rover-df 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +99 -5
- data/lib/rover.rb +14 -6
- data/lib/rover/data_frame.rb +83 -9
- data/lib/rover/group.rb +49 -0
- data/lib/rover/vector.rb +129 -21
- data/lib/rover/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b003d311b623fdd38fee4c6fa76129ad4bba042e8193c1872928cb05085daad3
|
4
|
+
data.tar.gz: d0c8c04b2a8aec3ea5b7616cbcda61f03a12c96fc8f9a0c7aa29fc898948b759
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8033d8ae9e5fb8c8e767ba68897d37745cc5d35a7a82cb2847e2b1d2c3adf8eeb406914cd47949d8f4c3f21307617ab550f435e7a5c257fe1187ed47dd943829
|
7
|
+
data.tar.gz: acfdca4ad081e2722c4b5269824de123d26aadab28532837d2bfc717c2ca263f73dc4335963beae654b24d4c17cebbefdcce761a946f391340a306c9ca2a8c9b
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## 0.1.1 (2020-06-10)
|
2
|
+
|
3
|
+
- Added methods and options for types
|
4
|
+
- Added grouping
|
5
|
+
- Added one-hot encoding
|
6
|
+
- Added `sample` to data frames
|
7
|
+
- Added `tally`, `var`, `std`, `take`, `count`, and `length` to vectors
|
8
|
+
- Improved error message for `read_csv` with no headers
|
9
|
+
|
1
10
|
## 0.1.0 (2020-05-13)
|
2
11
|
|
3
12
|
- First release
|
data/README.md
CHANGED
@@ -2,7 +2,9 @@
|
|
2
2
|
|
3
3
|
Simple, powerful data frames for Ruby
|
4
4
|
|
5
|
-
:mountain: Designed for data exploration and machine learning, and powered by [Numo](https://github.com/ruby-numo/numo-narray)
|
5
|
+
:mountain: Designed for data exploration and machine learning, and powered by [Numo](https://github.com/ruby-numo/numo-narray)
|
6
|
+
|
7
|
+
[](https://travis-ci.org/ankane/rover)
|
6
8
|
|
7
9
|
## Installation
|
8
10
|
|
@@ -16,12 +18,22 @@ gem 'rover-df'
|
|
16
18
|
|
17
19
|
A data frame is an in-memory table. It’s a useful data structure for data analysis and machine learning. It uses columnar storage for fast operations on columns.
|
18
20
|
|
21
|
+
Try it out for forecasting by clicking the button below:
|
22
|
+
|
23
|
+
[](https://mybinder.org/v2/gh/ankane/ml-stack/master?filepath=Forecasting.ipynb)
|
24
|
+
|
25
|
+
Use the `Run` button (or `SHIFT` + `ENTER`) to run each line.
|
26
|
+
|
19
27
|
## Creating Data Frames
|
20
28
|
|
21
29
|
From an array
|
22
30
|
|
23
31
|
```ruby
|
24
|
-
Rover::DataFrame.new([
|
32
|
+
Rover::DataFrame.new([
|
33
|
+
{a: 1, b: "one"},
|
34
|
+
{a: 2, b: "two"},
|
35
|
+
{a: 3, b: "three"}
|
36
|
+
])
|
25
37
|
```
|
26
38
|
|
27
39
|
From a hash
|
@@ -33,7 +45,7 @@ Rover::DataFrame.new({
|
|
33
45
|
})
|
34
46
|
```
|
35
47
|
|
36
|
-
From
|
48
|
+
From Active Record
|
37
49
|
|
38
50
|
```ruby
|
39
51
|
Rover::DataFrame.new(User.all)
|
@@ -75,6 +87,8 @@ Select a column
|
|
75
87
|
df[:a]
|
76
88
|
```
|
77
89
|
|
90
|
+
> Note that strings and symbols are different keys, just like hashes
|
91
|
+
|
78
92
|
Select multiple columns
|
79
93
|
|
80
94
|
```ruby
|
@@ -158,13 +172,39 @@ df[:a].min
|
|
158
172
|
df[:a].max
|
159
173
|
```
|
160
174
|
|
175
|
+
Count occurrences
|
176
|
+
|
177
|
+
```ruby
|
178
|
+
df[:a].tally
|
179
|
+
```
|
180
|
+
|
161
181
|
Cross tabulation
|
162
182
|
|
163
183
|
```ruby
|
164
184
|
df[:a].crosstab(df[:b])
|
165
185
|
```
|
166
186
|
|
167
|
-
##
|
187
|
+
## Grouping
|
188
|
+
|
189
|
+
Group
|
190
|
+
|
191
|
+
```ruby
|
192
|
+
df.group(:a).count
|
193
|
+
```
|
194
|
+
|
195
|
+
Works with all summary statistics
|
196
|
+
|
197
|
+
```ruby
|
198
|
+
df.group(:a).max(:b)
|
199
|
+
```
|
200
|
+
|
201
|
+
Multiple groups
|
202
|
+
|
203
|
+
```ruby
|
204
|
+
df.group([:a, :b]).count
|
205
|
+
```
|
206
|
+
|
207
|
+
## Updating Data
|
168
208
|
|
169
209
|
Add a new column
|
170
210
|
|
@@ -214,7 +254,7 @@ Rename a column
|
|
214
254
|
df[:new_a] = df.delete(:a)
|
215
255
|
```
|
216
256
|
|
217
|
-
Sort
|
257
|
+
Sort rows
|
218
258
|
|
219
259
|
```ruby
|
220
260
|
df.sort_by! { |r| r[:a] }
|
@@ -258,6 +298,20 @@ Left join
|
|
258
298
|
df.left_join(other_df)
|
259
299
|
```
|
260
300
|
|
301
|
+
## Encoding
|
302
|
+
|
303
|
+
One-hot encoding
|
304
|
+
|
305
|
+
```ruby
|
306
|
+
df.one_hot
|
307
|
+
```
|
308
|
+
|
309
|
+
Drop a variable in each category to avoid the dummy variable trap
|
310
|
+
|
311
|
+
```ruby
|
312
|
+
df.one_hot(drop: true)
|
313
|
+
```
|
314
|
+
|
261
315
|
## Conversion
|
262
316
|
|
263
317
|
Array of hashes
|
@@ -284,6 +338,46 @@ CSV
|
|
284
338
|
df.to_csv
|
285
339
|
```
|
286
340
|
|
341
|
+
## Types
|
342
|
+
|
343
|
+
Pass column types when creating a data frame
|
344
|
+
|
345
|
+
```ruby
|
346
|
+
Rover::DataFrame.new(data, types: {"a" => :int, "b" => :float})
|
347
|
+
```
|
348
|
+
|
349
|
+
Or
|
350
|
+
|
351
|
+
```ruby
|
352
|
+
Rover.read_csv("data.csv", types: {"a" => :int, "b" => :float})
|
353
|
+
```
|
354
|
+
|
355
|
+
Supported types are:
|
356
|
+
|
357
|
+
- boolean - `bool`
|
358
|
+
- float - `float`, `float32`
|
359
|
+
- integer - `int`, `int32`, `int16`, `int8`
|
360
|
+
- unsigned integer - `uint`, `uint32`, `uint16`, `uint8`
|
361
|
+
- object - `object`
|
362
|
+
|
363
|
+
Get column types
|
364
|
+
|
365
|
+
```ruby
|
366
|
+
df.types
|
367
|
+
```
|
368
|
+
|
369
|
+
For a specific column
|
370
|
+
|
371
|
+
```ruby
|
372
|
+
df[:a].type
|
373
|
+
```
|
374
|
+
|
375
|
+
Change the type of a column
|
376
|
+
|
377
|
+
```ruby
|
378
|
+
df[:a] = df[:a].to(:int)
|
379
|
+
```
|
380
|
+
|
287
381
|
## History
|
288
382
|
|
289
383
|
View the [changelog](https://github.com/ankane/rover/blob/master/CHANGELOG.md)
|
data/lib/rover.rb
CHANGED
@@ -3,30 +3,38 @@ require "numo/narray"
|
|
3
3
|
|
4
4
|
# modules
|
5
5
|
require "rover/data_frame"
|
6
|
+
require "rover/group"
|
6
7
|
require "rover/vector"
|
7
8
|
require "rover/version"
|
8
9
|
|
9
10
|
module Rover
|
10
11
|
class << self
|
11
|
-
def read_csv(path, **options)
|
12
|
+
def read_csv(path, types: nil, **options)
|
12
13
|
require "csv"
|
13
|
-
csv_to_df(CSV.read(path,
|
14
|
+
csv_to_df(CSV.read(path, **csv_options(options)), types: types)
|
14
15
|
end
|
15
16
|
|
16
|
-
def parse_csv(str, **options)
|
17
|
+
def parse_csv(str, types: nil, **options)
|
17
18
|
require "csv"
|
18
|
-
csv_to_df(CSV.parse(str,
|
19
|
+
csv_to_df(CSV.parse(str, **csv_options(options)), types: types)
|
19
20
|
end
|
20
21
|
|
21
22
|
private
|
22
23
|
|
23
|
-
|
24
|
+
# TODO use date converter
|
25
|
+
def csv_options(options)
|
26
|
+
options = {headers: true, converters: :numeric}.merge(options)
|
27
|
+
raise ArgumentError, "Must specify headers" unless options[:headers]
|
28
|
+
options
|
29
|
+
end
|
30
|
+
|
31
|
+
def csv_to_df(table, types: nil)
|
24
32
|
table.by_col!
|
25
33
|
data = {}
|
26
34
|
table.each do |k, v|
|
27
35
|
data[k] = v
|
28
36
|
end
|
29
|
-
DataFrame.new(data)
|
37
|
+
DataFrame.new(data, types: types)
|
30
38
|
end
|
31
39
|
end
|
32
40
|
end
|
data/lib/rover/data_frame.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
module Rover
|
2
2
|
class DataFrame
|
3
|
-
def initialize(
|
3
|
+
def initialize(*args)
|
4
|
+
data, options = process_args(args)
|
5
|
+
|
4
6
|
@vectors = {}
|
7
|
+
types = options[:types] || {}
|
5
8
|
|
6
9
|
if data.is_a?(DataFrame)
|
7
10
|
data.vectors.each do |k, v|
|
@@ -11,7 +14,7 @@ module Rover
|
|
11
14
|
data.to_h.each do |k, v|
|
12
15
|
@vectors[k] =
|
13
16
|
if v.respond_to?(:to_a)
|
14
|
-
Vector.new(v)
|
17
|
+
Vector.new(v, type: types[k])
|
15
18
|
else
|
16
19
|
v
|
17
20
|
end
|
@@ -20,7 +23,7 @@ module Rover
|
|
20
23
|
# handle scalars
|
21
24
|
size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1
|
22
25
|
@vectors.each_key do |k|
|
23
|
-
@vectors[k] = to_vector(@vectors[k], size)
|
26
|
+
@vectors[k] = to_vector(@vectors[k], size: size, type: types[k])
|
24
27
|
end
|
25
28
|
elsif data.is_a?(Array)
|
26
29
|
vectors = {}
|
@@ -35,12 +38,12 @@ module Rover
|
|
35
38
|
end
|
36
39
|
end
|
37
40
|
vectors.each do |k, v|
|
38
|
-
@vectors[k] = to_vector(v)
|
41
|
+
@vectors[k] = to_vector(v, type: types[k])
|
39
42
|
end
|
40
43
|
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
|
41
44
|
result = data.connection.select_all(data.all.to_sql)
|
42
45
|
result.columns.each_with_index do |k, i|
|
43
|
-
@vectors[k] = to_vector(result.rows.map { |r| r[i] })
|
46
|
+
@vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
|
44
47
|
end
|
45
48
|
else
|
46
49
|
raise ArgumentError, "Cannot cast to data frame: #{data.class.name}"
|
@@ -90,9 +93,13 @@ module Rover
|
|
90
93
|
@vectors.dup
|
91
94
|
end
|
92
95
|
|
96
|
+
def types
|
97
|
+
@vectors.map { |k, v| [k, v.type] }.to_h
|
98
|
+
end
|
99
|
+
|
93
100
|
def []=(k, v)
|
94
101
|
check_key(k)
|
95
|
-
v = to_vector(v, size)
|
102
|
+
v = to_vector(v, size: size)
|
96
103
|
raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size
|
97
104
|
@vectors[k] = v
|
98
105
|
end
|
@@ -170,6 +177,12 @@ module Rover
|
|
170
177
|
DataFrame.new(new_vectors)
|
171
178
|
end
|
172
179
|
|
180
|
+
def sample(*args, **kwargs)
|
181
|
+
# TODO make more efficient
|
182
|
+
indexes = (0...size).to_a.sample(*args, **kwargs)
|
183
|
+
self[indexes]
|
184
|
+
end
|
185
|
+
|
173
186
|
def to_a
|
174
187
|
a = []
|
175
188
|
each_row do |row|
|
@@ -190,6 +203,25 @@ module Rover
|
|
190
203
|
Numo::NArray.column_stack(vectors.values.map(&:to_numo))
|
191
204
|
end
|
192
205
|
|
206
|
+
# TODO raise error when collision
|
207
|
+
def one_hot(drop: false)
|
208
|
+
df = DataFrame.new
|
209
|
+
vectors.each do |k, v|
|
210
|
+
if v.to_numo.is_a?(Numo::RObject)
|
211
|
+
df.merge!(v.one_hot(drop: drop, prefix: "#{k}_"))
|
212
|
+
else
|
213
|
+
df[k] = v
|
214
|
+
end
|
215
|
+
end
|
216
|
+
df
|
217
|
+
rescue ArgumentError => e
|
218
|
+
if e.message == "All elements must be strings"
|
219
|
+
# better error message
|
220
|
+
raise ArgumentError, "All elements must be numeric or strings"
|
221
|
+
end
|
222
|
+
raise e
|
223
|
+
end
|
224
|
+
|
193
225
|
def to_csv
|
194
226
|
require "csv"
|
195
227
|
CSV.generate do |csv|
|
@@ -258,6 +290,17 @@ module Rover
|
|
258
290
|
dup.sort_by!(&block)
|
259
291
|
end
|
260
292
|
|
293
|
+
def group(columns)
|
294
|
+
Group.new(self, columns)
|
295
|
+
end
|
296
|
+
|
297
|
+
[:max, :min, :median, :mean, :percentile, :sum].each do |name|
|
298
|
+
define_method(name) do |column, *args|
|
299
|
+
check_column(column)
|
300
|
+
self[column].send(name, *args)
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
261
304
|
def dup
|
262
305
|
df = DataFrame.new
|
263
306
|
@vectors.each do |k, v|
|
@@ -375,8 +418,15 @@ module Rover
|
|
375
418
|
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
376
419
|
end
|
377
420
|
|
378
|
-
def
|
379
|
-
|
421
|
+
def check_column(key)
|
422
|
+
raise ArgumentError, "Missing column: #{key}" unless include?(key)
|
423
|
+
end
|
424
|
+
|
425
|
+
def to_vector(v, size: nil, type: nil)
|
426
|
+
if v.is_a?(Vector)
|
427
|
+
v = v.to(type) if type && v.type != type
|
428
|
+
return v
|
429
|
+
end
|
380
430
|
|
381
431
|
if size && !v.respond_to?(:to_a)
|
382
432
|
v =
|
@@ -392,7 +442,31 @@ module Rover
|
|
392
442
|
end
|
393
443
|
end
|
394
444
|
|
395
|
-
Vector.new(v)
|
445
|
+
Vector.new(v, type: type)
|
446
|
+
end
|
447
|
+
|
448
|
+
# can't use data = {} and keyword arguments
|
449
|
+
# as this causes an unknown keyword error when data is passed as
|
450
|
+
# DataFrame.new({a: ..., b: ...})
|
451
|
+
#
|
452
|
+
# at the moment, there doesn't appear to be a way to distinguish between
|
453
|
+
# DataFrame.new({types: ...}) which should set data, and
|
454
|
+
# DataFrame.new(types: ...) which should set options
|
455
|
+
# https://bugs.ruby-lang.org/issues/16891
|
456
|
+
#
|
457
|
+
# there aren't currently options that should be used without data
|
458
|
+
# if this is ever the case, we should still require data
|
459
|
+
# to prevent new options from breaking existing code
|
460
|
+
def process_args(args)
|
461
|
+
data = args[0] || {}
|
462
|
+
options = args.size > 1 && args.last.is_a?(Hash) ? args.pop : {}
|
463
|
+
raise ArgumentError, "wrong number of arguments (given #{args.size}, expected 0..1)" if args.size > 1
|
464
|
+
|
465
|
+
known_keywords = [:types]
|
466
|
+
unknown_keywords = options.keys - known_keywords
|
467
|
+
raise ArgumentError, "unknown keywords: #{unknown_keywords.join(", ")}" if unknown_keywords.any?
|
468
|
+
|
469
|
+
[data, options]
|
396
470
|
end
|
397
471
|
end
|
398
472
|
end
|
data/lib/rover/group.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
module Rover
|
2
|
+
class Group
|
3
|
+
def initialize(df, columns)
|
4
|
+
@df = df
|
5
|
+
@columns = Array(columns)
|
6
|
+
end
|
7
|
+
|
8
|
+
[:count, :max, :min, :mean, :median, :percentile, :sum].each do |name|
|
9
|
+
define_method(name) do |*args|
|
10
|
+
result = {}
|
11
|
+
grouped_dfs.each do |k, df|
|
12
|
+
result[k] = df.send(name, *args)
|
13
|
+
end
|
14
|
+
result
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
# TODO make more efficient
|
21
|
+
def grouped_dfs
|
22
|
+
# cache here so we can reuse for multiple calcuations if needed
|
23
|
+
@grouped_dfs ||= begin
|
24
|
+
raise ArgumentError, "No columns given" if @columns.empty?
|
25
|
+
missing_keys = @columns - @df.keys
|
26
|
+
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
27
|
+
|
28
|
+
groups = Hash.new { |hash, key| hash[key] = [] }
|
29
|
+
if @columns.size == 1
|
30
|
+
@df[@columns.first].each_with_index do |v, i|
|
31
|
+
groups[v] << i
|
32
|
+
end
|
33
|
+
else
|
34
|
+
i = 0
|
35
|
+
@df.each_row do |row|
|
36
|
+
groups[@columns.map { |c| row[c] }] << i
|
37
|
+
i += 1
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
result = {}
|
42
|
+
groups.each do |k, indexes|
|
43
|
+
result[k] = @df[indexes]
|
44
|
+
end
|
45
|
+
result
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/rover/vector.rb
CHANGED
@@ -1,27 +1,39 @@
|
|
1
1
|
module Rover
|
2
2
|
class Vector
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
3
|
+
# if a user never specifies types,
|
4
|
+
# the defaults are bool, float, int, and object
|
5
|
+
# keep these simple
|
6
|
+
#
|
7
|
+
# we could create aliases for float64, int64, uint64
|
8
|
+
# if so, type should still return the simple type
|
9
|
+
TYPE_CAST_MAPPING = {
|
10
|
+
bool: Numo::Bit,
|
11
|
+
float32: Numo::SFloat,
|
12
|
+
float: Numo::DFloat,
|
13
|
+
int8: Numo::Int8,
|
14
|
+
int16: Numo::Int16,
|
15
|
+
int32: Numo::Int32,
|
16
|
+
int: Numo::Int64,
|
17
|
+
object: Numo::RObject,
|
18
|
+
uint8: Numo::UInt8,
|
19
|
+
uint16: Numo::UInt16,
|
20
|
+
uint32: Numo::UInt32,
|
21
|
+
uint: Numo::UInt64
|
22
|
+
}
|
23
|
+
|
24
|
+
def initialize(data, type: nil)
|
25
|
+
@data = cast_data(data, type: type)
|
22
26
|
raise ArgumentError, "Bad size: #{@data.shape}" unless @data.ndim == 1
|
23
27
|
end
|
24
28
|
|
29
|
+
def type
|
30
|
+
TYPE_CAST_MAPPING.find { |_, v| @data.is_a?(v) }[0]
|
31
|
+
end
|
32
|
+
|
33
|
+
def to(type)
|
34
|
+
Vector.new(self, type: type)
|
35
|
+
end
|
36
|
+
|
25
37
|
def to_numo
|
26
38
|
@data
|
27
39
|
end
|
@@ -35,6 +47,8 @@ module Rover
|
|
35
47
|
def size
|
36
48
|
@data.size
|
37
49
|
end
|
50
|
+
alias_method :length, :size
|
51
|
+
alias_method :count, :size
|
38
52
|
|
39
53
|
def uniq
|
40
54
|
Vector.new(@data.to_a.uniq)
|
@@ -148,6 +162,15 @@ module Rover
|
|
148
162
|
Vector.new(mapped)
|
149
163
|
end
|
150
164
|
|
165
|
+
def tally
|
166
|
+
result = Hash.new(0)
|
167
|
+
@data.each do |v|
|
168
|
+
result[v] += 1
|
169
|
+
end
|
170
|
+
result.default = nil
|
171
|
+
result
|
172
|
+
end
|
173
|
+
|
151
174
|
def sort
|
152
175
|
Vector.new(@data.respond_to?(:sort) ? @data.sort : @data.to_a.sort)
|
153
176
|
end
|
@@ -157,7 +180,11 @@ module Rover
|
|
157
180
|
end
|
158
181
|
|
159
182
|
def each(&block)
|
160
|
-
|
183
|
+
@data.each(&block)
|
184
|
+
end
|
185
|
+
|
186
|
+
def each_with_index(&block)
|
187
|
+
@data.each_with_index(&block)
|
161
188
|
end
|
162
189
|
|
163
190
|
def max
|
@@ -176,7 +203,7 @@ module Rover
|
|
176
203
|
|
177
204
|
def median
|
178
205
|
# need to cast to get correct result
|
179
|
-
#
|
206
|
+
# https://github.com/ruby-numo/numo-narray/issues/165
|
180
207
|
@data.cast_to(Numo::DFloat).median
|
181
208
|
end
|
182
209
|
|
@@ -188,6 +215,16 @@ module Rover
|
|
188
215
|
@data.sum
|
189
216
|
end
|
190
217
|
|
218
|
+
# uses Bessel's correction for now since that's all Numo supports
|
219
|
+
def std
|
220
|
+
@data.cast_to(Numo::DFloat).stddev
|
221
|
+
end
|
222
|
+
|
223
|
+
# uses Bessel's correction for now since that's all Numo supports
|
224
|
+
def var
|
225
|
+
@data.cast_to(Numo::DFloat).var
|
226
|
+
end
|
227
|
+
|
191
228
|
def all?(&block)
|
192
229
|
@data.to_a.all?(&block)
|
193
230
|
end
|
@@ -208,6 +245,11 @@ module Rover
|
|
208
245
|
Vector.new(@data[-n..-1])
|
209
246
|
end
|
210
247
|
|
248
|
+
def take(n)
|
249
|
+
raise ArgumentError, "attempt to take negative size" if n < 0
|
250
|
+
first(n)
|
251
|
+
end
|
252
|
+
|
211
253
|
def crosstab(other)
|
212
254
|
index = uniq.sort
|
213
255
|
index_pos = index.to_a.map.with_index.to_h
|
@@ -231,6 +273,20 @@ module Rover
|
|
231
273
|
last(n)
|
232
274
|
end
|
233
275
|
|
276
|
+
def one_hot(drop: false, prefix: nil)
|
277
|
+
raise ArgumentError, "All elements must be strings" unless all? { |vi| vi.is_a?(String) }
|
278
|
+
|
279
|
+
new_vectors = {}
|
280
|
+
# maybe sort values first
|
281
|
+
values = uniq.to_a
|
282
|
+
values.shift if drop
|
283
|
+
values.each do |v2|
|
284
|
+
# TODO use types
|
285
|
+
new_vectors["#{prefix}#{v2}"] = (self == v2).to_numo.cast_to(Numo::Int64)
|
286
|
+
end
|
287
|
+
DataFrame.new(new_vectors)
|
288
|
+
end
|
289
|
+
|
234
290
|
# TODO add type and size?
|
235
291
|
def inspect
|
236
292
|
elements = first(5).to_a.map(&:inspect)
|
@@ -244,5 +300,57 @@ module Rover
|
|
244
300
|
require "iruby"
|
245
301
|
IRuby::HTML.table(to_a)
|
246
302
|
end
|
303
|
+
|
304
|
+
private
|
305
|
+
|
306
|
+
def cast_data(data, type: nil)
|
307
|
+
numo_type = numo_type(type) if type
|
308
|
+
|
309
|
+
data = data.to_numo if data.is_a?(Vector)
|
310
|
+
|
311
|
+
if data.is_a?(Numo::NArray)
|
312
|
+
raise ArgumentError, "Complex types not supported yet" if data.is_a?(Numo::DComplex) || data.is_a?(Numo::SComplex)
|
313
|
+
|
314
|
+
if type
|
315
|
+
case type
|
316
|
+
when /int/
|
317
|
+
# Numo does not check these when casting
|
318
|
+
raise RangeError, "float NaN out of range of integer" if data.respond_to?(:isnan) && data.isnan.any?
|
319
|
+
raise RangeError, "float Inf out of range of integer" if data.respond_to?(:isinf) && data.isinf.any?
|
320
|
+
|
321
|
+
data = data.to_a.map { |v| v.nil? ? nil : v.to_i } if data.is_a?(Numo::RObject)
|
322
|
+
when /float/
|
323
|
+
data = data.to_a.map { |v| v.nil? ? Float::NAN : v.to_f } if data.is_a?(Numo::RObject)
|
324
|
+
end
|
325
|
+
|
326
|
+
data = numo_type.cast(data)
|
327
|
+
end
|
328
|
+
else
|
329
|
+
data = data.to_a
|
330
|
+
|
331
|
+
if type
|
332
|
+
data = numo_type.cast(data)
|
333
|
+
else
|
334
|
+
data =
|
335
|
+
if data.all? { |v| v.is_a?(Integer) }
|
336
|
+
Numo::Int64.cast(data)
|
337
|
+
elsif data.all? { |v| v.is_a?(Numeric) || v.nil? }
|
338
|
+
Numo::DFloat.cast(data.map { |v| v || Float::NAN })
|
339
|
+
elsif data.all? { |v| v == true || v == false }
|
340
|
+
Numo::Bit.cast(data)
|
341
|
+
else
|
342
|
+
Numo::RObject.cast(data)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
data
|
348
|
+
end
|
349
|
+
|
350
|
+
def numo_type(type)
|
351
|
+
numo_type = TYPE_CAST_MAPPING[type]
|
352
|
+
raise ArgumentError, "Invalid type: #{type}" unless numo_type
|
353
|
+
numo_type
|
354
|
+
end
|
247
355
|
end
|
248
356
|
end
|
data/lib/rover/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rover-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-06-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -120,6 +120,7 @@ files:
|
|
120
120
|
- lib/rover-df.rb
|
121
121
|
- lib/rover.rb
|
122
122
|
- lib/rover/data_frame.rb
|
123
|
+
- lib/rover/group.rb
|
123
124
|
- lib/rover/vector.rb
|
124
125
|
- lib/rover/version.rb
|
125
126
|
homepage: https://github.com/ankane/rover
|