rover-df 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +307 -0
- data/lib/rover-df.rb +1 -0
- data/lib/rover.rb +32 -0
- data/lib/rover/data_frame.rb +398 -0
- data/lib/rover/vector.rb +248 -0
- data/lib/rover/version.rb +3 -0
- metadata +148 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 4588d0b3b5633a3821a4c07e7102e5933edca92179836db041f2400d8be88538
|
4
|
+
data.tar.gz: 9b01cd2bae5fb6ba9f426fe0d347752cd30c63619b00284fb68e8f711ec38ddf
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b2d35866786a7fbe17b274585419c752b08c817b2db1bf939a6c3f92a7ae2cd282d725614f96db730fd2590cbb8c24710d0fb1f713255d2c348c0fed0b874a35
|
7
|
+
data.tar.gz: 4bf0ba38ce2c3ef4765d702591948af18fddf142efb7e559e26cc4ab504538775a1771c839f1570230f7d101fa20bfbbeb5044f6bf567637790575ee9b95be87
|
data/CHANGELOG.md
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2020 Andrew Kane
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,307 @@
|
|
1
|
+
# Rover
|
2
|
+
|
3
|
+
Simple, powerful data frames for Ruby
|
4
|
+
|
5
|
+
:mountain: Designed for data exploration and machine learning, and powered by [Numo](https://github.com/ruby-numo/numo-narray) for blazing performance
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application’s Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'rover-df'
|
13
|
+
```
|
14
|
+
|
15
|
+
## Intro
|
16
|
+
|
17
|
+
A data frame is an in-memory table. It’s a useful data structure for data analysis and machine learning. It uses columnar storage for fast operations on columns.
|
18
|
+
|
19
|
+
## Creating Data Frames
|
20
|
+
|
21
|
+
From an array
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
Rover::DataFrame.new([{a: 1, b: "one"}, {a: 2, b: "two"}, {a: 3, b: "three"}])
|
25
|
+
```
|
26
|
+
|
27
|
+
From a hash
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
Rover::DataFrame.new({
|
31
|
+
a: [1, 2, 3],
|
32
|
+
b: ["one", "two", "three"]
|
33
|
+
})
|
34
|
+
```
|
35
|
+
|
36
|
+
From an Active Record relation
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
Rover::DataFrame.new(User.all)
|
40
|
+
```
|
41
|
+
|
42
|
+
From a CSV
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
Rover.read_csv("file.csv")
|
46
|
+
# or
|
47
|
+
Rover.parse_csv("CSV,data,string")
|
48
|
+
```
|
49
|
+
|
50
|
+
## Attributes
|
51
|
+
|
52
|
+
Get number of rows
|
53
|
+
|
54
|
+
```ruby
|
55
|
+
df.count
|
56
|
+
```
|
57
|
+
|
58
|
+
Get column names
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
df.keys
|
62
|
+
```
|
63
|
+
|
64
|
+
Check if a column exists
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
df.include?(name)
|
68
|
+
```
|
69
|
+
|
70
|
+
## Selecting Data
|
71
|
+
|
72
|
+
Select a column
|
73
|
+
|
74
|
+
```ruby
|
75
|
+
df[:a]
|
76
|
+
```
|
77
|
+
|
78
|
+
Select multiple columns
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
df[[:a, :b]]
|
82
|
+
```
|
83
|
+
|
84
|
+
Select first rows
|
85
|
+
|
86
|
+
```ruby
|
87
|
+
df.head
|
88
|
+
# or
|
89
|
+
df.first(5)
|
90
|
+
```
|
91
|
+
|
92
|
+
Select last rows
|
93
|
+
|
94
|
+
```ruby
|
95
|
+
df.tail
|
96
|
+
# or
|
97
|
+
df.last(5)
|
98
|
+
```
|
99
|
+
|
100
|
+
Select rows by index
|
101
|
+
|
102
|
+
```ruby
|
103
|
+
df[1]
|
104
|
+
# or
|
105
|
+
df[1..3]
|
106
|
+
# or
|
107
|
+
df[[1, 4, 5]]
|
108
|
+
```
|
109
|
+
|
110
|
+
## Filtering
|
111
|
+
|
112
|
+
Filter on a condition
|
113
|
+
|
114
|
+
```ruby
|
115
|
+
df[df[:a] > 100]
|
116
|
+
```
|
117
|
+
|
118
|
+
And
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
df[df[:a] > 100 & df[:b] == "one"]
|
122
|
+
```
|
123
|
+
|
124
|
+
Or
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
df[df[:a] > 100 | df[:b] == "one"]
|
128
|
+
```
|
129
|
+
|
130
|
+
Not
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
df[df[:a] != 100]
|
134
|
+
```
|
135
|
+
|
136
|
+
## Operations
|
137
|
+
|
138
|
+
Basic operations
|
139
|
+
|
140
|
+
```ruby
|
141
|
+
df[:a] + 5
|
142
|
+
df[:a] - 5
|
143
|
+
df[:a] * 5
|
144
|
+
df[:a] / 5
|
145
|
+
df[:a] % 5
|
146
|
+
df[:a] ** 2
|
147
|
+
```
|
148
|
+
|
149
|
+
Summary statistics
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
df[:a].count
|
153
|
+
df[:a].sum
|
154
|
+
df[:a].mean
|
155
|
+
df[:a].median
|
156
|
+
df[:a].percentile(90)
|
157
|
+
df[:a].min
|
158
|
+
df[:a].max
|
159
|
+
```
|
160
|
+
|
161
|
+
Cross tabulation
|
162
|
+
|
163
|
+
```ruby
|
164
|
+
df[:a].crosstab(df[:b])
|
165
|
+
```
|
166
|
+
|
167
|
+
## Updates
|
168
|
+
|
169
|
+
Add a new column
|
170
|
+
|
171
|
+
```ruby
|
172
|
+
df[:a] = 1
|
173
|
+
# or
|
174
|
+
df[:a] = [1, 2, 3]
|
175
|
+
```
|
176
|
+
|
177
|
+
Update a single element
|
178
|
+
|
179
|
+
```ruby
|
180
|
+
df[:a][0] = 100
|
181
|
+
```
|
182
|
+
|
183
|
+
Update multiple elements
|
184
|
+
|
185
|
+
```ruby
|
186
|
+
df[:a][0..2] = 1
|
187
|
+
# or
|
188
|
+
df[:a][0..2] = [1, 2, 3]
|
189
|
+
```
|
190
|
+
|
191
|
+
Update elements matching a condition
|
192
|
+
|
193
|
+
```ruby
|
194
|
+
df[:a][df[:a] > 100] = 0
|
195
|
+
```
|
196
|
+
|
197
|
+
Clamp
|
198
|
+
|
199
|
+
```ruby
|
200
|
+
df[:a].clamp!(0, 100)
|
201
|
+
```
|
202
|
+
|
203
|
+
Delete columns
|
204
|
+
|
205
|
+
```ruby
|
206
|
+
df.delete(:a)
|
207
|
+
# or
|
208
|
+
df.except!(:a, :b)
|
209
|
+
```
|
210
|
+
|
211
|
+
Rename a column
|
212
|
+
|
213
|
+
```ruby
|
214
|
+
df[:new_a] = df.delete(:a)
|
215
|
+
```
|
216
|
+
|
217
|
+
Sort data
|
218
|
+
|
219
|
+
```ruby
|
220
|
+
df.sort_by! { |r| r[:a] }
|
221
|
+
```
|
222
|
+
|
223
|
+
Clear all data
|
224
|
+
|
225
|
+
```ruby
|
226
|
+
df.clear
|
227
|
+
```
|
228
|
+
|
229
|
+
## Combining Data Frames
|
230
|
+
|
231
|
+
Add rows
|
232
|
+
|
233
|
+
```ruby
|
234
|
+
df.concat(other_df)
|
235
|
+
```
|
236
|
+
|
237
|
+
Add columns
|
238
|
+
|
239
|
+
```ruby
|
240
|
+
df.merge!(other_df)
|
241
|
+
```
|
242
|
+
|
243
|
+
Inner join
|
244
|
+
|
245
|
+
```ruby
|
246
|
+
df.inner_join(other_df)
|
247
|
+
# or
|
248
|
+
df.inner_join(other_df, on: :a)
|
249
|
+
# or
|
250
|
+
df.inner_join(other_df, on: [:a, :b])
|
251
|
+
# or
|
252
|
+
df.inner_join(other_df, on: {df_col: :other_df_col})
|
253
|
+
```
|
254
|
+
|
255
|
+
Left join
|
256
|
+
|
257
|
+
```ruby
|
258
|
+
df.left_join(other_df)
|
259
|
+
```
|
260
|
+
|
261
|
+
## Conversion
|
262
|
+
|
263
|
+
Array of hashes
|
264
|
+
|
265
|
+
```ruby
|
266
|
+
df.to_a
|
267
|
+
```
|
268
|
+
|
269
|
+
Hash of arrays
|
270
|
+
|
271
|
+
```ruby
|
272
|
+
df.to_h
|
273
|
+
```
|
274
|
+
|
275
|
+
Numo array
|
276
|
+
|
277
|
+
```ruby
|
278
|
+
df.to_numo
|
279
|
+
```
|
280
|
+
|
281
|
+
CSV
|
282
|
+
|
283
|
+
```ruby
|
284
|
+
df.to_csv
|
285
|
+
```
|
286
|
+
|
287
|
+
## History
|
288
|
+
|
289
|
+
View the [changelog](https://github.com/ankane/rover/blob/master/CHANGELOG.md)
|
290
|
+
|
291
|
+
## Contributing
|
292
|
+
|
293
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
294
|
+
|
295
|
+
- [Report bugs](https://github.com/ankane/rover/issues)
|
296
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/rover/pulls)
|
297
|
+
- Write, clarify, or fix documentation
|
298
|
+
- Suggest or add new features
|
299
|
+
|
300
|
+
To get started with development:
|
301
|
+
|
302
|
+
```sh
|
303
|
+
git clone https://github.com/ankane/rover.git
|
304
|
+
cd rover
|
305
|
+
bundle install
|
306
|
+
bundle exec rake test
|
307
|
+
```
|
data/lib/rover-df.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "rover"
|
data/lib/rover.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# dependencies
|
2
|
+
require "numo/narray"
|
3
|
+
|
4
|
+
# modules
|
5
|
+
require "rover/data_frame"
|
6
|
+
require "rover/vector"
|
7
|
+
require "rover/version"
|
8
|
+
|
9
|
+
module Rover
|
10
|
+
class << self
|
11
|
+
def read_csv(path, **options)
|
12
|
+
require "csv"
|
13
|
+
csv_to_df(CSV.read(path, headers: true, converters: :numeric, **options))
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse_csv(str, **options)
|
17
|
+
require "csv"
|
18
|
+
csv_to_df(CSV.parse(str, headers: true, converters: :numeric, **options))
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def csv_to_df(table)
|
24
|
+
table.by_col!
|
25
|
+
data = {}
|
26
|
+
table.each do |k, v|
|
27
|
+
data[k] = v
|
28
|
+
end
|
29
|
+
DataFrame.new(data)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,398 @@
|
|
1
|
+
module Rover
|
2
|
+
class DataFrame
|
3
|
+
def initialize(data = {})
|
4
|
+
@vectors = {}
|
5
|
+
|
6
|
+
if data.is_a?(DataFrame)
|
7
|
+
data.vectors.each do |k, v|
|
8
|
+
@vectors[k] = v
|
9
|
+
end
|
10
|
+
elsif data.is_a?(Hash)
|
11
|
+
data.to_h.each do |k, v|
|
12
|
+
@vectors[k] =
|
13
|
+
if v.respond_to?(:to_a)
|
14
|
+
Vector.new(v)
|
15
|
+
else
|
16
|
+
v
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# handle scalars
|
21
|
+
size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1
|
22
|
+
@vectors.each_key do |k|
|
23
|
+
@vectors[k] = to_vector(@vectors[k], size)
|
24
|
+
end
|
25
|
+
elsif data.is_a?(Array)
|
26
|
+
vectors = {}
|
27
|
+
raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) }
|
28
|
+
keys = data.flat_map(&:keys).uniq
|
29
|
+
keys.each do |k|
|
30
|
+
vectors[k] = []
|
31
|
+
end
|
32
|
+
data.each do |d|
|
33
|
+
keys.each do |k|
|
34
|
+
vectors[k] << d[k]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
vectors.each do |k, v|
|
38
|
+
@vectors[k] = to_vector(v)
|
39
|
+
end
|
40
|
+
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
|
41
|
+
result = data.connection.select_all(data.all.to_sql)
|
42
|
+
result.columns.each_with_index do |k, i|
|
43
|
+
@vectors[k] = to_vector(result.rows.map { |r| r[i] })
|
44
|
+
end
|
45
|
+
else
|
46
|
+
raise ArgumentError, "Cannot cast to data frame: #{data.class.name}"
|
47
|
+
end
|
48
|
+
|
49
|
+
# check keys
|
50
|
+
@vectors.each_key do |k|
|
51
|
+
check_key(k)
|
52
|
+
end
|
53
|
+
|
54
|
+
# check sizes
|
55
|
+
sizes = @vectors.values.map(&:size).uniq
|
56
|
+
if sizes.size > 1
|
57
|
+
raise ArgumentError, "Different sizes: #{sizes}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def [](where)
|
62
|
+
if (where.is_a?(Vector) && where.to_numo.is_a?(Numo::Bit)) || where.is_a?(Numeric) || where.is_a?(Range) || (where.is_a?(Array) && where.all? { |v| v.is_a?(Integer) } )
|
63
|
+
new_vectors = {}
|
64
|
+
@vectors.each do |k, v|
|
65
|
+
new_vectors[k] = v[where]
|
66
|
+
end
|
67
|
+
DataFrame.new(new_vectors)
|
68
|
+
elsif where.is_a?(Array)
|
69
|
+
# multiple columns
|
70
|
+
df = DataFrame.new
|
71
|
+
where.each do |k|
|
72
|
+
df[k] = @vectors[k]
|
73
|
+
end
|
74
|
+
df
|
75
|
+
else
|
76
|
+
# single column
|
77
|
+
@vectors[where]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# return each row as a hash
|
82
|
+
def each_row
|
83
|
+
size.times do |i|
|
84
|
+
yield @vectors.map { |k, v| [k, v[i]] }.to_h
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# dup to prevent direct modification of keys
|
89
|
+
def vectors
|
90
|
+
@vectors.dup
|
91
|
+
end
|
92
|
+
|
93
|
+
def []=(k, v)
|
94
|
+
check_key(k)
|
95
|
+
v = to_vector(v, size)
|
96
|
+
raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size
|
97
|
+
@vectors[k] = v
|
98
|
+
end
|
99
|
+
|
100
|
+
def size
|
101
|
+
@vectors.values.first&.size || 0
|
102
|
+
end
|
103
|
+
alias_method :length, :size
|
104
|
+
alias_method :count, :size
|
105
|
+
|
106
|
+
# should this check for columns as well?
|
107
|
+
def any?
|
108
|
+
size > 0
|
109
|
+
end
|
110
|
+
|
111
|
+
# should this check for columns as well?
|
112
|
+
def empty?
|
113
|
+
size == 0
|
114
|
+
end
|
115
|
+
|
116
|
+
def clear
|
117
|
+
@vectors.clear
|
118
|
+
end
|
119
|
+
|
120
|
+
def shape
|
121
|
+
[size, @vectors.size]
|
122
|
+
end
|
123
|
+
|
124
|
+
def keys
|
125
|
+
@vectors.keys
|
126
|
+
end
|
127
|
+
alias_method :names, :keys
|
128
|
+
alias_method :vector_names, :keys
|
129
|
+
|
130
|
+
def delete(key)
|
131
|
+
@vectors.delete(key)
|
132
|
+
end
|
133
|
+
|
134
|
+
def except(*keys)
|
135
|
+
dup.except!(*keys)
|
136
|
+
end
|
137
|
+
|
138
|
+
def except!(*keys)
|
139
|
+
keys.each do |key|
|
140
|
+
delete(key)
|
141
|
+
end
|
142
|
+
self
|
143
|
+
end
|
144
|
+
|
145
|
+
def include?(key)
|
146
|
+
@vectors.include?(key)
|
147
|
+
end
|
148
|
+
|
149
|
+
def head(n = 5)
|
150
|
+
first(n)
|
151
|
+
end
|
152
|
+
|
153
|
+
def tail(n = 5)
|
154
|
+
last(n)
|
155
|
+
end
|
156
|
+
|
157
|
+
def first(n = nil)
|
158
|
+
new_vectors = {}
|
159
|
+
@vectors.each do |k, v|
|
160
|
+
new_vectors[k] = v.first(n)
|
161
|
+
end
|
162
|
+
DataFrame.new(new_vectors)
|
163
|
+
end
|
164
|
+
|
165
|
+
def last(n = nil)
|
166
|
+
new_vectors = {}
|
167
|
+
@vectors.each do |k, v|
|
168
|
+
new_vectors[k] = v.last(n)
|
169
|
+
end
|
170
|
+
DataFrame.new(new_vectors)
|
171
|
+
end
|
172
|
+
|
173
|
+
def to_a
|
174
|
+
a = []
|
175
|
+
each_row do |row|
|
176
|
+
a << row
|
177
|
+
end
|
178
|
+
a
|
179
|
+
end
|
180
|
+
|
181
|
+
def to_h
|
182
|
+
hsh = {}
|
183
|
+
@vectors.each do |k, v|
|
184
|
+
hsh[k] = v.to_a
|
185
|
+
end
|
186
|
+
hsh
|
187
|
+
end
|
188
|
+
|
189
|
+
def to_numo
|
190
|
+
Numo::NArray.column_stack(vectors.values.map(&:to_numo))
|
191
|
+
end
|
192
|
+
|
193
|
+
def to_csv
|
194
|
+
require "csv"
|
195
|
+
CSV.generate do |csv|
|
196
|
+
csv << keys
|
197
|
+
numo = vectors.values.map(&:to_numo)
|
198
|
+
size.times do |i|
|
199
|
+
csv << numo.map { |n| n[i] }
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
# for IRuby
|
205
|
+
def to_html
|
206
|
+
require "iruby"
|
207
|
+
IRuby::HTML.table(to_h)
|
208
|
+
end
|
209
|
+
|
210
|
+
# TODO handle long text better
|
211
|
+
def inspect
|
212
|
+
return "#<Rover::DataFrame>" if keys.empty?
|
213
|
+
|
214
|
+
lines = []
|
215
|
+
line_start = 0
|
216
|
+
spaces = 2
|
217
|
+
|
218
|
+
@vectors.each do |k, v|
|
219
|
+
v = v.first(5).to_a
|
220
|
+
width = ([k] + v).map(&:to_s).map(&:size).max
|
221
|
+
width = 3 if width < 3
|
222
|
+
|
223
|
+
if lines.empty? || lines[-2].map { |l| l.size + spaces }.sum + width > 120
|
224
|
+
line_start = lines.size
|
225
|
+
lines << []
|
226
|
+
[size, 5].min.times do |i|
|
227
|
+
lines << []
|
228
|
+
end
|
229
|
+
lines << [] if size > 5
|
230
|
+
lines << []
|
231
|
+
end
|
232
|
+
|
233
|
+
lines[line_start] << "%#{width}s" % k.to_s
|
234
|
+
v.each_with_index do |v2, i|
|
235
|
+
lines[line_start + 1 + i] << "%#{width}s" % v2.to_s
|
236
|
+
end
|
237
|
+
lines[line_start + 6] << "%#{width}s" % "..." if size > 5
|
238
|
+
end
|
239
|
+
|
240
|
+
lines.pop
|
241
|
+
lines.map { |l| l.join(" " * spaces) }.join("\n")
|
242
|
+
end
|
243
|
+
alias_method :to_s, :inspect # alias like hash
|
244
|
+
|
245
|
+
def sort_by!
|
246
|
+
indexes =
|
247
|
+
size.times.sort_by do |i|
|
248
|
+
yield @vectors.map { |k, v| [k, v[i]] }.to_h
|
249
|
+
end
|
250
|
+
|
251
|
+
@vectors.each do |k, v|
|
252
|
+
self[k] = v.to_numo.at(indexes)
|
253
|
+
end
|
254
|
+
self
|
255
|
+
end
|
256
|
+
|
257
|
+
def sort_by(&block)
|
258
|
+
dup.sort_by!(&block)
|
259
|
+
end
|
260
|
+
|
261
|
+
def dup
|
262
|
+
df = DataFrame.new
|
263
|
+
@vectors.each do |k, v|
|
264
|
+
df[k] = v
|
265
|
+
end
|
266
|
+
df
|
267
|
+
end
|
268
|
+
|
269
|
+
def +(other)
|
270
|
+
dup.concat(other)
|
271
|
+
end
|
272
|
+
|
273
|
+
# in-place, like Array#concat
|
274
|
+
# TODO make more performant
|
275
|
+
def concat(other)
|
276
|
+
raise ArgumentError, "Must be a data frame" unless other.is_a?(DataFrame)
|
277
|
+
|
278
|
+
size = self.size
|
279
|
+
vectors.each do |k, v|
|
280
|
+
@vectors[k] = Vector.new(v.to_a + (other[k] ? other[k].to_a : [nil] * other.size))
|
281
|
+
end
|
282
|
+
(other.vector_names - vector_names).each do |k|
|
283
|
+
@vectors[k] = Vector.new([nil] * size + other[k].to_a)
|
284
|
+
end
|
285
|
+
self
|
286
|
+
end
|
287
|
+
|
288
|
+
def merge(other)
|
289
|
+
dup.merge!(other)
|
290
|
+
end
|
291
|
+
|
292
|
+
def merge!(other)
|
293
|
+
other.vectors.each do |k, v|
|
294
|
+
self[k] = v
|
295
|
+
end
|
296
|
+
self
|
297
|
+
end
|
298
|
+
|
299
|
+
# see join for options
|
300
|
+
def inner_join(other, on: nil)
|
301
|
+
join(other, on: on, how: "inner")
|
302
|
+
end
|
303
|
+
|
304
|
+
# see join for options
|
305
|
+
def left_join(other, on: nil)
|
306
|
+
join(other, on: on, how: "left")
|
307
|
+
end
|
308
|
+
|
309
|
+
# don't check types
|
310
|
+
def ==(other)
|
311
|
+
size == other.size &&
|
312
|
+
keys == other.keys &&
|
313
|
+
keys.all? { |k| self[k] == other[k] }
|
314
|
+
end
|
315
|
+
|
316
|
+
private
|
317
|
+
|
318
|
+
def check_key(key)
|
319
|
+
raise ArgumentError, "Key must be a string or symbol, got #{key.inspect}" unless key.is_a?(String) || key.is_a?(Symbol)
|
320
|
+
end
|
321
|
+
|
322
|
+
# TODO make more efficient
|
323
|
+
# TODO add option to prefix/suffix keys?
|
324
|
+
# Supports:
|
325
|
+
# - on: :key
|
326
|
+
# - on: [:key1, :key2]
|
327
|
+
# - on: {key1a: :key1b, key2a: :key2b}
|
328
|
+
def join(other, how:, on: nil)
|
329
|
+
self_on, other_on =
|
330
|
+
if on.is_a?(Hash)
|
331
|
+
[on.keys, on.values]
|
332
|
+
else
|
333
|
+
on ||= keys & other.keys
|
334
|
+
on = [on] unless on.is_a?(Array)
|
335
|
+
[on, on]
|
336
|
+
end
|
337
|
+
|
338
|
+
check_join_keys(self, self_on)
|
339
|
+
check_join_keys(other, other_on)
|
340
|
+
|
341
|
+
indexed = other.to_a.group_by { |r| r.values_at(*other_on) }
|
342
|
+
indexed.default = []
|
343
|
+
|
344
|
+
left = how == "left"
|
345
|
+
|
346
|
+
vectors = {}
|
347
|
+
keys = (self.keys + other.keys).uniq
|
348
|
+
keys.each do |k|
|
349
|
+
vectors[k] = []
|
350
|
+
end
|
351
|
+
|
352
|
+
each_row do |r|
|
353
|
+
matches = indexed[r.values_at(*self_on)]
|
354
|
+
if matches.empty?
|
355
|
+
if left
|
356
|
+
keys.each do |k|
|
357
|
+
vectors[k] << r[k]
|
358
|
+
end
|
359
|
+
end
|
360
|
+
else
|
361
|
+
matches.each do |r2|
|
362
|
+
keys.each do |k|
|
363
|
+
vectors[k] << (r2[k] || r[k])
|
364
|
+
end
|
365
|
+
end
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
DataFrame.new(vectors)
|
370
|
+
end
|
371
|
+
|
372
|
+
def check_join_keys(df, keys)
|
373
|
+
raise ArgumentError, "No keys" if keys.empty?
|
374
|
+
missing_keys = keys.select { |k| !df.include?(k) }
|
375
|
+
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
376
|
+
end
|
377
|
+
|
378
|
+
def to_vector(v, size = nil)
|
379
|
+
return v if v.is_a?(Vector)
|
380
|
+
|
381
|
+
if size && !v.respond_to?(:to_a)
|
382
|
+
v =
|
383
|
+
if v.is_a?(Integer)
|
384
|
+
Numo::Int64.new(size).fill(v)
|
385
|
+
elsif v.is_a?(Numeric)
|
386
|
+
Numo::DFloat.new(size).fill(v)
|
387
|
+
elsif v == true || v == false
|
388
|
+
Numo::Bit.new(size).fill(v)
|
389
|
+
else
|
390
|
+
# TODO make more efficient
|
391
|
+
[v] * size
|
392
|
+
end
|
393
|
+
end
|
394
|
+
|
395
|
+
Vector.new(v)
|
396
|
+
end
|
397
|
+
end
|
398
|
+
end
|
data/lib/rover/vector.rb
ADDED
@@ -0,0 +1,248 @@
|
|
1
|
+
module Rover
|
2
|
+
class Vector
|
3
|
+
def initialize(data)
|
4
|
+
@data =
|
5
|
+
if data.is_a?(Vector)
|
6
|
+
data.to_numo
|
7
|
+
elsif data.is_a?(Numo::NArray)
|
8
|
+
data
|
9
|
+
else
|
10
|
+
data = data.to_a
|
11
|
+
if data.all? { |v| v.is_a?(Integer) }
|
12
|
+
Numo::Int64.cast(data)
|
13
|
+
elsif data.all? { |v| v.is_a?(Numeric) || v.nil? }
|
14
|
+
Numo::DFloat.cast(data.map { |v| v || Float::NAN })
|
15
|
+
elsif data.all? { |v| v == true || v == false }
|
16
|
+
Numo::Bit.cast(data)
|
17
|
+
else
|
18
|
+
Numo::RObject.cast(data)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
raise ArgumentError, "Bad size: #{@data.shape}" unless @data.ndim == 1
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_numo
|
26
|
+
@data
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_a
|
30
|
+
a = @data.to_a
|
31
|
+
a.map! { |v| !v.zero? } if @data.is_a?(Numo::Bit)
|
32
|
+
a
|
33
|
+
end
|
34
|
+
|
35
|
+
def size
|
36
|
+
@data.size
|
37
|
+
end
|
38
|
+
|
39
|
+
def uniq
|
40
|
+
Vector.new(@data.to_a.uniq)
|
41
|
+
end
|
42
|
+
|
43
|
+
def missing
|
44
|
+
bit =
|
45
|
+
if @data.is_a?(Numo::RObject)
|
46
|
+
Numo::Bit.cast(@data.map(&:nil?))
|
47
|
+
elsif @data.respond_to?(:isnan)
|
48
|
+
@data.isnan
|
49
|
+
else
|
50
|
+
Numo::Bit.new(size).fill(0)
|
51
|
+
end
|
52
|
+
|
53
|
+
Vector.new(bit)
|
54
|
+
end
|
55
|
+
|
56
|
+
# keep same number of rows as original
|
57
|
+
# to make it easy to add to original data frame
|
58
|
+
def diff
|
59
|
+
diff = @data.cast_to(Numo::DFloat).diff
|
60
|
+
Vector.new(diff.insert(0, Float::NAN))
|
61
|
+
end
|
62
|
+
|
63
|
+
def [](v)
|
64
|
+
if v.is_a?(Vector)
|
65
|
+
Vector.new(v.to_numo.mask(@data))
|
66
|
+
else
|
67
|
+
@data[v]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def []=(k, v)
|
72
|
+
k = k.to_numo if k.is_a?(Vector)
|
73
|
+
@data[k] = v
|
74
|
+
end
|
75
|
+
|
76
|
+
%w(+ - * / % ** &).each do |op|
|
77
|
+
define_method(op) do |other|
|
78
|
+
other = other.to_numo if other.is_a?(Vector)
|
79
|
+
# TODO better logic
|
80
|
+
if @data.is_a?(Numo::RObject)
|
81
|
+
map { |v| v.send(op, other) }
|
82
|
+
else
|
83
|
+
Vector.new(@data.send(op, other))
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
{
|
89
|
+
"==" => "eq",
|
90
|
+
"!=" => "ne",
|
91
|
+
">" => "gt",
|
92
|
+
">=" => "ge",
|
93
|
+
"<" => "lt",
|
94
|
+
"<=" => "le"
|
95
|
+
}.each do |op, meth|
|
96
|
+
define_method(op) do |other|
|
97
|
+
other = other.to_numo if other.is_a?(Vector)
|
98
|
+
v =
|
99
|
+
if other.is_a?(Numo::RObject)
|
100
|
+
@data.to_a.zip(other).map { |v, ov| v == ov }
|
101
|
+
elsif other.is_a?(Numeric) || other.is_a?(Numo::NArray)
|
102
|
+
@data.send(meth, other)
|
103
|
+
else
|
104
|
+
@data.map { |v| v.send(op, other) }
|
105
|
+
end
|
106
|
+
Vector.new(Numo::Bit.cast(v))
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def in?(values)
|
111
|
+
ret = Numo::Bit.new(size).fill(false)
|
112
|
+
values.each do |v|
|
113
|
+
comp =
|
114
|
+
if v.is_a?(Numeric) || v.is_a?(Numo::NArray)
|
115
|
+
@data.eq(v)
|
116
|
+
else
|
117
|
+
Numo::Bit.cast(@data.map { |d| d == v })
|
118
|
+
end
|
119
|
+
ret |= comp
|
120
|
+
end
|
121
|
+
Vector.new(ret)
|
122
|
+
end
|
123
|
+
|
124
|
+
def !
|
125
|
+
if @data.is_a?(Numo::Bit)
|
126
|
+
Vector.new(@data.eq(0))
|
127
|
+
else
|
128
|
+
raise "Not implemented yet"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def -@
|
133
|
+
self * -1
|
134
|
+
end
|
135
|
+
|
136
|
+
def clamp!(min, max)
|
137
|
+
@data = @data.clip(min, max)
|
138
|
+
self
|
139
|
+
end
|
140
|
+
|
141
|
+
def clamp(min, max)
|
142
|
+
dup.clamp!(min, max)
|
143
|
+
end
|
144
|
+
|
145
|
+
def map(&block)
|
146
|
+
mapped = @data.map(&block)
|
147
|
+
mapped = mapped.to_a if mapped.is_a?(Numo::RObject) # re-evaluate cast
|
148
|
+
Vector.new(mapped)
|
149
|
+
end
|
150
|
+
|
151
|
+
def sort
|
152
|
+
Vector.new(@data.respond_to?(:sort) ? @data.sort : @data.to_a.sort)
|
153
|
+
end
|
154
|
+
|
155
|
+
def abs
|
156
|
+
Vector.new(@data.abs)
|
157
|
+
end
|
158
|
+
|
159
|
+
def each(&block)
|
160
|
+
to_a.each(&block)
|
161
|
+
end
|
162
|
+
|
163
|
+
def max
|
164
|
+
@data.max
|
165
|
+
end
|
166
|
+
|
167
|
+
def min
|
168
|
+
@data.min
|
169
|
+
end
|
170
|
+
|
171
|
+
def mean
|
172
|
+
# currently only floats have mean in Numo
|
173
|
+
# https://github.com/ruby-numo/numo-narray/issues/79
|
174
|
+
@data.cast_to(Numo::DFloat).mean
|
175
|
+
end
|
176
|
+
|
177
|
+
def median
|
178
|
+
# need to cast to get correct result
|
179
|
+
# TODO file bug with Numo
|
180
|
+
@data.cast_to(Numo::DFloat).median
|
181
|
+
end
|
182
|
+
|
183
|
+
def percentile(q)
|
184
|
+
@data.percentile(q)
|
185
|
+
end
|
186
|
+
|
187
|
+
def sum
|
188
|
+
@data.sum
|
189
|
+
end
|
190
|
+
|
191
|
+
def all?(&block)
|
192
|
+
@data.to_a.all?(&block)
|
193
|
+
end
|
194
|
+
|
195
|
+
def any?(&block)
|
196
|
+
@data.to_a.any?(&block)
|
197
|
+
end
|
198
|
+
|
199
|
+
def first(n = 1)
|
200
|
+
if n >= size
|
201
|
+
Vector.new(@data)
|
202
|
+
else
|
203
|
+
Vector.new(@data[0...n])
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def last(n = 1)
|
208
|
+
Vector.new(@data[-n..-1])
|
209
|
+
end
|
210
|
+
|
211
|
+
def crosstab(other)
|
212
|
+
index = uniq.sort
|
213
|
+
index_pos = index.to_a.map.with_index.to_h
|
214
|
+
df = DataFrame.new({"_" => index})
|
215
|
+
other.uniq.sort.each do |k|
|
216
|
+
df[k] = 0
|
217
|
+
end
|
218
|
+
to_a.zip(other.to_a) do |v1, v2|
|
219
|
+
df[v2][index_pos[v1]] += 1
|
220
|
+
end
|
221
|
+
df
|
222
|
+
end
|
223
|
+
|
224
|
+
def head(n = 5)
|
225
|
+
n += size if n < 0
|
226
|
+
first(n)
|
227
|
+
end
|
228
|
+
|
229
|
+
def tail(n = 5)
|
230
|
+
n += size if n < 0
|
231
|
+
last(n)
|
232
|
+
end
|
233
|
+
|
234
|
+
# TODO add type and size?
|
235
|
+
def inspect
|
236
|
+
elements = first(5).to_a.map(&:inspect)
|
237
|
+
elements << "..." if size > 5
|
238
|
+
"#<Rover::Vector [#{elements.join(", ")}]>"
|
239
|
+
end
|
240
|
+
alias_method :to_s, :inspect # alias like hash
|
241
|
+
|
242
|
+
# for IRuby
|
243
|
+
def to_html
|
244
|
+
require "iruby"
|
245
|
+
IRuby::HTML.table(to_a)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
metadata
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rover-df
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrew Kane
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-05-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: numo-narray
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.9.1.7
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.9.1.7
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '5'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '5'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: activerecord
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '5'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '5'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: sqlite3
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: iruby
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
description:
|
112
|
+
email: andrew@chartkick.com
|
113
|
+
executables: []
|
114
|
+
extensions: []
|
115
|
+
extra_rdoc_files: []
|
116
|
+
files:
|
117
|
+
- CHANGELOG.md
|
118
|
+
- LICENSE.txt
|
119
|
+
- README.md
|
120
|
+
- lib/rover-df.rb
|
121
|
+
- lib/rover.rb
|
122
|
+
- lib/rover/data_frame.rb
|
123
|
+
- lib/rover/vector.rb
|
124
|
+
- lib/rover/version.rb
|
125
|
+
homepage: https://github.com/ankane/rover
|
126
|
+
licenses:
|
127
|
+
- MIT
|
128
|
+
metadata: {}
|
129
|
+
post_install_message:
|
130
|
+
rdoc_options: []
|
131
|
+
require_paths:
|
132
|
+
- lib
|
133
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - ">="
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '2.4'
|
138
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - ">="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
143
|
+
requirements: []
|
144
|
+
rubygems_version: 3.1.2
|
145
|
+
signing_key:
|
146
|
+
specification_version: 4
|
147
|
+
summary: Simple, powerful data frames for Ruby
|
148
|
+
test_files: []
|