rover-df 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +307 -0
- data/lib/rover-df.rb +1 -0
- data/lib/rover.rb +32 -0
- data/lib/rover/data_frame.rb +398 -0
- data/lib/rover/vector.rb +248 -0
- data/lib/rover/version.rb +3 -0
- metadata +148 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 4588d0b3b5633a3821a4c07e7102e5933edca92179836db041f2400d8be88538
|
4
|
+
data.tar.gz: 9b01cd2bae5fb6ba9f426fe0d347752cd30c63619b00284fb68e8f711ec38ddf
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b2d35866786a7fbe17b274585419c752b08c817b2db1bf939a6c3f92a7ae2cd282d725614f96db730fd2590cbb8c24710d0fb1f713255d2c348c0fed0b874a35
|
7
|
+
data.tar.gz: 4bf0ba38ce2c3ef4765d702591948af18fddf142efb7e559e26cc4ab504538775a1771c839f1570230f7d101fa20bfbbeb5044f6bf567637790575ee9b95be87
|
data/CHANGELOG.md
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2020 Andrew Kane
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,307 @@
|
|
1
|
+
# Rover
|
2
|
+
|
3
|
+
Simple, powerful data frames for Ruby
|
4
|
+
|
5
|
+
:mountain: Designed for data exploration and machine learning, and powered by [Numo](https://github.com/ruby-numo/numo-narray) for blazing performance
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application’s Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'rover-df'
|
13
|
+
```
|
14
|
+
|
15
|
+
## Intro
|
16
|
+
|
17
|
+
A data frame is an in-memory table. It’s a useful data structure for data analysis and machine learning. It uses columnar storage for fast operations on columns.
|
18
|
+
|
19
|
+
## Creating Data Frames
|
20
|
+
|
21
|
+
From an array
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
Rover::DataFrame.new([{a: 1, b: "one"}, {a: 2, b: "two"}, {a: 3, b: "three"}])
|
25
|
+
```
|
26
|
+
|
27
|
+
From a hash
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
Rover::DataFrame.new({
|
31
|
+
a: [1, 2, 3],
|
32
|
+
b: ["one", "two", "three"]
|
33
|
+
})
|
34
|
+
```
|
35
|
+
|
36
|
+
From an Active Record relation
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
Rover::DataFrame.new(User.all)
|
40
|
+
```
|
41
|
+
|
42
|
+
From a CSV
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
Rover.read_csv("file.csv")
|
46
|
+
# or
|
47
|
+
Rover.parse_csv("CSV,data,string")
|
48
|
+
```
|
49
|
+
|
50
|
+
## Attributes
|
51
|
+
|
52
|
+
Get number of rows
|
53
|
+
|
54
|
+
```ruby
|
55
|
+
df.count
|
56
|
+
```
|
57
|
+
|
58
|
+
Get column names
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
df.keys
|
62
|
+
```
|
63
|
+
|
64
|
+
Check if a column exists
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
df.include?(name)
|
68
|
+
```
|
69
|
+
|
70
|
+
## Selecting Data
|
71
|
+
|
72
|
+
Select a column
|
73
|
+
|
74
|
+
```ruby
|
75
|
+
df[:a]
|
76
|
+
```
|
77
|
+
|
78
|
+
Select multiple columns
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
df[[:a, :b]]
|
82
|
+
```
|
83
|
+
|
84
|
+
Select first rows
|
85
|
+
|
86
|
+
```ruby
|
87
|
+
df.head
|
88
|
+
# or
|
89
|
+
df.first(5)
|
90
|
+
```
|
91
|
+
|
92
|
+
Select last rows
|
93
|
+
|
94
|
+
```ruby
|
95
|
+
df.tail
|
96
|
+
# or
|
97
|
+
df.last(5)
|
98
|
+
```
|
99
|
+
|
100
|
+
Select rows by index
|
101
|
+
|
102
|
+
```ruby
|
103
|
+
df[1]
|
104
|
+
# or
|
105
|
+
df[1..3]
|
106
|
+
# or
|
107
|
+
df[[1, 4, 5]]
|
108
|
+
```
|
109
|
+
|
110
|
+
## Filtering
|
111
|
+
|
112
|
+
Filter on a condition
|
113
|
+
|
114
|
+
```ruby
|
115
|
+
df[df[:a] > 100]
|
116
|
+
```
|
117
|
+
|
118
|
+
And
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
df[df[:a] > 100 & df[:b] == "one"]
|
122
|
+
```
|
123
|
+
|
124
|
+
Or
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
df[df[:a] > 100 | df[:b] == "one"]
|
128
|
+
```
|
129
|
+
|
130
|
+
Not
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
df[df[:a] != 100]
|
134
|
+
```
|
135
|
+
|
136
|
+
## Operations
|
137
|
+
|
138
|
+
Basic operations
|
139
|
+
|
140
|
+
```ruby
|
141
|
+
df[:a] + 5
|
142
|
+
df[:a] - 5
|
143
|
+
df[:a] * 5
|
144
|
+
df[:a] / 5
|
145
|
+
df[:a] % 5
|
146
|
+
df[:a] ** 2
|
147
|
+
```
|
148
|
+
|
149
|
+
Summary statistics
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
df[:a].count
|
153
|
+
df[:a].sum
|
154
|
+
df[:a].mean
|
155
|
+
df[:a].median
|
156
|
+
df[:a].percentile(90)
|
157
|
+
df[:a].min
|
158
|
+
df[:a].max
|
159
|
+
```
|
160
|
+
|
161
|
+
Cross tabulation
|
162
|
+
|
163
|
+
```ruby
|
164
|
+
df[:a].crosstab(df[:b])
|
165
|
+
```
|
166
|
+
|
167
|
+
## Updates
|
168
|
+
|
169
|
+
Add a new column
|
170
|
+
|
171
|
+
```ruby
|
172
|
+
df[:a] = 1
|
173
|
+
# or
|
174
|
+
df[:a] = [1, 2, 3]
|
175
|
+
```
|
176
|
+
|
177
|
+
Update a single element
|
178
|
+
|
179
|
+
```ruby
|
180
|
+
df[:a][0] = 100
|
181
|
+
```
|
182
|
+
|
183
|
+
Update multiple elements
|
184
|
+
|
185
|
+
```ruby
|
186
|
+
df[:a][0..2] = 1
|
187
|
+
# or
|
188
|
+
df[:a][0..2] = [1, 2, 3]
|
189
|
+
```
|
190
|
+
|
191
|
+
Update elements matching a condition
|
192
|
+
|
193
|
+
```ruby
|
194
|
+
df[:a][df[:a] > 100] = 0
|
195
|
+
```
|
196
|
+
|
197
|
+
Clamp
|
198
|
+
|
199
|
+
```ruby
|
200
|
+
df[:a].clamp!(0, 100)
|
201
|
+
```
|
202
|
+
|
203
|
+
Delete columns
|
204
|
+
|
205
|
+
```ruby
|
206
|
+
df.delete(:a)
|
207
|
+
# or
|
208
|
+
df.except!(:a, :b)
|
209
|
+
```
|
210
|
+
|
211
|
+
Rename a column
|
212
|
+
|
213
|
+
```ruby
|
214
|
+
df[:new_a] = df.delete(:a)
|
215
|
+
```
|
216
|
+
|
217
|
+
Sort data
|
218
|
+
|
219
|
+
```ruby
|
220
|
+
df.sort_by! { |r| r[:a] }
|
221
|
+
```
|
222
|
+
|
223
|
+
Clear all data
|
224
|
+
|
225
|
+
```ruby
|
226
|
+
df.clear
|
227
|
+
```
|
228
|
+
|
229
|
+
## Combining Data Frames
|
230
|
+
|
231
|
+
Add rows
|
232
|
+
|
233
|
+
```ruby
|
234
|
+
df.concat(other_df)
|
235
|
+
```
|
236
|
+
|
237
|
+
Add columns
|
238
|
+
|
239
|
+
```ruby
|
240
|
+
df.merge!(other_df)
|
241
|
+
```
|
242
|
+
|
243
|
+
Inner join
|
244
|
+
|
245
|
+
```ruby
|
246
|
+
df.inner_join(other_df)
|
247
|
+
# or
|
248
|
+
df.inner_join(other_df, on: :a)
|
249
|
+
# or
|
250
|
+
df.inner_join(other_df, on: [:a, :b])
|
251
|
+
# or
|
252
|
+
df.inner_join(other_df, on: {df_col: :other_df_col})
|
253
|
+
```
|
254
|
+
|
255
|
+
Left join
|
256
|
+
|
257
|
+
```ruby
|
258
|
+
df.left_join(other_df)
|
259
|
+
```
|
260
|
+
|
261
|
+
## Conversion
|
262
|
+
|
263
|
+
Array of hashes
|
264
|
+
|
265
|
+
```ruby
|
266
|
+
df.to_a
|
267
|
+
```
|
268
|
+
|
269
|
+
Hash of arrays
|
270
|
+
|
271
|
+
```ruby
|
272
|
+
df.to_h
|
273
|
+
```
|
274
|
+
|
275
|
+
Numo array
|
276
|
+
|
277
|
+
```ruby
|
278
|
+
df.to_numo
|
279
|
+
```
|
280
|
+
|
281
|
+
CSV
|
282
|
+
|
283
|
+
```ruby
|
284
|
+
df.to_csv
|
285
|
+
```
|
286
|
+
|
287
|
+
## History
|
288
|
+
|
289
|
+
View the [changelog](https://github.com/ankane/rover/blob/master/CHANGELOG.md)
|
290
|
+
|
291
|
+
## Contributing
|
292
|
+
|
293
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
294
|
+
|
295
|
+
- [Report bugs](https://github.com/ankane/rover/issues)
|
296
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/rover/pulls)
|
297
|
+
- Write, clarify, or fix documentation
|
298
|
+
- Suggest or add new features
|
299
|
+
|
300
|
+
To get started with development:
|
301
|
+
|
302
|
+
```sh
|
303
|
+
git clone https://github.com/ankane/rover.git
|
304
|
+
cd rover
|
305
|
+
bundle install
|
306
|
+
bundle exec rake test
|
307
|
+
```
|
data/lib/rover-df.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "rover"
|
data/lib/rover.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# dependencies
|
2
|
+
require "numo/narray"
|
3
|
+
|
4
|
+
# modules
|
5
|
+
require "rover/data_frame"
|
6
|
+
require "rover/vector"
|
7
|
+
require "rover/version"
|
8
|
+
|
9
|
+
module Rover
|
10
|
+
class << self
|
11
|
+
def read_csv(path, **options)
|
12
|
+
require "csv"
|
13
|
+
csv_to_df(CSV.read(path, headers: true, converters: :numeric, **options))
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse_csv(str, **options)
|
17
|
+
require "csv"
|
18
|
+
csv_to_df(CSV.parse(str, headers: true, converters: :numeric, **options))
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def csv_to_df(table)
|
24
|
+
table.by_col!
|
25
|
+
data = {}
|
26
|
+
table.each do |k, v|
|
27
|
+
data[k] = v
|
28
|
+
end
|
29
|
+
DataFrame.new(data)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,398 @@
|
|
1
|
+
module Rover
|
2
|
+
class DataFrame
|
3
|
+
def initialize(data = {})
|
4
|
+
@vectors = {}
|
5
|
+
|
6
|
+
if data.is_a?(DataFrame)
|
7
|
+
data.vectors.each do |k, v|
|
8
|
+
@vectors[k] = v
|
9
|
+
end
|
10
|
+
elsif data.is_a?(Hash)
|
11
|
+
data.to_h.each do |k, v|
|
12
|
+
@vectors[k] =
|
13
|
+
if v.respond_to?(:to_a)
|
14
|
+
Vector.new(v)
|
15
|
+
else
|
16
|
+
v
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# handle scalars
|
21
|
+
size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1
|
22
|
+
@vectors.each_key do |k|
|
23
|
+
@vectors[k] = to_vector(@vectors[k], size)
|
24
|
+
end
|
25
|
+
elsif data.is_a?(Array)
|
26
|
+
vectors = {}
|
27
|
+
raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) }
|
28
|
+
keys = data.flat_map(&:keys).uniq
|
29
|
+
keys.each do |k|
|
30
|
+
vectors[k] = []
|
31
|
+
end
|
32
|
+
data.each do |d|
|
33
|
+
keys.each do |k|
|
34
|
+
vectors[k] << d[k]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
vectors.each do |k, v|
|
38
|
+
@vectors[k] = to_vector(v)
|
39
|
+
end
|
40
|
+
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
|
41
|
+
result = data.connection.select_all(data.all.to_sql)
|
42
|
+
result.columns.each_with_index do |k, i|
|
43
|
+
@vectors[k] = to_vector(result.rows.map { |r| r[i] })
|
44
|
+
end
|
45
|
+
else
|
46
|
+
raise ArgumentError, "Cannot cast to data frame: #{data.class.name}"
|
47
|
+
end
|
48
|
+
|
49
|
+
# check keys
|
50
|
+
@vectors.each_key do |k|
|
51
|
+
check_key(k)
|
52
|
+
end
|
53
|
+
|
54
|
+
# check sizes
|
55
|
+
sizes = @vectors.values.map(&:size).uniq
|
56
|
+
if sizes.size > 1
|
57
|
+
raise ArgumentError, "Different sizes: #{sizes}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def [](where)
|
62
|
+
if (where.is_a?(Vector) && where.to_numo.is_a?(Numo::Bit)) || where.is_a?(Numeric) || where.is_a?(Range) || (where.is_a?(Array) && where.all? { |v| v.is_a?(Integer) } )
|
63
|
+
new_vectors = {}
|
64
|
+
@vectors.each do |k, v|
|
65
|
+
new_vectors[k] = v[where]
|
66
|
+
end
|
67
|
+
DataFrame.new(new_vectors)
|
68
|
+
elsif where.is_a?(Array)
|
69
|
+
# multiple columns
|
70
|
+
df = DataFrame.new
|
71
|
+
where.each do |k|
|
72
|
+
df[k] = @vectors[k]
|
73
|
+
end
|
74
|
+
df
|
75
|
+
else
|
76
|
+
# single column
|
77
|
+
@vectors[where]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# return each row as a hash
|
82
|
+
def each_row
|
83
|
+
size.times do |i|
|
84
|
+
yield @vectors.map { |k, v| [k, v[i]] }.to_h
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# dup to prevent direct modification of keys
|
89
|
+
def vectors
|
90
|
+
@vectors.dup
|
91
|
+
end
|
92
|
+
|
93
|
+
def []=(k, v)
|
94
|
+
check_key(k)
|
95
|
+
v = to_vector(v, size)
|
96
|
+
raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size
|
97
|
+
@vectors[k] = v
|
98
|
+
end
|
99
|
+
|
100
|
+
def size
|
101
|
+
@vectors.values.first&.size || 0
|
102
|
+
end
|
103
|
+
alias_method :length, :size
|
104
|
+
alias_method :count, :size
|
105
|
+
|
106
|
+
# should this check for columns as well?
|
107
|
+
def any?
|
108
|
+
size > 0
|
109
|
+
end
|
110
|
+
|
111
|
+
# should this check for columns as well?
|
112
|
+
def empty?
|
113
|
+
size == 0
|
114
|
+
end
|
115
|
+
|
116
|
+
def clear
|
117
|
+
@vectors.clear
|
118
|
+
end
|
119
|
+
|
120
|
+
def shape
|
121
|
+
[size, @vectors.size]
|
122
|
+
end
|
123
|
+
|
124
|
+
def keys
|
125
|
+
@vectors.keys
|
126
|
+
end
|
127
|
+
alias_method :names, :keys
|
128
|
+
alias_method :vector_names, :keys
|
129
|
+
|
130
|
+
def delete(key)
|
131
|
+
@vectors.delete(key)
|
132
|
+
end
|
133
|
+
|
134
|
+
def except(*keys)
|
135
|
+
dup.except!(*keys)
|
136
|
+
end
|
137
|
+
|
138
|
+
def except!(*keys)
|
139
|
+
keys.each do |key|
|
140
|
+
delete(key)
|
141
|
+
end
|
142
|
+
self
|
143
|
+
end
|
144
|
+
|
145
|
+
def include?(key)
|
146
|
+
@vectors.include?(key)
|
147
|
+
end
|
148
|
+
|
149
|
+
def head(n = 5)
|
150
|
+
first(n)
|
151
|
+
end
|
152
|
+
|
153
|
+
def tail(n = 5)
|
154
|
+
last(n)
|
155
|
+
end
|
156
|
+
|
157
|
+
def first(n = nil)
|
158
|
+
new_vectors = {}
|
159
|
+
@vectors.each do |k, v|
|
160
|
+
new_vectors[k] = v.first(n)
|
161
|
+
end
|
162
|
+
DataFrame.new(new_vectors)
|
163
|
+
end
|
164
|
+
|
165
|
+
def last(n = nil)
|
166
|
+
new_vectors = {}
|
167
|
+
@vectors.each do |k, v|
|
168
|
+
new_vectors[k] = v.last(n)
|
169
|
+
end
|
170
|
+
DataFrame.new(new_vectors)
|
171
|
+
end
|
172
|
+
|
173
|
+
def to_a
|
174
|
+
a = []
|
175
|
+
each_row do |row|
|
176
|
+
a << row
|
177
|
+
end
|
178
|
+
a
|
179
|
+
end
|
180
|
+
|
181
|
+
def to_h
|
182
|
+
hsh = {}
|
183
|
+
@vectors.each do |k, v|
|
184
|
+
hsh[k] = v.to_a
|
185
|
+
end
|
186
|
+
hsh
|
187
|
+
end
|
188
|
+
|
189
|
+
def to_numo
|
190
|
+
Numo::NArray.column_stack(vectors.values.map(&:to_numo))
|
191
|
+
end
|
192
|
+
|
193
|
+
def to_csv
|
194
|
+
require "csv"
|
195
|
+
CSV.generate do |csv|
|
196
|
+
csv << keys
|
197
|
+
numo = vectors.values.map(&:to_numo)
|
198
|
+
size.times do |i|
|
199
|
+
csv << numo.map { |n| n[i] }
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
# for IRuby
|
205
|
+
def to_html
|
206
|
+
require "iruby"
|
207
|
+
IRuby::HTML.table(to_h)
|
208
|
+
end
|
209
|
+
|
210
|
+
# TODO handle long text better
|
211
|
+
def inspect
|
212
|
+
return "#<Rover::DataFrame>" if keys.empty?
|
213
|
+
|
214
|
+
lines = []
|
215
|
+
line_start = 0
|
216
|
+
spaces = 2
|
217
|
+
|
218
|
+
@vectors.each do |k, v|
|
219
|
+
v = v.first(5).to_a
|
220
|
+
width = ([k] + v).map(&:to_s).map(&:size).max
|
221
|
+
width = 3 if width < 3
|
222
|
+
|
223
|
+
if lines.empty? || lines[-2].map { |l| l.size + spaces }.sum + width > 120
|
224
|
+
line_start = lines.size
|
225
|
+
lines << []
|
226
|
+
[size, 5].min.times do |i|
|
227
|
+
lines << []
|
228
|
+
end
|
229
|
+
lines << [] if size > 5
|
230
|
+
lines << []
|
231
|
+
end
|
232
|
+
|
233
|
+
lines[line_start] << "%#{width}s" % k.to_s
|
234
|
+
v.each_with_index do |v2, i|
|
235
|
+
lines[line_start + 1 + i] << "%#{width}s" % v2.to_s
|
236
|
+
end
|
237
|
+
lines[line_start + 6] << "%#{width}s" % "..." if size > 5
|
238
|
+
end
|
239
|
+
|
240
|
+
lines.pop
|
241
|
+
lines.map { |l| l.join(" " * spaces) }.join("\n")
|
242
|
+
end
|
243
|
+
alias_method :to_s, :inspect # alias like hash
|
244
|
+
|
245
|
+
def sort_by!
|
246
|
+
indexes =
|
247
|
+
size.times.sort_by do |i|
|
248
|
+
yield @vectors.map { |k, v| [k, v[i]] }.to_h
|
249
|
+
end
|
250
|
+
|
251
|
+
@vectors.each do |k, v|
|
252
|
+
self[k] = v.to_numo.at(indexes)
|
253
|
+
end
|
254
|
+
self
|
255
|
+
end
|
256
|
+
|
257
|
+
def sort_by(&block)
|
258
|
+
dup.sort_by!(&block)
|
259
|
+
end
|
260
|
+
|
261
|
+
def dup
|
262
|
+
df = DataFrame.new
|
263
|
+
@vectors.each do |k, v|
|
264
|
+
df[k] = v
|
265
|
+
end
|
266
|
+
df
|
267
|
+
end
|
268
|
+
|
269
|
+
def +(other)
|
270
|
+
dup.concat(other)
|
271
|
+
end
|
272
|
+
|
273
|
+
# in-place, like Array#concat
|
274
|
+
# TODO make more performant
|
275
|
+
def concat(other)
|
276
|
+
raise ArgumentError, "Must be a data frame" unless other.is_a?(DataFrame)
|
277
|
+
|
278
|
+
size = self.size
|
279
|
+
vectors.each do |k, v|
|
280
|
+
@vectors[k] = Vector.new(v.to_a + (other[k] ? other[k].to_a : [nil] * other.size))
|
281
|
+
end
|
282
|
+
(other.vector_names - vector_names).each do |k|
|
283
|
+
@vectors[k] = Vector.new([nil] * size + other[k].to_a)
|
284
|
+
end
|
285
|
+
self
|
286
|
+
end
|
287
|
+
|
288
|
+
def merge(other)
|
289
|
+
dup.merge!(other)
|
290
|
+
end
|
291
|
+
|
292
|
+
def merge!(other)
|
293
|
+
other.vectors.each do |k, v|
|
294
|
+
self[k] = v
|
295
|
+
end
|
296
|
+
self
|
297
|
+
end
|
298
|
+
|
299
|
+
# see join for options
|
300
|
+
def inner_join(other, on: nil)
|
301
|
+
join(other, on: on, how: "inner")
|
302
|
+
end
|
303
|
+
|
304
|
+
# see join for options
|
305
|
+
def left_join(other, on: nil)
|
306
|
+
join(other, on: on, how: "left")
|
307
|
+
end
|
308
|
+
|
309
|
+
# don't check types
|
310
|
+
def ==(other)
|
311
|
+
size == other.size &&
|
312
|
+
keys == other.keys &&
|
313
|
+
keys.all? { |k| self[k] == other[k] }
|
314
|
+
end
|
315
|
+
|
316
|
+
private
|
317
|
+
|
318
|
+
def check_key(key)
|
319
|
+
raise ArgumentError, "Key must be a string or symbol, got #{key.inspect}" unless key.is_a?(String) || key.is_a?(Symbol)
|
320
|
+
end
|
321
|
+
|
322
|
+
# TODO make more efficient
|
323
|
+
# TODO add option to prefix/suffix keys?
|
324
|
+
# Supports:
|
325
|
+
# - on: :key
|
326
|
+
# - on: [:key1, :key2]
|
327
|
+
# - on: {key1a: :key1b, key2a: :key2b}
|
328
|
+
def join(other, how:, on: nil)
|
329
|
+
self_on, other_on =
|
330
|
+
if on.is_a?(Hash)
|
331
|
+
[on.keys, on.values]
|
332
|
+
else
|
333
|
+
on ||= keys & other.keys
|
334
|
+
on = [on] unless on.is_a?(Array)
|
335
|
+
[on, on]
|
336
|
+
end
|
337
|
+
|
338
|
+
check_join_keys(self, self_on)
|
339
|
+
check_join_keys(other, other_on)
|
340
|
+
|
341
|
+
indexed = other.to_a.group_by { |r| r.values_at(*other_on) }
|
342
|
+
indexed.default = []
|
343
|
+
|
344
|
+
left = how == "left"
|
345
|
+
|
346
|
+
vectors = {}
|
347
|
+
keys = (self.keys + other.keys).uniq
|
348
|
+
keys.each do |k|
|
349
|
+
vectors[k] = []
|
350
|
+
end
|
351
|
+
|
352
|
+
each_row do |r|
|
353
|
+
matches = indexed[r.values_at(*self_on)]
|
354
|
+
if matches.empty?
|
355
|
+
if left
|
356
|
+
keys.each do |k|
|
357
|
+
vectors[k] << r[k]
|
358
|
+
end
|
359
|
+
end
|
360
|
+
else
|
361
|
+
matches.each do |r2|
|
362
|
+
keys.each do |k|
|
363
|
+
vectors[k] << (r2[k] || r[k])
|
364
|
+
end
|
365
|
+
end
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
DataFrame.new(vectors)
|
370
|
+
end
|
371
|
+
|
372
|
+
def check_join_keys(df, keys)
|
373
|
+
raise ArgumentError, "No keys" if keys.empty?
|
374
|
+
missing_keys = keys.select { |k| !df.include?(k) }
|
375
|
+
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
376
|
+
end
|
377
|
+
|
378
|
+
def to_vector(v, size = nil)
|
379
|
+
return v if v.is_a?(Vector)
|
380
|
+
|
381
|
+
if size && !v.respond_to?(:to_a)
|
382
|
+
v =
|
383
|
+
if v.is_a?(Integer)
|
384
|
+
Numo::Int64.new(size).fill(v)
|
385
|
+
elsif v.is_a?(Numeric)
|
386
|
+
Numo::DFloat.new(size).fill(v)
|
387
|
+
elsif v == true || v == false
|
388
|
+
Numo::Bit.new(size).fill(v)
|
389
|
+
else
|
390
|
+
# TODO make more efficient
|
391
|
+
[v] * size
|
392
|
+
end
|
393
|
+
end
|
394
|
+
|
395
|
+
Vector.new(v)
|
396
|
+
end
|
397
|
+
end
|
398
|
+
end
|
data/lib/rover/vector.rb
ADDED
@@ -0,0 +1,248 @@
|
|
1
|
+
module Rover
|
2
|
+
class Vector
|
3
|
+
def initialize(data)
|
4
|
+
@data =
|
5
|
+
if data.is_a?(Vector)
|
6
|
+
data.to_numo
|
7
|
+
elsif data.is_a?(Numo::NArray)
|
8
|
+
data
|
9
|
+
else
|
10
|
+
data = data.to_a
|
11
|
+
if data.all? { |v| v.is_a?(Integer) }
|
12
|
+
Numo::Int64.cast(data)
|
13
|
+
elsif data.all? { |v| v.is_a?(Numeric) || v.nil? }
|
14
|
+
Numo::DFloat.cast(data.map { |v| v || Float::NAN })
|
15
|
+
elsif data.all? { |v| v == true || v == false }
|
16
|
+
Numo::Bit.cast(data)
|
17
|
+
else
|
18
|
+
Numo::RObject.cast(data)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
raise ArgumentError, "Bad size: #{@data.shape}" unless @data.ndim == 1
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_numo
|
26
|
+
@data
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_a
|
30
|
+
a = @data.to_a
|
31
|
+
a.map! { |v| !v.zero? } if @data.is_a?(Numo::Bit)
|
32
|
+
a
|
33
|
+
end
|
34
|
+
|
35
|
+
def size
|
36
|
+
@data.size
|
37
|
+
end
|
38
|
+
|
39
|
+
def uniq
|
40
|
+
Vector.new(@data.to_a.uniq)
|
41
|
+
end
|
42
|
+
|
43
|
+
def missing
|
44
|
+
bit =
|
45
|
+
if @data.is_a?(Numo::RObject)
|
46
|
+
Numo::Bit.cast(@data.map(&:nil?))
|
47
|
+
elsif @data.respond_to?(:isnan)
|
48
|
+
@data.isnan
|
49
|
+
else
|
50
|
+
Numo::Bit.new(size).fill(0)
|
51
|
+
end
|
52
|
+
|
53
|
+
Vector.new(bit)
|
54
|
+
end
|
55
|
+
|
56
|
+
# keep same number of rows as original
|
57
|
+
# to make it easy to add to original data frame
|
58
|
+
def diff
|
59
|
+
diff = @data.cast_to(Numo::DFloat).diff
|
60
|
+
Vector.new(diff.insert(0, Float::NAN))
|
61
|
+
end
|
62
|
+
|
63
|
+
def [](v)
|
64
|
+
if v.is_a?(Vector)
|
65
|
+
Vector.new(v.to_numo.mask(@data))
|
66
|
+
else
|
67
|
+
@data[v]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def []=(k, v)
|
72
|
+
k = k.to_numo if k.is_a?(Vector)
|
73
|
+
@data[k] = v
|
74
|
+
end
|
75
|
+
|
76
|
+
%w(+ - * / % ** &).each do |op|
|
77
|
+
define_method(op) do |other|
|
78
|
+
other = other.to_numo if other.is_a?(Vector)
|
79
|
+
# TODO better logic
|
80
|
+
if @data.is_a?(Numo::RObject)
|
81
|
+
map { |v| v.send(op, other) }
|
82
|
+
else
|
83
|
+
Vector.new(@data.send(op, other))
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
{
|
89
|
+
"==" => "eq",
|
90
|
+
"!=" => "ne",
|
91
|
+
">" => "gt",
|
92
|
+
">=" => "ge",
|
93
|
+
"<" => "lt",
|
94
|
+
"<=" => "le"
|
95
|
+
}.each do |op, meth|
|
96
|
+
define_method(op) do |other|
|
97
|
+
other = other.to_numo if other.is_a?(Vector)
|
98
|
+
v =
|
99
|
+
if other.is_a?(Numo::RObject)
|
100
|
+
@data.to_a.zip(other).map { |v, ov| v == ov }
|
101
|
+
elsif other.is_a?(Numeric) || other.is_a?(Numo::NArray)
|
102
|
+
@data.send(meth, other)
|
103
|
+
else
|
104
|
+
@data.map { |v| v.send(op, other) }
|
105
|
+
end
|
106
|
+
Vector.new(Numo::Bit.cast(v))
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def in?(values)
|
111
|
+
ret = Numo::Bit.new(size).fill(false)
|
112
|
+
values.each do |v|
|
113
|
+
comp =
|
114
|
+
if v.is_a?(Numeric) || v.is_a?(Numo::NArray)
|
115
|
+
@data.eq(v)
|
116
|
+
else
|
117
|
+
Numo::Bit.cast(@data.map { |d| d == v })
|
118
|
+
end
|
119
|
+
ret |= comp
|
120
|
+
end
|
121
|
+
Vector.new(ret)
|
122
|
+
end
|
123
|
+
|
124
|
+
def !
|
125
|
+
if @data.is_a?(Numo::Bit)
|
126
|
+
Vector.new(@data.eq(0))
|
127
|
+
else
|
128
|
+
raise "Not implemented yet"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def -@
|
133
|
+
self * -1
|
134
|
+
end
|
135
|
+
|
136
|
+
def clamp!(min, max)
|
137
|
+
@data = @data.clip(min, max)
|
138
|
+
self
|
139
|
+
end
|
140
|
+
|
141
|
+
def clamp(min, max)
|
142
|
+
dup.clamp!(min, max)
|
143
|
+
end
|
144
|
+
|
145
|
+
def map(&block)
|
146
|
+
mapped = @data.map(&block)
|
147
|
+
mapped = mapped.to_a if mapped.is_a?(Numo::RObject) # re-evaluate cast
|
148
|
+
Vector.new(mapped)
|
149
|
+
end
|
150
|
+
|
151
|
+
def sort
|
152
|
+
Vector.new(@data.respond_to?(:sort) ? @data.sort : @data.to_a.sort)
|
153
|
+
end
|
154
|
+
|
155
|
+
def abs
|
156
|
+
Vector.new(@data.abs)
|
157
|
+
end
|
158
|
+
|
159
|
+
def each(&block)
|
160
|
+
to_a.each(&block)
|
161
|
+
end
|
162
|
+
|
163
|
+
def max
|
164
|
+
@data.max
|
165
|
+
end
|
166
|
+
|
167
|
+
def min
|
168
|
+
@data.min
|
169
|
+
end
|
170
|
+
|
171
|
+
def mean
|
172
|
+
# currently only floats have mean in Numo
|
173
|
+
# https://github.com/ruby-numo/numo-narray/issues/79
|
174
|
+
@data.cast_to(Numo::DFloat).mean
|
175
|
+
end
|
176
|
+
|
177
|
+
def median
|
178
|
+
# need to cast to get correct result
|
179
|
+
# TODO file bug with Numo
|
180
|
+
@data.cast_to(Numo::DFloat).median
|
181
|
+
end
|
182
|
+
|
183
|
+
def percentile(q)
|
184
|
+
@data.percentile(q)
|
185
|
+
end
|
186
|
+
|
187
|
+
def sum
|
188
|
+
@data.sum
|
189
|
+
end
|
190
|
+
|
191
|
+
def all?(&block)
|
192
|
+
@data.to_a.all?(&block)
|
193
|
+
end
|
194
|
+
|
195
|
+
def any?(&block)
|
196
|
+
@data.to_a.any?(&block)
|
197
|
+
end
|
198
|
+
|
199
|
+
def first(n = 1)
|
200
|
+
if n >= size
|
201
|
+
Vector.new(@data)
|
202
|
+
else
|
203
|
+
Vector.new(@data[0...n])
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def last(n = 1)
|
208
|
+
Vector.new(@data[-n..-1])
|
209
|
+
end
|
210
|
+
|
211
|
+
def crosstab(other)
|
212
|
+
index = uniq.sort
|
213
|
+
index_pos = index.to_a.map.with_index.to_h
|
214
|
+
df = DataFrame.new({"_" => index})
|
215
|
+
other.uniq.sort.each do |k|
|
216
|
+
df[k] = 0
|
217
|
+
end
|
218
|
+
to_a.zip(other.to_a) do |v1, v2|
|
219
|
+
df[v2][index_pos[v1]] += 1
|
220
|
+
end
|
221
|
+
df
|
222
|
+
end
|
223
|
+
|
224
|
+
def head(n = 5)
|
225
|
+
n += size if n < 0
|
226
|
+
first(n)
|
227
|
+
end
|
228
|
+
|
229
|
+
def tail(n = 5)
|
230
|
+
n += size if n < 0
|
231
|
+
last(n)
|
232
|
+
end
|
233
|
+
|
234
|
+
# TODO add type and size?
|
235
|
+
def inspect
|
236
|
+
elements = first(5).to_a.map(&:inspect)
|
237
|
+
elements << "..." if size > 5
|
238
|
+
"#<Rover::Vector [#{elements.join(", ")}]>"
|
239
|
+
end
|
240
|
+
alias_method :to_s, :inspect # alias like hash
|
241
|
+
|
242
|
+
# for IRuby
|
243
|
+
def to_html
|
244
|
+
require "iruby"
|
245
|
+
IRuby::HTML.table(to_a)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
metadata
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rover-df
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrew Kane
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-05-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: numo-narray
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.9.1.7
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.9.1.7
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '5'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '5'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: activerecord
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '5'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '5'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: sqlite3
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: iruby
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
description:
|
112
|
+
email: andrew@chartkick.com
|
113
|
+
executables: []
|
114
|
+
extensions: []
|
115
|
+
extra_rdoc_files: []
|
116
|
+
files:
|
117
|
+
- CHANGELOG.md
|
118
|
+
- LICENSE.txt
|
119
|
+
- README.md
|
120
|
+
- lib/rover-df.rb
|
121
|
+
- lib/rover.rb
|
122
|
+
- lib/rover/data_frame.rb
|
123
|
+
- lib/rover/vector.rb
|
124
|
+
- lib/rover/version.rb
|
125
|
+
homepage: https://github.com/ankane/rover
|
126
|
+
licenses:
|
127
|
+
- MIT
|
128
|
+
metadata: {}
|
129
|
+
post_install_message:
|
130
|
+
rdoc_options: []
|
131
|
+
require_paths:
|
132
|
+
- lib
|
133
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - ">="
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '2.4'
|
138
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - ">="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
143
|
+
requirements: []
|
144
|
+
rubygems_version: 3.1.2
|
145
|
+
signing_key:
|
146
|
+
specification_version: 4
|
147
|
+
summary: Simple, powerful data frames for Ruby
|
148
|
+
test_files: []
|