rover-df 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 4588d0b3b5633a3821a4c07e7102e5933edca92179836db041f2400d8be88538
4
+ data.tar.gz: 9b01cd2bae5fb6ba9f426fe0d347752cd30c63619b00284fb68e8f711ec38ddf
5
+ SHA512:
6
+ metadata.gz: b2d35866786a7fbe17b274585419c752b08c817b2db1bf939a6c3f92a7ae2cd282d725614f96db730fd2590cbb8c24710d0fb1f713255d2c348c0fed0b874a35
7
+ data.tar.gz: 4bf0ba38ce2c3ef4765d702591948af18fddf142efb7e559e26cc4ab504538775a1771c839f1570230f7d101fa20bfbbeb5044f6bf567637790575ee9b95be87
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (2020-05-13)
2
+
3
+ - First release
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2020 Andrew Kane
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,307 @@
1
+ # Rover
2
+
3
+ Simple, powerful data frames for Ruby
4
+
5
+ :mountain: Designed for data exploration and machine learning, and powered by [Numo](https://github.com/ruby-numo/numo-narray) for blazing performance
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application’s Gemfile:
10
+
11
+ ```ruby
12
+ gem 'rover-df'
13
+ ```
14
+
15
+ ## Intro
16
+
17
+ A data frame is an in-memory table. It’s a useful data structure for data analysis and machine learning. It uses columnar storage for fast operations on columns.
18
+
19
+ ## Creating Data Frames
20
+
21
+ From an array
22
+
23
+ ```ruby
24
+ Rover::DataFrame.new([{a: 1, b: "one"}, {a: 2, b: "two"}, {a: 3, b: "three"}])
25
+ ```
26
+
27
+ From a hash
28
+
29
+ ```ruby
30
+ Rover::DataFrame.new({
31
+ a: [1, 2, 3],
32
+ b: ["one", "two", "three"]
33
+ })
34
+ ```
35
+
36
+ From an Active Record relation
37
+
38
+ ```ruby
39
+ Rover::DataFrame.new(User.all)
40
+ ```
41
+
42
+ From a CSV
43
+
44
+ ```ruby
45
+ Rover.read_csv("file.csv")
46
+ # or
47
+ Rover.parse_csv("CSV,data,string")
48
+ ```
49
+
50
+ ## Attributes
51
+
52
+ Get number of rows
53
+
54
+ ```ruby
55
+ df.count
56
+ ```
57
+
58
+ Get column names
59
+
60
+ ```ruby
61
+ df.keys
62
+ ```
63
+
64
+ Check if a column exists
65
+
66
+ ```ruby
67
+ df.include?(name)
68
+ ```
69
+
70
+ ## Selecting Data
71
+
72
+ Select a column
73
+
74
+ ```ruby
75
+ df[:a]
76
+ ```
77
+
78
+ Select multiple columns
79
+
80
+ ```ruby
81
+ df[[:a, :b]]
82
+ ```
83
+
84
+ Select first rows
85
+
86
+ ```ruby
87
+ df.head
88
+ # or
89
+ df.first(5)
90
+ ```
91
+
92
+ Select last rows
93
+
94
+ ```ruby
95
+ df.tail
96
+ # or
97
+ df.last(5)
98
+ ```
99
+
100
+ Select rows by index
101
+
102
+ ```ruby
103
+ df[1]
104
+ # or
105
+ df[1..3]
106
+ # or
107
+ df[[1, 4, 5]]
108
+ ```
109
+
110
+ ## Filtering
111
+
112
+ Filter on a condition
113
+
114
+ ```ruby
115
+ df[df[:a] > 100]
116
+ ```
117
+
118
+ And
119
+
120
+ ```ruby
121
+ df[df[:a] > 100 & df[:b] == "one"]
122
+ ```
123
+
124
+ Or
125
+
126
+ ```ruby
127
+ df[df[:a] > 100 | df[:b] == "one"]
128
+ ```
129
+
130
+ Not
131
+
132
+ ```ruby
133
+ df[df[:a] != 100]
134
+ ```
135
+
136
+ ## Operations
137
+
138
+ Basic operations
139
+
140
+ ```ruby
141
+ df[:a] + 5
142
+ df[:a] - 5
143
+ df[:a] * 5
144
+ df[:a] / 5
145
+ df[:a] % 5
146
+ df[:a] ** 2
147
+ ```
148
+
149
+ Summary statistics
150
+
151
+ ```ruby
152
+ df[:a].count
153
+ df[:a].sum
154
+ df[:a].mean
155
+ df[:a].median
156
+ df[:a].percentile(90)
157
+ df[:a].min
158
+ df[:a].max
159
+ ```
160
+
161
+ Cross tabulation
162
+
163
+ ```ruby
164
+ df[:a].crosstab(df[:b])
165
+ ```
166
+
167
+ ## Updates
168
+
169
+ Add a new column
170
+
171
+ ```ruby
172
+ df[:a] = 1
173
+ # or
174
+ df[:a] = [1, 2, 3]
175
+ ```
176
+
177
+ Update a single element
178
+
179
+ ```ruby
180
+ df[:a][0] = 100
181
+ ```
182
+
183
+ Update multiple elements
184
+
185
+ ```ruby
186
+ df[:a][0..2] = 1
187
+ # or
188
+ df[:a][0..2] = [1, 2, 3]
189
+ ```
190
+
191
+ Update elements matching a condition
192
+
193
+ ```ruby
194
+ df[:a][df[:a] > 100] = 0
195
+ ```
196
+
197
+ Clamp
198
+
199
+ ```ruby
200
+ df[:a].clamp!(0, 100)
201
+ ```
202
+
203
+ Delete columns
204
+
205
+ ```ruby
206
+ df.delete(:a)
207
+ # or
208
+ df.except!(:a, :b)
209
+ ```
210
+
211
+ Rename a column
212
+
213
+ ```ruby
214
+ df[:new_a] = df.delete(:a)
215
+ ```
216
+
217
+ Sort data
218
+
219
+ ```ruby
220
+ df.sort_by! { |r| r[:a] }
221
+ ```
222
+
223
+ Clear all data
224
+
225
+ ```ruby
226
+ df.clear
227
+ ```
228
+
229
+ ## Combining Data Frames
230
+
231
+ Add rows
232
+
233
+ ```ruby
234
+ df.concat(other_df)
235
+ ```
236
+
237
+ Add columns
238
+
239
+ ```ruby
240
+ df.merge!(other_df)
241
+ ```
242
+
243
+ Inner join
244
+
245
+ ```ruby
246
+ df.inner_join(other_df)
247
+ # or
248
+ df.inner_join(other_df, on: :a)
249
+ # or
250
+ df.inner_join(other_df, on: [:a, :b])
251
+ # or
252
+ df.inner_join(other_df, on: {df_col: :other_df_col})
253
+ ```
254
+
255
+ Left join
256
+
257
+ ```ruby
258
+ df.left_join(other_df)
259
+ ```
260
+
261
+ ## Conversion
262
+
263
+ Array of hashes
264
+
265
+ ```ruby
266
+ df.to_a
267
+ ```
268
+
269
+ Hash of arrays
270
+
271
+ ```ruby
272
+ df.to_h
273
+ ```
274
+
275
+ Numo array
276
+
277
+ ```ruby
278
+ df.to_numo
279
+ ```
280
+
281
+ CSV
282
+
283
+ ```ruby
284
+ df.to_csv
285
+ ```
286
+
287
+ ## History
288
+
289
+ View the [changelog](https://github.com/ankane/rover/blob/master/CHANGELOG.md)
290
+
291
+ ## Contributing
292
+
293
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
294
+
295
+ - [Report bugs](https://github.com/ankane/rover/issues)
296
+ - Fix bugs and [submit pull requests](https://github.com/ankane/rover/pulls)
297
+ - Write, clarify, or fix documentation
298
+ - Suggest or add new features
299
+
300
+ To get started with development:
301
+
302
+ ```sh
303
+ git clone https://github.com/ankane/rover.git
304
+ cd rover
305
+ bundle install
306
+ bundle exec rake test
307
+ ```
@@ -0,0 +1 @@
1
+ require "rover"
@@ -0,0 +1,32 @@
1
+ # dependencies
2
+ require "numo/narray"
3
+
4
+ # modules
5
+ require "rover/data_frame"
6
+ require "rover/vector"
7
+ require "rover/version"
8
+
9
+ module Rover
10
+ class << self
11
+ def read_csv(path, **options)
12
+ require "csv"
13
+ csv_to_df(CSV.read(path, headers: true, converters: :numeric, **options))
14
+ end
15
+
16
+ def parse_csv(str, **options)
17
+ require "csv"
18
+ csv_to_df(CSV.parse(str, headers: true, converters: :numeric, **options))
19
+ end
20
+
21
+ private
22
+
23
+ def csv_to_df(table)
24
+ table.by_col!
25
+ data = {}
26
+ table.each do |k, v|
27
+ data[k] = v
28
+ end
29
+ DataFrame.new(data)
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,398 @@
1
+ module Rover
2
+ class DataFrame
3
+ def initialize(data = {})
4
+ @vectors = {}
5
+
6
+ if data.is_a?(DataFrame)
7
+ data.vectors.each do |k, v|
8
+ @vectors[k] = v
9
+ end
10
+ elsif data.is_a?(Hash)
11
+ data.to_h.each do |k, v|
12
+ @vectors[k] =
13
+ if v.respond_to?(:to_a)
14
+ Vector.new(v)
15
+ else
16
+ v
17
+ end
18
+ end
19
+
20
+ # handle scalars
21
+ size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1
22
+ @vectors.each_key do |k|
23
+ @vectors[k] = to_vector(@vectors[k], size)
24
+ end
25
+ elsif data.is_a?(Array)
26
+ vectors = {}
27
+ raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) }
28
+ keys = data.flat_map(&:keys).uniq
29
+ keys.each do |k|
30
+ vectors[k] = []
31
+ end
32
+ data.each do |d|
33
+ keys.each do |k|
34
+ vectors[k] << d[k]
35
+ end
36
+ end
37
+ vectors.each do |k, v|
38
+ @vectors[k] = to_vector(v)
39
+ end
40
+ elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
41
+ result = data.connection.select_all(data.all.to_sql)
42
+ result.columns.each_with_index do |k, i|
43
+ @vectors[k] = to_vector(result.rows.map { |r| r[i] })
44
+ end
45
+ else
46
+ raise ArgumentError, "Cannot cast to data frame: #{data.class.name}"
47
+ end
48
+
49
+ # check keys
50
+ @vectors.each_key do |k|
51
+ check_key(k)
52
+ end
53
+
54
+ # check sizes
55
+ sizes = @vectors.values.map(&:size).uniq
56
+ if sizes.size > 1
57
+ raise ArgumentError, "Different sizes: #{sizes}"
58
+ end
59
+ end
60
+
61
+ def [](where)
62
+ if (where.is_a?(Vector) && where.to_numo.is_a?(Numo::Bit)) || where.is_a?(Numeric) || where.is_a?(Range) || (where.is_a?(Array) && where.all? { |v| v.is_a?(Integer) } )
63
+ new_vectors = {}
64
+ @vectors.each do |k, v|
65
+ new_vectors[k] = v[where]
66
+ end
67
+ DataFrame.new(new_vectors)
68
+ elsif where.is_a?(Array)
69
+ # multiple columns
70
+ df = DataFrame.new
71
+ where.each do |k|
72
+ df[k] = @vectors[k]
73
+ end
74
+ df
75
+ else
76
+ # single column
77
+ @vectors[where]
78
+ end
79
+ end
80
+
81
+ # return each row as a hash
82
+ def each_row
83
+ size.times do |i|
84
+ yield @vectors.map { |k, v| [k, v[i]] }.to_h
85
+ end
86
+ end
87
+
88
+ # dup to prevent direct modification of keys
89
+ def vectors
90
+ @vectors.dup
91
+ end
92
+
93
+ def []=(k, v)
94
+ check_key(k)
95
+ v = to_vector(v, size)
96
+ raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size
97
+ @vectors[k] = v
98
+ end
99
+
100
+ def size
101
+ @vectors.values.first&.size || 0
102
+ end
103
+ alias_method :length, :size
104
+ alias_method :count, :size
105
+
106
+ # should this check for columns as well?
107
+ def any?
108
+ size > 0
109
+ end
110
+
111
+ # should this check for columns as well?
112
+ def empty?
113
+ size == 0
114
+ end
115
+
116
+ def clear
117
+ @vectors.clear
118
+ end
119
+
120
+ def shape
121
+ [size, @vectors.size]
122
+ end
123
+
124
+ def keys
125
+ @vectors.keys
126
+ end
127
+ alias_method :names, :keys
128
+ alias_method :vector_names, :keys
129
+
130
+ def delete(key)
131
+ @vectors.delete(key)
132
+ end
133
+
134
+ def except(*keys)
135
+ dup.except!(*keys)
136
+ end
137
+
138
+ def except!(*keys)
139
+ keys.each do |key|
140
+ delete(key)
141
+ end
142
+ self
143
+ end
144
+
145
+ def include?(key)
146
+ @vectors.include?(key)
147
+ end
148
+
149
+ def head(n = 5)
150
+ first(n)
151
+ end
152
+
153
+ def tail(n = 5)
154
+ last(n)
155
+ end
156
+
157
+ def first(n = nil)
158
+ new_vectors = {}
159
+ @vectors.each do |k, v|
160
+ new_vectors[k] = v.first(n)
161
+ end
162
+ DataFrame.new(new_vectors)
163
+ end
164
+
165
+ def last(n = nil)
166
+ new_vectors = {}
167
+ @vectors.each do |k, v|
168
+ new_vectors[k] = v.last(n)
169
+ end
170
+ DataFrame.new(new_vectors)
171
+ end
172
+
173
+ def to_a
174
+ a = []
175
+ each_row do |row|
176
+ a << row
177
+ end
178
+ a
179
+ end
180
+
181
+ def to_h
182
+ hsh = {}
183
+ @vectors.each do |k, v|
184
+ hsh[k] = v.to_a
185
+ end
186
+ hsh
187
+ end
188
+
189
+ def to_numo
190
+ Numo::NArray.column_stack(vectors.values.map(&:to_numo))
191
+ end
192
+
193
+ def to_csv
194
+ require "csv"
195
+ CSV.generate do |csv|
196
+ csv << keys
197
+ numo = vectors.values.map(&:to_numo)
198
+ size.times do |i|
199
+ csv << numo.map { |n| n[i] }
200
+ end
201
+ end
202
+ end
203
+
204
+ # for IRuby
205
+ def to_html
206
+ require "iruby"
207
+ IRuby::HTML.table(to_h)
208
+ end
209
+
210
+ # TODO handle long text better
211
+ def inspect
212
+ return "#<Rover::DataFrame>" if keys.empty?
213
+
214
+ lines = []
215
+ line_start = 0
216
+ spaces = 2
217
+
218
+ @vectors.each do |k, v|
219
+ v = v.first(5).to_a
220
+ width = ([k] + v).map(&:to_s).map(&:size).max
221
+ width = 3 if width < 3
222
+
223
+ if lines.empty? || lines[-2].map { |l| l.size + spaces }.sum + width > 120
224
+ line_start = lines.size
225
+ lines << []
226
+ [size, 5].min.times do |i|
227
+ lines << []
228
+ end
229
+ lines << [] if size > 5
230
+ lines << []
231
+ end
232
+
233
+ lines[line_start] << "%#{width}s" % k.to_s
234
+ v.each_with_index do |v2, i|
235
+ lines[line_start + 1 + i] << "%#{width}s" % v2.to_s
236
+ end
237
+ lines[line_start + 6] << "%#{width}s" % "..." if size > 5
238
+ end
239
+
240
+ lines.pop
241
+ lines.map { |l| l.join(" " * spaces) }.join("\n")
242
+ end
243
+ alias_method :to_s, :inspect # alias like hash
244
+
245
+ def sort_by!
246
+ indexes =
247
+ size.times.sort_by do |i|
248
+ yield @vectors.map { |k, v| [k, v[i]] }.to_h
249
+ end
250
+
251
+ @vectors.each do |k, v|
252
+ self[k] = v.to_numo.at(indexes)
253
+ end
254
+ self
255
+ end
256
+
257
+ def sort_by(&block)
258
+ dup.sort_by!(&block)
259
+ end
260
+
261
+ def dup
262
+ df = DataFrame.new
263
+ @vectors.each do |k, v|
264
+ df[k] = v
265
+ end
266
+ df
267
+ end
268
+
269
+ def +(other)
270
+ dup.concat(other)
271
+ end
272
+
273
+ # in-place, like Array#concat
274
+ # TODO make more performant
275
+ def concat(other)
276
+ raise ArgumentError, "Must be a data frame" unless other.is_a?(DataFrame)
277
+
278
+ size = self.size
279
+ vectors.each do |k, v|
280
+ @vectors[k] = Vector.new(v.to_a + (other[k] ? other[k].to_a : [nil] * other.size))
281
+ end
282
+ (other.vector_names - vector_names).each do |k|
283
+ @vectors[k] = Vector.new([nil] * size + other[k].to_a)
284
+ end
285
+ self
286
+ end
287
+
288
+ def merge(other)
289
+ dup.merge!(other)
290
+ end
291
+
292
+ def merge!(other)
293
+ other.vectors.each do |k, v|
294
+ self[k] = v
295
+ end
296
+ self
297
+ end
298
+
299
+ # see join for options
300
+ def inner_join(other, on: nil)
301
+ join(other, on: on, how: "inner")
302
+ end
303
+
304
+ # see join for options
305
+ def left_join(other, on: nil)
306
+ join(other, on: on, how: "left")
307
+ end
308
+
309
+ # don't check types
310
+ def ==(other)
311
+ size == other.size &&
312
+ keys == other.keys &&
313
+ keys.all? { |k| self[k] == other[k] }
314
+ end
315
+
316
+ private
317
+
318
+ def check_key(key)
319
+ raise ArgumentError, "Key must be a string or symbol, got #{key.inspect}" unless key.is_a?(String) || key.is_a?(Symbol)
320
+ end
321
+
322
+ # TODO make more efficient
323
+ # TODO add option to prefix/suffix keys?
324
+ # Supports:
325
+ # - on: :key
326
+ # - on: [:key1, :key2]
327
+ # - on: {key1a: :key1b, key2a: :key2b}
328
+ def join(other, how:, on: nil)
329
+ self_on, other_on =
330
+ if on.is_a?(Hash)
331
+ [on.keys, on.values]
332
+ else
333
+ on ||= keys & other.keys
334
+ on = [on] unless on.is_a?(Array)
335
+ [on, on]
336
+ end
337
+
338
+ check_join_keys(self, self_on)
339
+ check_join_keys(other, other_on)
340
+
341
+ indexed = other.to_a.group_by { |r| r.values_at(*other_on) }
342
+ indexed.default = []
343
+
344
+ left = how == "left"
345
+
346
+ vectors = {}
347
+ keys = (self.keys + other.keys).uniq
348
+ keys.each do |k|
349
+ vectors[k] = []
350
+ end
351
+
352
+ each_row do |r|
353
+ matches = indexed[r.values_at(*self_on)]
354
+ if matches.empty?
355
+ if left
356
+ keys.each do |k|
357
+ vectors[k] << r[k]
358
+ end
359
+ end
360
+ else
361
+ matches.each do |r2|
362
+ keys.each do |k|
363
+ vectors[k] << (r2[k] || r[k])
364
+ end
365
+ end
366
+ end
367
+ end
368
+
369
+ DataFrame.new(vectors)
370
+ end
371
+
372
+ def check_join_keys(df, keys)
373
+ raise ArgumentError, "No keys" if keys.empty?
374
+ missing_keys = keys.select { |k| !df.include?(k) }
375
+ raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
376
+ end
377
+
378
+ def to_vector(v, size = nil)
379
+ return v if v.is_a?(Vector)
380
+
381
+ if size && !v.respond_to?(:to_a)
382
+ v =
383
+ if v.is_a?(Integer)
384
+ Numo::Int64.new(size).fill(v)
385
+ elsif v.is_a?(Numeric)
386
+ Numo::DFloat.new(size).fill(v)
387
+ elsif v == true || v == false
388
+ Numo::Bit.new(size).fill(v)
389
+ else
390
+ # TODO make more efficient
391
+ [v] * size
392
+ end
393
+ end
394
+
395
+ Vector.new(v)
396
+ end
397
+ end
398
+ end
@@ -0,0 +1,248 @@
1
+ module Rover
2
+ class Vector
3
+ def initialize(data)
4
+ @data =
5
+ if data.is_a?(Vector)
6
+ data.to_numo
7
+ elsif data.is_a?(Numo::NArray)
8
+ data
9
+ else
10
+ data = data.to_a
11
+ if data.all? { |v| v.is_a?(Integer) }
12
+ Numo::Int64.cast(data)
13
+ elsif data.all? { |v| v.is_a?(Numeric) || v.nil? }
14
+ Numo::DFloat.cast(data.map { |v| v || Float::NAN })
15
+ elsif data.all? { |v| v == true || v == false }
16
+ Numo::Bit.cast(data)
17
+ else
18
+ Numo::RObject.cast(data)
19
+ end
20
+ end
21
+
22
+ raise ArgumentError, "Bad size: #{@data.shape}" unless @data.ndim == 1
23
+ end
24
+
25
+ def to_numo
26
+ @data
27
+ end
28
+
29
+ def to_a
30
+ a = @data.to_a
31
+ a.map! { |v| !v.zero? } if @data.is_a?(Numo::Bit)
32
+ a
33
+ end
34
+
35
+ def size
36
+ @data.size
37
+ end
38
+
39
+ def uniq
40
+ Vector.new(@data.to_a.uniq)
41
+ end
42
+
43
+ def missing
44
+ bit =
45
+ if @data.is_a?(Numo::RObject)
46
+ Numo::Bit.cast(@data.map(&:nil?))
47
+ elsif @data.respond_to?(:isnan)
48
+ @data.isnan
49
+ else
50
+ Numo::Bit.new(size).fill(0)
51
+ end
52
+
53
+ Vector.new(bit)
54
+ end
55
+
56
+ # keep same number of rows as original
57
+ # to make it easy to add to original data frame
58
+ def diff
59
+ diff = @data.cast_to(Numo::DFloat).diff
60
+ Vector.new(diff.insert(0, Float::NAN))
61
+ end
62
+
63
+ def [](v)
64
+ if v.is_a?(Vector)
65
+ Vector.new(v.to_numo.mask(@data))
66
+ else
67
+ @data[v]
68
+ end
69
+ end
70
+
71
+ def []=(k, v)
72
+ k = k.to_numo if k.is_a?(Vector)
73
+ @data[k] = v
74
+ end
75
+
76
+ %w(+ - * / % ** &).each do |op|
77
+ define_method(op) do |other|
78
+ other = other.to_numo if other.is_a?(Vector)
79
+ # TODO better logic
80
+ if @data.is_a?(Numo::RObject)
81
+ map { |v| v.send(op, other) }
82
+ else
83
+ Vector.new(@data.send(op, other))
84
+ end
85
+ end
86
+ end
87
+
88
+ {
89
+ "==" => "eq",
90
+ "!=" => "ne",
91
+ ">" => "gt",
92
+ ">=" => "ge",
93
+ "<" => "lt",
94
+ "<=" => "le"
95
+ }.each do |op, meth|
96
+ define_method(op) do |other|
97
+ other = other.to_numo if other.is_a?(Vector)
98
+ v =
99
+ if other.is_a?(Numo::RObject)
100
+ @data.to_a.zip(other).map { |v, ov| v == ov }
101
+ elsif other.is_a?(Numeric) || other.is_a?(Numo::NArray)
102
+ @data.send(meth, other)
103
+ else
104
+ @data.map { |v| v.send(op, other) }
105
+ end
106
+ Vector.new(Numo::Bit.cast(v))
107
+ end
108
+ end
109
+
110
+ def in?(values)
111
+ ret = Numo::Bit.new(size).fill(false)
112
+ values.each do |v|
113
+ comp =
114
+ if v.is_a?(Numeric) || v.is_a?(Numo::NArray)
115
+ @data.eq(v)
116
+ else
117
+ Numo::Bit.cast(@data.map { |d| d == v })
118
+ end
119
+ ret |= comp
120
+ end
121
+ Vector.new(ret)
122
+ end
123
+
124
+ def !
125
+ if @data.is_a?(Numo::Bit)
126
+ Vector.new(@data.eq(0))
127
+ else
128
+ raise "Not implemented yet"
129
+ end
130
+ end
131
+
132
+ def -@
133
+ self * -1
134
+ end
135
+
136
+ def clamp!(min, max)
137
+ @data = @data.clip(min, max)
138
+ self
139
+ end
140
+
141
+ def clamp(min, max)
142
+ dup.clamp!(min, max)
143
+ end
144
+
145
+ def map(&block)
146
+ mapped = @data.map(&block)
147
+ mapped = mapped.to_a if mapped.is_a?(Numo::RObject) # re-evaluate cast
148
+ Vector.new(mapped)
149
+ end
150
+
151
+ def sort
152
+ Vector.new(@data.respond_to?(:sort) ? @data.sort : @data.to_a.sort)
153
+ end
154
+
155
+ def abs
156
+ Vector.new(@data.abs)
157
+ end
158
+
159
+ def each(&block)
160
+ to_a.each(&block)
161
+ end
162
+
163
+ def max
164
+ @data.max
165
+ end
166
+
167
+ def min
168
+ @data.min
169
+ end
170
+
171
+ def mean
172
+ # currently only floats have mean in Numo
173
+ # https://github.com/ruby-numo/numo-narray/issues/79
174
+ @data.cast_to(Numo::DFloat).mean
175
+ end
176
+
177
+ def median
178
+ # need to cast to get correct result
179
+ # TODO file bug with Numo
180
+ @data.cast_to(Numo::DFloat).median
181
+ end
182
+
183
+ def percentile(q)
184
+ @data.percentile(q)
185
+ end
186
+
187
+ def sum
188
+ @data.sum
189
+ end
190
+
191
+ def all?(&block)
192
+ @data.to_a.all?(&block)
193
+ end
194
+
195
+ def any?(&block)
196
+ @data.to_a.any?(&block)
197
+ end
198
+
199
+ def first(n = 1)
200
+ if n >= size
201
+ Vector.new(@data)
202
+ else
203
+ Vector.new(@data[0...n])
204
+ end
205
+ end
206
+
207
+ def last(n = 1)
208
+ Vector.new(@data[-n..-1])
209
+ end
210
+
211
+ def crosstab(other)
212
+ index = uniq.sort
213
+ index_pos = index.to_a.map.with_index.to_h
214
+ df = DataFrame.new({"_" => index})
215
+ other.uniq.sort.each do |k|
216
+ df[k] = 0
217
+ end
218
+ to_a.zip(other.to_a) do |v1, v2|
219
+ df[v2][index_pos[v1]] += 1
220
+ end
221
+ df
222
+ end
223
+
224
+ def head(n = 5)
225
+ n += size if n < 0
226
+ first(n)
227
+ end
228
+
229
+ def tail(n = 5)
230
+ n += size if n < 0
231
+ last(n)
232
+ end
233
+
234
+ # TODO add type and size?
235
+ def inspect
236
+ elements = first(5).to_a.map(&:inspect)
237
+ elements << "..." if size > 5
238
+ "#<Rover::Vector [#{elements.join(", ")}]>"
239
+ end
240
+ alias_method :to_s, :inspect # alias like hash
241
+
242
+ # for IRuby
243
+ def to_html
244
+ require "iruby"
245
+ IRuby::HTML.table(to_a)
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,3 @@
1
+ module Rover
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,148 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rover-df
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-05-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: numo-narray
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.1.7
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.1.7
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '5'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '5'
69
+ - !ruby/object:Gem::Dependency
70
+ name: activerecord
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '5'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '5'
83
+ - !ruby/object:Gem::Dependency
84
+ name: sqlite3
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: iruby
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description:
112
+ email: andrew@chartkick.com
113
+ executables: []
114
+ extensions: []
115
+ extra_rdoc_files: []
116
+ files:
117
+ - CHANGELOG.md
118
+ - LICENSE.txt
119
+ - README.md
120
+ - lib/rover-df.rb
121
+ - lib/rover.rb
122
+ - lib/rover/data_frame.rb
123
+ - lib/rover/vector.rb
124
+ - lib/rover/version.rb
125
+ homepage: https://github.com/ankane/rover
126
+ licenses:
127
+ - MIT
128
+ metadata: {}
129
+ post_install_message:
130
+ rdoc_options: []
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '2.4'
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ requirements: []
144
+ rubygems_version: 3.1.2
145
+ signing_key:
146
+ specification_version: 4
147
+ summary: Simple, powerful data frames for Ruby
148
+ test_files: []