rover-df 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 4588d0b3b5633a3821a4c07e7102e5933edca92179836db041f2400d8be88538
4
+ data.tar.gz: 9b01cd2bae5fb6ba9f426fe0d347752cd30c63619b00284fb68e8f711ec38ddf
5
+ SHA512:
6
+ metadata.gz: b2d35866786a7fbe17b274585419c752b08c817b2db1bf939a6c3f92a7ae2cd282d725614f96db730fd2590cbb8c24710d0fb1f713255d2c348c0fed0b874a35
7
+ data.tar.gz: 4bf0ba38ce2c3ef4765d702591948af18fddf142efb7e559e26cc4ab504538775a1771c839f1570230f7d101fa20bfbbeb5044f6bf567637790575ee9b95be87
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (2020-05-13)
2
+
3
+ - First release
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2020 Andrew Kane
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,307 @@
1
+ # Rover
2
+
3
+ Simple, powerful data frames for Ruby
4
+
5
+ :mountain: Designed for data exploration and machine learning, and powered by [Numo](https://github.com/ruby-numo/numo-narray) for blazing performance
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application’s Gemfile:
10
+
11
+ ```ruby
12
+ gem 'rover-df'
13
+ ```
14
+
15
+ ## Intro
16
+
17
+ A data frame is an in-memory table. It’s a useful data structure for data analysis and machine learning. It uses columnar storage for fast operations on columns.
18
+
19
+ ## Creating Data Frames
20
+
21
+ From an array
22
+
23
+ ```ruby
24
+ Rover::DataFrame.new([{a: 1, b: "one"}, {a: 2, b: "two"}, {a: 3, b: "three"}])
25
+ ```
26
+
27
+ From a hash
28
+
29
+ ```ruby
30
+ Rover::DataFrame.new({
31
+ a: [1, 2, 3],
32
+ b: ["one", "two", "three"]
33
+ })
34
+ ```
35
+
36
+ From an Active Record relation
37
+
38
+ ```ruby
39
+ Rover::DataFrame.new(User.all)
40
+ ```
41
+
42
+ From a CSV
43
+
44
+ ```ruby
45
+ Rover.read_csv("file.csv")
46
+ # or
47
+ Rover.parse_csv("CSV,data,string")
48
+ ```
49
+
50
+ ## Attributes
51
+
52
+ Get number of rows
53
+
54
+ ```ruby
55
+ df.count
56
+ ```
57
+
58
+ Get column names
59
+
60
+ ```ruby
61
+ df.keys
62
+ ```
63
+
64
+ Check if a column exists
65
+
66
+ ```ruby
67
+ df.include?(name)
68
+ ```
69
+
70
+ ## Selecting Data
71
+
72
+ Select a column
73
+
74
+ ```ruby
75
+ df[:a]
76
+ ```
77
+
78
+ Select multiple columns
79
+
80
+ ```ruby
81
+ df[[:a, :b]]
82
+ ```
83
+
84
+ Select first rows
85
+
86
+ ```ruby
87
+ df.head
88
+ # or
89
+ df.first(5)
90
+ ```
91
+
92
+ Select last rows
93
+
94
+ ```ruby
95
+ df.tail
96
+ # or
97
+ df.last(5)
98
+ ```
99
+
100
+ Select rows by index
101
+
102
+ ```ruby
103
+ df[1]
104
+ # or
105
+ df[1..3]
106
+ # or
107
+ df[[1, 4, 5]]
108
+ ```
109
+
110
+ ## Filtering
111
+
112
+ Filter on a condition
113
+
114
+ ```ruby
115
+ df[df[:a] > 100]
116
+ ```
117
+
118
+ And
119
+
120
+ ```ruby
121
+ df[df[:a] > 100 & df[:b] == "one"]
122
+ ```
123
+
124
+ Or
125
+
126
+ ```ruby
127
+ df[df[:a] > 100 | df[:b] == "one"]
128
+ ```
129
+
130
+ Not
131
+
132
+ ```ruby
133
+ df[df[:a] != 100]
134
+ ```
135
+
136
+ ## Operations
137
+
138
+ Basic operations
139
+
140
+ ```ruby
141
+ df[:a] + 5
142
+ df[:a] - 5
143
+ df[:a] * 5
144
+ df[:a] / 5
145
+ df[:a] % 5
146
+ df[:a] ** 2
147
+ ```
148
+
149
+ Summary statistics
150
+
151
+ ```ruby
152
+ df[:a].count
153
+ df[:a].sum
154
+ df[:a].mean
155
+ df[:a].median
156
+ df[:a].percentile(90)
157
+ df[:a].min
158
+ df[:a].max
159
+ ```
160
+
161
+ Cross tabulation
162
+
163
+ ```ruby
164
+ df[:a].crosstab(df[:b])
165
+ ```
166
+
167
+ ## Updates
168
+
169
+ Add a new column
170
+
171
+ ```ruby
172
+ df[:a] = 1
173
+ # or
174
+ df[:a] = [1, 2, 3]
175
+ ```
176
+
177
+ Update a single element
178
+
179
+ ```ruby
180
+ df[:a][0] = 100
181
+ ```
182
+
183
+ Update multiple elements
184
+
185
+ ```ruby
186
+ df[:a][0..2] = 1
187
+ # or
188
+ df[:a][0..2] = [1, 2, 3]
189
+ ```
190
+
191
+ Update elements matching a condition
192
+
193
+ ```ruby
194
+ df[:a][df[:a] > 100] = 0
195
+ ```
196
+
197
+ Clamp
198
+
199
+ ```ruby
200
+ df[:a].clamp!(0, 100)
201
+ ```
202
+
203
+ Delete columns
204
+
205
+ ```ruby
206
+ df.delete(:a)
207
+ # or
208
+ df.except!(:a, :b)
209
+ ```
210
+
211
+ Rename a column
212
+
213
+ ```ruby
214
+ df[:new_a] = df.delete(:a)
215
+ ```
216
+
217
+ Sort data
218
+
219
+ ```ruby
220
+ df.sort_by! { |r| r[:a] }
221
+ ```
222
+
223
+ Clear all data
224
+
225
+ ```ruby
226
+ df.clear
227
+ ```
228
+
229
+ ## Combining Data Frames
230
+
231
+ Add rows
232
+
233
+ ```ruby
234
+ df.concat(other_df)
235
+ ```
236
+
237
+ Add columns
238
+
239
+ ```ruby
240
+ df.merge!(other_df)
241
+ ```
242
+
243
+ Inner join
244
+
245
+ ```ruby
246
+ df.inner_join(other_df)
247
+ # or
248
+ df.inner_join(other_df, on: :a)
249
+ # or
250
+ df.inner_join(other_df, on: [:a, :b])
251
+ # or
252
+ df.inner_join(other_df, on: {df_col: :other_df_col})
253
+ ```
254
+
255
+ Left join
256
+
257
+ ```ruby
258
+ df.left_join(other_df)
259
+ ```
260
+
261
+ ## Conversion
262
+
263
+ Array of hashes
264
+
265
+ ```ruby
266
+ df.to_a
267
+ ```
268
+
269
+ Hash of arrays
270
+
271
+ ```ruby
272
+ df.to_h
273
+ ```
274
+
275
+ Numo array
276
+
277
+ ```ruby
278
+ df.to_numo
279
+ ```
280
+
281
+ CSV
282
+
283
+ ```ruby
284
+ df.to_csv
285
+ ```
286
+
287
+ ## History
288
+
289
+ View the [changelog](https://github.com/ankane/rover/blob/master/CHANGELOG.md)
290
+
291
+ ## Contributing
292
+
293
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
294
+
295
+ - [Report bugs](https://github.com/ankane/rover/issues)
296
+ - Fix bugs and [submit pull requests](https://github.com/ankane/rover/pulls)
297
+ - Write, clarify, or fix documentation
298
+ - Suggest or add new features
299
+
300
+ To get started with development:
301
+
302
+ ```sh
303
+ git clone https://github.com/ankane/rover.git
304
+ cd rover
305
+ bundle install
306
+ bundle exec rake test
307
+ ```
@@ -0,0 +1 @@
1
+ require "rover"
@@ -0,0 +1,32 @@
1
+ # dependencies
2
+ require "numo/narray"
3
+
4
+ # modules
5
+ require "rover/data_frame"
6
+ require "rover/vector"
7
+ require "rover/version"
8
+
9
+ module Rover
10
+ class << self
11
+ def read_csv(path, **options)
12
+ require "csv"
13
+ csv_to_df(CSV.read(path, headers: true, converters: :numeric, **options))
14
+ end
15
+
16
+ def parse_csv(str, **options)
17
+ require "csv"
18
+ csv_to_df(CSV.parse(str, headers: true, converters: :numeric, **options))
19
+ end
20
+
21
+ private
22
+
23
+ def csv_to_df(table)
24
+ table.by_col!
25
+ data = {}
26
+ table.each do |k, v|
27
+ data[k] = v
28
+ end
29
+ DataFrame.new(data)
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,398 @@
1
+ module Rover
2
+ class DataFrame
3
+ def initialize(data = {})
4
+ @vectors = {}
5
+
6
+ if data.is_a?(DataFrame)
7
+ data.vectors.each do |k, v|
8
+ @vectors[k] = v
9
+ end
10
+ elsif data.is_a?(Hash)
11
+ data.to_h.each do |k, v|
12
+ @vectors[k] =
13
+ if v.respond_to?(:to_a)
14
+ Vector.new(v)
15
+ else
16
+ v
17
+ end
18
+ end
19
+
20
+ # handle scalars
21
+ size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1
22
+ @vectors.each_key do |k|
23
+ @vectors[k] = to_vector(@vectors[k], size)
24
+ end
25
+ elsif data.is_a?(Array)
26
+ vectors = {}
27
+ raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) }
28
+ keys = data.flat_map(&:keys).uniq
29
+ keys.each do |k|
30
+ vectors[k] = []
31
+ end
32
+ data.each do |d|
33
+ keys.each do |k|
34
+ vectors[k] << d[k]
35
+ end
36
+ end
37
+ vectors.each do |k, v|
38
+ @vectors[k] = to_vector(v)
39
+ end
40
+ elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
41
+ result = data.connection.select_all(data.all.to_sql)
42
+ result.columns.each_with_index do |k, i|
43
+ @vectors[k] = to_vector(result.rows.map { |r| r[i] })
44
+ end
45
+ else
46
+ raise ArgumentError, "Cannot cast to data frame: #{data.class.name}"
47
+ end
48
+
49
+ # check keys
50
+ @vectors.each_key do |k|
51
+ check_key(k)
52
+ end
53
+
54
+ # check sizes
55
+ sizes = @vectors.values.map(&:size).uniq
56
+ if sizes.size > 1
57
+ raise ArgumentError, "Different sizes: #{sizes}"
58
+ end
59
+ end
60
+
61
+ def [](where)
62
+ if (where.is_a?(Vector) && where.to_numo.is_a?(Numo::Bit)) || where.is_a?(Numeric) || where.is_a?(Range) || (where.is_a?(Array) && where.all? { |v| v.is_a?(Integer) } )
63
+ new_vectors = {}
64
+ @vectors.each do |k, v|
65
+ new_vectors[k] = v[where]
66
+ end
67
+ DataFrame.new(new_vectors)
68
+ elsif where.is_a?(Array)
69
+ # multiple columns
70
+ df = DataFrame.new
71
+ where.each do |k|
72
+ df[k] = @vectors[k]
73
+ end
74
+ df
75
+ else
76
+ # single column
77
+ @vectors[where]
78
+ end
79
+ end
80
+
81
+ # return each row as a hash
82
+ def each_row
83
+ size.times do |i|
84
+ yield @vectors.map { |k, v| [k, v[i]] }.to_h
85
+ end
86
+ end
87
+
88
+ # dup to prevent direct modification of keys
89
+ def vectors
90
+ @vectors.dup
91
+ end
92
+
93
+ def []=(k, v)
94
+ check_key(k)
95
+ v = to_vector(v, size)
96
+ raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size
97
+ @vectors[k] = v
98
+ end
99
+
100
+ def size
101
+ @vectors.values.first&.size || 0
102
+ end
103
+ alias_method :length, :size
104
+ alias_method :count, :size
105
+
106
+ # should this check for columns as well?
107
+ def any?
108
+ size > 0
109
+ end
110
+
111
+ # should this check for columns as well?
112
+ def empty?
113
+ size == 0
114
+ end
115
+
116
+ def clear
117
+ @vectors.clear
118
+ end
119
+
120
+ def shape
121
+ [size, @vectors.size]
122
+ end
123
+
124
+ def keys
125
+ @vectors.keys
126
+ end
127
+ alias_method :names, :keys
128
+ alias_method :vector_names, :keys
129
+
130
+ def delete(key)
131
+ @vectors.delete(key)
132
+ end
133
+
134
+ def except(*keys)
135
+ dup.except!(*keys)
136
+ end
137
+
138
+ def except!(*keys)
139
+ keys.each do |key|
140
+ delete(key)
141
+ end
142
+ self
143
+ end
144
+
145
+ def include?(key)
146
+ @vectors.include?(key)
147
+ end
148
+
149
+ def head(n = 5)
150
+ first(n)
151
+ end
152
+
153
+ def tail(n = 5)
154
+ last(n)
155
+ end
156
+
157
+ def first(n = nil)
158
+ new_vectors = {}
159
+ @vectors.each do |k, v|
160
+ new_vectors[k] = v.first(n)
161
+ end
162
+ DataFrame.new(new_vectors)
163
+ end
164
+
165
+ def last(n = nil)
166
+ new_vectors = {}
167
+ @vectors.each do |k, v|
168
+ new_vectors[k] = v.last(n)
169
+ end
170
+ DataFrame.new(new_vectors)
171
+ end
172
+
173
+ def to_a
174
+ a = []
175
+ each_row do |row|
176
+ a << row
177
+ end
178
+ a
179
+ end
180
+
181
+ def to_h
182
+ hsh = {}
183
+ @vectors.each do |k, v|
184
+ hsh[k] = v.to_a
185
+ end
186
+ hsh
187
+ end
188
+
189
+ def to_numo
190
+ Numo::NArray.column_stack(vectors.values.map(&:to_numo))
191
+ end
192
+
193
+ def to_csv
194
+ require "csv"
195
+ CSV.generate do |csv|
196
+ csv << keys
197
+ numo = vectors.values.map(&:to_numo)
198
+ size.times do |i|
199
+ csv << numo.map { |n| n[i] }
200
+ end
201
+ end
202
+ end
203
+
204
+ # for IRuby
205
+ def to_html
206
+ require "iruby"
207
+ IRuby::HTML.table(to_h)
208
+ end
209
+
210
+ # TODO handle long text better
211
+ def inspect
212
+ return "#<Rover::DataFrame>" if keys.empty?
213
+
214
+ lines = []
215
+ line_start = 0
216
+ spaces = 2
217
+
218
+ @vectors.each do |k, v|
219
+ v = v.first(5).to_a
220
+ width = ([k] + v).map(&:to_s).map(&:size).max
221
+ width = 3 if width < 3
222
+
223
+ if lines.empty? || lines[-2].map { |l| l.size + spaces }.sum + width > 120
224
+ line_start = lines.size
225
+ lines << []
226
+ [size, 5].min.times do |i|
227
+ lines << []
228
+ end
229
+ lines << [] if size > 5
230
+ lines << []
231
+ end
232
+
233
+ lines[line_start] << "%#{width}s" % k.to_s
234
+ v.each_with_index do |v2, i|
235
+ lines[line_start + 1 + i] << "%#{width}s" % v2.to_s
236
+ end
237
+ lines[line_start + 6] << "%#{width}s" % "..." if size > 5
238
+ end
239
+
240
+ lines.pop
241
+ lines.map { |l| l.join(" " * spaces) }.join("\n")
242
+ end
243
+ alias_method :to_s, :inspect # alias like hash
244
+
245
+ def sort_by!
246
+ indexes =
247
+ size.times.sort_by do |i|
248
+ yield @vectors.map { |k, v| [k, v[i]] }.to_h
249
+ end
250
+
251
+ @vectors.each do |k, v|
252
+ self[k] = v.to_numo.at(indexes)
253
+ end
254
+ self
255
+ end
256
+
257
+ def sort_by(&block)
258
+ dup.sort_by!(&block)
259
+ end
260
+
261
+ def dup
262
+ df = DataFrame.new
263
+ @vectors.each do |k, v|
264
+ df[k] = v
265
+ end
266
+ df
267
+ end
268
+
269
+ def +(other)
270
+ dup.concat(other)
271
+ end
272
+
273
+ # in-place, like Array#concat
274
+ # TODO make more performant
275
+ def concat(other)
276
+ raise ArgumentError, "Must be a data frame" unless other.is_a?(DataFrame)
277
+
278
+ size = self.size
279
+ vectors.each do |k, v|
280
+ @vectors[k] = Vector.new(v.to_a + (other[k] ? other[k].to_a : [nil] * other.size))
281
+ end
282
+ (other.vector_names - vector_names).each do |k|
283
+ @vectors[k] = Vector.new([nil] * size + other[k].to_a)
284
+ end
285
+ self
286
+ end
287
+
288
+ def merge(other)
289
+ dup.merge!(other)
290
+ end
291
+
292
+ def merge!(other)
293
+ other.vectors.each do |k, v|
294
+ self[k] = v
295
+ end
296
+ self
297
+ end
298
+
299
+ # see join for options
300
+ def inner_join(other, on: nil)
301
+ join(other, on: on, how: "inner")
302
+ end
303
+
304
+ # see join for options
305
+ def left_join(other, on: nil)
306
+ join(other, on: on, how: "left")
307
+ end
308
+
309
+ # don't check types
310
+ def ==(other)
311
+ size == other.size &&
312
+ keys == other.keys &&
313
+ keys.all? { |k| self[k] == other[k] }
314
+ end
315
+
316
+ private
317
+
318
+ def check_key(key)
319
+ raise ArgumentError, "Key must be a string or symbol, got #{key.inspect}" unless key.is_a?(String) || key.is_a?(Symbol)
320
+ end
321
+
322
+ # TODO make more efficient
323
+ # TODO add option to prefix/suffix keys?
324
+ # Supports:
325
+ # - on: :key
326
+ # - on: [:key1, :key2]
327
+ # - on: {key1a: :key1b, key2a: :key2b}
328
+ def join(other, how:, on: nil)
329
+ self_on, other_on =
330
+ if on.is_a?(Hash)
331
+ [on.keys, on.values]
332
+ else
333
+ on ||= keys & other.keys
334
+ on = [on] unless on.is_a?(Array)
335
+ [on, on]
336
+ end
337
+
338
+ check_join_keys(self, self_on)
339
+ check_join_keys(other, other_on)
340
+
341
+ indexed = other.to_a.group_by { |r| r.values_at(*other_on) }
342
+ indexed.default = []
343
+
344
+ left = how == "left"
345
+
346
+ vectors = {}
347
+ keys = (self.keys + other.keys).uniq
348
+ keys.each do |k|
349
+ vectors[k] = []
350
+ end
351
+
352
+ each_row do |r|
353
+ matches = indexed[r.values_at(*self_on)]
354
+ if matches.empty?
355
+ if left
356
+ keys.each do |k|
357
+ vectors[k] << r[k]
358
+ end
359
+ end
360
+ else
361
+ matches.each do |r2|
362
+ keys.each do |k|
363
+ vectors[k] << (r2[k] || r[k])
364
+ end
365
+ end
366
+ end
367
+ end
368
+
369
+ DataFrame.new(vectors)
370
+ end
371
+
372
+ def check_join_keys(df, keys)
373
+ raise ArgumentError, "No keys" if keys.empty?
374
+ missing_keys = keys.select { |k| !df.include?(k) }
375
+ raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
376
+ end
377
+
378
+ def to_vector(v, size = nil)
379
+ return v if v.is_a?(Vector)
380
+
381
+ if size && !v.respond_to?(:to_a)
382
+ v =
383
+ if v.is_a?(Integer)
384
+ Numo::Int64.new(size).fill(v)
385
+ elsif v.is_a?(Numeric)
386
+ Numo::DFloat.new(size).fill(v)
387
+ elsif v == true || v == false
388
+ Numo::Bit.new(size).fill(v)
389
+ else
390
+ # TODO make more efficient
391
+ [v] * size
392
+ end
393
+ end
394
+
395
+ Vector.new(v)
396
+ end
397
+ end
398
+ end
@@ -0,0 +1,248 @@
1
+ module Rover
2
+ class Vector
3
+ def initialize(data)
4
+ @data =
5
+ if data.is_a?(Vector)
6
+ data.to_numo
7
+ elsif data.is_a?(Numo::NArray)
8
+ data
9
+ else
10
+ data = data.to_a
11
+ if data.all? { |v| v.is_a?(Integer) }
12
+ Numo::Int64.cast(data)
13
+ elsif data.all? { |v| v.is_a?(Numeric) || v.nil? }
14
+ Numo::DFloat.cast(data.map { |v| v || Float::NAN })
15
+ elsif data.all? { |v| v == true || v == false }
16
+ Numo::Bit.cast(data)
17
+ else
18
+ Numo::RObject.cast(data)
19
+ end
20
+ end
21
+
22
+ raise ArgumentError, "Bad size: #{@data.shape}" unless @data.ndim == 1
23
+ end
24
+
25
+ def to_numo
26
+ @data
27
+ end
28
+
29
+ def to_a
30
+ a = @data.to_a
31
+ a.map! { |v| !v.zero? } if @data.is_a?(Numo::Bit)
32
+ a
33
+ end
34
+
35
+ def size
36
+ @data.size
37
+ end
38
+
39
+ def uniq
40
+ Vector.new(@data.to_a.uniq)
41
+ end
42
+
43
+ def missing
44
+ bit =
45
+ if @data.is_a?(Numo::RObject)
46
+ Numo::Bit.cast(@data.map(&:nil?))
47
+ elsif @data.respond_to?(:isnan)
48
+ @data.isnan
49
+ else
50
+ Numo::Bit.new(size).fill(0)
51
+ end
52
+
53
+ Vector.new(bit)
54
+ end
55
+
56
+ # keep same number of rows as original
57
+ # to make it easy to add to original data frame
58
+ def diff
59
+ diff = @data.cast_to(Numo::DFloat).diff
60
+ Vector.new(diff.insert(0, Float::NAN))
61
+ end
62
+
63
+ def [](v)
64
+ if v.is_a?(Vector)
65
+ Vector.new(v.to_numo.mask(@data))
66
+ else
67
+ @data[v]
68
+ end
69
+ end
70
+
71
+ def []=(k, v)
72
+ k = k.to_numo if k.is_a?(Vector)
73
+ @data[k] = v
74
+ end
75
+
76
+ %w(+ - * / % ** &).each do |op|
77
+ define_method(op) do |other|
78
+ other = other.to_numo if other.is_a?(Vector)
79
+ # TODO better logic
80
+ if @data.is_a?(Numo::RObject)
81
+ map { |v| v.send(op, other) }
82
+ else
83
+ Vector.new(@data.send(op, other))
84
+ end
85
+ end
86
+ end
87
+
88
+ {
89
+ "==" => "eq",
90
+ "!=" => "ne",
91
+ ">" => "gt",
92
+ ">=" => "ge",
93
+ "<" => "lt",
94
+ "<=" => "le"
95
+ }.each do |op, meth|
96
+ define_method(op) do |other|
97
+ other = other.to_numo if other.is_a?(Vector)
98
+ v =
99
+ if other.is_a?(Numo::RObject)
100
+ @data.to_a.zip(other).map { |v, ov| v == ov }
101
+ elsif other.is_a?(Numeric) || other.is_a?(Numo::NArray)
102
+ @data.send(meth, other)
103
+ else
104
+ @data.map { |v| v.send(op, other) }
105
+ end
106
+ Vector.new(Numo::Bit.cast(v))
107
+ end
108
+ end
109
+
110
+ def in?(values)
111
+ ret = Numo::Bit.new(size).fill(false)
112
+ values.each do |v|
113
+ comp =
114
+ if v.is_a?(Numeric) || v.is_a?(Numo::NArray)
115
+ @data.eq(v)
116
+ else
117
+ Numo::Bit.cast(@data.map { |d| d == v })
118
+ end
119
+ ret |= comp
120
+ end
121
+ Vector.new(ret)
122
+ end
123
+
124
+ def !
125
+ if @data.is_a?(Numo::Bit)
126
+ Vector.new(@data.eq(0))
127
+ else
128
+ raise "Not implemented yet"
129
+ end
130
+ end
131
+
132
+ def -@
133
+ self * -1
134
+ end
135
+
136
+ def clamp!(min, max)
137
+ @data = @data.clip(min, max)
138
+ self
139
+ end
140
+
141
+ def clamp(min, max)
142
+ dup.clamp!(min, max)
143
+ end
144
+
145
+ def map(&block)
146
+ mapped = @data.map(&block)
147
+ mapped = mapped.to_a if mapped.is_a?(Numo::RObject) # re-evaluate cast
148
+ Vector.new(mapped)
149
+ end
150
+
151
+ def sort
152
+ Vector.new(@data.respond_to?(:sort) ? @data.sort : @data.to_a.sort)
153
+ end
154
+
155
+ def abs
156
+ Vector.new(@data.abs)
157
+ end
158
+
159
+ def each(&block)
160
+ to_a.each(&block)
161
+ end
162
+
163
+ def max
164
+ @data.max
165
+ end
166
+
167
+ def min
168
+ @data.min
169
+ end
170
+
171
+ def mean
172
+ # currently only floats have mean in Numo
173
+ # https://github.com/ruby-numo/numo-narray/issues/79
174
+ @data.cast_to(Numo::DFloat).mean
175
+ end
176
+
177
+ def median
178
+ # need to cast to get correct result
179
+ # TODO file bug with Numo
180
+ @data.cast_to(Numo::DFloat).median
181
+ end
182
+
183
+ def percentile(q)
184
+ @data.percentile(q)
185
+ end
186
+
187
+ def sum
188
+ @data.sum
189
+ end
190
+
191
+ def all?(&block)
192
+ @data.to_a.all?(&block)
193
+ end
194
+
195
+ def any?(&block)
196
+ @data.to_a.any?(&block)
197
+ end
198
+
199
+ def first(n = 1)
200
+ if n >= size
201
+ Vector.new(@data)
202
+ else
203
+ Vector.new(@data[0...n])
204
+ end
205
+ end
206
+
207
+ def last(n = 1)
208
+ Vector.new(@data[-n..-1])
209
+ end
210
+
211
+ def crosstab(other)
212
+ index = uniq.sort
213
+ index_pos = index.to_a.map.with_index.to_h
214
+ df = DataFrame.new({"_" => index})
215
+ other.uniq.sort.each do |k|
216
+ df[k] = 0
217
+ end
218
+ to_a.zip(other.to_a) do |v1, v2|
219
+ df[v2][index_pos[v1]] += 1
220
+ end
221
+ df
222
+ end
223
+
224
+ def head(n = 5)
225
+ n += size if n < 0
226
+ first(n)
227
+ end
228
+
229
+ def tail(n = 5)
230
+ n += size if n < 0
231
+ last(n)
232
+ end
233
+
234
+ # TODO add type and size?
235
+ def inspect
236
+ elements = first(5).to_a.map(&:inspect)
237
+ elements << "..." if size > 5
238
+ "#<Rover::Vector [#{elements.join(", ")}]>"
239
+ end
240
+ alias_method :to_s, :inspect # alias like hash
241
+
242
+ # for IRuby
243
+ def to_html
244
+ require "iruby"
245
+ IRuby::HTML.table(to_a)
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,3 @@
1
+ module Rover
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,148 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rover-df
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-05-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: numo-narray
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.1.7
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.1.7
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '5'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '5'
69
+ - !ruby/object:Gem::Dependency
70
+ name: activerecord
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '5'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '5'
83
+ - !ruby/object:Gem::Dependency
84
+ name: sqlite3
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: iruby
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description:
112
+ email: andrew@chartkick.com
113
+ executables: []
114
+ extensions: []
115
+ extra_rdoc_files: []
116
+ files:
117
+ - CHANGELOG.md
118
+ - LICENSE.txt
119
+ - README.md
120
+ - lib/rover-df.rb
121
+ - lib/rover.rb
122
+ - lib/rover/data_frame.rb
123
+ - lib/rover/vector.rb
124
+ - lib/rover/version.rb
125
+ homepage: https://github.com/ankane/rover
126
+ licenses:
127
+ - MIT
128
+ metadata: {}
129
+ post_install_message:
130
+ rdoc_options: []
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '2.4'
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ requirements: []
144
+ rubygems_version: 3.1.2
145
+ signing_key:
146
+ specification_version: 4
147
+ summary: Simple, powerful data frames for Ruby
148
+ test_files: []