rover-df 0.2.3 → 0.2.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b8ac8c0dda5ee8ea5482b5d52927446e52a60151c05959324970b6b420c6b825
4
- data.tar.gz: cbabf42c40195303fa62a85b40c3d516dff7cb56a4059c2ab6867921fae62bb9
3
+ metadata.gz: c75bed3989211e806e54c296290e5f7b3af236a15742daac876e211e3ca5a76f
4
+ data.tar.gz: 5865ff8f1d0036423f18cfee867da63214ee50f79d373b0f0f244853d8efbefa
5
5
  SHA512:
6
- metadata.gz: 2b906f49a0accbbf4682216808faf3113c3f31c24e9e434a03f996d8e8e9b4db1c8ca0ccfb3f604e798261f97d88b26a5376bace349f230b5eda5949b492fb88
7
- data.tar.gz: 8f3d590c6df3d588f92c6c84b327211a3dce6b27452b4a1161492ca90dc87cfd6aad02a3c7ef038a9c6cb69155558f2a332acbfd65a9bb4ba1d220333b051872
6
+ metadata.gz: 11718bc8ade75a605e92cabe05c29e55c6d4dfe427cd5ada0a8a216db678b32a88f4a43843d1e7dcda7b7a64adb63b76969f1d958e91ca57c4f71989632e14aa
7
+ data.tar.gz: 16940236090625bef69cb14d6d9f9f50720314edea1b5892f60443799e5389700ddfb0d79a29ee1e193168097add9d7195799e7f049d85f9c9dc9c443843a678
data/CHANGELOG.md CHANGED
@@ -1,3 +1,25 @@
1
+ ## 0.2.7 (2022-01-16)
2
+
3
+ - Added support for booleans to Parquet methods
4
+ - Added support for creating data frames from `ActiveRecord::Result`
5
+ - Added `types` option to `read_parquet` and `parse_parquet` methods
6
+
7
+ ## 0.2.6 (2021-10-27)
8
+
9
+ - Added support for `nil` headers to `read_csv` and `parse_csv`
10
+ - Added `read_parquet`, `parse_parquet`, and `to_parquet` methods
11
+
12
+ ## 0.2.5 (2021-09-25)
13
+
14
+ - Fixed column types with joins
15
+
16
+ ## 0.2.4 (2021-06-03)
17
+
18
+ - Added grouping for `std` and `var`
19
+ - Fixed `==` for data frames
20
+ - Fixed error with `first` and `last` for data frames
21
+ - Fixed error with `last` when vector size is smaller than `n`
22
+
1
23
  ## 0.2.3 (2021-02-08)
2
24
 
3
25
  - Added `select`, `reject`, and `map!` methods to vectors
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2020-2021 Andrew Kane
1
+ Copyright (c) 2020-2022 Andrew Kane
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -20,7 +20,7 @@ gem 'rover-df'
20
20
 
21
21
  A data frame is an in-memory table. It’s a useful data structure for data analysis and machine learning. It uses columnar storage for fast operations on columns.
22
22
 
23
- Try it out for forecasting by clicking the button below:
23
+ Try it out for forecasting by clicking the button below (it can take a few minutes to start):
24
24
 
25
25
  [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ankane/ml-stack/master?filepath=Forecasting.ipynb)
26
26
 
@@ -61,6 +61,14 @@ Rover.read_csv("file.csv")
61
61
  Rover.parse_csv("CSV,data,string")
62
62
  ```
63
63
 
64
+ From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
65
+
66
+ ```ruby
67
+ Rover.read_parquet("file.parquet")
68
+ # or
69
+ Rover.parse_parquet("PAR1...")
70
+ ```
71
+
64
72
  ## Attributes
65
73
 
66
74
  Get number of rows
@@ -89,7 +97,7 @@ Select a column
89
97
  df[:a]
90
98
  ```
91
99
 
92
- > Note that strings and symbols are different keys, just like hashes
100
+ > Note that strings and symbols are different keys, just like hashes. Creating a data frame from Active Record, a CSV, or Parquet uses strings.
93
101
 
94
102
  Select multiple columns
95
103
 
@@ -123,6 +131,20 @@ df[1..3]
123
131
  df[[1, 4, 5]]
124
132
  ```
125
133
 
134
+ Iterate over rows
135
+
136
+ ```ruby
137
+ df.each_row { |row| ... }
138
+ ```
139
+
140
+ Iterate over a column
141
+
142
+ ```ruby
143
+ df[:a].each { |item| ... }
144
+ # or
145
+ df[:a].each_with_index { |item, index| ... }
146
+ ```
147
+
126
148
  ## Filtering
127
149
 
128
150
  Filter on a condition
@@ -181,6 +203,8 @@ df[:a].median
181
203
  df[:a].percentile(90)
182
204
  df[:a].min
183
205
  df[:a].max
206
+ df[:a].std
207
+ df[:a].var
184
208
  ```
185
209
 
186
210
  Count occurrences
@@ -259,6 +283,14 @@ df[:a][0..2] = 1
259
283
  df[:a][0..2] = [1, 2, 3]
260
284
  ```
261
285
 
286
+ Update all elements
287
+
288
+ ```ruby
289
+ df[:a] = df[:a].map { |v| v.gsub("a", "b") }
290
+ # or
291
+ df[:a].map! { |v| v.gsub("a", "b") }
292
+ ```
293
+
262
294
  Update elements matching a condition
263
295
 
264
296
  ```ruby
@@ -369,6 +401,12 @@ CSV
369
401
  df.to_csv
370
402
  ```
371
403
 
404
+ Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
405
+
406
+ ```ruby
407
+ df.to_parquet
408
+ ```
409
+
372
410
  ## Types
373
411
 
374
412
  You can specify column types when creating a data frame
@@ -40,8 +40,8 @@ module Rover
40
40
  vectors.each do |k, v|
41
41
  @vectors[k] = to_vector(v, type: types[k])
42
42
  end
43
- elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
44
- result = data.connection.select_all(data.all.to_sql)
43
+ elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base) || data.is_a?(ActiveRecord::Result))
44
+ result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.all.to_sql)
45
45
  result.columns.each_with_index do |k, i|
46
46
  @vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
47
47
  end
@@ -163,7 +163,7 @@ module Rover
163
163
  last(n)
164
164
  end
165
165
 
166
- def first(n = nil)
166
+ def first(n = 1)
167
167
  new_vectors = {}
168
168
  @vectors.each do |k, v|
169
169
  new_vectors[k] = v.first(n)
@@ -171,7 +171,7 @@ module Rover
171
171
  DataFrame.new(new_vectors)
172
172
  end
173
173
 
174
- def last(n = nil)
174
+ def last(n = 1)
175
175
  new_vectors = {}
176
176
  @vectors.each do |k, v|
177
177
  new_vectors[k] = v.last(n)
@@ -235,6 +235,44 @@ module Rover
235
235
  end
236
236
  end
237
237
 
238
+ def to_parquet
239
+ require "parquet"
240
+
241
+ schema = {}
242
+ types.each do |name, type|
243
+ schema[name] =
244
+ case type
245
+ when :int
246
+ :int64
247
+ when :uint
248
+ :uint64
249
+ when :float
250
+ :double
251
+ when :float32
252
+ :float
253
+ when :bool
254
+ :boolean
255
+ when :object
256
+ if @vectors[name].all? { |v| v.is_a?(String) }
257
+ :string
258
+ else
259
+ raise "Unknown type"
260
+ end
261
+ else
262
+ type
263
+ end
264
+ end
265
+ # TODO improve performance
266
+ raw_records = []
267
+ size.times do |i|
268
+ raw_records << @vectors.map { |_, v| v[i] }
269
+ end
270
+ table = Arrow::Table.new(schema, raw_records)
271
+ buffer = Arrow::ResizableBuffer.new(1024)
272
+ table.save(buffer, format: :parquet)
273
+ buffer.data.to_s
274
+ end
275
+
238
276
  # for IRuby
239
277
  def to_html
240
278
  require "iruby"
@@ -301,7 +339,7 @@ module Rover
301
339
  Group.new(self, columns.flatten)
302
340
  end
303
341
 
304
- [:max, :min, :median, :mean, :percentile, :sum].each do |name|
342
+ [:max, :min, :median, :mean, :percentile, :sum, :std, :var].each do |name|
305
343
  define_method(name) do |column, *args|
306
344
  check_column(column)
307
345
  self[column].send(name, *args)
@@ -360,7 +398,7 @@ module Rover
360
398
  def ==(other)
361
399
  size == other.size &&
362
400
  keys == other.keys &&
363
- keys.all? { |k| self[k] == other[k] }
401
+ keys.all? { |k| self[k].to_numo == other[k].to_numo }
364
402
  end
365
403
 
366
404
  def plot(x = nil, y = nil, type: nil)
@@ -475,10 +513,12 @@ module Rover
475
513
 
476
514
  left = how == "left"
477
515
 
516
+ types = {}
478
517
  vectors = {}
479
518
  keys = (self.keys + other.keys).uniq
480
519
  keys.each do |k|
481
520
  vectors[k] = []
521
+ types[k] = join_type(self.types[k], other.types[k])
482
522
  end
483
523
 
484
524
  each_row do |r|
@@ -498,7 +538,7 @@ module Rover
498
538
  end
499
539
  end
500
540
 
501
- DataFrame.new(vectors)
541
+ DataFrame.new(vectors, types: types)
502
542
  end
503
543
 
504
544
  def check_join_keys(df, keys)
@@ -523,6 +563,19 @@ module Rover
523
563
  end
524
564
  end
525
565
 
566
+ def join_type(a, b)
567
+ if a.nil?
568
+ b
569
+ elsif b.nil?
570
+ a
571
+ elsif a == b
572
+ a
573
+ else
574
+ # TODO specify
575
+ nil
576
+ end
577
+ end
578
+
526
579
  def to_vector(v, size: nil, type: nil)
527
580
  if v.is_a?(Vector)
528
581
  v = v.to(type) if type && v.type != type
data/lib/rover/group.rb CHANGED
@@ -9,7 +9,7 @@ module Rover
9
9
  Group.new(@df, @columns + columns.flatten)
10
10
  end
11
11
 
12
- [:count, :max, :min, :mean, :median, :percentile, :sum].each do |name|
12
+ [:count, :max, :min, :mean, :median, :percentile, :sum, :std, :var].each do |name|
13
13
  define_method(name) do |*args|
14
14
  n = [name, args.first].compact.join("_")
15
15
 
data/lib/rover/vector.rb CHANGED
@@ -263,7 +263,11 @@ module Rover
263
263
  end
264
264
 
265
265
  def last(n = 1)
266
- Vector.new(@data[-n..-1])
266
+ if n >= size
267
+ Vector.new(@data)
268
+ else
269
+ Vector.new(@data[-n..-1])
270
+ end
267
271
  end
268
272
 
269
273
  def take(n)
@@ -355,6 +359,7 @@ module Rover
355
359
  data = data.to_a
356
360
 
357
361
  if type
362
+ data = data.map { |v| v || Float::NAN } if [:float, :float32].include?(type)
358
363
  data = numo_type.cast(data)
359
364
  else
360
365
  data =
data/lib/rover/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rover
2
- VERSION = "0.2.3"
2
+ VERSION = "0.2.7"
3
3
  end
data/lib/rover.rb CHANGED
@@ -19,6 +19,16 @@ module Rover
19
19
  csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
20
20
  end
21
21
 
22
+ def read_parquet(path, types: nil)
23
+ require "parquet"
24
+ parquet_to_df(Arrow::Table.load(path), types: types)
25
+ end
26
+
27
+ def parse_parquet(str, types: nil)
28
+ require "parquet"
29
+ parquet_to_df(Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet), types: types)
30
+ end
31
+
22
32
  private
23
33
 
24
34
  # TODO use date converter
@@ -35,10 +45,63 @@ module Rover
35
45
 
36
46
  table.by_col!
37
47
  data = {}
48
+ keys = table.map { |k, _| [k, true] }.to_h
49
+ unnamed_suffix = 1
38
50
  table.each do |k, v|
51
+ # TODO do same for empty string in 0.3.0
52
+ if k.nil?
53
+ k = "unnamed"
54
+ while keys.include?(k)
55
+ unnamed_suffix += 1
56
+ k = "unnamed#{unnamed_suffix}"
57
+ end
58
+ keys[k] = true
59
+ end
39
60
  data[k] = v
40
61
  end
62
+
41
63
  DataFrame.new(data, types: types)
42
64
  end
65
+
66
+ PARQUET_TYPE_MAPPING = {
67
+ "bool" => Numo::Bit,
68
+ "float" => Numo::SFloat,
69
+ "double" => Numo::DFloat,
70
+ "int8" => Numo::Int8,
71
+ "int16" => Numo::Int16,
72
+ "int32" => Numo::Int32,
73
+ "int64" => Numo::Int64,
74
+ "string" => Numo::RObject,
75
+ "uint8" => Numo::UInt8,
76
+ "uint16" => Numo::UInt16,
77
+ "uint32" => Numo::UInt32,
78
+ "uint64" => Numo::UInt64
79
+ }
80
+
81
+ def parquet_to_df(table, types: nil)
82
+ data = {}
83
+ types ||= {}
84
+ table.each_column do |column|
85
+ k = column.field.name
86
+ if types[k]
87
+ data[k] = Vector.new(column.data.values, type: types[k])
88
+ else
89
+ type = column.field.data_type.to_s
90
+ numo_type = PARQUET_TYPE_MAPPING[type]
91
+ raise "Unknown type: #{type}" unless numo_type
92
+
93
+ # TODO automatic conversion?
94
+ # int => float
95
+ # bool => object
96
+ if (type.include?("int") || type == "bool") && column.n_nulls > 0
97
+ raise "Nulls not supported for #{type} column: #{k}"
98
+ end
99
+
100
+ # TODO improve performance
101
+ data[k] = numo_type.cast(column.data.values)
102
+ end
103
+ end
104
+ DataFrame.new(data)
105
+ end
43
106
  end
44
107
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rover-df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-02-08 00:00:00.000000000 Z
11
+ date: 2022-01-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -58,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
58
58
  - !ruby/object:Gem::Version
59
59
  version: '0'
60
60
  requirements: []
61
- rubygems_version: 3.2.3
61
+ rubygems_version: 3.3.3
62
62
  signing_key:
63
63
  specification_version: 4
64
64
  summary: Simple, powerful data frames for Ruby