rover-df 0.2.5 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ca39a558c3c12103f03fed4cb8f007fbd00a1f8e84b839916fd0010aae4613ba
4
- data.tar.gz: 43df8cdc415cc036ac383f30b7c91a35b644067a3cb8ea199abd7452b98298d5
3
+ metadata.gz: 01e2a90ba133ae07ad6ad482bdca985df806d6a073fa2d93029b2b7e1b55dc49
4
+ data.tar.gz: 96f4171420dea68b38cffdd5a365657bc464f6d1f0c4f6bf1aefb20377c56179
5
5
  SHA512:
6
- metadata.gz: 2724c7e85ee7921f277be833cf89be638c14cbb37a44411bba86c42cacffe7c0e4b82ea04d4dfb3d694c6429ba41bc8e8c10f7cb40e5d34bf59d14755858735f
7
- data.tar.gz: fa860158decbca0a0b35ccb82e6f73d9a513c37b483eca52d140842d5dd255899a2e1ded3ec4375a492b86d3ec09ffa53d4871e05f1fdad39f3d2630215417dc
6
+ metadata.gz: 2451d6844c7ece459e61c8e1499047f8efd6472a0d57317b7e2e1110527d843e8177c16ccbb1aeb0fd61e647fdd4291ebf73d4bfe008560eff7b963b1ac22ee6
7
+ data.tar.gz: 18ad0cfb8fc22aeb63d2e1b11333b1a5989c7bcc0f2b5fbebedb11acf3d3dc26e7235109e1501d0e4b9a06b5aa7e47b71bdb23de6d6fcd87c9fb53d2bf0be330
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.2.6 (2021-10-27)
2
+
3
+ - Added support for `nil` headers to `read_csv` and `parse_csv`
4
+ - Added `read_parquet`, `parse_parquet`, and `to_parquet` methods
5
+
1
6
  ## 0.2.5 (2021-09-25)
2
7
 
3
8
  - Fixed column types with joins
data/README.md CHANGED
@@ -61,6 +61,14 @@ Rover.read_csv("file.csv")
61
61
  Rover.parse_csv("CSV,data,string")
62
62
  ```
63
63
 
64
+ From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) [unreleased]
65
+
66
+ ```ruby
67
+ Rover.read_parquet("file.parquet")
68
+ # or
69
+ Rover.parse_parquet("PAR1...")
70
+ ```
71
+
64
72
  ## Attributes
65
73
 
66
74
  Get number of rows
@@ -89,7 +97,7 @@ Select a column
89
97
  df[:a]
90
98
  ```
91
99
 
92
- > Note that strings and symbols are different keys, just like hashes
100
+ > Note that strings and symbols are different keys, just like hashes. Creating a data frame from Active Record, a CSV, or Parquet uses strings.
93
101
 
94
102
  Select multiple columns
95
103
 
@@ -393,6 +401,12 @@ CSV
393
401
  df.to_csv
394
402
  ```
395
403
 
404
+ Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) [unreleased]
405
+
406
+ ```ruby
407
+ df.to_parquet
408
+ ```
409
+
396
410
  ## Types
397
411
 
398
412
  You can specify column types when creating a data frame
@@ -235,6 +235,42 @@ module Rover
235
235
  end
236
236
  end
237
237
 
238
+ def to_parquet
239
+ require "parquet"
240
+
241
+ schema = {}
242
+ types.each do |name, type|
243
+ schema[name] =
244
+ case type
245
+ when :int
246
+ :int64
247
+ when :uint
248
+ :uint64
249
+ when :float
250
+ :double
251
+ when :float32
252
+ :float
253
+ when :object
254
+ if @vectors[name].all? { |v| v.is_a?(String) }
255
+ :string
256
+ else
257
+ raise "Unknown type"
258
+ end
259
+ else
260
+ type
261
+ end
262
+ end
263
+ # TODO improve performance
264
+ raw_records = []
265
+ size.times do |i|
266
+ raw_records << @vectors.map { |_, v| v[i] }
267
+ end
268
+ table = Arrow::Table.new(schema, raw_records)
269
+ buffer = Arrow::ResizableBuffer.new(1024)
270
+ table.save(buffer, format: :parquet)
271
+ buffer.data.to_s
272
+ end
273
+
238
274
  # for IRuby
239
275
  def to_html
240
276
  require "iruby"
data/lib/rover/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rover
2
- VERSION = "0.2.5"
2
+ VERSION = "0.2.6"
3
3
  end
data/lib/rover.rb CHANGED
@@ -19,6 +19,16 @@ module Rover
19
19
  csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
20
20
  end
21
21
 
22
+ def read_parquet(path)
23
+ require "parquet"
24
+ parquet_to_df(Arrow::Table.load(path))
25
+ end
26
+
27
+ def parse_parquet(str)
28
+ require "parquet"
29
+ parquet_to_df(Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet))
30
+ end
31
+
22
32
  private
23
33
 
24
34
  # TODO use date converter
@@ -35,10 +45,49 @@ module Rover
35
45
 
36
46
  table.by_col!
37
47
  data = {}
48
+ keys = table.map { |k, _| [k, true] }.to_h
49
+ unnamed_suffix = 1
38
50
  table.each do |k, v|
51
+ # TODO do same for empty string in 0.3.0
52
+ if k.nil?
53
+ k = "unnamed"
54
+ while keys.include?(k)
55
+ unnamed_suffix += 1
56
+ k = "unnamed#{unnamed_suffix}"
57
+ end
58
+ keys[k] = true
59
+ end
39
60
  data[k] = v
40
61
  end
62
+
41
63
  DataFrame.new(data, types: types)
42
64
  end
65
+
66
+ PARQUET_TYPE_MAPPING = {
67
+ "float" => Numo::SFloat,
68
+ "double" => Numo::DFloat,
69
+ "int8" => Numo::Int8,
70
+ "int16" => Numo::Int16,
71
+ "int32" => Numo::Int32,
72
+ "int64" => Numo::Int64,
73
+ "string" => Numo::RObject,
74
+ "uint8" => Numo::UInt8,
75
+ "uint16" => Numo::UInt16,
76
+ "uint32" => Numo::UInt32,
77
+ "uint64" => Numo::UInt64
78
+ }
79
+
80
+ def parquet_to_df(table)
81
+ data = {}
82
+ table.each_column do |column|
83
+ k = column.field.name
84
+ type = column.field.data_type.to_s
85
+ numo_type = PARQUET_TYPE_MAPPING[type]
86
+ raise "Unknown type: #{type}" unless numo_type
87
+ # TODO improve performance
88
+ data[k] = numo_type.cast(column.data.values)
89
+ end
90
+ DataFrame.new(data)
91
+ end
43
92
  end
44
93
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rover-df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.2.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-09-25 00:00:00.000000000 Z
11
+ date: 2021-10-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray