rover-df 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +15 -1
- data/lib/rover/data_frame.rb +36 -0
- data/lib/rover/version.rb +1 -1
- data/lib/rover.rb +49 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 01e2a90ba133ae07ad6ad482bdca985df806d6a073fa2d93029b2b7e1b55dc49
|
4
|
+
data.tar.gz: 96f4171420dea68b38cffdd5a365657bc464f6d1f0c4f6bf1aefb20377c56179
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2451d6844c7ece459e61c8e1499047f8efd6472a0d57317b7e2e1110527d843e8177c16ccbb1aeb0fd61e647fdd4291ebf73d4bfe008560eff7b963b1ac22ee6
|
7
|
+
data.tar.gz: 18ad0cfb8fc22aeb63d2e1b11333b1a5989c7bcc0f2b5fbebedb11acf3d3dc26e7235109e1501d0e4b9a06b5aa7e47b71bdb23de6d6fcd87c9fb53d2bf0be330
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -61,6 +61,14 @@ Rover.read_csv("file.csv")
|
|
61
61
|
Rover.parse_csv("CSV,data,string")
|
62
62
|
```
|
63
63
|
|
64
|
+
From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) [unreleased]
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
Rover.read_parquet("file.parquet")
|
68
|
+
# or
|
69
|
+
Rover.parse_parquet("PAR1...")
|
70
|
+
```
|
71
|
+
|
64
72
|
## Attributes
|
65
73
|
|
66
74
|
Get number of rows
|
@@ -89,7 +97,7 @@ Select a column
|
|
89
97
|
df[:a]
|
90
98
|
```
|
91
99
|
|
92
|
-
> Note that strings and symbols are different keys, just like hashes
|
100
|
+
> Note that strings and symbols are different keys, just like hashes. Creating a data frame from Active Record, a CSV, or Parquet uses strings.
|
93
101
|
|
94
102
|
Select multiple columns
|
95
103
|
|
@@ -393,6 +401,12 @@ CSV
|
|
393
401
|
df.to_csv
|
394
402
|
```
|
395
403
|
|
404
|
+
Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) [unreleased]
|
405
|
+
|
406
|
+
```ruby
|
407
|
+
df.to_parquet
|
408
|
+
```
|
409
|
+
|
396
410
|
## Types
|
397
411
|
|
398
412
|
You can specify column types when creating a data frame
|
data/lib/rover/data_frame.rb
CHANGED
@@ -235,6 +235,42 @@ module Rover
|
|
235
235
|
end
|
236
236
|
end
|
237
237
|
|
238
|
+
def to_parquet
|
239
|
+
require "parquet"
|
240
|
+
|
241
|
+
schema = {}
|
242
|
+
types.each do |name, type|
|
243
|
+
schema[name] =
|
244
|
+
case type
|
245
|
+
when :int
|
246
|
+
:int64
|
247
|
+
when :uint
|
248
|
+
:uint64
|
249
|
+
when :float
|
250
|
+
:double
|
251
|
+
when :float32
|
252
|
+
:float
|
253
|
+
when :object
|
254
|
+
if @vectors[name].all? { |v| v.is_a?(String) }
|
255
|
+
:string
|
256
|
+
else
|
257
|
+
raise "Unknown type"
|
258
|
+
end
|
259
|
+
else
|
260
|
+
type
|
261
|
+
end
|
262
|
+
end
|
263
|
+
# TODO improve performance
|
264
|
+
raw_records = []
|
265
|
+
size.times do |i|
|
266
|
+
raw_records << @vectors.map { |_, v| v[i] }
|
267
|
+
end
|
268
|
+
table = Arrow::Table.new(schema, raw_records)
|
269
|
+
buffer = Arrow::ResizableBuffer.new(1024)
|
270
|
+
table.save(buffer, format: :parquet)
|
271
|
+
buffer.data.to_s
|
272
|
+
end
|
273
|
+
|
238
274
|
# for IRuby
|
239
275
|
def to_html
|
240
276
|
require "iruby"
|
data/lib/rover/version.rb
CHANGED
data/lib/rover.rb
CHANGED
@@ -19,6 +19,16 @@ module Rover
|
|
19
19
|
csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
|
20
20
|
end
|
21
21
|
|
22
|
+
def read_parquet(path)
|
23
|
+
require "parquet"
|
24
|
+
parquet_to_df(Arrow::Table.load(path))
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_parquet(str)
|
28
|
+
require "parquet"
|
29
|
+
parquet_to_df(Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet))
|
30
|
+
end
|
31
|
+
|
22
32
|
private
|
23
33
|
|
24
34
|
# TODO use date converter
|
@@ -35,10 +45,49 @@ module Rover
|
|
35
45
|
|
36
46
|
table.by_col!
|
37
47
|
data = {}
|
48
|
+
keys = table.map { |k, _| [k, true] }.to_h
|
49
|
+
unnamed_suffix = 1
|
38
50
|
table.each do |k, v|
|
51
|
+
# TODO do same for empty string in 0.3.0
|
52
|
+
if k.nil?
|
53
|
+
k = "unnamed"
|
54
|
+
while keys.include?(k)
|
55
|
+
unnamed_suffix += 1
|
56
|
+
k = "unnamed#{unnamed_suffix}"
|
57
|
+
end
|
58
|
+
keys[k] = true
|
59
|
+
end
|
39
60
|
data[k] = v
|
40
61
|
end
|
62
|
+
|
41
63
|
DataFrame.new(data, types: types)
|
42
64
|
end
|
65
|
+
|
66
|
+
PARQUET_TYPE_MAPPING = {
|
67
|
+
"float" => Numo::SFloat,
|
68
|
+
"double" => Numo::DFloat,
|
69
|
+
"int8" => Numo::Int8,
|
70
|
+
"int16" => Numo::Int16,
|
71
|
+
"int32" => Numo::Int32,
|
72
|
+
"int64" => Numo::Int64,
|
73
|
+
"string" => Numo::RObject,
|
74
|
+
"uint8" => Numo::UInt8,
|
75
|
+
"uint16" => Numo::UInt16,
|
76
|
+
"uint32" => Numo::UInt32,
|
77
|
+
"uint64" => Numo::UInt64
|
78
|
+
}
|
79
|
+
|
80
|
+
def parquet_to_df(table)
|
81
|
+
data = {}
|
82
|
+
table.each_column do |column|
|
83
|
+
k = column.field.name
|
84
|
+
type = column.field.data_type.to_s
|
85
|
+
numo_type = PARQUET_TYPE_MAPPING[type]
|
86
|
+
raise "Unknown type: #{type}" unless numo_type
|
87
|
+
# TODO improve performance
|
88
|
+
data[k] = numo_type.cast(column.data.values)
|
89
|
+
end
|
90
|
+
DataFrame.new(data)
|
91
|
+
end
|
43
92
|
end
|
44
93
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rover-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|