rover-df 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +15 -1
- data/lib/rover/data_frame.rb +36 -0
- data/lib/rover/version.rb +1 -1
- data/lib/rover.rb +49 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 01e2a90ba133ae07ad6ad482bdca985df806d6a073fa2d93029b2b7e1b55dc49
|
4
|
+
data.tar.gz: 96f4171420dea68b38cffdd5a365657bc464f6d1f0c4f6bf1aefb20377c56179
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2451d6844c7ece459e61c8e1499047f8efd6472a0d57317b7e2e1110527d843e8177c16ccbb1aeb0fd61e647fdd4291ebf73d4bfe008560eff7b963b1ac22ee6
|
7
|
+
data.tar.gz: 18ad0cfb8fc22aeb63d2e1b11333b1a5989c7bcc0f2b5fbebedb11acf3d3dc26e7235109e1501d0e4b9a06b5aa7e47b71bdb23de6d6fcd87c9fb53d2bf0be330
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -61,6 +61,14 @@ Rover.read_csv("file.csv")
|
|
61
61
|
Rover.parse_csv("CSV,data,string")
|
62
62
|
```
|
63
63
|
|
64
|
+
From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) [unreleased]
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
Rover.read_parquet("file.parquet")
|
68
|
+
# or
|
69
|
+
Rover.parse_parquet("PAR1...")
|
70
|
+
```
|
71
|
+
|
64
72
|
## Attributes
|
65
73
|
|
66
74
|
Get number of rows
|
@@ -89,7 +97,7 @@ Select a column
|
|
89
97
|
df[:a]
|
90
98
|
```
|
91
99
|
|
92
|
-
> Note that strings and symbols are different keys, just like hashes
|
100
|
+
> Note that strings and symbols are different keys, just like hashes. Creating a data frame from Active Record, a CSV, or Parquet uses strings.
|
93
101
|
|
94
102
|
Select multiple columns
|
95
103
|
|
@@ -393,6 +401,12 @@ CSV
|
|
393
401
|
df.to_csv
|
394
402
|
```
|
395
403
|
|
404
|
+
Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) [unreleased]
|
405
|
+
|
406
|
+
```ruby
|
407
|
+
df.to_parquet
|
408
|
+
```
|
409
|
+
|
396
410
|
## Types
|
397
411
|
|
398
412
|
You can specify column types when creating a data frame
|
data/lib/rover/data_frame.rb
CHANGED
@@ -235,6 +235,42 @@ module Rover
|
|
235
235
|
end
|
236
236
|
end
|
237
237
|
|
238
|
+
def to_parquet
|
239
|
+
require "parquet"
|
240
|
+
|
241
|
+
schema = {}
|
242
|
+
types.each do |name, type|
|
243
|
+
schema[name] =
|
244
|
+
case type
|
245
|
+
when :int
|
246
|
+
:int64
|
247
|
+
when :uint
|
248
|
+
:uint64
|
249
|
+
when :float
|
250
|
+
:double
|
251
|
+
when :float32
|
252
|
+
:float
|
253
|
+
when :object
|
254
|
+
if @vectors[name].all? { |v| v.is_a?(String) }
|
255
|
+
:string
|
256
|
+
else
|
257
|
+
raise "Unknown type"
|
258
|
+
end
|
259
|
+
else
|
260
|
+
type
|
261
|
+
end
|
262
|
+
end
|
263
|
+
# TODO improve performance
|
264
|
+
raw_records = []
|
265
|
+
size.times do |i|
|
266
|
+
raw_records << @vectors.map { |_, v| v[i] }
|
267
|
+
end
|
268
|
+
table = Arrow::Table.new(schema, raw_records)
|
269
|
+
buffer = Arrow::ResizableBuffer.new(1024)
|
270
|
+
table.save(buffer, format: :parquet)
|
271
|
+
buffer.data.to_s
|
272
|
+
end
|
273
|
+
|
238
274
|
# for IRuby
|
239
275
|
def to_html
|
240
276
|
require "iruby"
|
data/lib/rover/version.rb
CHANGED
data/lib/rover.rb
CHANGED
@@ -19,6 +19,16 @@ module Rover
|
|
19
19
|
csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
|
20
20
|
end
|
21
21
|
|
22
|
+
def read_parquet(path)
|
23
|
+
require "parquet"
|
24
|
+
parquet_to_df(Arrow::Table.load(path))
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_parquet(str)
|
28
|
+
require "parquet"
|
29
|
+
parquet_to_df(Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet))
|
30
|
+
end
|
31
|
+
|
22
32
|
private
|
23
33
|
|
24
34
|
# TODO use date converter
|
@@ -35,10 +45,49 @@ module Rover
|
|
35
45
|
|
36
46
|
table.by_col!
|
37
47
|
data = {}
|
48
|
+
keys = table.map { |k, _| [k, true] }.to_h
|
49
|
+
unnamed_suffix = 1
|
38
50
|
table.each do |k, v|
|
51
|
+
# TODO do same for empty string in 0.3.0
|
52
|
+
if k.nil?
|
53
|
+
k = "unnamed"
|
54
|
+
while keys.include?(k)
|
55
|
+
unnamed_suffix += 1
|
56
|
+
k = "unnamed#{unnamed_suffix}"
|
57
|
+
end
|
58
|
+
keys[k] = true
|
59
|
+
end
|
39
60
|
data[k] = v
|
40
61
|
end
|
62
|
+
|
41
63
|
DataFrame.new(data, types: types)
|
42
64
|
end
|
65
|
+
|
66
|
+
PARQUET_TYPE_MAPPING = {
|
67
|
+
"float" => Numo::SFloat,
|
68
|
+
"double" => Numo::DFloat,
|
69
|
+
"int8" => Numo::Int8,
|
70
|
+
"int16" => Numo::Int16,
|
71
|
+
"int32" => Numo::Int32,
|
72
|
+
"int64" => Numo::Int64,
|
73
|
+
"string" => Numo::RObject,
|
74
|
+
"uint8" => Numo::UInt8,
|
75
|
+
"uint16" => Numo::UInt16,
|
76
|
+
"uint32" => Numo::UInt32,
|
77
|
+
"uint64" => Numo::UInt64
|
78
|
+
}
|
79
|
+
|
80
|
+
def parquet_to_df(table)
|
81
|
+
data = {}
|
82
|
+
table.each_column do |column|
|
83
|
+
k = column.field.name
|
84
|
+
type = column.field.data_type.to_s
|
85
|
+
numo_type = PARQUET_TYPE_MAPPING[type]
|
86
|
+
raise "Unknown type: #{type}" unless numo_type
|
87
|
+
# TODO improve performance
|
88
|
+
data[k] = numo_type.cast(column.data.values)
|
89
|
+
end
|
90
|
+
DataFrame.new(data)
|
91
|
+
end
|
43
92
|
end
|
44
93
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rover-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|