parqueteur 1.1.1 → 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +13 -13
- data/README.md +137 -8
- data/examples/cars.rb +40 -0
- data/examples/convert-and-compression.rb +56 -0
- data/examples/convert-methods.rb +54 -0
- data/examples/convert-to-io.rb +52 -0
- data/examples/convert-with-chunks.rb +54 -0
- data/examples/convert-without-compression.rb +52 -0
- data/examples/hello-world.rb +57 -0
- data/examples/readme-example.rb +44 -0
- data/lib/parqueteur/converter.rb +108 -79
- data/lib/parqueteur/input.rb +12 -27
- data/lib/parqueteur/type_resolver.rb +10 -0
- data/lib/parqueteur/types/date32_type.rb +15 -0
- data/lib/parqueteur/types/date64_type.rb +15 -0
- data/lib/parqueteur/types/decimal128_type.rb +29 -0
- data/lib/parqueteur/types/decimal256_type.rb +29 -0
- data/lib/parqueteur/types/int32_type.rb +0 -2
- data/lib/parqueteur/types/int64_type.rb +0 -2
- data/lib/parqueteur/types/time32_type.rb +19 -0
- data/lib/parqueteur/types/time64_type.rb +19 -0
- data/lib/parqueteur/types/timestamp_type.rb +3 -1
- data/lib/parqueteur/version.rb +1 -1
- data/lib/parqueteur.rb +9 -4
- data/parqueteur.gemspec +3 -3
- data/scripts/apache-arrow-ubuntu-install.sh +18 -0
- metadata +21 -9
- data/example.rb +0 -39
- data/lib/parqueteur/chunked_converter.rb +0 -28
- data/test.json +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6bae31969c118f27e2a1e3796a72dff3c7c63db4ce7b12db6f967d8f5b222ee5
|
4
|
+
data.tar.gz: 13bcbc2a1eaee55ba0b27d6aa2848147829c90d85a7089d35fb22e01ae586053
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04d7dc514fb2c0869a1b8ef071117b22da51ce162a31169b76a413c9bfb356b12ff7493e3fdbce5b8691033ad013254c53547d592f12f17bc554600e1ecc4d0d
|
7
|
+
data.tar.gz: fafaca859454f78f81b63961360c4baad6b0ec2614c0a8b675cb14a9df5c6aaef3bf4b72050effce84418415271567bf4a3032bef27abcb4bcbcd8e22ab160b2
|
data/Gemfile.lock
CHANGED
@@ -1,32 +1,32 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
parqueteur (1.
|
5
|
-
red-parquet (~>
|
4
|
+
parqueteur (1.3.2)
|
5
|
+
red-parquet (~> 6.0)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: https://rubygems.org/
|
9
9
|
specs:
|
10
|
-
bigdecimal (3.0.
|
11
|
-
extpp (0.0
|
12
|
-
gio2 (3.4.
|
13
|
-
gobject-introspection (= 3.4.
|
14
|
-
glib2 (3.4.
|
10
|
+
bigdecimal (3.0.2)
|
11
|
+
extpp (0.1.0)
|
12
|
+
gio2 (3.4.9)
|
13
|
+
gobject-introspection (= 3.4.9)
|
14
|
+
glib2 (3.4.9)
|
15
15
|
native-package-installer (>= 1.0.3)
|
16
16
|
pkg-config (>= 1.3.5)
|
17
|
-
gobject-introspection (3.4.
|
18
|
-
glib2 (= 3.4.
|
17
|
+
gobject-introspection (3.4.9)
|
18
|
+
glib2 (= 3.4.9)
|
19
19
|
native-package-installer (1.1.1)
|
20
20
|
pkg-config (1.4.6)
|
21
21
|
rake (13.0.6)
|
22
|
-
red-arrow (
|
22
|
+
red-arrow (6.0.0)
|
23
23
|
bigdecimal (>= 2.0.3)
|
24
24
|
extpp (>= 0.0.7)
|
25
|
-
gio2 (>= 3.4.
|
25
|
+
gio2 (>= 3.4.9)
|
26
26
|
native-package-installer
|
27
27
|
pkg-config
|
28
|
-
red-parquet (
|
29
|
-
red-arrow (=
|
28
|
+
red-parquet (6.0.0)
|
29
|
+
red-arrow (= 6.0.0)
|
30
30
|
|
31
31
|
PLATFORMS
|
32
32
|
ruby
|
data/README.md
CHANGED
@@ -1,15 +1,31 @@
|
|
1
1
|
# Parqueteur
|
2
2
|
|
3
|
-
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/parqueteur.svg)](https://badge.fury.io/rb/parqueteur)
|
4
4
|
|
5
|
-
|
5
|
+
Parqueteur enables you to generate Apache Parquet files from raw data.
|
6
6
|
|
7
|
+
## Dependencies
|
8
|
+
|
9
|
+
Since I only tested Parqueteur on Ubuntu, I don't have any install scripts for others operating systems.
|
10
|
+
### Debian/Ubuntu packages
|
11
|
+
- `libgirepository1.0-dev`
|
12
|
+
- `libarrow-dev`
|
13
|
+
- `libarrow-glib-dev`
|
14
|
+
- `libparquet-dev`
|
15
|
+
- `libparquet-glib-dev`
|
16
|
+
|
17
|
+
You can check `scripts/apache-arrow-ubuntu-install.sh` script for a quick way to install all of them.
|
7
18
|
## Installation
|
8
19
|
|
9
20
|
Add this line to your application's Gemfile:
|
10
21
|
|
11
22
|
```ruby
|
12
|
-
gem 'parqueteur'
|
23
|
+
gem 'parqueteur', '~> 1.0'
|
24
|
+
```
|
25
|
+
|
26
|
+
> (optional) If you don't want to require Parqueteur globally you can add `require: false` to the Gemfile instruction:
|
27
|
+
```ruby
|
28
|
+
gem 'parqueteur', '~> 1.0', require: false
|
13
29
|
```
|
14
30
|
|
15
31
|
And then execute:
|
@@ -22,14 +38,127 @@ Or install it yourself as:
|
|
22
38
|
|
23
39
|
## Usage
|
24
40
|
|
25
|
-
|
41
|
+
Parqueteur provides an elegant way to generate Apache Parquet files from a defined schema.
|
42
|
+
|
43
|
+
Converters accepts any object that implements `Enumerable` as data source.
|
44
|
+
|
45
|
+
### Working example
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
require 'parqueteur'
|
49
|
+
|
50
|
+
class FooParquetConverter < Parqueteur::Converter
|
51
|
+
column :id, :bigint
|
52
|
+
column :reference, :string
|
53
|
+
column :datetime, :timestamp
|
54
|
+
end
|
55
|
+
|
56
|
+
data = [
|
57
|
+
{ 'id' => 1, 'reference' => 'hello world 1', 'datetime' => Time.now },
|
58
|
+
{ 'id' => 2, 'reference' => 'hello world 2', 'datetime' => Time.now },
|
59
|
+
{ 'id' => 3, 'reference' => 'hello world 3', 'datetime' => Time.now }
|
60
|
+
]
|
61
|
+
|
62
|
+
# initialize Converter with Parquet GZIP compression mode
|
63
|
+
converter = FooParquetConverter.new(data, compression: :gzip)
|
26
64
|
|
27
|
-
|
65
|
+
# write result to file
|
66
|
+
converter.write('hello_world.parquet')
|
28
67
|
|
29
|
-
|
68
|
+
# in-memory result (StringIO)
|
69
|
+
converter.to_io
|
70
|
+
|
71
|
+
# write to temporary file (Tempfile)
|
72
|
+
# don't forget to `close` / `unlink` it after usage
|
73
|
+
converter.to_tmpfile
|
74
|
+
|
75
|
+
# convert to Arrow::Table
|
76
|
+
pp converter.to_arrow_table
|
77
|
+
```
|
78
|
+
|
79
|
+
### Using transformers
|
80
|
+
|
81
|
+
You can use transformers to apply data items transformations.
|
82
|
+
|
83
|
+
From `examples/cars.rb`:
|
84
|
+
|
85
|
+
```ruby
|
86
|
+
require 'parqueteur'
|
87
|
+
|
88
|
+
class Car
|
89
|
+
attr_reader :name, :production_year
|
90
|
+
|
91
|
+
def initialize(name, production_year)
|
92
|
+
@name = name
|
93
|
+
@production_year = production_year
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
class CarParquetConverter < Parqueteur::Converter
|
98
|
+
column :name, :string
|
99
|
+
column :production_year, :integer
|
100
|
+
|
101
|
+
transform do |car|
|
102
|
+
{
|
103
|
+
'name' => car.name,
|
104
|
+
'production_year' => car.production_year
|
105
|
+
}
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
cars = [
|
110
|
+
Car.new('Alfa Romeo 75', 1985),
|
111
|
+
Car.new('Alfa Romeo 33', 1983),
|
112
|
+
Car.new('Audi A3', 1996),
|
113
|
+
Car.new('Audi A4', 1994),
|
114
|
+
Car.new('BMW 503', 1956),
|
115
|
+
Car.new('BMW X5', 1999)
|
116
|
+
]
|
117
|
+
|
118
|
+
# initialize Converter with Parquet GZIP compression mode
|
119
|
+
converter = CarParquetConverter.new(data, compression: :gzip)
|
120
|
+
|
121
|
+
# write result to file
|
122
|
+
pp converter.to_arrow_table
|
123
|
+
```
|
124
|
+
|
125
|
+
Output:
|
126
|
+
```
|
127
|
+
#<Arrow::Table:0x7fc1fb24b958 ptr=0x7fc1faedd910>
|
128
|
+
# name production_year
|
129
|
+
0 Alfa Romeo 75 1985
|
130
|
+
1 Alfa Romeo 33 1983
|
131
|
+
2 Audi A3 1996
|
132
|
+
3 Audi A4 1994
|
133
|
+
4 BMW 503 1956
|
134
|
+
5 BMW X5 1999
|
135
|
+
```
|
30
136
|
|
31
|
-
|
137
|
+
### Available Types
|
138
|
+
|
139
|
+
| Name (Symbol) | Apache Parquet Type |
|
140
|
+
| ------------- | --------- |
|
141
|
+
| `:array` | `Array` |
|
142
|
+
| `:bigdecimal` | `Decimal256` |
|
143
|
+
| `:bigint` | `Int64` or `UInt64` with `unsigned: true` option |
|
144
|
+
| `:boolean` | `Boolean` |
|
145
|
+
| `:date` | `Date32` |
|
146
|
+
| `:date32` | `Date32` |
|
147
|
+
| `:date64` | `Date64` |
|
148
|
+
| `:decimal` | `Decimal128` |
|
149
|
+
| `:decimal128` | `Decimal128` |
|
150
|
+
| `:decimal256` | `Decimal256` |
|
151
|
+
| `:int32` | `Int32` or `UInt32` with `unsigned: true` option |
|
152
|
+
| `:int64` | `Int64` or `UInt64` with `unsigned: true` option |
|
153
|
+
| `:integer` | `Int32` or `UInt32` with `unsigned: true` option |
|
154
|
+
| `:map` | `Map` |
|
155
|
+
| `:string` | `String` |
|
156
|
+
| `:struct` | `Struct` |
|
157
|
+
| `:time` | `Time32` |
|
158
|
+
| `:time32` | `Time32` |
|
159
|
+
| `:time64` | `Time64` |
|
160
|
+
| `:timestamp` | `Timestamp` |
|
32
161
|
|
33
162
|
## Contributing
|
34
163
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
164
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/pocketsizesun/parqueteur-ruby.
|
data/examples/cars.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
|
4
|
+
class Car
|
5
|
+
attr_reader :name, :production_year
|
6
|
+
|
7
|
+
def initialize(name, production_year)
|
8
|
+
@name = name
|
9
|
+
@production_year = production_year
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class CarParquetConverter < Parqueteur::Converter
|
14
|
+
column :name, :string
|
15
|
+
column :production_year, :integer
|
16
|
+
|
17
|
+
transform do |car|
|
18
|
+
{
|
19
|
+
'name' => car.name,
|
20
|
+
'production_year' => car.production_year
|
21
|
+
}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
cars = [
|
26
|
+
Car.new('Alfa Romeo 75', 1985),
|
27
|
+
Car.new('Alfa Romeo 33', 1983),
|
28
|
+
Car.new('Audi A3', 1996),
|
29
|
+
Car.new('Audi A4', 1994),
|
30
|
+
Car.new('BMW 503', 1956),
|
31
|
+
Car.new('BMW X5', 1999)
|
32
|
+
]
|
33
|
+
|
34
|
+
# initialize Converter with Parquet GZIP compression mode
|
35
|
+
converter = CarParquetConverter.new(
|
36
|
+
cars, compression: :gzip
|
37
|
+
)
|
38
|
+
|
39
|
+
# write result to file
|
40
|
+
pp converter.to_arrow_table
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class FooParquetConverter < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
# initialize Converter with Parquet GZIP compression mode
|
52
|
+
converter = FooParquetConverter.new(data, compression: :gzip)
|
53
|
+
|
54
|
+
# write result to file
|
55
|
+
converter.write('tmp/example.gzip-compressed.parquet')
|
56
|
+
converter.write('tmp/example.no-gzip.parquet', compression: false)
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
converter = Foo.new(data, compression: :gzip)
|
52
|
+
pp converter.to_io
|
53
|
+
pp converter.to_arrow_table
|
54
|
+
converter.write('tmp/test.parquet')
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
io = Foo.convert(data)
|
52
|
+
pp io.read
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
converter = Foo.new(data, compression: :gzip)
|
52
|
+
converter.split(200).each_with_index do |chunk, idx|
|
53
|
+
puts "#{idx}: #{chunk.path}"
|
54
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
path = 'tmp/test.parquet'
|
52
|
+
Foo.convert_to(data, path)
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
|
4
|
+
class FooParquetConverter < Parqueteur::Converter
|
5
|
+
column :id, :bigint
|
6
|
+
column :my_string_array, :array, elements: :string
|
7
|
+
column :my_date, :date
|
8
|
+
column :my_decimal, :decimal, precision: 12, scale: 4
|
9
|
+
column :my_int, :integer
|
10
|
+
column :my_map, :map, key: :string, value: :string
|
11
|
+
column :my_string, :string
|
12
|
+
column :my_struct, :struct do
|
13
|
+
field :my_struct_str, :string
|
14
|
+
field :my_struct_int, :integer
|
15
|
+
end
|
16
|
+
column :my_time, :time
|
17
|
+
column :my_timestamp, :timestamp
|
18
|
+
end
|
19
|
+
|
20
|
+
data = 100.times.collect do |i|
|
21
|
+
{
|
22
|
+
'id' => i,
|
23
|
+
'my_string_array' => %w[a b c],
|
24
|
+
'my_date' => Date.today,
|
25
|
+
'my_decimal' => BigDecimal('0.03'),
|
26
|
+
'my_int' => rand(1..10),
|
27
|
+
'my_map' => { 'a' => 'b' },
|
28
|
+
'my_string' => 'Hello World',
|
29
|
+
'my_struct' => {
|
30
|
+
'my_struct_str' => 'Hello World',
|
31
|
+
'my_struct_int' => 1
|
32
|
+
},
|
33
|
+
'my_time' => 3600,
|
34
|
+
'my_timestamp' => Time.now
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
# initialize Converter with Parquet GZIP compression mode
|
39
|
+
converter = FooParquetConverter.new(data, compression: :gzip)
|
40
|
+
|
41
|
+
# write result to file
|
42
|
+
converter.write('tmp/hello_world.compressed.parquet')
|
43
|
+
converter.write('tmp/hello_world.parquet', compression: false)
|
44
|
+
|
45
|
+
# in-memory result (StringIO)
|
46
|
+
converter.to_io
|
47
|
+
|
48
|
+
# write to temporary file (Tempfile)
|
49
|
+
# don't forget to `close` / `unlink` it after usage
|
50
|
+
converter.to_tmpfile
|
51
|
+
|
52
|
+
# Arrow Table
|
53
|
+
table = converter.to_arrow_table
|
54
|
+
table.each_record do |record|
|
55
|
+
# pp record['my_decimal'].to_f
|
56
|
+
pp record.to_h
|
57
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
|
4
|
+
class FooParquetConverter < Parqueteur::Converter
|
5
|
+
column :id, :bigint
|
6
|
+
column :reference, :string
|
7
|
+
column :datetime, :timestamp
|
8
|
+
column :beers_count, :integer
|
9
|
+
|
10
|
+
transform do |item|
|
11
|
+
item.merge(
|
12
|
+
'datetime' => Time.now
|
13
|
+
)
|
14
|
+
end
|
15
|
+
|
16
|
+
transform :add_beers
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def add_beers(item)
|
21
|
+
item['beers_count'] += rand(1..3)
|
22
|
+
item
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
data = 10.times.lazy.map do |i|
|
27
|
+
{ 'id' => i + 1, 'reference' => 'hello world 1', 'beers_count' => 0 }
|
28
|
+
end
|
29
|
+
|
30
|
+
# initialize Converter with Parquet GZIP compression mode
|
31
|
+
converter = FooParquetConverter.new(data, compression: :gzip)
|
32
|
+
|
33
|
+
# write result to file
|
34
|
+
converter.write('tmp/hello_world.parquet')
|
35
|
+
|
36
|
+
# in-memory result (StringIO)
|
37
|
+
converter.to_io
|
38
|
+
|
39
|
+
# write to temporary file (Tempfile)
|
40
|
+
# don't forget to `close` / `unlink` it after usage
|
41
|
+
converter.to_tmpfile
|
42
|
+
|
43
|
+
# convert to Arrow::Table
|
44
|
+
pp converter.to_arrow_table
|