parqueteur 1.3.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +8 -8
- data/README.md +98 -4
- data/examples/cars.rb +40 -0
- data/examples/hello-world.rb +2 -1
- data/examples/readme-example.rb +44 -0
- data/lib/parqueteur/type_resolver.rb +1 -1
- data/lib/parqueteur/types/decimal128_type.rb +14 -3
- data/lib/parqueteur/types/decimal256_type.rb +14 -3
- data/lib/parqueteur/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 86f767a68e38cdd93da015e4ddfcda06b2eefe553ecb6d7a423b4cc0f2752183
|
|
4
|
+
data.tar.gz: cd741b1d023e44fcc14c08192b9791b675339736527f47dbe2393e85bdff9d07
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 58882a4d2d1ea5a0cb53f5643f96ac504352b87e6c1436d10841a616d2867bf127c70a31374f6ff2f455fb2894d29fe8cc439ca8a0efca5fe519c00ee312c8c5
|
|
7
|
+
data.tar.gz: ac185e758d8c0fa19d11ac05e96f77f93cb8260e055d203d1d0d71dd04e0ff6db70d773824a8a6fe5c63e5f235ab179f3417ac689b590403b69192cbad0bde98
|
data/Gemfile.lock
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
parqueteur (1.3.
|
|
4
|
+
parqueteur (1.3.1)
|
|
5
5
|
red-parquet (~> 5.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
8
8
|
remote: https://rubygems.org/
|
|
9
9
|
specs:
|
|
10
|
-
bigdecimal (3.0.
|
|
11
|
-
extpp (0.0
|
|
12
|
-
gio2 (3.4.
|
|
13
|
-
gobject-introspection (= 3.4.
|
|
14
|
-
glib2 (3.4.
|
|
10
|
+
bigdecimal (3.0.2)
|
|
11
|
+
extpp (0.1.0)
|
|
12
|
+
gio2 (3.4.9)
|
|
13
|
+
gobject-introspection (= 3.4.9)
|
|
14
|
+
glib2 (3.4.9)
|
|
15
15
|
native-package-installer (>= 1.0.3)
|
|
16
16
|
pkg-config (>= 1.3.5)
|
|
17
|
-
gobject-introspection (3.4.
|
|
18
|
-
glib2 (= 3.4.
|
|
17
|
+
gobject-introspection (3.4.9)
|
|
18
|
+
glib2 (= 3.4.9)
|
|
19
19
|
native-package-installer (1.1.1)
|
|
20
20
|
pkg-config (1.4.6)
|
|
21
21
|
rake (13.0.6)
|
data/README.md
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
# Parqueteur
|
|
2
2
|
|
|
3
|
+
[](https://badge.fury.io/rb/parqueteur)
|
|
4
|
+
|
|
3
5
|
Parqueteur enables you to generate Apache Parquet files from raw data.
|
|
4
6
|
|
|
5
7
|
## Dependencies
|
|
6
8
|
|
|
7
|
-
Since I only tested Parqueteur on Ubuntu, I don't have any install scripts for
|
|
9
|
+
Since I only tested Parqueteur on Ubuntu, I don't have any install scripts for others operating systems.
|
|
8
10
|
### Debian/Ubuntu packages
|
|
9
11
|
- `libgirepository1.0-dev`
|
|
10
12
|
- `libarrow-dev`
|
|
@@ -37,18 +39,24 @@ Or install it yourself as:
|
|
|
37
39
|
## Usage
|
|
38
40
|
|
|
39
41
|
Parqueteur provides an elegant way to generate Apache Parquet files from a defined schema.
|
|
42
|
+
|
|
43
|
+
Converters accepts any object that implements `Enumerable` as data source.
|
|
44
|
+
|
|
45
|
+
### Working example
|
|
46
|
+
|
|
40
47
|
```ruby
|
|
41
48
|
require 'parqueteur'
|
|
42
49
|
|
|
43
50
|
class FooParquetConverter < Parqueteur::Converter
|
|
44
51
|
column :id, :bigint
|
|
45
52
|
column :reference, :string
|
|
53
|
+
column :datetime, :timestamp
|
|
46
54
|
end
|
|
47
55
|
|
|
48
56
|
data = [
|
|
49
|
-
{ 'id' => 1, 'reference' => 'hello world 1' },
|
|
50
|
-
{ 'id' => 2, 'reference' => 'hello world 2' },
|
|
51
|
-
{ 'id' => 3, 'reference' => 'hello world 3' }
|
|
57
|
+
{ 'id' => 1, 'reference' => 'hello world 1', 'datetime' => Time.now },
|
|
58
|
+
{ 'id' => 2, 'reference' => 'hello world 2', 'datetime' => Time.now },
|
|
59
|
+
{ 'id' => 3, 'reference' => 'hello world 3', 'datetime' => Time.now }
|
|
52
60
|
]
|
|
53
61
|
|
|
54
62
|
# initialize Converter with Parquet GZIP compression mode
|
|
@@ -63,8 +71,94 @@ converter.to_io
|
|
|
63
71
|
# write to temporary file (Tempfile)
|
|
64
72
|
# don't forget to `close` / `unlink` it after usage
|
|
65
73
|
converter.to_tmpfile
|
|
74
|
+
|
|
75
|
+
# convert to Arrow::Table
|
|
76
|
+
pp converter.to_arrow_table
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Using transformers
|
|
80
|
+
|
|
81
|
+
You can use transformers to apply data items transformations.
|
|
82
|
+
|
|
83
|
+
From `examples/cars.rb`:
|
|
84
|
+
|
|
85
|
+
```ruby
|
|
86
|
+
require 'parqueteur'
|
|
87
|
+
|
|
88
|
+
class Car
|
|
89
|
+
attr_reader :name, :production_year
|
|
90
|
+
|
|
91
|
+
def initialize(name, production_year)
|
|
92
|
+
@name = name
|
|
93
|
+
@production_year = production_year
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
class CarParquetConverter < Parqueteur::Converter
|
|
98
|
+
column :name, :string
|
|
99
|
+
column :production_year, :integer
|
|
100
|
+
|
|
101
|
+
transform do |car|
|
|
102
|
+
{
|
|
103
|
+
'name' => car.name,
|
|
104
|
+
'production_year' => car.production_year
|
|
105
|
+
}
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
cars = [
|
|
110
|
+
Car.new('Alfa Romeo 75', 1985),
|
|
111
|
+
Car.new('Alfa Romeo 33', 1983),
|
|
112
|
+
Car.new('Audi A3', 1996),
|
|
113
|
+
Car.new('Audi A4', 1994),
|
|
114
|
+
Car.new('BMW 503', 1956),
|
|
115
|
+
Car.new('BMW X5', 1999)
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
# initialize Converter with Parquet GZIP compression mode
|
|
119
|
+
converter = CarParquetConverter.new(data, compression: :gzip)
|
|
120
|
+
|
|
121
|
+
# write result to file
|
|
122
|
+
pp converter.to_arrow_table
|
|
66
123
|
```
|
|
67
124
|
|
|
125
|
+
Output:
|
|
126
|
+
```
|
|
127
|
+
#<Arrow::Table:0x7fc1fb24b958 ptr=0x7fc1faedd910>
|
|
128
|
+
# name production_year
|
|
129
|
+
0 Alfa Romeo 75 1985
|
|
130
|
+
1 Alfa Romeo 33 1983
|
|
131
|
+
2 Audi A3 1996
|
|
132
|
+
3 Audi A4 1994
|
|
133
|
+
4 BMW 503 1956
|
|
134
|
+
5 BMW X5 1999
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Available Types
|
|
138
|
+
|
|
139
|
+
| Name (Symbol) | Apache Parquet Type |
|
|
140
|
+
| ------------- | --------- |
|
|
141
|
+
| `:array` | `Array` |
|
|
142
|
+
| `:bigdecimal` | `Decimal256` |
|
|
143
|
+
| `:bigint` | `Int64` or `UInt64` with `unsigned: true` option |
|
|
144
|
+
| `:boolean` | `Boolean` |
|
|
145
|
+
| `:date` | `Date32` |
|
|
146
|
+
| `:date32` | `Date32` |
|
|
147
|
+
| `:date64` | `Date64` |
|
|
148
|
+
| `:decimal` | `Decimal128` |
|
|
149
|
+
| `:decimal128` | `Decimal128` |
|
|
150
|
+
| `:decimal256` | `Decimal256` |
|
|
151
|
+
| `:int32` | `Int32` or `UInt32` with `unsigned: true` option |
|
|
152
|
+
| `:int64` | `Int64` or `UInt64` with `unsigned: true` option |
|
|
153
|
+
| `:integer` | `Int32` or `UInt32` with `unsigned: true` option |
|
|
154
|
+
| `:map` | `Map` |
|
|
155
|
+
| `:string` | `String` |
|
|
156
|
+
| `:struct` | `Struct` |
|
|
157
|
+
| `:time` | `Time32` |
|
|
158
|
+
| `:time32` | `Time32` |
|
|
159
|
+
| `:time64` | `Time64` |
|
|
160
|
+
| `:timestamp` | `Timestamp` |
|
|
161
|
+
|
|
68
162
|
## Contributing
|
|
69
163
|
|
|
70
164
|
Bug reports and pull requests are welcome on GitHub at https://github.com/pocketsizesun/parqueteur-ruby.
|
data/examples/cars.rb
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
require 'bundler/setup'
|
|
2
|
+
require 'parqueteur'
|
|
3
|
+
|
|
4
|
+
class Car
|
|
5
|
+
attr_reader :name, :production_year
|
|
6
|
+
|
|
7
|
+
def initialize(name, production_year)
|
|
8
|
+
@name = name
|
|
9
|
+
@production_year = production_year
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
class CarParquetConverter < Parqueteur::Converter
|
|
14
|
+
column :name, :string
|
|
15
|
+
column :production_year, :integer
|
|
16
|
+
|
|
17
|
+
transform do |car|
|
|
18
|
+
{
|
|
19
|
+
'name' => car.name,
|
|
20
|
+
'production_year' => car.production_year
|
|
21
|
+
}
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
cars = [
|
|
26
|
+
Car.new('Alfa Romeo 75', 1985),
|
|
27
|
+
Car.new('Alfa Romeo 33', 1983),
|
|
28
|
+
Car.new('Audi A3', 1996),
|
|
29
|
+
Car.new('Audi A4', 1994),
|
|
30
|
+
Car.new('BMW 503', 1956),
|
|
31
|
+
Car.new('BMW X5', 1999)
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
# initialize Converter with Parquet GZIP compression mode
|
|
35
|
+
converter = CarParquetConverter.new(
|
|
36
|
+
cars, compression: :gzip
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# write result to file
|
|
40
|
+
pp converter.to_arrow_table
|
data/examples/hello-world.rb
CHANGED
|
@@ -22,7 +22,7 @@ data = 100.times.collect do |i|
|
|
|
22
22
|
'id' => i,
|
|
23
23
|
'my_string_array' => %w[a b c],
|
|
24
24
|
'my_date' => Date.today,
|
|
25
|
-
'my_decimal' => BigDecimal('
|
|
25
|
+
'my_decimal' => BigDecimal('0.03'),
|
|
26
26
|
'my_int' => rand(1..10),
|
|
27
27
|
'my_map' => { 'a' => 'b' },
|
|
28
28
|
'my_string' => 'Hello World',
|
|
@@ -52,5 +52,6 @@ converter.to_tmpfile
|
|
|
52
52
|
# Arrow Table
|
|
53
53
|
table = converter.to_arrow_table
|
|
54
54
|
table.each_record do |record|
|
|
55
|
+
# pp record['my_decimal'].to_f
|
|
55
56
|
pp record.to_h
|
|
56
57
|
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
require 'bundler/setup'
|
|
2
|
+
require 'parqueteur'
|
|
3
|
+
|
|
4
|
+
class FooParquetConverter < Parqueteur::Converter
|
|
5
|
+
column :id, :bigint
|
|
6
|
+
column :reference, :string
|
|
7
|
+
column :datetime, :timestamp
|
|
8
|
+
column :beers_count, :integer
|
|
9
|
+
|
|
10
|
+
transform do |item|
|
|
11
|
+
item.merge(
|
|
12
|
+
'datetime' => Time.now
|
|
13
|
+
)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
transform :add_beers
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
def add_beers(item)
|
|
21
|
+
item['beers_count'] += rand(1..3)
|
|
22
|
+
item
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
data = 10.times.lazy.map do |i|
|
|
27
|
+
{ 'id' => i + 1, 'reference' => 'hello world 1', 'beers_count' => 0 }
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# initialize Converter with Parquet GZIP compression mode
|
|
31
|
+
converter = FooParquetConverter.new(data, compression: :gzip)
|
|
32
|
+
|
|
33
|
+
# write result to file
|
|
34
|
+
converter.write('tmp/hello_world.parquet')
|
|
35
|
+
|
|
36
|
+
# in-memory result (StringIO)
|
|
37
|
+
converter.to_io
|
|
38
|
+
|
|
39
|
+
# write to temporary file (Tempfile)
|
|
40
|
+
# don't forget to `close` / `unlink` it after usage
|
|
41
|
+
converter.to_tmpfile
|
|
42
|
+
|
|
43
|
+
# convert to Arrow::Table
|
|
44
|
+
pp converter.to_arrow_table
|
|
@@ -11,7 +11,7 @@ module Parqueteur
|
|
|
11
11
|
bigint: Parqueteur::Types::Int64Type,
|
|
12
12
|
boolean: Parqueteur::Types::BooleanType,
|
|
13
13
|
date: Parqueteur::Types::Date32Type,
|
|
14
|
-
date32: Parqueteur::Types::
|
|
14
|
+
date32: Parqueteur::Types::Date32Type,
|
|
15
15
|
date64: Parqueteur::Types::Date64Type,
|
|
16
16
|
decimal: Parqueteur::Types::Decimal128Type,
|
|
17
17
|
decimal128: Parqueteur::Types::Decimal128Type,
|
|
@@ -3,14 +3,25 @@
|
|
|
3
3
|
module Parqueteur
|
|
4
4
|
module Types
|
|
5
5
|
class Decimal128Type < Parqueteur::Type
|
|
6
|
+
def initialize(options = {}, &block)
|
|
7
|
+
@scale = options.fetch(:scale)
|
|
8
|
+
@precision = options.fetch(:precision)
|
|
9
|
+
@format_str = "%.#{@scale}f"
|
|
10
|
+
super(options, &block)
|
|
11
|
+
end
|
|
12
|
+
|
|
6
13
|
def build_value_array(values)
|
|
7
|
-
Arrow::Decimal128ArrayBuilder.build(
|
|
14
|
+
Arrow::Decimal128ArrayBuilder.build(
|
|
15
|
+
@arrow_type,
|
|
16
|
+
values.map do |value|
|
|
17
|
+
Arrow::Decimal128.new(format(@format_str, BigDecimal(value)))
|
|
18
|
+
end
|
|
19
|
+
)
|
|
8
20
|
end
|
|
9
21
|
|
|
10
22
|
def arrow_type_builder
|
|
11
23
|
Arrow::Decimal128DataType.new(
|
|
12
|
-
precision
|
|
13
|
-
scale: @options.fetch(:scale)
|
|
24
|
+
@precision, @scale
|
|
14
25
|
)
|
|
15
26
|
end
|
|
16
27
|
end
|
|
@@ -3,14 +3,25 @@
|
|
|
3
3
|
module Parqueteur
|
|
4
4
|
module Types
|
|
5
5
|
class Decimal256Type < Parqueteur::Type
|
|
6
|
+
def initialize(options = {}, &block)
|
|
7
|
+
@scale = options.fetch(:scale)
|
|
8
|
+
@precision = options.fetch(:precision)
|
|
9
|
+
@format_str = "%.#{@scale}f"
|
|
10
|
+
super(options, &block)
|
|
11
|
+
end
|
|
12
|
+
|
|
6
13
|
def build_value_array(values)
|
|
7
|
-
Arrow::Decimal256ArrayBuilder.build(
|
|
14
|
+
Arrow::Decimal256ArrayBuilder.build(
|
|
15
|
+
@arrow_type,
|
|
16
|
+
values.map do |value|
|
|
17
|
+
Arrow::Decimal256.new(format(@format_str, BigDecimal(value)))
|
|
18
|
+
end
|
|
19
|
+
)
|
|
8
20
|
end
|
|
9
21
|
|
|
10
22
|
def arrow_type_builder
|
|
11
23
|
Arrow::Decimal256DataType.new(
|
|
12
|
-
precision
|
|
13
|
-
scale: @options.fetch(:scale)
|
|
24
|
+
@precision, @scale
|
|
14
25
|
)
|
|
15
26
|
end
|
|
16
27
|
end
|
data/lib/parqueteur/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: parqueteur
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.3.
|
|
4
|
+
version: 1.3.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Julien D.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2021-10-
|
|
11
|
+
date: 2021-10-17 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: red-parquet
|
|
@@ -38,12 +38,14 @@ files:
|
|
|
38
38
|
- Rakefile
|
|
39
39
|
- bin/console
|
|
40
40
|
- bin/setup
|
|
41
|
+
- examples/cars.rb
|
|
41
42
|
- examples/convert-and-compression.rb
|
|
42
43
|
- examples/convert-methods.rb
|
|
43
44
|
- examples/convert-to-io.rb
|
|
44
45
|
- examples/convert-with-chunks.rb
|
|
45
46
|
- examples/convert-without-compression.rb
|
|
46
47
|
- examples/hello-world.rb
|
|
48
|
+
- examples/readme-example.rb
|
|
47
49
|
- lib/parqueteur.rb
|
|
48
50
|
- lib/parqueteur/column.rb
|
|
49
51
|
- lib/parqueteur/column_collection.rb
|