parqueteur 1.0.3 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/Gemfile.lock +1 -1
- data/README.md +43 -8
- data/examples/convert-and-compression.rb +56 -0
- data/examples/convert-methods.rb +54 -0
- data/examples/convert-to-io.rb +52 -0
- data/examples/convert-with-chunks.rb +54 -0
- data/examples/convert-without-compression.rb +52 -0
- data/examples/hello-world.rb +56 -0
- data/lib/parqueteur/column.rb +3 -20
- data/lib/parqueteur/column_collection.rb +8 -0
- data/lib/parqueteur/converter.rb +110 -67
- data/lib/parqueteur/input.rb +12 -27
- data/lib/parqueteur/struct.rb +25 -0
- data/lib/parqueteur/type.rb +21 -0
- data/lib/parqueteur/type_resolver.rb +44 -48
- data/lib/parqueteur/types/array_type.rb +21 -0
- data/lib/parqueteur/types/boolean_type.rb +15 -0
- data/lib/parqueteur/types/date32_type.rb +15 -0
- data/lib/parqueteur/types/date64_type.rb +15 -0
- data/lib/parqueteur/types/decimal128_type.rb +18 -0
- data/lib/parqueteur/types/decimal256_type.rb +18 -0
- data/lib/parqueteur/types/int32_type.rb +23 -0
- data/lib/parqueteur/types/int64_type.rb +23 -0
- data/lib/parqueteur/types/map_type.rb +36 -0
- data/lib/parqueteur/types/string_type.rb +20 -0
- data/lib/parqueteur/types/struct_type.rb +35 -0
- data/lib/parqueteur/types/time32_type.rb +19 -0
- data/lib/parqueteur/types/time64_type.rb +19 -0
- data/lib/parqueteur/types/timestamp_type.rb +24 -0
- data/lib/parqueteur/version.rb +1 -1
- data/lib/parqueteur.rb +24 -7
- data/parqueteur.gemspec +2 -2
- data/scripts/apache-arrow-ubuntu-install.sh +18 -0
- metadata +27 -8
- data/example.rb +0 -20
- data/lib/parqueteur/chunked_converter.rb +0 -28
- data/lib/parqueteur/value_array_builder.rb +0 -59
- data/test.json +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 715ea84521855ea8978e4f944fd6ae07f192bf97fdd40893b4b9bd292a3fe0b5
|
4
|
+
data.tar.gz: d05798d68479c37a8d7028cd65ff35438399de4459d84d4797399940c63513f6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a4d74f311c64f79c6e339ba05e631feb94552268936f58c86e0e9bcf70bb46fec8a94d452501cd395203ea33ab9440e015e4503f255cfcecc7703e6fc8d0a1b
|
7
|
+
data.tar.gz: 344ea6420b6c08bbe61f4f534a92c26f563193d3f6dc995dd7eb09d9df5e9f3342971bbd53570d8db0f6b910e457e62e95811c04ba2245578de3b8bb245f7dc1
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
# Parqueteur
|
2
2
|
|
3
|
-
|
3
|
+
Parqueteur enables you to generate Apache Parquet files from raw data.
|
4
4
|
|
5
|
-
|
5
|
+
## Dependencies
|
6
6
|
|
7
|
+
Since I only tested Parqueteur on Ubuntu, I don't have any install scripts for other operating systems.
|
8
|
+
### Debian/Ubuntu packages
|
9
|
+
- `libgirepository1.0-dev`
|
10
|
+
- `libarrow-dev`
|
11
|
+
- `libarrow-glib-dev`
|
12
|
+
- `libparquet-dev`
|
13
|
+
- `libparquet-glib-dev`
|
14
|
+
|
15
|
+
You can check `scripts/apache-arrow-ubuntu-install.sh` script for a quick way to install all of them.
|
7
16
|
## Installation
|
8
17
|
|
9
18
|
Add this line to your application's Gemfile:
|
10
19
|
|
11
20
|
```ruby
|
12
|
-
gem 'parqueteur'
|
21
|
+
gem 'parqueteur', '~> 1.0'
|
22
|
+
```
|
23
|
+
|
24
|
+
> (optional) If you don't want to require Parqueteur globally you can add `require: false` to the Gemfile instruction:
|
25
|
+
```ruby
|
26
|
+
gem 'parqueteur', '~> 1.0', require: false
|
13
27
|
```
|
14
28
|
|
15
29
|
And then execute:
|
@@ -22,14 +36,35 @@ Or install it yourself as:
|
|
22
36
|
|
23
37
|
## Usage
|
24
38
|
|
25
|
-
|
39
|
+
Parqueteur provides an elegant way to generate Apache Parquet files from a defined schema.
|
40
|
+
```ruby
|
41
|
+
require 'parqueteur'
|
26
42
|
|
27
|
-
|
43
|
+
class FooParquetConverter < Parqueteur::Converter
|
44
|
+
column :id, :bigint
|
45
|
+
column :reference, :string
|
46
|
+
end
|
28
47
|
|
29
|
-
|
48
|
+
data = [
|
49
|
+
{ 'id' => 1, 'reference' => 'hello world 1' },
|
50
|
+
{ 'id' => 2, 'reference' => 'hello world 2' },
|
51
|
+
{ 'id' => 3, 'reference' => 'hello world 3' }
|
52
|
+
]
|
30
53
|
|
31
|
-
|
54
|
+
# initialize Converter with Parquet GZIP compression mode
|
55
|
+
converter = FooParquetConverter.new(data, compression: :gzip)
|
56
|
+
|
57
|
+
# write result to file
|
58
|
+
converter.write('hello_world.parquet')
|
59
|
+
|
60
|
+
# in-memory result (StringIO)
|
61
|
+
converter.to_io
|
62
|
+
|
63
|
+
# write to temporary file (Tempfile)
|
64
|
+
# don't forget to `close` / `unlink` it after usage
|
65
|
+
converter.to_tmpfile
|
66
|
+
```
|
32
67
|
|
33
68
|
## Contributing
|
34
69
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
70
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/pocketsizesun/parqueteur-ruby.
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class FooParquetConverter < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
# initialize Converter with Parquet GZIP compression mode
|
52
|
+
converter = FooParquetConverter.new(data, compression: :gzip)
|
53
|
+
|
54
|
+
# write result to file
|
55
|
+
converter.write('tmp/example.gzip-compressed.parquet')
|
56
|
+
converter.write('tmp/example.no-gzip.parquet', compression: false)
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
converter = Foo.new(data, compression: :gzip)
|
52
|
+
pp converter.to_io
|
53
|
+
pp converter.to_arrow_table
|
54
|
+
converter.write('tmp/test.parquet')
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
io = Foo.convert(data)
|
52
|
+
pp io.read
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
converter = Foo.new(data, compression: :gzip)
|
52
|
+
converter.split(200).each_with_index do |chunk, idx|
|
53
|
+
puts "#{idx}: #{chunk.path}"
|
54
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
path = 'tmp/test.parquet'
|
52
|
+
Foo.convert_to(data, path)
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
|
4
|
+
class FooParquetConverter < Parqueteur::Converter
|
5
|
+
column :id, :bigint
|
6
|
+
column :my_string_array, :array, elements: :string
|
7
|
+
column :my_date, :date
|
8
|
+
column :my_decimal, :decimal, precision: 12, scale: 4
|
9
|
+
column :my_int, :integer
|
10
|
+
column :my_map, :map, key: :string, value: :string
|
11
|
+
column :my_string, :string
|
12
|
+
column :my_struct, :struct do
|
13
|
+
field :my_struct_str, :string
|
14
|
+
field :my_struct_int, :integer
|
15
|
+
end
|
16
|
+
column :my_time, :time
|
17
|
+
column :my_timestamp, :timestamp
|
18
|
+
end
|
19
|
+
|
20
|
+
data = 100.times.collect do |i|
|
21
|
+
{
|
22
|
+
'id' => i,
|
23
|
+
'my_string_array' => %w[a b c],
|
24
|
+
'my_date' => Date.today,
|
25
|
+
'my_decimal' => BigDecimal('789000.5678'),
|
26
|
+
'my_int' => rand(1..10),
|
27
|
+
'my_map' => { 'a' => 'b' },
|
28
|
+
'my_string' => 'Hello World',
|
29
|
+
'my_struct' => {
|
30
|
+
'my_struct_str' => 'Hello World',
|
31
|
+
'my_struct_int' => 1
|
32
|
+
},
|
33
|
+
'my_time' => 3600,
|
34
|
+
'my_timestamp' => Time.now
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
# initialize Converter with Parquet GZIP compression mode
|
39
|
+
converter = FooParquetConverter.new(data, compression: :gzip)
|
40
|
+
|
41
|
+
# write result to file
|
42
|
+
converter.write('tmp/hello_world.compressed.parquet')
|
43
|
+
converter.write('tmp/hello_world.parquet', compression: false)
|
44
|
+
|
45
|
+
# in-memory result (StringIO)
|
46
|
+
converter.to_io
|
47
|
+
|
48
|
+
# write to temporary file (Tempfile)
|
49
|
+
# don't forget to `close` / `unlink` it after usage
|
50
|
+
converter.to_tmpfile
|
51
|
+
|
52
|
+
# Arrow Table
|
53
|
+
table = converter.to_arrow_table
|
54
|
+
table.each_record do |record|
|
55
|
+
pp record.to_h
|
56
|
+
end
|
data/lib/parqueteur/column.rb
CHANGED
@@ -4,31 +4,14 @@ module Parqueteur
|
|
4
4
|
class Column
|
5
5
|
attr_reader :name, :type, :options
|
6
6
|
|
7
|
-
def initialize(name, type, options = {})
|
7
|
+
def initialize(name, type, options = {}, &block)
|
8
8
|
@name = name.to_s
|
9
|
-
@type = type
|
9
|
+
@type = Parqueteur::TypeResolver.resolve(type, options, &block)
|
10
10
|
@options = options
|
11
11
|
end
|
12
12
|
|
13
13
|
def arrow_type
|
14
|
-
@arrow_type
|
15
|
-
end
|
16
|
-
|
17
|
-
def cast(value)
|
18
|
-
case @type
|
19
|
-
when :string then value.to_s
|
20
|
-
when :boolean then value == true
|
21
|
-
when :integer then value.to_i
|
22
|
-
when :long then value.to_i
|
23
|
-
when :timestamp
|
24
|
-
case value
|
25
|
-
when String then Time.parse(value).to_i
|
26
|
-
when Integer then value
|
27
|
-
else
|
28
|
-
raise ArgumentError, "Unable to cast '#{value}' to timestamp"
|
29
|
-
end
|
30
|
-
when :map then value
|
31
|
-
end
|
14
|
+
@type.arrow_type
|
32
15
|
end
|
33
16
|
|
34
17
|
def to_arrow_field
|
@@ -4,11 +4,18 @@ module Parqueteur
|
|
4
4
|
class ColumnCollection
|
5
5
|
include Enumerable
|
6
6
|
|
7
|
+
attr_reader :column_names
|
8
|
+
|
7
9
|
def initialize
|
8
10
|
@columns = []
|
11
|
+
@column_names = []
|
9
12
|
@columns_idx = {}
|
10
13
|
end
|
11
14
|
|
15
|
+
def key?(key)
|
16
|
+
@columns_idx.key?(key)
|
17
|
+
end
|
18
|
+
|
12
19
|
def each(&block)
|
13
20
|
@columns.each(&block)
|
14
21
|
end
|
@@ -17,6 +24,7 @@ module Parqueteur
|
|
17
24
|
unless @columns_idx.key?(column.name)
|
18
25
|
@columns_idx[column.name] = column
|
19
26
|
@columns << column
|
27
|
+
@column_names << column.name
|
20
28
|
end
|
21
29
|
|
22
30
|
true
|