parqueteur 1.0.2 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/Gemfile.lock +1 -1
- data/examples/convert-methods.rb +54 -0
- data/examples/convert-to-io.rb +52 -0
- data/examples/convert-with-chunks.rb +54 -0
- data/examples/convert-with-compression.rb +52 -0
- data/examples/convert-without-compression.rb +52 -0
- data/lib/parqueteur/column.rb +3 -20
- data/lib/parqueteur/column_collection.rb +8 -0
- data/lib/parqueteur/converter.rb +99 -66
- data/lib/parqueteur/input.rb +12 -27
- data/lib/parqueteur/struct.rb +25 -0
- data/lib/parqueteur/type.rb +21 -0
- data/lib/parqueteur/type_resolver.rb +34 -48
- data/lib/parqueteur/types/array_type.rb +21 -0
- data/lib/parqueteur/types/boolean_type.rb +15 -0
- data/lib/parqueteur/types/int32_type.rb +25 -0
- data/lib/parqueteur/types/int64_type.rb +25 -0
- data/lib/parqueteur/types/map_type.rb +36 -0
- data/lib/parqueteur/types/string_type.rb +20 -0
- data/lib/parqueteur/types/struct_type.rb +35 -0
- data/lib/parqueteur/types/timestamp_type.rb +22 -0
- data/lib/parqueteur/version.rb +1 -1
- data/lib/parqueteur.rb +17 -5
- metadata +16 -4
- data/example.rb +0 -20
- data/lib/parqueteur/chunked_converter.rb +0 -28
- data/lib/parqueteur/value_array_builder.rb +0 -59
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 056b9208a8bffcd163464dbd2cf276a9b0704e96788b77555d545eb339a4e798
|
4
|
+
data.tar.gz: 1e20d31b1fc6f198fee42546939ce289d71d66f65ffa66562cdd7841e0f24f61
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fe08a7b282c4ededc08acb5aa9f4b485ead828aee4fd1444e8bb1af80cc56ea8c20411aefe136809f91ad808bee52db261218e8b5e6b7538bfa53d1eb38eb4b5
|
7
|
+
data.tar.gz: 0fee8ec94698b7b4c9d3a089fd0094a52bd83dfda56d0652f8a5b08dfe84a88b251736e62a9da7f510e0fa3d1842e2551161178ce30b5e0f5c6ee9b903917a2c
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
converter = Foo.new(data, compression: :gzip)
|
52
|
+
pp converter.to_io
|
53
|
+
pp converter.to_arrow_table
|
54
|
+
converter.write('tmp/test.parquet')
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
io = Foo.convert(data)
|
52
|
+
pp io.read
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
converter = Foo.new(data, compression: :gzip)
|
52
|
+
converter.split(200).each_with_index do |chunk, idx|
|
53
|
+
puts "#{idx}: #{chunk.path}"
|
54
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
path = 'tmp/test.parquet'
|
52
|
+
Foo.convert_to(data, path, compression: :gzip)
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
path = 'tmp/test.parquet'
|
52
|
+
Foo.convert_to(data, path)
|
data/lib/parqueteur/column.rb
CHANGED
@@ -4,31 +4,14 @@ module Parqueteur
|
|
4
4
|
class Column
|
5
5
|
attr_reader :name, :type, :options
|
6
6
|
|
7
|
-
def initialize(name, type, options = {})
|
7
|
+
def initialize(name, type, options = {}, &block)
|
8
8
|
@name = name.to_s
|
9
|
-
@type = type
|
9
|
+
@type = Parqueteur::TypeResolver.resolve(type, options, &block)
|
10
10
|
@options = options
|
11
11
|
end
|
12
12
|
|
13
13
|
def arrow_type
|
14
|
-
@arrow_type
|
15
|
-
end
|
16
|
-
|
17
|
-
def cast(value)
|
18
|
-
case @type
|
19
|
-
when :string then value.to_s
|
20
|
-
when :boolean then value == true
|
21
|
-
when :integer then value.to_i
|
22
|
-
when :long then value.to_i
|
23
|
-
when :timestamp
|
24
|
-
case value
|
25
|
-
when String then Time.parse(value).to_i
|
26
|
-
when Integer then value
|
27
|
-
else
|
28
|
-
raise ArgumentError, "Unable to cast '#{value}' to timestamp"
|
29
|
-
end
|
30
|
-
when :map then value
|
31
|
-
end
|
14
|
+
@type.arrow_type
|
32
15
|
end
|
33
16
|
|
34
17
|
def to_arrow_field
|
@@ -4,11 +4,18 @@ module Parqueteur
|
|
4
4
|
class ColumnCollection
|
5
5
|
include Enumerable
|
6
6
|
|
7
|
+
attr_reader :column_names
|
8
|
+
|
7
9
|
def initialize
|
8
10
|
@columns = []
|
11
|
+
@column_names = []
|
9
12
|
@columns_idx = {}
|
10
13
|
end
|
11
14
|
|
15
|
+
def key?(key)
|
16
|
+
@columns_idx.key?(key)
|
17
|
+
end
|
18
|
+
|
12
19
|
def each(&block)
|
13
20
|
@columns.each(&block)
|
14
21
|
end
|
@@ -17,6 +24,7 @@ module Parqueteur
|
|
17
24
|
unless @columns_idx.key?(column.name)
|
18
25
|
@columns_idx[column.name] = column
|
19
26
|
@columns << column
|
27
|
+
@column_names << column.name
|
20
28
|
end
|
21
29
|
|
22
30
|
true
|
data/lib/parqueteur/converter.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
module Parqueteur
|
4
4
|
class Converter
|
5
|
-
|
5
|
+
DEFAULT_BATCH_SIZE = 10
|
6
6
|
|
7
7
|
def self.inline(&block)
|
8
8
|
Class.new(self, &block)
|
@@ -12,104 +12,137 @@ module Parqueteur
|
|
12
12
|
@columns ||= Parqueteur::ColumnCollection.new
|
13
13
|
end
|
14
14
|
|
15
|
-
def self.column(name, type, options = {})
|
16
|
-
columns.add(Parqueteur::Column.new(name, type, options))
|
15
|
+
def self.column(name, type, options = {}, &block)
|
16
|
+
columns.add(Parqueteur::Column.new(name, type, options, &block))
|
17
17
|
end
|
18
18
|
|
19
19
|
def self.transforms
|
20
20
|
@transforms ||= []
|
21
21
|
end
|
22
22
|
|
23
|
-
def self.transform(method_name, &block)
|
23
|
+
def self.transform(method_name = nil, &block)
|
24
24
|
transforms << (method_name || block)
|
25
25
|
end
|
26
26
|
|
27
|
-
def self.convert(input,
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
27
|
+
def self.convert(input, **kwargs)
|
28
|
+
new(input, **kwargs).to_io
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.convert_to(input, output_path, **kwargs)
|
32
|
+
converter = new(input, **kwargs)
|
33
|
+
converter.write(output_path)
|
34
|
+
end
|
35
|
+
|
36
|
+
# @param [Enumerable] An enumerable object
|
37
|
+
# @option [Symbol] compression - :gzip
|
38
|
+
def initialize(input, **kwargs)
|
39
|
+
@input = Parqueteur::Input.from(input)
|
40
|
+
@batch_size = kwargs.fetch(:batch_size, DEFAULT_BATCH_SIZE)
|
41
|
+
@compression = kwargs.fetch(:compression, nil)&.to_sym
|
42
|
+
end
|
43
|
+
|
44
|
+
def split(size)
|
45
|
+
Enumerator.new do |arr|
|
46
|
+
@input.each_slice(size) do |records|
|
47
|
+
local_converter = self.class.new(
|
48
|
+
records, batch_size: @batch_size, compression: @compression
|
49
|
+
)
|
50
|
+
file = local_converter.to_tmpfile
|
51
|
+
arr << file
|
52
|
+
file.close
|
53
|
+
file.unlink
|
54
|
+
end
|
33
55
|
end
|
34
56
|
end
|
35
57
|
|
36
|
-
def
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
when :io
|
43
|
-
to_io
|
44
|
-
when String
|
45
|
-
to_arrow_table.save(output)
|
46
|
-
when StringIO, IO
|
47
|
-
buffer = Arrow::ResizableBuffer.new(0)
|
48
|
-
to_arrow_table.save(buffer, format: :parquet)
|
49
|
-
output.write(buffer.data.to_s)
|
50
|
-
output.rewind
|
51
|
-
output
|
52
|
-
else
|
53
|
-
raise ArgumentError, "unsupported output: #{output.class}, accepted: String (filename), IO, StringIO"
|
58
|
+
def split_by_io(size)
|
59
|
+
Enumerator.new do |arr|
|
60
|
+
@input.each_slice(size) do |records|
|
61
|
+
local_converter = self.class.new(records)
|
62
|
+
arr << local_converter.to_io
|
63
|
+
end
|
54
64
|
end
|
55
65
|
end
|
56
66
|
|
57
|
-
def
|
58
|
-
|
67
|
+
def write(path)
|
68
|
+
arrow_schema = self.class.columns.arrow_schema
|
69
|
+
writer_properties = Parquet::WriterProperties.new
|
70
|
+
writer_properties.set_compression(@compression) unless @compression.nil?
|
71
|
+
|
72
|
+
Arrow::FileOutputStream.open(path, false) do |output|
|
73
|
+
Parquet::ArrowFileWriter.open(arrow_schema, output, writer_properties) do |writer|
|
74
|
+
@input.each_slice(@batch_size) do |records|
|
75
|
+
arrow_table = build_arrow_table(records)
|
76
|
+
writer.write_table(arrow_table, 1024)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
true
|
82
|
+
end
|
83
|
+
|
84
|
+
def to_tmpfile
|
85
|
+
tempfile = Tempfile.new
|
86
|
+
tempfile.binmode
|
87
|
+
write(tempfile.path)
|
88
|
+
tempfile.rewind
|
89
|
+
tempfile
|
59
90
|
end
|
60
91
|
|
61
92
|
def to_io
|
62
|
-
|
93
|
+
tmpfile = to_tmpfile
|
94
|
+
strio = StringIO.new(tmpfile.read)
|
95
|
+
tmpfile.close
|
96
|
+
tmpfile.unlink
|
97
|
+
strio
|
98
|
+
end
|
99
|
+
|
100
|
+
def to_arrow_table
|
101
|
+
file = to_tmpfile
|
102
|
+
table = Arrow::Table.load(file.path, format: :parquet)
|
103
|
+
file.close
|
104
|
+
file.unlink
|
105
|
+
table
|
63
106
|
end
|
64
107
|
|
65
108
|
def to_blob
|
66
|
-
|
109
|
+
to_io.read
|
67
110
|
end
|
68
111
|
|
69
|
-
|
70
|
-
transforms = self.class.transforms
|
112
|
+
private
|
71
113
|
|
72
|
-
|
73
|
-
|
74
|
-
values = self.class.columns.each_with_object({}) do |column, hash|
|
75
|
-
hash[column.name] = []
|
76
|
-
end
|
114
|
+
def build_arrow_table(records)
|
115
|
+
transforms = self.class.transforms
|
77
116
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
item = \
|
82
|
-
if transform.is_a?(Symbol)
|
83
|
-
__send__(transform, item)
|
84
|
-
else
|
85
|
-
transform.call(item)
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
117
|
+
values = self.class.columns.each_with_object({}) do |column, hash|
|
118
|
+
hash[column.name] = []
|
119
|
+
end
|
89
120
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
121
|
+
records.each do |item|
|
122
|
+
if transforms.length > 0
|
123
|
+
transforms.each do |transform|
|
124
|
+
item = \
|
125
|
+
if transform.is_a?(Symbol)
|
126
|
+
__send__(transform, item)
|
127
|
+
else
|
128
|
+
transform.call(item)
|
129
|
+
end
|
96
130
|
end
|
97
131
|
end
|
98
132
|
|
99
|
-
values.
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
)
|
106
|
-
)
|
133
|
+
values.each_key do |value_key|
|
134
|
+
if item.key?(value_key)
|
135
|
+
values[value_key] << item[value_key]
|
136
|
+
else
|
137
|
+
values[value_key] << nil
|
138
|
+
end
|
107
139
|
end
|
108
140
|
end
|
109
141
|
|
110
142
|
Arrow::Table.new(
|
111
|
-
|
112
|
-
|
143
|
+
values.each_with_object({}) do |item, hash|
|
144
|
+
column = self.class.columns.find(item[0])
|
145
|
+
hash[item[0]] = column.type.build_value_array(item[1])
|
113
146
|
end
|
114
147
|
)
|
115
148
|
end
|
data/lib/parqueteur/input.rb
CHANGED
@@ -4,40 +4,25 @@ module Parqueteur
|
|
4
4
|
class Input
|
5
5
|
include Enumerable
|
6
6
|
|
7
|
-
def self.from(arg
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
if File.exist?(arg)
|
12
|
-
File.new(arg, 'r')
|
13
|
-
else
|
14
|
-
arg.split("\n")
|
15
|
-
end
|
16
|
-
when Array, Enumerator
|
17
|
-
arg
|
18
|
-
end,
|
19
|
-
options
|
20
|
-
)
|
7
|
+
def self.from(arg)
|
8
|
+
return arg if arg.is_a?(self)
|
9
|
+
|
10
|
+
new(arg)
|
21
11
|
end
|
22
12
|
|
23
|
-
def initialize(source
|
13
|
+
def initialize(source)
|
14
|
+
unless source.is_a?(Enumerable)
|
15
|
+
raise ArgumentError, 'Enumerable object expected'
|
16
|
+
end
|
17
|
+
|
24
18
|
@source = source
|
25
|
-
@options = options
|
26
19
|
end
|
27
20
|
|
28
21
|
def each(&block)
|
29
|
-
|
30
|
-
when File
|
31
|
-
if @options.fetch(:json_newlines, true) == true
|
32
|
-
@source.each_line do |line|
|
33
|
-
yield(JSON.parse(line.strip))
|
34
|
-
end
|
35
|
-
else
|
36
|
-
JSON.parse(@source.read).each(&block)
|
37
|
-
end
|
38
|
-
@source.rewind
|
39
|
-
when Array, Enumerator
|
22
|
+
if block_given?
|
40
23
|
@source.each(&block)
|
24
|
+
else
|
25
|
+
@source.to_enum(:each)
|
41
26
|
end
|
42
27
|
end
|
43
28
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Parqueteur
|
4
|
+
class Struct
|
5
|
+
def initialize(&block)
|
6
|
+
instance_exec(&block)
|
7
|
+
end
|
8
|
+
|
9
|
+
def fields
|
10
|
+
@fields ||= Parqueteur::ColumnCollection.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def field(name, type, options = {}, &block)
|
14
|
+
fields.add(Parqueteur::Column.new(name, type, options, &block))
|
15
|
+
end
|
16
|
+
|
17
|
+
def key?(key)
|
18
|
+
fields.key?(key)
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_arrow_type
|
22
|
+
fields.collect(&:to_arrow_field)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Parqueteur
|
4
|
+
class Type
|
5
|
+
attr_reader :options, :arrow_type
|
6
|
+
|
7
|
+
def initialize(options = {}, &block)
|
8
|
+
@options = options
|
9
|
+
@block = block
|
10
|
+
@arrow_type = arrow_type_builder
|
11
|
+
end
|
12
|
+
|
13
|
+
def build_value_array(values)
|
14
|
+
raise "#to_arrow_field must be implemented in #{self.class}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def resolve(type, options = {})
|
18
|
+
Parqueteur::TypeResolver.resolve(type, options)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -2,60 +2,46 @@
|
|
2
2
|
|
3
3
|
module Parqueteur
|
4
4
|
class TypeResolver
|
5
|
-
|
6
|
-
|
5
|
+
include Singleton
|
6
|
+
|
7
|
+
def self.registered_types
|
8
|
+
@registered_types ||= {
|
9
|
+
array: Parqueteur::Types::ArrayType,
|
10
|
+
bigint: Parqueteur::Types::Int64Type,
|
11
|
+
boolean: Parqueteur::Types::BooleanType,
|
12
|
+
int32: Parqueteur::Types::Int32Type,
|
13
|
+
int64: Parqueteur::Types::Int64Type,
|
14
|
+
integer: Parqueteur::Types::Int32Type,
|
15
|
+
map: Parqueteur::Types::MapType,
|
16
|
+
string: Parqueteur::Types::StringType,
|
17
|
+
struct: Parqueteur::Types::StructType,
|
18
|
+
timestamp: Parqueteur::Types::TimestampType
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.register_type(type, klass)
|
23
|
+
registered_types[type] = klass
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.resolve(*args, &block)
|
27
|
+
instance.resolve(*args, &block)
|
7
28
|
end
|
8
29
|
|
9
|
-
def resolve(type, options = {})
|
10
|
-
|
11
|
-
|
12
|
-
elements_opt = options.fetch(:elements)
|
13
|
-
Arrow::ListDataType.new(
|
14
|
-
if elements_opt.is_a?(Hash)
|
15
|
-
resolve(elements_opt.fetch(:type), elements_opt)
|
16
|
-
else
|
17
|
-
resolve(elements_opt)
|
18
|
-
end
|
19
|
-
)
|
20
|
-
when :boolean
|
21
|
-
Arrow::BooleanDataType.new
|
22
|
-
when :integer
|
23
|
-
if options.fetch(:unsigned, false) == true
|
24
|
-
Arrow::UInt32DataType.new
|
25
|
-
else
|
26
|
-
Arrow::Int32DataType.new
|
27
|
-
end
|
28
|
-
when :long
|
29
|
-
if options.fetch(:unsigned, false) == true
|
30
|
-
Arrow::UInt64DataType.new
|
31
|
-
else
|
32
|
-
Arrow::Int64DataType.new
|
33
|
-
end
|
34
|
-
when :timestamp
|
35
|
-
Arrow::TimestampDataType.new(
|
36
|
-
options.fetch(:unit, :second)
|
37
|
-
)
|
38
|
-
when :string
|
39
|
-
Arrow::StringDataType.new
|
40
|
-
when :map
|
41
|
-
map_value = options.fetch(:value)
|
42
|
-
Arrow::MapDataType.new(
|
43
|
-
resolve(options.fetch(:key)),
|
44
|
-
if map_value.is_a?(Hash)
|
45
|
-
resolve(map_value.fetch(:type), map_value)
|
46
|
-
else
|
47
|
-
resolve(map_value)
|
48
|
-
end
|
49
|
-
)
|
30
|
+
def resolve(type, options = {}, &block)
|
31
|
+
if type.is_a?(Symbol)
|
32
|
+
resolve_from_symbol(type, options, &block)
|
50
33
|
else
|
51
|
-
|
34
|
+
type.new(options, &block)
|
52
35
|
end
|
53
36
|
end
|
54
|
-
end
|
55
|
-
end
|
56
37
|
|
57
|
-
private
|
38
|
+
private
|
58
39
|
|
59
|
-
def
|
40
|
+
def resolve_from_symbol(type, options, &block)
|
41
|
+
type_klass = self.class.registered_types.fetch(type.to_sym, nil)
|
42
|
+
raise Parqueteur::TypeNotFound, type if type_klass.nil?
|
60
43
|
|
44
|
+
type_klass.new(options, &block)
|
45
|
+
end
|
46
|
+
end
|
61
47
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Parqueteur
|
4
|
+
module Types
|
5
|
+
class ArrayType < Parqueteur::Type
|
6
|
+
def build_value_array(values)
|
7
|
+
Arrow::ListArrayBuilder.build(arrow_type, values)
|
8
|
+
end
|
9
|
+
|
10
|
+
def arrow_type_builder
|
11
|
+
Arrow::ListDataType.new(
|
12
|
+
if options[:elements].is_a?(Hash)
|
13
|
+
resolve(options[:elements].fetch(:type), options[:elements]).arrow_type
|
14
|
+
else
|
15
|
+
resolve(options[:elements]).arrow_type
|
16
|
+
end
|
17
|
+
)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Parqueteur
|
4
|
+
module Types
|
5
|
+
class BooleanType < Parqueteur::Type
|
6
|
+
def build_value_array(values)
|
7
|
+
Arrow::BooleanArray.new(values)
|
8
|
+
end
|
9
|
+
|
10
|
+
def arrow_type_builder
|
11
|
+
Arrow::BooleanDataType.new
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Parqueteur
|
4
|
+
module Types
|
5
|
+
class Int32Type < Parqueteur::Type
|
6
|
+
def build_value_array(values)
|
7
|
+
if options.fetch(:unsigned, false) == true
|
8
|
+
Arrow::UInt32Array.new(values)
|
9
|
+
else
|
10
|
+
Arrow::Int32Array.new(values)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def arrow_type_builder
|
15
|
+
if options.fetch(:unsigned, false) == true
|
16
|
+
Arrow::UInt32DataType.new
|
17
|
+
else
|
18
|
+
Arrow::Int32DataType.new
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# when :integer
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Parqueteur
|
4
|
+
module Types
|
5
|
+
class Int64Type < Parqueteur::Type
|
6
|
+
def build_value_array(values)
|
7
|
+
if options.fetch(:unsigned, false) == true
|
8
|
+
Arrow::UInt64Array.new(values)
|
9
|
+
else
|
10
|
+
Arrow::Int64Array.new(values)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def arrow_type_builder
|
15
|
+
if options.fetch(:unsigned, false) == true
|
16
|
+
Arrow::UInt64DataType.new
|
17
|
+
else
|
18
|
+
Arrow::Int64DataType.new
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# when :integer
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Parqueteur
|
4
|
+
module Types
|
5
|
+
class MapType < Parqueteur::Type
|
6
|
+
def build_value_array(values)
|
7
|
+
builder = Arrow::MapArrayBuilder.new(arrow_type)
|
8
|
+
values.each do |entry|
|
9
|
+
builder.append_value
|
10
|
+
next if entry.nil?
|
11
|
+
|
12
|
+
entry.each do |k, v|
|
13
|
+
builder.key_builder.append(k)
|
14
|
+
builder.item_builder.append(v)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
builder.finish
|
19
|
+
end
|
20
|
+
|
21
|
+
def arrow_type_builder
|
22
|
+
map_value = options.fetch(:value)
|
23
|
+
|
24
|
+
Arrow::MapDataType.new(
|
25
|
+
resolve(options.fetch(:key)).arrow_type,
|
26
|
+
if map_value.is_a?(Hash)
|
27
|
+
resolve(map_value.fetch(:type), map_value).arrow_type
|
28
|
+
else
|
29
|
+
resolve(map_value).arrow_type
|
30
|
+
end
|
31
|
+
)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# when :timestamp
|
4
|
+
# Arrow::TimestampDataType.new(
|
5
|
+
# options.fetch(:unit, :second)
|
6
|
+
# )
|
7
|
+
|
8
|
+
module Parqueteur
|
9
|
+
module Types
|
10
|
+
class StringType < Parqueteur::Type
|
11
|
+
def build_value_array(values)
|
12
|
+
Arrow::StringArray.new(values)
|
13
|
+
end
|
14
|
+
|
15
|
+
def arrow_type_builder
|
16
|
+
Arrow::StringDataType.new
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# when :timestamp
|
4
|
+
# Arrow::TimestampDataType.new(
|
5
|
+
# options.fetch(:unit, :second)
|
6
|
+
# )
|
7
|
+
|
8
|
+
module Parqueteur
|
9
|
+
module Types
|
10
|
+
class StructType < Parqueteur::Type
|
11
|
+
def build_value_array(values)
|
12
|
+
values.each do |value|
|
13
|
+
next if value.nil?
|
14
|
+
|
15
|
+
value.each_key do |key|
|
16
|
+
next if struct_object.key?(key)
|
17
|
+
|
18
|
+
raise Parqueteur::Error, "Struct field '#{key}' not found"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
Arrow::StructArrayBuilder.build(arrow_type, values)
|
22
|
+
end
|
23
|
+
|
24
|
+
def arrow_type_builder
|
25
|
+
Arrow::StructDataType.new(struct_object.to_arrow_type)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def struct_object
|
31
|
+
@struct_object ||= Parqueteur::Struct.new(&@block)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# when :timestamp
|
4
|
+
# Arrow::TimestampDataType.new(
|
5
|
+
# options.fetch(:unit, :second)
|
6
|
+
# )
|
7
|
+
|
8
|
+
module Parqueteur
|
9
|
+
module Types
|
10
|
+
class TimestampType < Parqueteur::Type
|
11
|
+
def build_value_array(values)
|
12
|
+
Arrow::TimestampArray.new(values)
|
13
|
+
end
|
14
|
+
|
15
|
+
def arrow_type_builder
|
16
|
+
Arrow::TimestampDataType.new(
|
17
|
+
options.fetch(:unit, :second)
|
18
|
+
)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/parqueteur/version.rb
CHANGED
data/lib/parqueteur.rb
CHANGED
@@ -1,17 +1,29 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
require '
|
3
|
+
require 'json'
|
4
|
+
require 'singleton'
|
5
|
+
require 'tempfile'
|
6
|
+
|
7
|
+
require_relative 'parqueteur/version'
|
5
8
|
require 'parqueteur/column'
|
6
9
|
require 'parqueteur/column_collection'
|
7
10
|
require 'parqueteur/converter'
|
8
|
-
require 'parqueteur/chunked_converter'
|
9
11
|
require 'parqueteur/input'
|
10
|
-
require 'parqueteur/
|
11
|
-
require '
|
12
|
+
require 'parqueteur/struct'
|
13
|
+
require 'parqueteur/type'
|
14
|
+
require 'parqueteur/type_resolver'
|
15
|
+
require 'parqueteur/types/array_type'
|
16
|
+
require 'parqueteur/types/boolean_type'
|
17
|
+
require 'parqueteur/types/int32_type'
|
18
|
+
require 'parqueteur/types/int64_type'
|
19
|
+
require 'parqueteur/types/map_type'
|
20
|
+
require 'parqueteur/types/string_type'
|
21
|
+
require 'parqueteur/types/struct_type'
|
22
|
+
require 'parqueteur/types/timestamp_type'
|
12
23
|
require 'parquet'
|
13
24
|
|
14
25
|
module Parqueteur
|
15
26
|
class Error < StandardError; end
|
27
|
+
class TypeNotFound < Error; end
|
16
28
|
# Your code goes here...
|
17
29
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parqueteur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Julien D.
|
@@ -38,15 +38,27 @@ files:
|
|
38
38
|
- Rakefile
|
39
39
|
- bin/console
|
40
40
|
- bin/setup
|
41
|
-
-
|
41
|
+
- examples/convert-methods.rb
|
42
|
+
- examples/convert-to-io.rb
|
43
|
+
- examples/convert-with-chunks.rb
|
44
|
+
- examples/convert-with-compression.rb
|
45
|
+
- examples/convert-without-compression.rb
|
42
46
|
- lib/parqueteur.rb
|
43
|
-
- lib/parqueteur/chunked_converter.rb
|
44
47
|
- lib/parqueteur/column.rb
|
45
48
|
- lib/parqueteur/column_collection.rb
|
46
49
|
- lib/parqueteur/converter.rb
|
47
50
|
- lib/parqueteur/input.rb
|
51
|
+
- lib/parqueteur/struct.rb
|
52
|
+
- lib/parqueteur/type.rb
|
48
53
|
- lib/parqueteur/type_resolver.rb
|
49
|
-
- lib/parqueteur/
|
54
|
+
- lib/parqueteur/types/array_type.rb
|
55
|
+
- lib/parqueteur/types/boolean_type.rb
|
56
|
+
- lib/parqueteur/types/int32_type.rb
|
57
|
+
- lib/parqueteur/types/int64_type.rb
|
58
|
+
- lib/parqueteur/types/map_type.rb
|
59
|
+
- lib/parqueteur/types/string_type.rb
|
60
|
+
- lib/parqueteur/types/struct_type.rb
|
61
|
+
- lib/parqueteur/types/timestamp_type.rb
|
50
62
|
- lib/parqueteur/version.rb
|
51
63
|
- parqueteur.gemspec
|
52
64
|
- test.json
|
data/example.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'bundler/setup'
|
2
|
-
require 'parqueteur'
|
3
|
-
|
4
|
-
class Foo < Parqueteur::Converter
|
5
|
-
column :id, :long
|
6
|
-
column :reference, :string
|
7
|
-
column :hash, :map, key: :string, value: :string
|
8
|
-
column :valid, :boolean
|
9
|
-
column :total, :integer
|
10
|
-
end
|
11
|
-
|
12
|
-
LETTERS = ('a'..'z').to_a
|
13
|
-
|
14
|
-
data = 1000.times.collect do |i|
|
15
|
-
{ 'id' => i + 1, 'reference' => "coucou:#{i}", 'hash' => { 'a' => LETTERS.sample }, 'valid' => rand < 0.5, 'total' => rand(100..500) }
|
16
|
-
end
|
17
|
-
|
18
|
-
chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
|
19
|
-
pp chunked_converter.write_files('test')
|
20
|
-
# puts Foo.convert(data, output: 'test.parquet')
|
@@ -1,28 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Parqueteur
|
4
|
-
class ChunkedConverter
|
5
|
-
attr_reader :schema
|
6
|
-
|
7
|
-
def initialize(input, converter, chunk_size = 200)
|
8
|
-
@input = Parqueteur::Input.from(input)
|
9
|
-
@converter = converter
|
10
|
-
@chunk_size = chunk_size
|
11
|
-
end
|
12
|
-
|
13
|
-
def chunks
|
14
|
-
Enumerator.new do |arr|
|
15
|
-
@input.each_slice(@chunk_size) do |chunk|
|
16
|
-
local_converter = @converter.new(chunk)
|
17
|
-
arr << local_converter.to_io
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
def write_files(prefix)
|
23
|
-
chunks.each_with_index do |chunk, idx|
|
24
|
-
File.write("#{prefix}.#{idx}.parquet", chunk.read)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
@@ -1,59 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Parqueteur
|
4
|
-
class ValueArrayBuilder
|
5
|
-
attr_reader :type, :options, :arrow_type
|
6
|
-
|
7
|
-
def self.build(input, type, options)
|
8
|
-
new(type, options).build(input)
|
9
|
-
end
|
10
|
-
|
11
|
-
def initialize(type, options)
|
12
|
-
@type = type
|
13
|
-
@options = options
|
14
|
-
@arrow_type = Parqueteur::TypeResolver.resolve(type, options)
|
15
|
-
end
|
16
|
-
|
17
|
-
def build(input)
|
18
|
-
return if input.nil?
|
19
|
-
|
20
|
-
case type
|
21
|
-
when :array
|
22
|
-
Arrow::ListArrayBuilder.build(arrow_type, input)
|
23
|
-
when :map
|
24
|
-
builder = Arrow::MapArrayBuilder.new(arrow_type)
|
25
|
-
input.each do |entry|
|
26
|
-
builder.append_value
|
27
|
-
next if entry.nil?
|
28
|
-
|
29
|
-
entry.each do |k, v|
|
30
|
-
builder.key_builder.append(k)
|
31
|
-
builder.item_builder.append(v)
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
builder.finish
|
36
|
-
when :boolean
|
37
|
-
Arrow::BooleanArray.new(input)
|
38
|
-
when :integer
|
39
|
-
if options.fetch(:unsigned, false) == true
|
40
|
-
Arrow::UInt32Array.new(input)
|
41
|
-
else
|
42
|
-
Arrow::Int32Array.new(input)
|
43
|
-
end
|
44
|
-
when :long
|
45
|
-
if options.fetch(:unsigned, false) == true
|
46
|
-
Arrow::UInt64Array.new(input)
|
47
|
-
else
|
48
|
-
Arrow::Int64Array.new(input)
|
49
|
-
end
|
50
|
-
when :string
|
51
|
-
Arrow::StringArray.new(input)
|
52
|
-
when :timestamp
|
53
|
-
Arrow::TimestampArray.new(input)
|
54
|
-
else
|
55
|
-
raise Error, "unknown type: #{type}"
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|