parqueteur 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/examples/convert-methods.rb +54 -0
- data/examples/convert-to-io.rb +52 -0
- data/examples/convert-with-chunks.rb +54 -0
- data/examples/convert-with-compression.rb +52 -0
- data/examples/convert-without-compression.rb +52 -0
- data/lib/parqueteur/converter.rb +96 -77
- data/lib/parqueteur/input.rb +12 -27
- data/lib/parqueteur/version.rb +1 -1
- data/lib/parqueteur.rb +2 -2
- metadata +6 -3
- data/example.rb +0 -39
- data/lib/parqueteur/chunked_converter.rb +0 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 056b9208a8bffcd163464dbd2cf276a9b0704e96788b77555d545eb339a4e798
|
4
|
+
data.tar.gz: 1e20d31b1fc6f198fee42546939ce289d71d66f65ffa66562cdd7841e0f24f61
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fe08a7b282c4ededc08acb5aa9f4b485ead828aee4fd1444e8bb1af80cc56ea8c20411aefe136809f91ad808bee52db261218e8b5e6b7538bfa53d1eb38eb4b5
|
7
|
+
data.tar.gz: 0fee8ec94698b7b4c9d3a089fd0094a52bd83dfda56d0652f8a5b08dfe84a88b251736e62a9da7f510e0fa3d1842e2551161178ce30b5e0f5c6ee9b903917a2c
|
data/Gemfile.lock
CHANGED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
converter = Foo.new(data, compression: :gzip)
|
52
|
+
pp converter.to_io
|
53
|
+
pp converter.to_arrow_table
|
54
|
+
converter.write('tmp/test.parquet')
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
io = Foo.convert(data)
|
52
|
+
pp io.read
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
converter = Foo.new(data, compression: :gzip)
|
52
|
+
converter.split(200).each_with_index do |chunk, idx|
|
53
|
+
puts "#{idx}: #{chunk.path}"
|
54
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
path = 'tmp/test.parquet'
|
52
|
+
Foo.convert_to(data, path, compression: :gzip)
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'parqueteur'
|
3
|
+
require 'securerandom'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
class Foo < Parqueteur::Converter
|
7
|
+
column :id, :bigint
|
8
|
+
column :reference, :string
|
9
|
+
column :hash, :map, key: :string, value: :string
|
10
|
+
# column :hash2, :map, key: :string, value: :string
|
11
|
+
# column :hash3, :map, key: :string, value: :string
|
12
|
+
column :valid, :boolean
|
13
|
+
column :total, :integer
|
14
|
+
column :numbers, :array, elements: :integer
|
15
|
+
column :my_struct, :struct do
|
16
|
+
field :test, :string
|
17
|
+
field :mon_nombre, :integer
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def random_hash
|
22
|
+
{
|
23
|
+
'a' => SecureRandom.hex(128),
|
24
|
+
'b' => SecureRandom.hex(128),
|
25
|
+
'c' => SecureRandom.hex(128),
|
26
|
+
'd' => SecureRandom.hex(128),
|
27
|
+
'e' => SecureRandom.hex(128),
|
28
|
+
'f' => SecureRandom.hex(128),
|
29
|
+
'g' => SecureRandom.hex(128),
|
30
|
+
'h' => SecureRandom.hex(128),
|
31
|
+
'i' => SecureRandom.hex(128),
|
32
|
+
'j' => SecureRandom.hex(128),
|
33
|
+
'k' => SecureRandom.hex(128),
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
data = 10000.times.collect do |i|
|
38
|
+
{
|
39
|
+
'id' => i + 1,
|
40
|
+
'reference' => "coucou:#{i}",
|
41
|
+
'hash' => random_hash,
|
42
|
+
# 'hash2' => random_hash,
|
43
|
+
# 'hash3' => random_hash,
|
44
|
+
'valid' => rand < 0.5,
|
45
|
+
'total' => rand(100..500),
|
46
|
+
'numbers' => [1, 2, 3]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
puts "data generation OK"
|
50
|
+
|
51
|
+
path = 'tmp/test.parquet'
|
52
|
+
Foo.convert_to(data, path)
|
data/lib/parqueteur/converter.rb
CHANGED
@@ -2,9 +2,7 @@
|
|
2
2
|
|
3
3
|
module Parqueteur
|
4
4
|
class Converter
|
5
|
-
|
6
|
-
|
7
|
-
DEFAULT_BATCH_SIZE = 25
|
5
|
+
DEFAULT_BATCH_SIZE = 10
|
8
6
|
|
9
7
|
def self.inline(&block)
|
10
8
|
Class.new(self, &block)
|
@@ -26,106 +24,127 @@ module Parqueteur
|
|
26
24
|
transforms << (method_name || block)
|
27
25
|
end
|
28
26
|
|
29
|
-
def self.convert(input,
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
27
|
+
def self.convert(input, **kwargs)
|
28
|
+
new(input, **kwargs).to_io
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.convert_to(input, output_path, **kwargs)
|
32
|
+
converter = new(input, **kwargs)
|
33
|
+
converter.write(output_path)
|
34
|
+
end
|
35
|
+
|
36
|
+
# @param [Enumerable] An enumerable object
|
37
|
+
# @option [Symbol] compression - :gzip
|
38
|
+
def initialize(input, **kwargs)
|
39
|
+
@input = Parqueteur::Input.from(input)
|
40
|
+
@batch_size = kwargs.fetch(:batch_size, DEFAULT_BATCH_SIZE)
|
41
|
+
@compression = kwargs.fetch(:compression, nil)&.to_sym
|
42
|
+
end
|
43
|
+
|
44
|
+
def split(size)
|
45
|
+
Enumerator.new do |arr|
|
46
|
+
@input.each_slice(size) do |records|
|
47
|
+
local_converter = self.class.new(
|
48
|
+
records, batch_size: @batch_size, compression: @compression
|
49
|
+
)
|
50
|
+
file = local_converter.to_tmpfile
|
51
|
+
arr << file
|
52
|
+
file.close
|
53
|
+
file.unlink
|
54
|
+
end
|
35
55
|
end
|
36
56
|
end
|
37
57
|
|
38
|
-
def
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
output
|
54
|
-
|
55
|
-
|
56
|
-
|
58
|
+
def split_by_io(size)
|
59
|
+
Enumerator.new do |arr|
|
60
|
+
@input.each_slice(size) do |records|
|
61
|
+
local_converter = self.class.new(records)
|
62
|
+
arr << local_converter.to_io
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def write(path)
|
68
|
+
arrow_schema = self.class.columns.arrow_schema
|
69
|
+
writer_properties = Parquet::WriterProperties.new
|
70
|
+
writer_properties.set_compression(@compression) unless @compression.nil?
|
71
|
+
|
72
|
+
Arrow::FileOutputStream.open(path, false) do |output|
|
73
|
+
Parquet::ArrowFileWriter.open(arrow_schema, output, writer_properties) do |writer|
|
74
|
+
@input.each_slice(@batch_size) do |records|
|
75
|
+
arrow_table = build_arrow_table(records)
|
76
|
+
writer.write_table(arrow_table, 1024)
|
77
|
+
end
|
78
|
+
end
|
57
79
|
end
|
80
|
+
|
81
|
+
true
|
58
82
|
end
|
59
83
|
|
60
|
-
def
|
61
|
-
|
84
|
+
def to_tmpfile
|
85
|
+
tempfile = Tempfile.new
|
86
|
+
tempfile.binmode
|
87
|
+
write(tempfile.path)
|
88
|
+
tempfile.rewind
|
89
|
+
tempfile
|
62
90
|
end
|
63
91
|
|
64
92
|
def to_io
|
65
|
-
|
93
|
+
tmpfile = to_tmpfile
|
94
|
+
strio = StringIO.new(tmpfile.read)
|
95
|
+
tmpfile.close
|
96
|
+
tmpfile.unlink
|
97
|
+
strio
|
98
|
+
end
|
99
|
+
|
100
|
+
def to_arrow_table
|
101
|
+
file = to_tmpfile
|
102
|
+
table = Arrow::Table.load(file.path, format: :parquet)
|
103
|
+
file.close
|
104
|
+
file.unlink
|
105
|
+
table
|
66
106
|
end
|
67
107
|
|
68
108
|
def to_blob
|
69
|
-
|
109
|
+
to_io.read
|
70
110
|
end
|
71
111
|
|
72
|
-
|
112
|
+
private
|
113
|
+
|
114
|
+
def build_arrow_table(records)
|
73
115
|
transforms = self.class.transforms
|
74
116
|
|
75
|
-
|
117
|
+
values = self.class.columns.each_with_object({}) do |column, hash|
|
76
118
|
hash[column.name] = []
|
77
119
|
end
|
78
120
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
item = \
|
89
|
-
if transform.is_a?(Symbol)
|
90
|
-
__send__(transform, item)
|
91
|
-
else
|
92
|
-
transform.call(item)
|
93
|
-
end
|
94
|
-
end
|
121
|
+
records.each do |item|
|
122
|
+
if transforms.length > 0
|
123
|
+
transforms.each do |transform|
|
124
|
+
item = \
|
125
|
+
if transform.is_a?(Symbol)
|
126
|
+
__send__(transform, item)
|
127
|
+
else
|
128
|
+
transform.call(item)
|
129
|
+
end
|
95
130
|
end
|
131
|
+
end
|
96
132
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
end
|
133
|
+
values.each_key do |value_key|
|
134
|
+
if item.key?(value_key)
|
135
|
+
values[value_key] << item[value_key]
|
136
|
+
else
|
137
|
+
values[value_key] << nil
|
103
138
|
end
|
104
139
|
end
|
140
|
+
end
|
105
141
|
|
106
|
-
|
142
|
+
Arrow::Table.new(
|
143
|
+
values.each_with_object({}) do |item, hash|
|
107
144
|
column = self.class.columns.find(item[0])
|
108
|
-
hash[item[0]].
|
109
|
-
column.type.build_value_array(item[1])
|
110
|
-
)
|
145
|
+
hash[item[0]] = column.type.build_value_array(item[1])
|
111
146
|
end
|
112
|
-
|
113
|
-
items_count += items.length
|
114
|
-
end
|
115
|
-
|
116
|
-
if items_count > 0
|
117
|
-
Arrow::Table.new(
|
118
|
-
chunks.transform_values! do |value|
|
119
|
-
Arrow::ChunkedArray.new(value)
|
120
|
-
end
|
121
|
-
)
|
122
|
-
else
|
123
|
-
Arrow::Table.new(
|
124
|
-
self.class.columns.each_with_object({}) do |column, hash|
|
125
|
-
hash[column.name] = column.type.build_value_array([])
|
126
|
-
end
|
127
|
-
)
|
128
|
-
end
|
147
|
+
)
|
129
148
|
end
|
130
149
|
end
|
131
150
|
end
|
data/lib/parqueteur/input.rb
CHANGED
@@ -4,40 +4,25 @@ module Parqueteur
|
|
4
4
|
class Input
|
5
5
|
include Enumerable
|
6
6
|
|
7
|
-
def self.from(arg
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
if File.exist?(arg)
|
12
|
-
File.new(arg, 'r')
|
13
|
-
else
|
14
|
-
arg.split("\n")
|
15
|
-
end
|
16
|
-
when Enumerable
|
17
|
-
arg
|
18
|
-
end,
|
19
|
-
options
|
20
|
-
)
|
7
|
+
def self.from(arg)
|
8
|
+
return arg if arg.is_a?(self)
|
9
|
+
|
10
|
+
new(arg)
|
21
11
|
end
|
22
12
|
|
23
|
-
def initialize(source
|
13
|
+
def initialize(source)
|
14
|
+
unless source.is_a?(Enumerable)
|
15
|
+
raise ArgumentError, 'Enumerable object expected'
|
16
|
+
end
|
17
|
+
|
24
18
|
@source = source
|
25
|
-
@options = options
|
26
19
|
end
|
27
20
|
|
28
21
|
def each(&block)
|
29
|
-
|
30
|
-
when File
|
31
|
-
if @options.fetch(:json_newlines, true) == true
|
32
|
-
@source.each_line do |line|
|
33
|
-
yield(JSON.parse(line.strip))
|
34
|
-
end
|
35
|
-
else
|
36
|
-
JSON.parse(@source.read).each(&block)
|
37
|
-
end
|
38
|
-
@source.rewind
|
39
|
-
when Enumerable
|
22
|
+
if block_given?
|
40
23
|
@source.each(&block)
|
24
|
+
else
|
25
|
+
@source.to_enum(:each)
|
41
26
|
end
|
42
27
|
end
|
43
28
|
end
|
data/lib/parqueteur/version.rb
CHANGED
data/lib/parqueteur.rb
CHANGED
@@ -2,9 +2,9 @@
|
|
2
2
|
|
3
3
|
require 'json'
|
4
4
|
require 'singleton'
|
5
|
+
require 'tempfile'
|
5
6
|
|
6
|
-
require_relative
|
7
|
-
require 'parqueteur/chunked_converter'
|
7
|
+
require_relative 'parqueteur/version'
|
8
8
|
require 'parqueteur/column'
|
9
9
|
require 'parqueteur/column_collection'
|
10
10
|
require 'parqueteur/converter'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parqueteur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Julien D.
|
@@ -38,9 +38,12 @@ files:
|
|
38
38
|
- Rakefile
|
39
39
|
- bin/console
|
40
40
|
- bin/setup
|
41
|
-
-
|
41
|
+
- examples/convert-methods.rb
|
42
|
+
- examples/convert-to-io.rb
|
43
|
+
- examples/convert-with-chunks.rb
|
44
|
+
- examples/convert-with-compression.rb
|
45
|
+
- examples/convert-without-compression.rb
|
42
46
|
- lib/parqueteur.rb
|
43
|
-
- lib/parqueteur/chunked_converter.rb
|
44
47
|
- lib/parqueteur/column.rb
|
45
48
|
- lib/parqueteur/column_collection.rb
|
46
49
|
- lib/parqueteur/converter.rb
|
data/example.rb
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
require 'bundler/setup'
|
2
|
-
require 'parqueteur'
|
3
|
-
|
4
|
-
class Foo < Parqueteur::Converter
|
5
|
-
column :id, :bigint
|
6
|
-
column :reference, :string
|
7
|
-
column :hash, :map, key: :string, value: :string
|
8
|
-
column :valid, :boolean
|
9
|
-
column :total, :integer
|
10
|
-
column :numbers, :array, elements: :integer
|
11
|
-
column :my_struct, :struct do
|
12
|
-
field :test, :string
|
13
|
-
field :mon_nombre, :integer
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
LETTERS = ('a'..'z').to_a
|
18
|
-
|
19
|
-
data = 1000.times.collect do |i|
|
20
|
-
{
|
21
|
-
'id' => i + 1,
|
22
|
-
'reference' => "coucou:#{i}",
|
23
|
-
'hash' => { 'a' => LETTERS.sample },
|
24
|
-
'valid' => rand < 0.5,
|
25
|
-
'total' => rand(100..500),
|
26
|
-
'numbers' => [1, 2, 3],
|
27
|
-
'my_struct' => {
|
28
|
-
'test' => 'super'
|
29
|
-
}
|
30
|
-
}
|
31
|
-
end
|
32
|
-
|
33
|
-
# chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
|
34
|
-
# pp chunked_converter.write_files('test')
|
35
|
-
puts Foo.convert(data, output: 'tmp/test.parquet')
|
36
|
-
table = Arrow::Table.load('tmp/test.parquet')
|
37
|
-
table.each_record do |record|
|
38
|
-
puts record.to_h
|
39
|
-
end
|
@@ -1,28 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Parqueteur
|
4
|
-
class ChunkedConverter
|
5
|
-
attr_reader :schema
|
6
|
-
|
7
|
-
def initialize(input, converter, chunk_size = 200)
|
8
|
-
@input = Parqueteur::Input.from(input)
|
9
|
-
@converter = converter
|
10
|
-
@chunk_size = chunk_size
|
11
|
-
end
|
12
|
-
|
13
|
-
def chunks
|
14
|
-
Enumerator.new do |arr|
|
15
|
-
@input.each_slice(@chunk_size) do |chunk|
|
16
|
-
local_converter = @converter.new(chunk)
|
17
|
-
arr << local_converter.to_io
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
def write_files(prefix)
|
23
|
-
chunks.each_with_index do |chunk, idx|
|
24
|
-
File.write("#{prefix}.#{idx}.parquet", chunk.read)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|