parqueteur 1.1.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1805a7356005e238c3d8f4318a54ea498ff58a0fcd93819c51cc445c4f8b6ee0
4
- data.tar.gz: 1360c3d481eac282e684f588e49cbcd142c7cc0f6c345d8a98e48e402a9edf22
3
+ metadata.gz: 056b9208a8bffcd163464dbd2cf276a9b0704e96788b77555d545eb339a4e798
4
+ data.tar.gz: 1e20d31b1fc6f198fee42546939ce289d71d66f65ffa66562cdd7841e0f24f61
5
5
  SHA512:
6
- metadata.gz: 28c157c39f25744a10223e8d11d165b7e4e25eb70e45cf2e22513795b4b1b4b1b9f44bc04d5ef8e10b74ebd82d1c87bc318a02b32fc25dbd6c32c0763e79801c
7
- data.tar.gz: 6f0433e8090c49c48278bcd4d5f36f96c394bd8d8386fbf2bc22d224d42ef58036beaa0307c32420cbbb3b33eec463964b28e616f862385de6afa1f8c98c9310
6
+ metadata.gz: fe08a7b282c4ededc08acb5aa9f4b485ead828aee4fd1444e8bb1af80cc56ea8c20411aefe136809f91ad808bee52db261218e8b5e6b7538bfa53d1eb38eb4b5
7
+ data.tar.gz: 0fee8ec94698b7b4c9d3a089fd0094a52bd83dfda56d0652f8a5b08dfe84a88b251736e62a9da7f510e0fa3d1842e2551161178ce30b5e0f5c6ee9b903917a2c
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- parqueteur (1.1.1)
4
+ parqueteur (1.2.0)
5
5
  red-parquet (~> 5.0)
6
6
 
7
7
  GEM
@@ -0,0 +1,54 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ converter = Foo.new(data, compression: :gzip)
52
+ pp converter.to_io
53
+ pp converter.to_arrow_table
54
+ converter.write('tmp/test.parquet')
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ io = Foo.convert(data)
52
+ pp io.read
@@ -0,0 +1,54 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ converter = Foo.new(data, compression: :gzip)
52
+ converter.split(200).each_with_index do |chunk, idx|
53
+ puts "#{idx}: #{chunk.path}"
54
+ end
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ path = 'tmp/test.parquet'
52
+ Foo.convert_to(data, path, compression: :gzip)
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ path = 'tmp/test.parquet'
52
+ Foo.convert_to(data, path)
@@ -2,9 +2,7 @@
2
2
 
3
3
  module Parqueteur
4
4
  class Converter
5
- attr_reader :schema
6
-
7
- DEFAULT_BATCH_SIZE = 25
5
+ DEFAULT_BATCH_SIZE = 10
8
6
 
9
7
  def self.inline(&block)
10
8
  Class.new(self, &block)
@@ -26,106 +24,127 @@ module Parqueteur
26
24
  transforms << (method_name || block)
27
25
  end
28
26
 
29
- def self.convert(input, output: nil, batch_size: DEFAULT_BATCH_SIZE)
30
- converter = new(input, bactch_size: batch_size)
31
- if !output.nil?
32
- converter.write(output)
33
- else
34
- converter.to_blob
27
+ def self.convert(input, **kwargs)
28
+ new(input, **kwargs).to_io
29
+ end
30
+
31
+ def self.convert_to(input, output_path, **kwargs)
32
+ converter = new(input, **kwargs)
33
+ converter.write(output_path)
34
+ end
35
+
36
+ # @param [Enumerable] An enumerable object
37
+ # @option [Symbol] compression - :gzip
38
+ def initialize(input, **kwargs)
39
+ @input = Parqueteur::Input.from(input)
40
+ @batch_size = kwargs.fetch(:batch_size, DEFAULT_BATCH_SIZE)
41
+ @compression = kwargs.fetch(:compression, nil)&.to_sym
42
+ end
43
+
44
+ def split(size)
45
+ Enumerator.new do |arr|
46
+ @input.each_slice(size) do |records|
47
+ local_converter = self.class.new(
48
+ records, batch_size: @batch_size, compression: @compression
49
+ )
50
+ file = local_converter.to_tmpfile
51
+ arr << file
52
+ file.close
53
+ file.unlink
54
+ end
35
55
  end
36
56
  end
37
57
 
38
- def initialize(input, options = {})
39
- @input = Parqueteur::Input.from(input, options)
40
- @batch_size = options.fetch(:batch_size, DEFAULT_BATCH_SIZE)
41
- end
42
-
43
- def write(output)
44
- case output
45
- when :io
46
- to_io
47
- when String
48
- to_arrow_table.save(output)
49
- when StringIO, IO
50
- buffer = Arrow::ResizableBuffer.new(0)
51
- to_arrow_table.save(buffer, format: :parquet)
52
- output.write(buffer.data.to_s)
53
- output.rewind
54
- output
55
- else
56
- raise ArgumentError, "unsupported output: #{output.class}, accepted: String (filename), IO, StringIO"
58
+ def split_by_io(size)
59
+ Enumerator.new do |arr|
60
+ @input.each_slice(size) do |records|
61
+ local_converter = self.class.new(records)
62
+ arr << local_converter.to_io
63
+ end
64
+ end
65
+ end
66
+
67
+ def write(path)
68
+ arrow_schema = self.class.columns.arrow_schema
69
+ writer_properties = Parquet::WriterProperties.new
70
+ writer_properties.set_compression(@compression) unless @compression.nil?
71
+
72
+ Arrow::FileOutputStream.open(path, false) do |output|
73
+ Parquet::ArrowFileWriter.open(arrow_schema, output, writer_properties) do |writer|
74
+ @input.each_slice(@batch_size) do |records|
75
+ arrow_table = build_arrow_table(records)
76
+ writer.write_table(arrow_table, 1024)
77
+ end
78
+ end
57
79
  end
80
+
81
+ true
58
82
  end
59
83
 
60
- def to_s
61
- inspect
84
+ def to_tmpfile
85
+ tempfile = Tempfile.new
86
+ tempfile.binmode
87
+ write(tempfile.path)
88
+ tempfile.rewind
89
+ tempfile
62
90
  end
63
91
 
64
92
  def to_io
65
- write(StringIO.new)
93
+ tmpfile = to_tmpfile
94
+ strio = StringIO.new(tmpfile.read)
95
+ tmpfile.close
96
+ tmpfile.unlink
97
+ strio
98
+ end
99
+
100
+ def to_arrow_table
101
+ file = to_tmpfile
102
+ table = Arrow::Table.load(file.path, format: :parquet)
103
+ file.close
104
+ file.unlink
105
+ table
66
106
  end
67
107
 
68
108
  def to_blob
69
- write(StringIO.new).read
109
+ to_io.read
70
110
  end
71
111
 
72
- def to_arrow_table
112
+ private
113
+
114
+ def build_arrow_table(records)
73
115
  transforms = self.class.transforms
74
116
 
75
- chunks = self.class.columns.each_with_object({}) do |column, hash|
117
+ values = self.class.columns.each_with_object({}) do |column, hash|
76
118
  hash[column.name] = []
77
119
  end
78
120
 
79
- items_count = 0
80
- @input.each_slice(@batch_size) do |items|
81
- values = self.class.columns.each_with_object({}) do |column, hash|
82
- hash[column.name] = []
83
- end
84
-
85
- items.each do |item|
86
- if transforms.length > 0
87
- transforms.each do |transform|
88
- item = \
89
- if transform.is_a?(Symbol)
90
- __send__(transform, item)
91
- else
92
- transform.call(item)
93
- end
94
- end
121
+ records.each do |item|
122
+ if transforms.length > 0
123
+ transforms.each do |transform|
124
+ item = \
125
+ if transform.is_a?(Symbol)
126
+ __send__(transform, item)
127
+ else
128
+ transform.call(item)
129
+ end
95
130
  end
131
+ end
96
132
 
97
- values.each_key do |value_key|
98
- if item.key?(value_key)
99
- values[value_key] << item[value_key]
100
- else
101
- values[value_key] << nil
102
- end
133
+ values.each_key do |value_key|
134
+ if item.key?(value_key)
135
+ values[value_key] << item[value_key]
136
+ else
137
+ values[value_key] << nil
103
138
  end
104
139
  end
140
+ end
105
141
 
106
- values.each_with_object(chunks) do |item, hash|
142
+ Arrow::Table.new(
143
+ values.each_with_object({}) do |item, hash|
107
144
  column = self.class.columns.find(item[0])
108
- hash[item[0]].push(
109
- column.type.build_value_array(item[1])
110
- )
145
+ hash[item[0]] = column.type.build_value_array(item[1])
111
146
  end
112
-
113
- items_count += items.length
114
- end
115
-
116
- if items_count > 0
117
- Arrow::Table.new(
118
- chunks.transform_values! do |value|
119
- Arrow::ChunkedArray.new(value)
120
- end
121
- )
122
- else
123
- Arrow::Table.new(
124
- self.class.columns.each_with_object({}) do |column, hash|
125
- hash[column.name] = column.type.build_value_array([])
126
- end
127
- )
128
- end
147
+ )
129
148
  end
130
149
  end
131
150
  end
@@ -4,40 +4,25 @@ module Parqueteur
4
4
  class Input
5
5
  include Enumerable
6
6
 
7
- def self.from(arg, options = {})
8
- new(
9
- case arg
10
- when String
11
- if File.exist?(arg)
12
- File.new(arg, 'r')
13
- else
14
- arg.split("\n")
15
- end
16
- when Enumerable
17
- arg
18
- end,
19
- options
20
- )
7
+ def self.from(arg)
8
+ return arg if arg.is_a?(self)
9
+
10
+ new(arg)
21
11
  end
22
12
 
23
- def initialize(source, options = {})
13
+ def initialize(source)
14
+ unless source.is_a?(Enumerable)
15
+ raise ArgumentError, 'Enumerable object expected'
16
+ end
17
+
24
18
  @source = source
25
- @options = options
26
19
  end
27
20
 
28
21
  def each(&block)
29
- case @source
30
- when File
31
- if @options.fetch(:json_newlines, true) == true
32
- @source.each_line do |line|
33
- yield(JSON.parse(line.strip))
34
- end
35
- else
36
- JSON.parse(@source.read).each(&block)
37
- end
38
- @source.rewind
39
- when Enumerable
22
+ if block_given?
40
23
  @source.each(&block)
24
+ else
25
+ @source.to_enum(:each)
41
26
  end
42
27
  end
43
28
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Parqueteur
4
- VERSION = '1.1.1'
4
+ VERSION = '1.2.0'
5
5
  end
data/lib/parqueteur.rb CHANGED
@@ -2,9 +2,9 @@
2
2
 
3
3
  require 'json'
4
4
  require 'singleton'
5
+ require 'tempfile'
5
6
 
6
- require_relative "parqueteur/version"
7
- require 'parqueteur/chunked_converter'
7
+ require_relative 'parqueteur/version'
8
8
  require 'parqueteur/column'
9
9
  require 'parqueteur/column_collection'
10
10
  require 'parqueteur/converter'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parqueteur
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julien D.
@@ -38,9 +38,12 @@ files:
38
38
  - Rakefile
39
39
  - bin/console
40
40
  - bin/setup
41
- - example.rb
41
+ - examples/convert-methods.rb
42
+ - examples/convert-to-io.rb
43
+ - examples/convert-with-chunks.rb
44
+ - examples/convert-with-compression.rb
45
+ - examples/convert-without-compression.rb
42
46
  - lib/parqueteur.rb
43
- - lib/parqueteur/chunked_converter.rb
44
47
  - lib/parqueteur/column.rb
45
48
  - lib/parqueteur/column_collection.rb
46
49
  - lib/parqueteur/converter.rb
data/example.rb DELETED
@@ -1,39 +0,0 @@
1
- require 'bundler/setup'
2
- require 'parqueteur'
3
-
4
- class Foo < Parqueteur::Converter
5
- column :id, :bigint
6
- column :reference, :string
7
- column :hash, :map, key: :string, value: :string
8
- column :valid, :boolean
9
- column :total, :integer
10
- column :numbers, :array, elements: :integer
11
- column :my_struct, :struct do
12
- field :test, :string
13
- field :mon_nombre, :integer
14
- end
15
- end
16
-
17
- LETTERS = ('a'..'z').to_a
18
-
19
- data = 1000.times.collect do |i|
20
- {
21
- 'id' => i + 1,
22
- 'reference' => "coucou:#{i}",
23
- 'hash' => { 'a' => LETTERS.sample },
24
- 'valid' => rand < 0.5,
25
- 'total' => rand(100..500),
26
- 'numbers' => [1, 2, 3],
27
- 'my_struct' => {
28
- 'test' => 'super'
29
- }
30
- }
31
- end
32
-
33
- # chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
34
- # pp chunked_converter.write_files('test')
35
- puts Foo.convert(data, output: 'tmp/test.parquet')
36
- table = Arrow::Table.load('tmp/test.parquet')
37
- table.each_record do |record|
38
- puts record.to_h
39
- end
@@ -1,28 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Parqueteur
4
- class ChunkedConverter
5
- attr_reader :schema
6
-
7
- def initialize(input, converter, chunk_size = 200)
8
- @input = Parqueteur::Input.from(input)
9
- @converter = converter
10
- @chunk_size = chunk_size
11
- end
12
-
13
- def chunks
14
- Enumerator.new do |arr|
15
- @input.each_slice(@chunk_size) do |chunk|
16
- local_converter = @converter.new(chunk)
17
- arr << local_converter.to_io
18
- end
19
- end
20
- end
21
-
22
- def write_files(prefix)
23
- chunks.each_with_index do |chunk, idx|
24
- File.write("#{prefix}.#{idx}.parquet", chunk.read)
25
- end
26
- end
27
- end
28
- end