parqueteur 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1805a7356005e238c3d8f4318a54ea498ff58a0fcd93819c51cc445c4f8b6ee0
4
- data.tar.gz: 1360c3d481eac282e684f588e49cbcd142c7cc0f6c345d8a98e48e402a9edf22
3
+ metadata.gz: 056b9208a8bffcd163464dbd2cf276a9b0704e96788b77555d545eb339a4e798
4
+ data.tar.gz: 1e20d31b1fc6f198fee42546939ce289d71d66f65ffa66562cdd7841e0f24f61
5
5
  SHA512:
6
- metadata.gz: 28c157c39f25744a10223e8d11d165b7e4e25eb70e45cf2e22513795b4b1b4b1b9f44bc04d5ef8e10b74ebd82d1c87bc318a02b32fc25dbd6c32c0763e79801c
7
- data.tar.gz: 6f0433e8090c49c48278bcd4d5f36f96c394bd8d8386fbf2bc22d224d42ef58036beaa0307c32420cbbb3b33eec463964b28e616f862385de6afa1f8c98c9310
6
+ metadata.gz: fe08a7b282c4ededc08acb5aa9f4b485ead828aee4fd1444e8bb1af80cc56ea8c20411aefe136809f91ad808bee52db261218e8b5e6b7538bfa53d1eb38eb4b5
7
+ data.tar.gz: 0fee8ec94698b7b4c9d3a089fd0094a52bd83dfda56d0652f8a5b08dfe84a88b251736e62a9da7f510e0fa3d1842e2551161178ce30b5e0f5c6ee9b903917a2c
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- parqueteur (1.1.1)
4
+ parqueteur (1.2.0)
5
5
  red-parquet (~> 5.0)
6
6
 
7
7
  GEM
@@ -0,0 +1,54 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ converter = Foo.new(data, compression: :gzip)
52
+ pp converter.to_io
53
+ pp converter.to_arrow_table
54
+ converter.write('tmp/test.parquet')
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ io = Foo.convert(data)
52
+ pp io.read
@@ -0,0 +1,54 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ converter = Foo.new(data, compression: :gzip)
52
+ converter.split(200).each_with_index do |chunk, idx|
53
+ puts "#{idx}: #{chunk.path}"
54
+ end
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ path = 'tmp/test.parquet'
52
+ Foo.convert_to(data, path, compression: :gzip)
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ path = 'tmp/test.parquet'
52
+ Foo.convert_to(data, path)
@@ -2,9 +2,7 @@
2
2
 
3
3
  module Parqueteur
4
4
  class Converter
5
- attr_reader :schema
6
-
7
- DEFAULT_BATCH_SIZE = 25
5
+ DEFAULT_BATCH_SIZE = 10
8
6
 
9
7
  def self.inline(&block)
10
8
  Class.new(self, &block)
@@ -26,106 +24,127 @@ module Parqueteur
26
24
  transforms << (method_name || block)
27
25
  end
28
26
 
29
- def self.convert(input, output: nil, batch_size: DEFAULT_BATCH_SIZE)
30
- converter = new(input, bactch_size: batch_size)
31
- if !output.nil?
32
- converter.write(output)
33
- else
34
- converter.to_blob
27
+ def self.convert(input, **kwargs)
28
+ new(input, **kwargs).to_io
29
+ end
30
+
31
+ def self.convert_to(input, output_path, **kwargs)
32
+ converter = new(input, **kwargs)
33
+ converter.write(output_path)
34
+ end
35
+
36
+ # @param [Enumerable] An enumerable object
37
+ # @option [Symbol] compression - :gzip
38
+ def initialize(input, **kwargs)
39
+ @input = Parqueteur::Input.from(input)
40
+ @batch_size = kwargs.fetch(:batch_size, DEFAULT_BATCH_SIZE)
41
+ @compression = kwargs.fetch(:compression, nil)&.to_sym
42
+ end
43
+
44
+ def split(size)
45
+ Enumerator.new do |arr|
46
+ @input.each_slice(size) do |records|
47
+ local_converter = self.class.new(
48
+ records, batch_size: @batch_size, compression: @compression
49
+ )
50
+ file = local_converter.to_tmpfile
51
+ arr << file
52
+ file.close
53
+ file.unlink
54
+ end
35
55
  end
36
56
  end
37
57
 
38
- def initialize(input, options = {})
39
- @input = Parqueteur::Input.from(input, options)
40
- @batch_size = options.fetch(:batch_size, DEFAULT_BATCH_SIZE)
41
- end
42
-
43
- def write(output)
44
- case output
45
- when :io
46
- to_io
47
- when String
48
- to_arrow_table.save(output)
49
- when StringIO, IO
50
- buffer = Arrow::ResizableBuffer.new(0)
51
- to_arrow_table.save(buffer, format: :parquet)
52
- output.write(buffer.data.to_s)
53
- output.rewind
54
- output
55
- else
56
- raise ArgumentError, "unsupported output: #{output.class}, accepted: String (filename), IO, StringIO"
58
+ def split_by_io(size)
59
+ Enumerator.new do |arr|
60
+ @input.each_slice(size) do |records|
61
+ local_converter = self.class.new(records)
62
+ arr << local_converter.to_io
63
+ end
64
+ end
65
+ end
66
+
67
+ def write(path)
68
+ arrow_schema = self.class.columns.arrow_schema
69
+ writer_properties = Parquet::WriterProperties.new
70
+ writer_properties.set_compression(@compression) unless @compression.nil?
71
+
72
+ Arrow::FileOutputStream.open(path, false) do |output|
73
+ Parquet::ArrowFileWriter.open(arrow_schema, output, writer_properties) do |writer|
74
+ @input.each_slice(@batch_size) do |records|
75
+ arrow_table = build_arrow_table(records)
76
+ writer.write_table(arrow_table, 1024)
77
+ end
78
+ end
57
79
  end
80
+
81
+ true
58
82
  end
59
83
 
60
- def to_s
61
- inspect
84
+ def to_tmpfile
85
+ tempfile = Tempfile.new
86
+ tempfile.binmode
87
+ write(tempfile.path)
88
+ tempfile.rewind
89
+ tempfile
62
90
  end
63
91
 
64
92
  def to_io
65
- write(StringIO.new)
93
+ tmpfile = to_tmpfile
94
+ strio = StringIO.new(tmpfile.read)
95
+ tmpfile.close
96
+ tmpfile.unlink
97
+ strio
98
+ end
99
+
100
+ def to_arrow_table
101
+ file = to_tmpfile
102
+ table = Arrow::Table.load(file.path, format: :parquet)
103
+ file.close
104
+ file.unlink
105
+ table
66
106
  end
67
107
 
68
108
  def to_blob
69
- write(StringIO.new).read
109
+ to_io.read
70
110
  end
71
111
 
72
- def to_arrow_table
112
+ private
113
+
114
+ def build_arrow_table(records)
73
115
  transforms = self.class.transforms
74
116
 
75
- chunks = self.class.columns.each_with_object({}) do |column, hash|
117
+ values = self.class.columns.each_with_object({}) do |column, hash|
76
118
  hash[column.name] = []
77
119
  end
78
120
 
79
- items_count = 0
80
- @input.each_slice(@batch_size) do |items|
81
- values = self.class.columns.each_with_object({}) do |column, hash|
82
- hash[column.name] = []
83
- end
84
-
85
- items.each do |item|
86
- if transforms.length > 0
87
- transforms.each do |transform|
88
- item = \
89
- if transform.is_a?(Symbol)
90
- __send__(transform, item)
91
- else
92
- transform.call(item)
93
- end
94
- end
121
+ records.each do |item|
122
+ if transforms.length > 0
123
+ transforms.each do |transform|
124
+ item = \
125
+ if transform.is_a?(Symbol)
126
+ __send__(transform, item)
127
+ else
128
+ transform.call(item)
129
+ end
95
130
  end
131
+ end
96
132
 
97
- values.each_key do |value_key|
98
- if item.key?(value_key)
99
- values[value_key] << item[value_key]
100
- else
101
- values[value_key] << nil
102
- end
133
+ values.each_key do |value_key|
134
+ if item.key?(value_key)
135
+ values[value_key] << item[value_key]
136
+ else
137
+ values[value_key] << nil
103
138
  end
104
139
  end
140
+ end
105
141
 
106
- values.each_with_object(chunks) do |item, hash|
142
+ Arrow::Table.new(
143
+ values.each_with_object({}) do |item, hash|
107
144
  column = self.class.columns.find(item[0])
108
- hash[item[0]].push(
109
- column.type.build_value_array(item[1])
110
- )
145
+ hash[item[0]] = column.type.build_value_array(item[1])
111
146
  end
112
-
113
- items_count += items.length
114
- end
115
-
116
- if items_count > 0
117
- Arrow::Table.new(
118
- chunks.transform_values! do |value|
119
- Arrow::ChunkedArray.new(value)
120
- end
121
- )
122
- else
123
- Arrow::Table.new(
124
- self.class.columns.each_with_object({}) do |column, hash|
125
- hash[column.name] = column.type.build_value_array([])
126
- end
127
- )
128
- end
147
+ )
129
148
  end
130
149
  end
131
150
  end
@@ -4,40 +4,25 @@ module Parqueteur
4
4
  class Input
5
5
  include Enumerable
6
6
 
7
- def self.from(arg, options = {})
8
- new(
9
- case arg
10
- when String
11
- if File.exist?(arg)
12
- File.new(arg, 'r')
13
- else
14
- arg.split("\n")
15
- end
16
- when Enumerable
17
- arg
18
- end,
19
- options
20
- )
7
+ def self.from(arg)
8
+ return arg if arg.is_a?(self)
9
+
10
+ new(arg)
21
11
  end
22
12
 
23
- def initialize(source, options = {})
13
+ def initialize(source)
14
+ unless source.is_a?(Enumerable)
15
+ raise ArgumentError, 'Enumerable object expected'
16
+ end
17
+
24
18
  @source = source
25
- @options = options
26
19
  end
27
20
 
28
21
  def each(&block)
29
- case @source
30
- when File
31
- if @options.fetch(:json_newlines, true) == true
32
- @source.each_line do |line|
33
- yield(JSON.parse(line.strip))
34
- end
35
- else
36
- JSON.parse(@source.read).each(&block)
37
- end
38
- @source.rewind
39
- when Enumerable
22
+ if block_given?
40
23
  @source.each(&block)
24
+ else
25
+ @source.to_enum(:each)
41
26
  end
42
27
  end
43
28
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Parqueteur
4
- VERSION = '1.1.1'
4
+ VERSION = '1.2.0'
5
5
  end
data/lib/parqueteur.rb CHANGED
@@ -2,9 +2,9 @@
2
2
 
3
3
  require 'json'
4
4
  require 'singleton'
5
+ require 'tempfile'
5
6
 
6
- require_relative "parqueteur/version"
7
- require 'parqueteur/chunked_converter'
7
+ require_relative 'parqueteur/version'
8
8
  require 'parqueteur/column'
9
9
  require 'parqueteur/column_collection'
10
10
  require 'parqueteur/converter'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parqueteur
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julien D.
@@ -38,9 +38,12 @@ files:
38
38
  - Rakefile
39
39
  - bin/console
40
40
  - bin/setup
41
- - example.rb
41
+ - examples/convert-methods.rb
42
+ - examples/convert-to-io.rb
43
+ - examples/convert-with-chunks.rb
44
+ - examples/convert-with-compression.rb
45
+ - examples/convert-without-compression.rb
42
46
  - lib/parqueteur.rb
43
- - lib/parqueteur/chunked_converter.rb
44
47
  - lib/parqueteur/column.rb
45
48
  - lib/parqueteur/column_collection.rb
46
49
  - lib/parqueteur/converter.rb
data/example.rb DELETED
@@ -1,39 +0,0 @@
1
- require 'bundler/setup'
2
- require 'parqueteur'
3
-
4
- class Foo < Parqueteur::Converter
5
- column :id, :bigint
6
- column :reference, :string
7
- column :hash, :map, key: :string, value: :string
8
- column :valid, :boolean
9
- column :total, :integer
10
- column :numbers, :array, elements: :integer
11
- column :my_struct, :struct do
12
- field :test, :string
13
- field :mon_nombre, :integer
14
- end
15
- end
16
-
17
- LETTERS = ('a'..'z').to_a
18
-
19
- data = 1000.times.collect do |i|
20
- {
21
- 'id' => i + 1,
22
- 'reference' => "coucou:#{i}",
23
- 'hash' => { 'a' => LETTERS.sample },
24
- 'valid' => rand < 0.5,
25
- 'total' => rand(100..500),
26
- 'numbers' => [1, 2, 3],
27
- 'my_struct' => {
28
- 'test' => 'super'
29
- }
30
- }
31
- end
32
-
33
- # chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
34
- # pp chunked_converter.write_files('test')
35
- puts Foo.convert(data, output: 'tmp/test.parquet')
36
- table = Arrow::Table.load('tmp/test.parquet')
37
- table.each_record do |record|
38
- puts record.to_h
39
- end
@@ -1,28 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Parqueteur
4
- class ChunkedConverter
5
- attr_reader :schema
6
-
7
- def initialize(input, converter, chunk_size = 200)
8
- @input = Parqueteur::Input.from(input)
9
- @converter = converter
10
- @chunk_size = chunk_size
11
- end
12
-
13
- def chunks
14
- Enumerator.new do |arr|
15
- @input.each_slice(@chunk_size) do |chunk|
16
- local_converter = @converter.new(chunk)
17
- arr << local_converter.to_io
18
- end
19
- end
20
- end
21
-
22
- def write_files(prefix)
23
- chunks.each_with_index do |chunk, idx|
24
- File.write("#{prefix}.#{idx}.parquet", chunk.read)
25
- end
26
- end
27
- end
28
- end