parqueteur 1.0.2 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b60aefc8e90564af2abeeb01a36aaaabe8cf08c65efeebdbf6f034372c8100c1
4
- data.tar.gz: b4f93aeac25321e50fdf8d2a8c8e0216feb4ec68d0ac356d2ed98118820c61a4
3
+ metadata.gz: 056b9208a8bffcd163464dbd2cf276a9b0704e96788b77555d545eb339a4e798
4
+ data.tar.gz: 1e20d31b1fc6f198fee42546939ce289d71d66f65ffa66562cdd7841e0f24f61
5
5
  SHA512:
6
- metadata.gz: 1dbb0d1a870ff2291909014f595a6c63a35c11c80adc4e1972e20852ac87ba20447ce31dee9feb2f11dcea1a2ec5276ad4a05ca3eb406ab26a2b606ea369b809
7
- data.tar.gz: ef4dec6ca81564972112468dc1729053c9c03bdc61a947a01990c1a870dc45e8136561553390fbfbb1b49d22ca1f8221a94cb8403a7ecfc6bf18f10ba07acd9b
6
+ metadata.gz: fe08a7b282c4ededc08acb5aa9f4b485ead828aee4fd1444e8bb1af80cc56ea8c20411aefe136809f91ad808bee52db261218e8b5e6b7538bfa53d1eb38eb4b5
7
+ data.tar.gz: 0fee8ec94698b7b4c9d3a089fd0094a52bd83dfda56d0652f8a5b08dfe84a88b251736e62a9da7f510e0fa3d1842e2551161178ce30b5e0f5c6ee9b903917a2c
data/.gitignore CHANGED
@@ -6,3 +6,6 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ /*.gem
10
+ /tmp/*
11
+ !/tmp/.keep
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- parqueteur (1.0.2)
4
+ parqueteur (1.2.0)
5
5
  red-parquet (~> 5.0)
6
6
 
7
7
  GEM
@@ -0,0 +1,54 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ converter = Foo.new(data, compression: :gzip)
52
+ pp converter.to_io
53
+ pp converter.to_arrow_table
54
+ converter.write('tmp/test.parquet')
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ io = Foo.convert(data)
52
+ pp io.read
@@ -0,0 +1,54 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ converter = Foo.new(data, compression: :gzip)
52
+ converter.split(200).each_with_index do |chunk, idx|
53
+ puts "#{idx}: #{chunk.path}"
54
+ end
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ path = 'tmp/test.parquet'
52
+ Foo.convert_to(data, path, compression: :gzip)
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ path = 'tmp/test.parquet'
52
+ Foo.convert_to(data, path)
@@ -4,31 +4,14 @@ module Parqueteur
4
4
  class Column
5
5
  attr_reader :name, :type, :options
6
6
 
7
- def initialize(name, type, options = {})
7
+ def initialize(name, type, options = {}, &block)
8
8
  @name = name.to_s
9
- @type = type
9
+ @type = Parqueteur::TypeResolver.resolve(type, options, &block)
10
10
  @options = options
11
11
  end
12
12
 
13
13
  def arrow_type
14
- @arrow_type ||= Parqueteur::TypeResolver.resolve(@type, @options)
15
- end
16
-
17
- def cast(value)
18
- case @type
19
- when :string then value.to_s
20
- when :boolean then value == true
21
- when :integer then value.to_i
22
- when :long then value.to_i
23
- when :timestamp
24
- case value
25
- when String then Time.parse(value).to_i
26
- when Integer then value
27
- else
28
- raise ArgumentError, "Unable to cast '#{value}' to timestamp"
29
- end
30
- when :map then value
31
- end
14
+ @type.arrow_type
32
15
  end
33
16
 
34
17
  def to_arrow_field
@@ -4,11 +4,18 @@ module Parqueteur
4
4
  class ColumnCollection
5
5
  include Enumerable
6
6
 
7
+ attr_reader :column_names
8
+
7
9
  def initialize
8
10
  @columns = []
11
+ @column_names = []
9
12
  @columns_idx = {}
10
13
  end
11
14
 
15
+ def key?(key)
16
+ @columns_idx.key?(key)
17
+ end
18
+
12
19
  def each(&block)
13
20
  @columns.each(&block)
14
21
  end
@@ -17,6 +24,7 @@ module Parqueteur
17
24
  unless @columns_idx.key?(column.name)
18
25
  @columns_idx[column.name] = column
19
26
  @columns << column
27
+ @column_names << column.name
20
28
  end
21
29
 
22
30
  true
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Parqueteur
4
4
  class Converter
5
- attr_reader :schema
5
+ DEFAULT_BATCH_SIZE = 10
6
6
 
7
7
  def self.inline(&block)
8
8
  Class.new(self, &block)
@@ -12,104 +12,137 @@ module Parqueteur
12
12
  @columns ||= Parqueteur::ColumnCollection.new
13
13
  end
14
14
 
15
- def self.column(name, type, options = {})
16
- columns.add(Parqueteur::Column.new(name, type, options))
15
+ def self.column(name, type, options = {}, &block)
16
+ columns.add(Parqueteur::Column.new(name, type, options, &block))
17
17
  end
18
18
 
19
19
  def self.transforms
20
20
  @transforms ||= []
21
21
  end
22
22
 
23
- def self.transform(method_name, &block)
23
+ def self.transform(method_name = nil, &block)
24
24
  transforms << (method_name || block)
25
25
  end
26
26
 
27
- def self.convert(input, output: nil)
28
- converter = new(input)
29
- if !output.nil?
30
- converter.write(output)
31
- else
32
- converter.to_blob
27
+ def self.convert(input, **kwargs)
28
+ new(input, **kwargs).to_io
29
+ end
30
+
31
+ def self.convert_to(input, output_path, **kwargs)
32
+ converter = new(input, **kwargs)
33
+ converter.write(output_path)
34
+ end
35
+
36
+ # @param [Enumerable] An enumerable object
37
+ # @option [Symbol] compression - :gzip
38
+ def initialize(input, **kwargs)
39
+ @input = Parqueteur::Input.from(input)
40
+ @batch_size = kwargs.fetch(:batch_size, DEFAULT_BATCH_SIZE)
41
+ @compression = kwargs.fetch(:compression, nil)&.to_sym
42
+ end
43
+
44
+ def split(size)
45
+ Enumerator.new do |arr|
46
+ @input.each_slice(size) do |records|
47
+ local_converter = self.class.new(
48
+ records, batch_size: @batch_size, compression: @compression
49
+ )
50
+ file = local_converter.to_tmpfile
51
+ arr << file
52
+ file.close
53
+ file.unlink
54
+ end
33
55
  end
34
56
  end
35
57
 
36
- def initialize(input, options = {})
37
- @input = Parqueteur::Input.from(input, options)
38
- end
39
-
40
- def write(output)
41
- case output
42
- when :io
43
- to_io
44
- when String
45
- to_arrow_table.save(output)
46
- when StringIO, IO
47
- buffer = Arrow::ResizableBuffer.new(0)
48
- to_arrow_table.save(buffer, format: :parquet)
49
- output.write(buffer.data.to_s)
50
- output.rewind
51
- output
52
- else
53
- raise ArgumentError, "unsupported output: #{output.class}, accepted: String (filename), IO, StringIO"
58
+ def split_by_io(size)
59
+ Enumerator.new do |arr|
60
+ @input.each_slice(size) do |records|
61
+ local_converter = self.class.new(records)
62
+ arr << local_converter.to_io
63
+ end
54
64
  end
55
65
  end
56
66
 
57
- def to_s
58
- inspect
67
+ def write(path)
68
+ arrow_schema = self.class.columns.arrow_schema
69
+ writer_properties = Parquet::WriterProperties.new
70
+ writer_properties.set_compression(@compression) unless @compression.nil?
71
+
72
+ Arrow::FileOutputStream.open(path, false) do |output|
73
+ Parquet::ArrowFileWriter.open(arrow_schema, output, writer_properties) do |writer|
74
+ @input.each_slice(@batch_size) do |records|
75
+ arrow_table = build_arrow_table(records)
76
+ writer.write_table(arrow_table, 1024)
77
+ end
78
+ end
79
+ end
80
+
81
+ true
82
+ end
83
+
84
+ def to_tmpfile
85
+ tempfile = Tempfile.new
86
+ tempfile.binmode
87
+ write(tempfile.path)
88
+ tempfile.rewind
89
+ tempfile
59
90
  end
60
91
 
61
92
  def to_io
62
- write(StringIO.new)
93
+ tmpfile = to_tmpfile
94
+ strio = StringIO.new(tmpfile.read)
95
+ tmpfile.close
96
+ tmpfile.unlink
97
+ strio
98
+ end
99
+
100
+ def to_arrow_table
101
+ file = to_tmpfile
102
+ table = Arrow::Table.load(file.path, format: :parquet)
103
+ file.close
104
+ file.unlink
105
+ table
63
106
  end
64
107
 
65
108
  def to_blob
66
- write(StringIO.new).read
109
+ to_io.read
67
110
  end
68
111
 
69
- def to_arrow_table
70
- transforms = self.class.transforms
112
+ private
71
113
 
72
- chunks = {}
73
- @input.each_slice(100) do |items|
74
- values = self.class.columns.each_with_object({}) do |column, hash|
75
- hash[column.name] = []
76
- end
114
+ def build_arrow_table(records)
115
+ transforms = self.class.transforms
77
116
 
78
- items.each do |item|
79
- if transforms.length > 0
80
- transforms.each do |transform|
81
- item = \
82
- if transform.is_a?(Symbol)
83
- __send__(transform, item)
84
- else
85
- transform.call(item)
86
- end
87
- end
88
- end
117
+ values = self.class.columns.each_with_object({}) do |column, hash|
118
+ hash[column.name] = []
119
+ end
89
120
 
90
- values.each_key do |value_key|
91
- if item.key?(value_key)
92
- values[value_key] << item[value_key]
93
- else
94
- values[value_key] << nil
95
- end
121
+ records.each do |item|
122
+ if transforms.length > 0
123
+ transforms.each do |transform|
124
+ item = \
125
+ if transform.is_a?(Symbol)
126
+ __send__(transform, item)
127
+ else
128
+ transform.call(item)
129
+ end
96
130
  end
97
131
  end
98
132
 
99
- values.each_with_object(chunks) do |item, hash|
100
- column = self.class.columns.find(item[0])
101
- hash[item[0]] ||= []
102
- hash[item[0]].push(
103
- Parqueteur::ValueArrayBuilder.build(
104
- item[1], column.type, column.options
105
- )
106
- )
133
+ values.each_key do |value_key|
134
+ if item.key?(value_key)
135
+ values[value_key] << item[value_key]
136
+ else
137
+ values[value_key] << nil
138
+ end
107
139
  end
108
140
  end
109
141
 
110
142
  Arrow::Table.new(
111
- chunks.transform_values! do |value|
112
- Arrow::ChunkedArray.new(value)
143
+ values.each_with_object({}) do |item, hash|
144
+ column = self.class.columns.find(item[0])
145
+ hash[item[0]] = column.type.build_value_array(item[1])
113
146
  end
114
147
  )
115
148
  end
@@ -4,40 +4,25 @@ module Parqueteur
4
4
  class Input
5
5
  include Enumerable
6
6
 
7
- def self.from(arg, options = {})
8
- new(
9
- case arg
10
- when String
11
- if File.exist?(arg)
12
- File.new(arg, 'r')
13
- else
14
- arg.split("\n")
15
- end
16
- when Array, Enumerator
17
- arg
18
- end,
19
- options
20
- )
7
+ def self.from(arg)
8
+ return arg if arg.is_a?(self)
9
+
10
+ new(arg)
21
11
  end
22
12
 
23
- def initialize(source, options = {})
13
+ def initialize(source)
14
+ unless source.is_a?(Enumerable)
15
+ raise ArgumentError, 'Enumerable object expected'
16
+ end
17
+
24
18
  @source = source
25
- @options = options
26
19
  end
27
20
 
28
21
  def each(&block)
29
- case @source
30
- when File
31
- if @options.fetch(:json_newlines, true) == true
32
- @source.each_line do |line|
33
- yield(JSON.parse(line.strip))
34
- end
35
- else
36
- JSON.parse(@source.read).each(&block)
37
- end
38
- @source.rewind
39
- when Array, Enumerator
22
+ if block_given?
40
23
  @source.each(&block)
24
+ else
25
+ @source.to_enum(:each)
41
26
  end
42
27
  end
43
28
  end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Struct
5
+ def initialize(&block)
6
+ instance_exec(&block)
7
+ end
8
+
9
+ def fields
10
+ @fields ||= Parqueteur::ColumnCollection.new
11
+ end
12
+
13
+ def field(name, type, options = {}, &block)
14
+ fields.add(Parqueteur::Column.new(name, type, options, &block))
15
+ end
16
+
17
+ def key?(key)
18
+ fields.key?(key)
19
+ end
20
+
21
+ def to_arrow_type
22
+ fields.collect(&:to_arrow_field)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Type
5
+ attr_reader :options, :arrow_type
6
+
7
+ def initialize(options = {}, &block)
8
+ @options = options
9
+ @block = block
10
+ @arrow_type = arrow_type_builder
11
+ end
12
+
13
+ def build_value_array(values)
14
+ raise "#to_arrow_field must be implemented in #{self.class}"
15
+ end
16
+
17
+ def resolve(type, options = {})
18
+ Parqueteur::TypeResolver.resolve(type, options)
19
+ end
20
+ end
21
+ end
@@ -2,60 +2,46 @@
2
2
 
3
3
  module Parqueteur
4
4
  class TypeResolver
5
- def self.resolve(*args)
6
- new.resolve(*args)
5
+ include Singleton
6
+
7
+ def self.registered_types
8
+ @registered_types ||= {
9
+ array: Parqueteur::Types::ArrayType,
10
+ bigint: Parqueteur::Types::Int64Type,
11
+ boolean: Parqueteur::Types::BooleanType,
12
+ int32: Parqueteur::Types::Int32Type,
13
+ int64: Parqueteur::Types::Int64Type,
14
+ integer: Parqueteur::Types::Int32Type,
15
+ map: Parqueteur::Types::MapType,
16
+ string: Parqueteur::Types::StringType,
17
+ struct: Parqueteur::Types::StructType,
18
+ timestamp: Parqueteur::Types::TimestampType
19
+ }
20
+ end
21
+
22
+ def self.register_type(type, klass)
23
+ registered_types[type] = klass
24
+ end
25
+
26
+ def self.resolve(*args, &block)
27
+ instance.resolve(*args, &block)
7
28
  end
8
29
 
9
- def resolve(type, options = {})
10
- case type
11
- when :array
12
- elements_opt = options.fetch(:elements)
13
- Arrow::ListDataType.new(
14
- if elements_opt.is_a?(Hash)
15
- resolve(elements_opt.fetch(:type), elements_opt)
16
- else
17
- resolve(elements_opt)
18
- end
19
- )
20
- when :boolean
21
- Arrow::BooleanDataType.new
22
- when :integer
23
- if options.fetch(:unsigned, false) == true
24
- Arrow::UInt32DataType.new
25
- else
26
- Arrow::Int32DataType.new
27
- end
28
- when :long
29
- if options.fetch(:unsigned, false) == true
30
- Arrow::UInt64DataType.new
31
- else
32
- Arrow::Int64DataType.new
33
- end
34
- when :timestamp
35
- Arrow::TimestampDataType.new(
36
- options.fetch(:unit, :second)
37
- )
38
- when :string
39
- Arrow::StringDataType.new
40
- when :map
41
- map_value = options.fetch(:value)
42
- Arrow::MapDataType.new(
43
- resolve(options.fetch(:key)),
44
- if map_value.is_a?(Hash)
45
- resolve(map_value.fetch(:type), map_value)
46
- else
47
- resolve(map_value)
48
- end
49
- )
30
+ def resolve(type, options = {}, &block)
31
+ if type.is_a?(Symbol)
32
+ resolve_from_symbol(type, options, &block)
50
33
  else
51
- raise Error, "unknown type: #{type}"
34
+ type.new(options, &block)
52
35
  end
53
36
  end
54
- end
55
- end
56
37
 
57
- private
38
+ private
58
39
 
59
- def build_arrow_type(type, options = {})
40
+ def resolve_from_symbol(type, options, &block)
41
+ type_klass = self.class.registered_types.fetch(type.to_sym, nil)
42
+ raise Parqueteur::TypeNotFound, type if type_klass.nil?
60
43
 
44
+ type_klass.new(options, &block)
45
+ end
46
+ end
61
47
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class ArrayType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::ListArrayBuilder.build(arrow_type, values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::ListDataType.new(
12
+ if options[:elements].is_a?(Hash)
13
+ resolve(options[:elements].fetch(:type), options[:elements]).arrow_type
14
+ else
15
+ resolve(options[:elements]).arrow_type
16
+ end
17
+ )
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class BooleanType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::BooleanArray.new(values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::BooleanDataType.new
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Int32Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ if options.fetch(:unsigned, false) == true
8
+ Arrow::UInt32Array.new(values)
9
+ else
10
+ Arrow::Int32Array.new(values)
11
+ end
12
+ end
13
+
14
+ def arrow_type_builder
15
+ if options.fetch(:unsigned, false) == true
16
+ Arrow::UInt32DataType.new
17
+ else
18
+ Arrow::Int32DataType.new
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ # when :integer
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Int64Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ if options.fetch(:unsigned, false) == true
8
+ Arrow::UInt64Array.new(values)
9
+ else
10
+ Arrow::Int64Array.new(values)
11
+ end
12
+ end
13
+
14
+ def arrow_type_builder
15
+ if options.fetch(:unsigned, false) == true
16
+ Arrow::UInt64DataType.new
17
+ else
18
+ Arrow::Int64DataType.new
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ # when :integer
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class MapType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ builder = Arrow::MapArrayBuilder.new(arrow_type)
8
+ values.each do |entry|
9
+ builder.append_value
10
+ next if entry.nil?
11
+
12
+ entry.each do |k, v|
13
+ builder.key_builder.append(k)
14
+ builder.item_builder.append(v)
15
+ end
16
+ end
17
+
18
+ builder.finish
19
+ end
20
+
21
+ def arrow_type_builder
22
+ map_value = options.fetch(:value)
23
+
24
+ Arrow::MapDataType.new(
25
+ resolve(options.fetch(:key)).arrow_type,
26
+ if map_value.is_a?(Hash)
27
+ resolve(map_value.fetch(:type), map_value).arrow_type
28
+ else
29
+ resolve(map_value).arrow_type
30
+ end
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class StringType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ Arrow::StringArray.new(values)
13
+ end
14
+
15
+ def arrow_type_builder
16
+ Arrow::StringDataType.new
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class StructType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ values.each do |value|
13
+ next if value.nil?
14
+
15
+ value.each_key do |key|
16
+ next if struct_object.key?(key)
17
+
18
+ raise Parqueteur::Error, "Struct field '#{key}' not found"
19
+ end
20
+ end
21
+ Arrow::StructArrayBuilder.build(arrow_type, values)
22
+ end
23
+
24
+ def arrow_type_builder
25
+ Arrow::StructDataType.new(struct_object.to_arrow_type)
26
+ end
27
+
28
+ private
29
+
30
+ def struct_object
31
+ @struct_object ||= Parqueteur::Struct.new(&@block)
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class TimestampType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ Arrow::TimestampArray.new(values)
13
+ end
14
+
15
+ def arrow_type_builder
16
+ Arrow::TimestampDataType.new(
17
+ options.fetch(:unit, :second)
18
+ )
19
+ end
20
+ end
21
+ end
22
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Parqueteur
4
- VERSION = '1.0.2'
4
+ VERSION = '1.2.0'
5
5
  end
data/lib/parqueteur.rb CHANGED
@@ -1,17 +1,29 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "parqueteur/version"
4
- require 'parqueteur/type_resolver'
3
+ require 'json'
4
+ require 'singleton'
5
+ require 'tempfile'
6
+
7
+ require_relative 'parqueteur/version'
5
8
  require 'parqueteur/column'
6
9
  require 'parqueteur/column_collection'
7
10
  require 'parqueteur/converter'
8
- require 'parqueteur/chunked_converter'
9
11
  require 'parqueteur/input'
10
- require 'parqueteur/value_array_builder'
11
- require 'json'
12
+ require 'parqueteur/struct'
13
+ require 'parqueteur/type'
14
+ require 'parqueteur/type_resolver'
15
+ require 'parqueteur/types/array_type'
16
+ require 'parqueteur/types/boolean_type'
17
+ require 'parqueteur/types/int32_type'
18
+ require 'parqueteur/types/int64_type'
19
+ require 'parqueteur/types/map_type'
20
+ require 'parqueteur/types/string_type'
21
+ require 'parqueteur/types/struct_type'
22
+ require 'parqueteur/types/timestamp_type'
12
23
  require 'parquet'
13
24
 
14
25
  module Parqueteur
15
26
  class Error < StandardError; end
27
+ class TypeNotFound < Error; end
16
28
  # Your code goes here...
17
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parqueteur
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julien D.
@@ -38,15 +38,27 @@ files:
38
38
  - Rakefile
39
39
  - bin/console
40
40
  - bin/setup
41
- - example.rb
41
+ - examples/convert-methods.rb
42
+ - examples/convert-to-io.rb
43
+ - examples/convert-with-chunks.rb
44
+ - examples/convert-with-compression.rb
45
+ - examples/convert-without-compression.rb
42
46
  - lib/parqueteur.rb
43
- - lib/parqueteur/chunked_converter.rb
44
47
  - lib/parqueteur/column.rb
45
48
  - lib/parqueteur/column_collection.rb
46
49
  - lib/parqueteur/converter.rb
47
50
  - lib/parqueteur/input.rb
51
+ - lib/parqueteur/struct.rb
52
+ - lib/parqueteur/type.rb
48
53
  - lib/parqueteur/type_resolver.rb
49
- - lib/parqueteur/value_array_builder.rb
54
+ - lib/parqueteur/types/array_type.rb
55
+ - lib/parqueteur/types/boolean_type.rb
56
+ - lib/parqueteur/types/int32_type.rb
57
+ - lib/parqueteur/types/int64_type.rb
58
+ - lib/parqueteur/types/map_type.rb
59
+ - lib/parqueteur/types/string_type.rb
60
+ - lib/parqueteur/types/struct_type.rb
61
+ - lib/parqueteur/types/timestamp_type.rb
50
62
  - lib/parqueteur/version.rb
51
63
  - parqueteur.gemspec
52
64
  - test.json
data/example.rb DELETED
@@ -1,20 +0,0 @@
1
- require 'bundler/setup'
2
- require 'parqueteur'
3
-
4
- class Foo < Parqueteur::Converter
5
- column :id, :long
6
- column :reference, :string
7
- column :hash, :map, key: :string, value: :string
8
- column :valid, :boolean
9
- column :total, :integer
10
- end
11
-
12
- LETTERS = ('a'..'z').to_a
13
-
14
- data = 1000.times.collect do |i|
15
- { 'id' => i + 1, 'reference' => "coucou:#{i}", 'hash' => { 'a' => LETTERS.sample }, 'valid' => rand < 0.5, 'total' => rand(100..500) }
16
- end
17
-
18
- chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
19
- pp chunked_converter.write_files('test')
20
- # puts Foo.convert(data, output: 'test.parquet')
@@ -1,28 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Parqueteur
4
- class ChunkedConverter
5
- attr_reader :schema
6
-
7
- def initialize(input, converter, chunk_size = 200)
8
- @input = Parqueteur::Input.from(input)
9
- @converter = converter
10
- @chunk_size = chunk_size
11
- end
12
-
13
- def chunks
14
- Enumerator.new do |arr|
15
- @input.each_slice(@chunk_size) do |chunk|
16
- local_converter = @converter.new(chunk)
17
- arr << local_converter.to_io
18
- end
19
- end
20
- end
21
-
22
- def write_files(prefix)
23
- chunks.each_with_index do |chunk, idx|
24
- File.write("#{prefix}.#{idx}.parquet", chunk.read)
25
- end
26
- end
27
- end
28
- end
@@ -1,59 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Parqueteur
4
- class ValueArrayBuilder
5
- attr_reader :type, :options, :arrow_type
6
-
7
- def self.build(input, type, options)
8
- new(type, options).build(input)
9
- end
10
-
11
- def initialize(type, options)
12
- @type = type
13
- @options = options
14
- @arrow_type = Parqueteur::TypeResolver.resolve(type, options)
15
- end
16
-
17
- def build(input)
18
- return if input.nil?
19
-
20
- case type
21
- when :array
22
- Arrow::ListArrayBuilder.build(arrow_type, input)
23
- when :map
24
- builder = Arrow::MapArrayBuilder.new(arrow_type)
25
- input.each do |entry|
26
- builder.append_value
27
- next if entry.nil?
28
-
29
- entry.each do |k, v|
30
- builder.key_builder.append(k)
31
- builder.item_builder.append(v)
32
- end
33
- end
34
-
35
- builder.finish
36
- when :boolean
37
- Arrow::BooleanArray.new(input)
38
- when :integer
39
- if options.fetch(:unsigned, false) == true
40
- Arrow::UInt32Array.new(input)
41
- else
42
- Arrow::Int32Array.new(input)
43
- end
44
- when :long
45
- if options.fetch(:unsigned, false) == true
46
- Arrow::UInt64Array.new(input)
47
- else
48
- Arrow::Int64Array.new(input)
49
- end
50
- when :string
51
- Arrow::StringArray.new(input)
52
- when :timestamp
53
- Arrow::TimestampArray.new(input)
54
- else
55
- raise Error, "unknown type: #{type}"
56
- end
57
- end
58
- end
59
- end