parqueteur 1.0.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b60aefc8e90564af2abeeb01a36aaaabe8cf08c65efeebdbf6f034372c8100c1
4
- data.tar.gz: b4f93aeac25321e50fdf8d2a8c8e0216feb4ec68d0ac356d2ed98118820c61a4
3
+ metadata.gz: 056b9208a8bffcd163464dbd2cf276a9b0704e96788b77555d545eb339a4e798
4
+ data.tar.gz: 1e20d31b1fc6f198fee42546939ce289d71d66f65ffa66562cdd7841e0f24f61
5
5
  SHA512:
6
- metadata.gz: 1dbb0d1a870ff2291909014f595a6c63a35c11c80adc4e1972e20852ac87ba20447ce31dee9feb2f11dcea1a2ec5276ad4a05ca3eb406ab26a2b606ea369b809
7
- data.tar.gz: ef4dec6ca81564972112468dc1729053c9c03bdc61a947a01990c1a870dc45e8136561553390fbfbb1b49d22ca1f8221a94cb8403a7ecfc6bf18f10ba07acd9b
6
+ metadata.gz: fe08a7b282c4ededc08acb5aa9f4b485ead828aee4fd1444e8bb1af80cc56ea8c20411aefe136809f91ad808bee52db261218e8b5e6b7538bfa53d1eb38eb4b5
7
+ data.tar.gz: 0fee8ec94698b7b4c9d3a089fd0094a52bd83dfda56d0652f8a5b08dfe84a88b251736e62a9da7f510e0fa3d1842e2551161178ce30b5e0f5c6ee9b903917a2c
data/.gitignore CHANGED
@@ -6,3 +6,6 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ /*.gem
10
+ /tmp/*
11
+ !/tmp/.keep
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- parqueteur (1.0.2)
4
+ parqueteur (1.2.0)
5
5
  red-parquet (~> 5.0)
6
6
 
7
7
  GEM
@@ -0,0 +1,54 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ converter = Foo.new(data, compression: :gzip)
52
+ pp converter.to_io
53
+ pp converter.to_arrow_table
54
+ converter.write('tmp/test.parquet')
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ io = Foo.convert(data)
52
+ pp io.read
@@ -0,0 +1,54 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ converter = Foo.new(data, compression: :gzip)
52
+ converter.split(200).each_with_index do |chunk, idx|
53
+ puts "#{idx}: #{chunk.path}"
54
+ end
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ path = 'tmp/test.parquet'
52
+ Foo.convert_to(data, path, compression: :gzip)
@@ -0,0 +1,52 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+ require 'securerandom'
4
+ require 'benchmark'
5
+
6
+ class Foo < Parqueteur::Converter
7
+ column :id, :bigint
8
+ column :reference, :string
9
+ column :hash, :map, key: :string, value: :string
10
+ # column :hash2, :map, key: :string, value: :string
11
+ # column :hash3, :map, key: :string, value: :string
12
+ column :valid, :boolean
13
+ column :total, :integer
14
+ column :numbers, :array, elements: :integer
15
+ column :my_struct, :struct do
16
+ field :test, :string
17
+ field :mon_nombre, :integer
18
+ end
19
+ end
20
+
21
+ def random_hash
22
+ {
23
+ 'a' => SecureRandom.hex(128),
24
+ 'b' => SecureRandom.hex(128),
25
+ 'c' => SecureRandom.hex(128),
26
+ 'd' => SecureRandom.hex(128),
27
+ 'e' => SecureRandom.hex(128),
28
+ 'f' => SecureRandom.hex(128),
29
+ 'g' => SecureRandom.hex(128),
30
+ 'h' => SecureRandom.hex(128),
31
+ 'i' => SecureRandom.hex(128),
32
+ 'j' => SecureRandom.hex(128),
33
+ 'k' => SecureRandom.hex(128),
34
+ }
35
+ end
36
+
37
+ data = 10000.times.collect do |i|
38
+ {
39
+ 'id' => i + 1,
40
+ 'reference' => "coucou:#{i}",
41
+ 'hash' => random_hash,
42
+ # 'hash2' => random_hash,
43
+ # 'hash3' => random_hash,
44
+ 'valid' => rand < 0.5,
45
+ 'total' => rand(100..500),
46
+ 'numbers' => [1, 2, 3]
47
+ }
48
+ end
49
+ puts "data generation OK"
50
+
51
+ path = 'tmp/test.parquet'
52
+ Foo.convert_to(data, path)
@@ -4,31 +4,14 @@ module Parqueteur
4
4
  class Column
5
5
  attr_reader :name, :type, :options
6
6
 
7
- def initialize(name, type, options = {})
7
+ def initialize(name, type, options = {}, &block)
8
8
  @name = name.to_s
9
- @type = type
9
+ @type = Parqueteur::TypeResolver.resolve(type, options, &block)
10
10
  @options = options
11
11
  end
12
12
 
13
13
  def arrow_type
14
- @arrow_type ||= Parqueteur::TypeResolver.resolve(@type, @options)
15
- end
16
-
17
- def cast(value)
18
- case @type
19
- when :string then value.to_s
20
- when :boolean then value == true
21
- when :integer then value.to_i
22
- when :long then value.to_i
23
- when :timestamp
24
- case value
25
- when String then Time.parse(value).to_i
26
- when Integer then value
27
- else
28
- raise ArgumentError, "Unable to cast '#{value}' to timestamp"
29
- end
30
- when :map then value
31
- end
14
+ @type.arrow_type
32
15
  end
33
16
 
34
17
  def to_arrow_field
@@ -4,11 +4,18 @@ module Parqueteur
4
4
  class ColumnCollection
5
5
  include Enumerable
6
6
 
7
+ attr_reader :column_names
8
+
7
9
  def initialize
8
10
  @columns = []
11
+ @column_names = []
9
12
  @columns_idx = {}
10
13
  end
11
14
 
15
+ def key?(key)
16
+ @columns_idx.key?(key)
17
+ end
18
+
12
19
  def each(&block)
13
20
  @columns.each(&block)
14
21
  end
@@ -17,6 +24,7 @@ module Parqueteur
17
24
  unless @columns_idx.key?(column.name)
18
25
  @columns_idx[column.name] = column
19
26
  @columns << column
27
+ @column_names << column.name
20
28
  end
21
29
 
22
30
  true
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Parqueteur
4
4
  class Converter
5
- attr_reader :schema
5
+ DEFAULT_BATCH_SIZE = 10
6
6
 
7
7
  def self.inline(&block)
8
8
  Class.new(self, &block)
@@ -12,104 +12,137 @@ module Parqueteur
12
12
  @columns ||= Parqueteur::ColumnCollection.new
13
13
  end
14
14
 
15
- def self.column(name, type, options = {})
16
- columns.add(Parqueteur::Column.new(name, type, options))
15
+ def self.column(name, type, options = {}, &block)
16
+ columns.add(Parqueteur::Column.new(name, type, options, &block))
17
17
  end
18
18
 
19
19
  def self.transforms
20
20
  @transforms ||= []
21
21
  end
22
22
 
23
- def self.transform(method_name, &block)
23
+ def self.transform(method_name = nil, &block)
24
24
  transforms << (method_name || block)
25
25
  end
26
26
 
27
- def self.convert(input, output: nil)
28
- converter = new(input)
29
- if !output.nil?
30
- converter.write(output)
31
- else
32
- converter.to_blob
27
+ def self.convert(input, **kwargs)
28
+ new(input, **kwargs).to_io
29
+ end
30
+
31
+ def self.convert_to(input, output_path, **kwargs)
32
+ converter = new(input, **kwargs)
33
+ converter.write(output_path)
34
+ end
35
+
36
+ # @param [Enumerable] An enumerable object
37
+ # @option [Symbol] compression - :gzip
38
+ def initialize(input, **kwargs)
39
+ @input = Parqueteur::Input.from(input)
40
+ @batch_size = kwargs.fetch(:batch_size, DEFAULT_BATCH_SIZE)
41
+ @compression = kwargs.fetch(:compression, nil)&.to_sym
42
+ end
43
+
44
+ def split(size)
45
+ Enumerator.new do |arr|
46
+ @input.each_slice(size) do |records|
47
+ local_converter = self.class.new(
48
+ records, batch_size: @batch_size, compression: @compression
49
+ )
50
+ file = local_converter.to_tmpfile
51
+ arr << file
52
+ file.close
53
+ file.unlink
54
+ end
33
55
  end
34
56
  end
35
57
 
36
- def initialize(input, options = {})
37
- @input = Parqueteur::Input.from(input, options)
38
- end
39
-
40
- def write(output)
41
- case output
42
- when :io
43
- to_io
44
- when String
45
- to_arrow_table.save(output)
46
- when StringIO, IO
47
- buffer = Arrow::ResizableBuffer.new(0)
48
- to_arrow_table.save(buffer, format: :parquet)
49
- output.write(buffer.data.to_s)
50
- output.rewind
51
- output
52
- else
53
- raise ArgumentError, "unsupported output: #{output.class}, accepted: String (filename), IO, StringIO"
58
+ def split_by_io(size)
59
+ Enumerator.new do |arr|
60
+ @input.each_slice(size) do |records|
61
+ local_converter = self.class.new(records)
62
+ arr << local_converter.to_io
63
+ end
54
64
  end
55
65
  end
56
66
 
57
- def to_s
58
- inspect
67
+ def write(path)
68
+ arrow_schema = self.class.columns.arrow_schema
69
+ writer_properties = Parquet::WriterProperties.new
70
+ writer_properties.set_compression(@compression) unless @compression.nil?
71
+
72
+ Arrow::FileOutputStream.open(path, false) do |output|
73
+ Parquet::ArrowFileWriter.open(arrow_schema, output, writer_properties) do |writer|
74
+ @input.each_slice(@batch_size) do |records|
75
+ arrow_table = build_arrow_table(records)
76
+ writer.write_table(arrow_table, 1024)
77
+ end
78
+ end
79
+ end
80
+
81
+ true
82
+ end
83
+
84
+ def to_tmpfile
85
+ tempfile = Tempfile.new
86
+ tempfile.binmode
87
+ write(tempfile.path)
88
+ tempfile.rewind
89
+ tempfile
59
90
  end
60
91
 
61
92
  def to_io
62
- write(StringIO.new)
93
+ tmpfile = to_tmpfile
94
+ strio = StringIO.new(tmpfile.read)
95
+ tmpfile.close
96
+ tmpfile.unlink
97
+ strio
98
+ end
99
+
100
+ def to_arrow_table
101
+ file = to_tmpfile
102
+ table = Arrow::Table.load(file.path, format: :parquet)
103
+ file.close
104
+ file.unlink
105
+ table
63
106
  end
64
107
 
65
108
  def to_blob
66
- write(StringIO.new).read
109
+ to_io.read
67
110
  end
68
111
 
69
- def to_arrow_table
70
- transforms = self.class.transforms
112
+ private
71
113
 
72
- chunks = {}
73
- @input.each_slice(100) do |items|
74
- values = self.class.columns.each_with_object({}) do |column, hash|
75
- hash[column.name] = []
76
- end
114
+ def build_arrow_table(records)
115
+ transforms = self.class.transforms
77
116
 
78
- items.each do |item|
79
- if transforms.length > 0
80
- transforms.each do |transform|
81
- item = \
82
- if transform.is_a?(Symbol)
83
- __send__(transform, item)
84
- else
85
- transform.call(item)
86
- end
87
- end
88
- end
117
+ values = self.class.columns.each_with_object({}) do |column, hash|
118
+ hash[column.name] = []
119
+ end
89
120
 
90
- values.each_key do |value_key|
91
- if item.key?(value_key)
92
- values[value_key] << item[value_key]
93
- else
94
- values[value_key] << nil
95
- end
121
+ records.each do |item|
122
+ if transforms.length > 0
123
+ transforms.each do |transform|
124
+ item = \
125
+ if transform.is_a?(Symbol)
126
+ __send__(transform, item)
127
+ else
128
+ transform.call(item)
129
+ end
96
130
  end
97
131
  end
98
132
 
99
- values.each_with_object(chunks) do |item, hash|
100
- column = self.class.columns.find(item[0])
101
- hash[item[0]] ||= []
102
- hash[item[0]].push(
103
- Parqueteur::ValueArrayBuilder.build(
104
- item[1], column.type, column.options
105
- )
106
- )
133
+ values.each_key do |value_key|
134
+ if item.key?(value_key)
135
+ values[value_key] << item[value_key]
136
+ else
137
+ values[value_key] << nil
138
+ end
107
139
  end
108
140
  end
109
141
 
110
142
  Arrow::Table.new(
111
- chunks.transform_values! do |value|
112
- Arrow::ChunkedArray.new(value)
143
+ values.each_with_object({}) do |item, hash|
144
+ column = self.class.columns.find(item[0])
145
+ hash[item[0]] = column.type.build_value_array(item[1])
113
146
  end
114
147
  )
115
148
  end
@@ -4,40 +4,25 @@ module Parqueteur
4
4
  class Input
5
5
  include Enumerable
6
6
 
7
- def self.from(arg, options = {})
8
- new(
9
- case arg
10
- when String
11
- if File.exist?(arg)
12
- File.new(arg, 'r')
13
- else
14
- arg.split("\n")
15
- end
16
- when Array, Enumerator
17
- arg
18
- end,
19
- options
20
- )
7
+ def self.from(arg)
8
+ return arg if arg.is_a?(self)
9
+
10
+ new(arg)
21
11
  end
22
12
 
23
- def initialize(source, options = {})
13
+ def initialize(source)
14
+ unless source.is_a?(Enumerable)
15
+ raise ArgumentError, 'Enumerable object expected'
16
+ end
17
+
24
18
  @source = source
25
- @options = options
26
19
  end
27
20
 
28
21
  def each(&block)
29
- case @source
30
- when File
31
- if @options.fetch(:json_newlines, true) == true
32
- @source.each_line do |line|
33
- yield(JSON.parse(line.strip))
34
- end
35
- else
36
- JSON.parse(@source.read).each(&block)
37
- end
38
- @source.rewind
39
- when Array, Enumerator
22
+ if block_given?
40
23
  @source.each(&block)
24
+ else
25
+ @source.to_enum(:each)
41
26
  end
42
27
  end
43
28
  end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Struct
5
+ def initialize(&block)
6
+ instance_exec(&block)
7
+ end
8
+
9
+ def fields
10
+ @fields ||= Parqueteur::ColumnCollection.new
11
+ end
12
+
13
+ def field(name, type, options = {}, &block)
14
+ fields.add(Parqueteur::Column.new(name, type, options, &block))
15
+ end
16
+
17
+ def key?(key)
18
+ fields.key?(key)
19
+ end
20
+
21
+ def to_arrow_type
22
+ fields.collect(&:to_arrow_field)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Type
5
+ attr_reader :options, :arrow_type
6
+
7
+ def initialize(options = {}, &block)
8
+ @options = options
9
+ @block = block
10
+ @arrow_type = arrow_type_builder
11
+ end
12
+
13
+ def build_value_array(values)
14
+ raise "#to_arrow_field must be implemented in #{self.class}"
15
+ end
16
+
17
+ def resolve(type, options = {})
18
+ Parqueteur::TypeResolver.resolve(type, options)
19
+ end
20
+ end
21
+ end
@@ -2,60 +2,46 @@
2
2
 
3
3
  module Parqueteur
4
4
  class TypeResolver
5
- def self.resolve(*args)
6
- new.resolve(*args)
5
+ include Singleton
6
+
7
+ def self.registered_types
8
+ @registered_types ||= {
9
+ array: Parqueteur::Types::ArrayType,
10
+ bigint: Parqueteur::Types::Int64Type,
11
+ boolean: Parqueteur::Types::BooleanType,
12
+ int32: Parqueteur::Types::Int32Type,
13
+ int64: Parqueteur::Types::Int64Type,
14
+ integer: Parqueteur::Types::Int32Type,
15
+ map: Parqueteur::Types::MapType,
16
+ string: Parqueteur::Types::StringType,
17
+ struct: Parqueteur::Types::StructType,
18
+ timestamp: Parqueteur::Types::TimestampType
19
+ }
20
+ end
21
+
22
+ def self.register_type(type, klass)
23
+ registered_types[type] = klass
24
+ end
25
+
26
+ def self.resolve(*args, &block)
27
+ instance.resolve(*args, &block)
7
28
  end
8
29
 
9
- def resolve(type, options = {})
10
- case type
11
- when :array
12
- elements_opt = options.fetch(:elements)
13
- Arrow::ListDataType.new(
14
- if elements_opt.is_a?(Hash)
15
- resolve(elements_opt.fetch(:type), elements_opt)
16
- else
17
- resolve(elements_opt)
18
- end
19
- )
20
- when :boolean
21
- Arrow::BooleanDataType.new
22
- when :integer
23
- if options.fetch(:unsigned, false) == true
24
- Arrow::UInt32DataType.new
25
- else
26
- Arrow::Int32DataType.new
27
- end
28
- when :long
29
- if options.fetch(:unsigned, false) == true
30
- Arrow::UInt64DataType.new
31
- else
32
- Arrow::Int64DataType.new
33
- end
34
- when :timestamp
35
- Arrow::TimestampDataType.new(
36
- options.fetch(:unit, :second)
37
- )
38
- when :string
39
- Arrow::StringDataType.new
40
- when :map
41
- map_value = options.fetch(:value)
42
- Arrow::MapDataType.new(
43
- resolve(options.fetch(:key)),
44
- if map_value.is_a?(Hash)
45
- resolve(map_value.fetch(:type), map_value)
46
- else
47
- resolve(map_value)
48
- end
49
- )
30
+ def resolve(type, options = {}, &block)
31
+ if type.is_a?(Symbol)
32
+ resolve_from_symbol(type, options, &block)
50
33
  else
51
- raise Error, "unknown type: #{type}"
34
+ type.new(options, &block)
52
35
  end
53
36
  end
54
- end
55
- end
56
37
 
57
- private
38
+ private
58
39
 
59
- def build_arrow_type(type, options = {})
40
+ def resolve_from_symbol(type, options, &block)
41
+ type_klass = self.class.registered_types.fetch(type.to_sym, nil)
42
+ raise Parqueteur::TypeNotFound, type if type_klass.nil?
60
43
 
44
+ type_klass.new(options, &block)
45
+ end
46
+ end
61
47
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class ArrayType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::ListArrayBuilder.build(arrow_type, values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::ListDataType.new(
12
+ if options[:elements].is_a?(Hash)
13
+ resolve(options[:elements].fetch(:type), options[:elements]).arrow_type
14
+ else
15
+ resolve(options[:elements]).arrow_type
16
+ end
17
+ )
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class BooleanType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::BooleanArray.new(values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::BooleanDataType.new
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Int32Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ if options.fetch(:unsigned, false) == true
8
+ Arrow::UInt32Array.new(values)
9
+ else
10
+ Arrow::Int32Array.new(values)
11
+ end
12
+ end
13
+
14
+ def arrow_type_builder
15
+ if options.fetch(:unsigned, false) == true
16
+ Arrow::UInt32DataType.new
17
+ else
18
+ Arrow::Int32DataType.new
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ # when :integer
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Int64Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ if options.fetch(:unsigned, false) == true
8
+ Arrow::UInt64Array.new(values)
9
+ else
10
+ Arrow::Int64Array.new(values)
11
+ end
12
+ end
13
+
14
+ def arrow_type_builder
15
+ if options.fetch(:unsigned, false) == true
16
+ Arrow::UInt64DataType.new
17
+ else
18
+ Arrow::Int64DataType.new
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ # when :integer
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class MapType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ builder = Arrow::MapArrayBuilder.new(arrow_type)
8
+ values.each do |entry|
9
+ builder.append_value
10
+ next if entry.nil?
11
+
12
+ entry.each do |k, v|
13
+ builder.key_builder.append(k)
14
+ builder.item_builder.append(v)
15
+ end
16
+ end
17
+
18
+ builder.finish
19
+ end
20
+
21
+ def arrow_type_builder
22
+ map_value = options.fetch(:value)
23
+
24
+ Arrow::MapDataType.new(
25
+ resolve(options.fetch(:key)).arrow_type,
26
+ if map_value.is_a?(Hash)
27
+ resolve(map_value.fetch(:type), map_value).arrow_type
28
+ else
29
+ resolve(map_value).arrow_type
30
+ end
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class StringType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ Arrow::StringArray.new(values)
13
+ end
14
+
15
+ def arrow_type_builder
16
+ Arrow::StringDataType.new
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class StructType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ values.each do |value|
13
+ next if value.nil?
14
+
15
+ value.each_key do |key|
16
+ next if struct_object.key?(key)
17
+
18
+ raise Parqueteur::Error, "Struct field '#{key}' not found"
19
+ end
20
+ end
21
+ Arrow::StructArrayBuilder.build(arrow_type, values)
22
+ end
23
+
24
+ def arrow_type_builder
25
+ Arrow::StructDataType.new(struct_object.to_arrow_type)
26
+ end
27
+
28
+ private
29
+
30
+ def struct_object
31
+ @struct_object ||= Parqueteur::Struct.new(&@block)
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class TimestampType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ Arrow::TimestampArray.new(values)
13
+ end
14
+
15
+ def arrow_type_builder
16
+ Arrow::TimestampDataType.new(
17
+ options.fetch(:unit, :second)
18
+ )
19
+ end
20
+ end
21
+ end
22
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Parqueteur
4
- VERSION = '1.0.2'
4
+ VERSION = '1.2.0'
5
5
  end
data/lib/parqueteur.rb CHANGED
@@ -1,17 +1,29 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "parqueteur/version"
4
- require 'parqueteur/type_resolver'
3
+ require 'json'
4
+ require 'singleton'
5
+ require 'tempfile'
6
+
7
+ require_relative 'parqueteur/version'
5
8
  require 'parqueteur/column'
6
9
  require 'parqueteur/column_collection'
7
10
  require 'parqueteur/converter'
8
- require 'parqueteur/chunked_converter'
9
11
  require 'parqueteur/input'
10
- require 'parqueteur/value_array_builder'
11
- require 'json'
12
+ require 'parqueteur/struct'
13
+ require 'parqueteur/type'
14
+ require 'parqueteur/type_resolver'
15
+ require 'parqueteur/types/array_type'
16
+ require 'parqueteur/types/boolean_type'
17
+ require 'parqueteur/types/int32_type'
18
+ require 'parqueteur/types/int64_type'
19
+ require 'parqueteur/types/map_type'
20
+ require 'parqueteur/types/string_type'
21
+ require 'parqueteur/types/struct_type'
22
+ require 'parqueteur/types/timestamp_type'
12
23
  require 'parquet'
13
24
 
14
25
  module Parqueteur
15
26
  class Error < StandardError; end
27
+ class TypeNotFound < Error; end
16
28
  # Your code goes here...
17
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parqueteur
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julien D.
@@ -38,15 +38,27 @@ files:
38
38
  - Rakefile
39
39
  - bin/console
40
40
  - bin/setup
41
- - example.rb
41
+ - examples/convert-methods.rb
42
+ - examples/convert-to-io.rb
43
+ - examples/convert-with-chunks.rb
44
+ - examples/convert-with-compression.rb
45
+ - examples/convert-without-compression.rb
42
46
  - lib/parqueteur.rb
43
- - lib/parqueteur/chunked_converter.rb
44
47
  - lib/parqueteur/column.rb
45
48
  - lib/parqueteur/column_collection.rb
46
49
  - lib/parqueteur/converter.rb
47
50
  - lib/parqueteur/input.rb
51
+ - lib/parqueteur/struct.rb
52
+ - lib/parqueteur/type.rb
48
53
  - lib/parqueteur/type_resolver.rb
49
- - lib/parqueteur/value_array_builder.rb
54
+ - lib/parqueteur/types/array_type.rb
55
+ - lib/parqueteur/types/boolean_type.rb
56
+ - lib/parqueteur/types/int32_type.rb
57
+ - lib/parqueteur/types/int64_type.rb
58
+ - lib/parqueteur/types/map_type.rb
59
+ - lib/parqueteur/types/string_type.rb
60
+ - lib/parqueteur/types/struct_type.rb
61
+ - lib/parqueteur/types/timestamp_type.rb
50
62
  - lib/parqueteur/version.rb
51
63
  - parqueteur.gemspec
52
64
  - test.json
data/example.rb DELETED
@@ -1,20 +0,0 @@
1
- require 'bundler/setup'
2
- require 'parqueteur'
3
-
4
- class Foo < Parqueteur::Converter
5
- column :id, :long
6
- column :reference, :string
7
- column :hash, :map, key: :string, value: :string
8
- column :valid, :boolean
9
- column :total, :integer
10
- end
11
-
12
- LETTERS = ('a'..'z').to_a
13
-
14
- data = 1000.times.collect do |i|
15
- { 'id' => i + 1, 'reference' => "coucou:#{i}", 'hash' => { 'a' => LETTERS.sample }, 'valid' => rand < 0.5, 'total' => rand(100..500) }
16
- end
17
-
18
- chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
19
- pp chunked_converter.write_files('test')
20
- # puts Foo.convert(data, output: 'test.parquet')
@@ -1,28 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Parqueteur
4
- class ChunkedConverter
5
- attr_reader :schema
6
-
7
- def initialize(input, converter, chunk_size = 200)
8
- @input = Parqueteur::Input.from(input)
9
- @converter = converter
10
- @chunk_size = chunk_size
11
- end
12
-
13
- def chunks
14
- Enumerator.new do |arr|
15
- @input.each_slice(@chunk_size) do |chunk|
16
- local_converter = @converter.new(chunk)
17
- arr << local_converter.to_io
18
- end
19
- end
20
- end
21
-
22
- def write_files(prefix)
23
- chunks.each_with_index do |chunk, idx|
24
- File.write("#{prefix}.#{idx}.parquet", chunk.read)
25
- end
26
- end
27
- end
28
- end
@@ -1,59 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Parqueteur
4
- class ValueArrayBuilder
5
- attr_reader :type, :options, :arrow_type
6
-
7
- def self.build(input, type, options)
8
- new(type, options).build(input)
9
- end
10
-
11
- def initialize(type, options)
12
- @type = type
13
- @options = options
14
- @arrow_type = Parqueteur::TypeResolver.resolve(type, options)
15
- end
16
-
17
- def build(input)
18
- return if input.nil?
19
-
20
- case type
21
- when :array
22
- Arrow::ListArrayBuilder.build(arrow_type, input)
23
- when :map
24
- builder = Arrow::MapArrayBuilder.new(arrow_type)
25
- input.each do |entry|
26
- builder.append_value
27
- next if entry.nil?
28
-
29
- entry.each do |k, v|
30
- builder.key_builder.append(k)
31
- builder.item_builder.append(v)
32
- end
33
- end
34
-
35
- builder.finish
36
- when :boolean
37
- Arrow::BooleanArray.new(input)
38
- when :integer
39
- if options.fetch(:unsigned, false) == true
40
- Arrow::UInt32Array.new(input)
41
- else
42
- Arrow::Int32Array.new(input)
43
- end
44
- when :long
45
- if options.fetch(:unsigned, false) == true
46
- Arrow::UInt64Array.new(input)
47
- else
48
- Arrow::Int64Array.new(input)
49
- end
50
- when :string
51
- Arrow::StringArray.new(input)
52
- when :timestamp
53
- Arrow::TimestampArray.new(input)
54
- else
55
- raise Error, "unknown type: #{type}"
56
- end
57
- end
58
- end
59
- end