parqueteur 1.0.3 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/Gemfile.lock +1 -1
  4. data/README.md +43 -8
  5. data/examples/convert-and-compression.rb +56 -0
  6. data/examples/convert-methods.rb +54 -0
  7. data/examples/convert-to-io.rb +52 -0
  8. data/examples/convert-with-chunks.rb +54 -0
  9. data/examples/convert-without-compression.rb +52 -0
  10. data/examples/hello-world.rb +56 -0
  11. data/lib/parqueteur/column.rb +3 -20
  12. data/lib/parqueteur/column_collection.rb +8 -0
  13. data/lib/parqueteur/converter.rb +110 -67
  14. data/lib/parqueteur/input.rb +12 -27
  15. data/lib/parqueteur/struct.rb +25 -0
  16. data/lib/parqueteur/type.rb +21 -0
  17. data/lib/parqueteur/type_resolver.rb +44 -48
  18. data/lib/parqueteur/types/array_type.rb +21 -0
  19. data/lib/parqueteur/types/boolean_type.rb +15 -0
  20. data/lib/parqueteur/types/date32_type.rb +15 -0
  21. data/lib/parqueteur/types/date64_type.rb +15 -0
  22. data/lib/parqueteur/types/decimal128_type.rb +18 -0
  23. data/lib/parqueteur/types/decimal256_type.rb +18 -0
  24. data/lib/parqueteur/types/int32_type.rb +23 -0
  25. data/lib/parqueteur/types/int64_type.rb +23 -0
  26. data/lib/parqueteur/types/map_type.rb +36 -0
  27. data/lib/parqueteur/types/string_type.rb +20 -0
  28. data/lib/parqueteur/types/struct_type.rb +35 -0
  29. data/lib/parqueteur/types/time32_type.rb +19 -0
  30. data/lib/parqueteur/types/time64_type.rb +19 -0
  31. data/lib/parqueteur/types/timestamp_type.rb +24 -0
  32. data/lib/parqueteur/version.rb +1 -1
  33. data/lib/parqueteur.rb +24 -7
  34. data/parqueteur.gemspec +2 -2
  35. data/scripts/apache-arrow-ubuntu-install.sh +18 -0
  36. metadata +27 -8
  37. data/example.rb +0 -20
  38. data/lib/parqueteur/chunked_converter.rb +0 -28
  39. data/lib/parqueteur/value_array_builder.rb +0 -59
  40. data/test.json +0 -1
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Parqueteur
4
4
  class Converter
5
- attr_reader :schema
5
+ DEFAULT_BATCH_SIZE = 10
6
6
 
7
7
  def self.inline(&block)
8
8
  Class.new(self, &block)
@@ -12,8 +12,8 @@ module Parqueteur
12
12
  @columns ||= Parqueteur::ColumnCollection.new
13
13
  end
14
14
 
15
- def self.column(name, type, options = {})
16
- columns.add(Parqueteur::Column.new(name, type, options))
15
+ def self.column(name, type, options = {}, &block)
16
+ columns.add(Parqueteur::Column.new(name, type, options, &block))
17
17
  end
18
18
 
19
19
  def self.transforms
@@ -24,92 +24,135 @@ module Parqueteur
24
24
  transforms << (method_name || block)
25
25
  end
26
26
 
27
- def self.convert(input, output: nil)
28
- converter = new(input)
29
- if !output.nil?
30
- converter.write(output)
31
- else
32
- converter.to_blob
27
+ def self.convert(input, **kwargs)
28
+ new(input, **kwargs).to_io
29
+ end
30
+
31
+ def self.convert_to(input, output_path, **kwargs)
32
+ converter = new(input, **kwargs)
33
+ converter.write(output_path)
34
+ end
35
+
36
+ # @param [Enumerable] An enumerable object
37
+ # @option [Symbol] compression - :gzip
38
+ def initialize(input, **kwargs)
39
+ @input = Parqueteur::Input.from(input)
40
+ @batch_size = kwargs.fetch(:batch_size, DEFAULT_BATCH_SIZE)
41
+ @compression = kwargs.fetch(:compression, nil)&.to_sym
42
+ end
43
+
44
+ def split(size, batch_size: nil, compression: nil)
45
+ Enumerator.new do |arr|
46
+ options = {
47
+ batch_size: batch_size || @batch_size,
48
+ compression: compression || @compression
49
+ }
50
+ @input.each_slice(size) do |records|
51
+ local_converter = self.class.new(records, **options)
52
+ file = local_converter.to_tmpfile
53
+ arr << file
54
+ file.close
55
+ file.unlink
56
+ end
33
57
  end
34
58
  end
35
59
 
36
- def initialize(input, options = {})
37
- @input = Parqueteur::Input.from(input, options)
38
- end
39
-
40
- def write(output)
41
- case output
42
- when :io
43
- to_io
44
- when String
45
- to_arrow_table.save(output)
46
- when StringIO, IO
47
- buffer = Arrow::ResizableBuffer.new(0)
48
- to_arrow_table.save(buffer, format: :parquet)
49
- output.write(buffer.data.to_s)
50
- output.rewind
51
- output
52
- else
53
- raise ArgumentError, "unsupported output: #{output.class}, accepted: String (filename), IO, StringIO"
60
+ def split_by_io(size, batch_size: nil, compression: nil)
61
+ Enumerator.new do |arr|
62
+ options = {
63
+ batch_size: batch_size || @batch_size,
64
+ compression: compression || @compression
65
+ }
66
+ @input.each_slice(size) do |records|
67
+ local_converter = self.class.new(records, **options)
68
+ arr << local_converter.to_io
69
+ end
70
+ end
71
+ end
72
+
73
+ def write(path, batch_size: nil, compression: nil)
74
+ compression = @compression if compression.nil?
75
+ batch_size = @batch_size if batch_size.nil?
76
+ arrow_schema = self.class.columns.arrow_schema
77
+ writer_properties = Parquet::WriterProperties.new
78
+ if !compression.nil? && compression != false
79
+ writer_properties.set_compression(compression)
80
+ end
81
+
82
+ Arrow::FileOutputStream.open(path, false) do |output|
83
+ Parquet::ArrowFileWriter.open(arrow_schema, output, writer_properties) do |writer|
84
+ @input.each_slice(batch_size) do |records|
85
+ arrow_table = build_arrow_table(records)
86
+ writer.write_table(arrow_table, 1024)
87
+ end
88
+ end
54
89
  end
90
+
91
+ true
55
92
  end
56
93
 
57
- def to_s
58
- inspect
94
+ def to_tmpfile(options = {})
95
+ tempfile = Tempfile.new
96
+ tempfile.binmode
97
+ write(tempfile.path, **options)
98
+ tempfile.rewind
99
+ tempfile
59
100
  end
60
101
 
61
- def to_io
62
- write(StringIO.new)
102
+ def to_io(options = {})
103
+ tmpfile = to_tmpfile(options)
104
+ strio = StringIO.new(tmpfile.read)
105
+ tmpfile.close
106
+ tmpfile.unlink
107
+ strio
63
108
  end
64
109
 
65
- def to_blob
66
- write(StringIO.new).read
110
+ def to_arrow_table(options = {})
111
+ file = to_tmpfile(options)
112
+ table = Arrow::Table.load(file.path, format: :parquet)
113
+ file.close
114
+ file.unlink
115
+ table
67
116
  end
68
117
 
69
- def to_arrow_table
70
- transforms = self.class.transforms
118
+ def to_blob(options = {})
119
+ to_tmpfile(options).read
120
+ end
71
121
 
72
- chunks = {}
73
- @input.each_slice(100) do |items|
74
- values = self.class.columns.each_with_object({}) do |column, hash|
75
- hash[column.name] = []
76
- end
122
+ private
77
123
 
78
- items.each do |item|
79
- if transforms.length > 0
80
- transforms.each do |transform|
81
- item = \
82
- if transform.is_a?(Symbol)
83
- __send__(transform, item)
84
- else
85
- transform.call(item)
86
- end
87
- end
88
- end
124
+ def build_arrow_table(records)
125
+ transforms = self.class.transforms
126
+
127
+ values = self.class.columns.each_with_object({}) do |column, hash|
128
+ hash[column.name] = []
129
+ end
89
130
 
90
- values.each_key do |value_key|
91
- if item.key?(value_key)
92
- values[value_key] << item[value_key]
93
- else
94
- values[value_key] << nil
95
- end
131
+ records.each do |item|
132
+ if transforms.length > 0
133
+ transforms.each do |transform|
134
+ item = \
135
+ if transform.is_a?(Symbol)
136
+ __send__(transform, item)
137
+ else
138
+ transform.call(item)
139
+ end
96
140
  end
97
141
  end
98
142
 
99
- values.each_with_object(chunks) do |item, hash|
100
- column = self.class.columns.find(item[0])
101
- hash[item[0]] ||= []
102
- hash[item[0]].push(
103
- Parqueteur::ValueArrayBuilder.build(
104
- item[1], column.type, column.options
105
- )
106
- )
143
+ values.each_key do |value_key|
144
+ if item.key?(value_key)
145
+ values[value_key] << item[value_key]
146
+ else
147
+ values[value_key] << nil
148
+ end
107
149
  end
108
150
  end
109
151
 
110
152
  Arrow::Table.new(
111
- chunks.transform_values! do |value|
112
- Arrow::ChunkedArray.new(value)
153
+ values.each_with_object({}) do |item, hash|
154
+ column = self.class.columns.find(item[0])
155
+ hash[item[0]] = column.type.build_value_array(item[1])
113
156
  end
114
157
  )
115
158
  end
@@ -4,40 +4,25 @@ module Parqueteur
4
4
  class Input
5
5
  include Enumerable
6
6
 
7
- def self.from(arg, options = {})
8
- new(
9
- case arg
10
- when String
11
- if File.exist?(arg)
12
- File.new(arg, 'r')
13
- else
14
- arg.split("\n")
15
- end
16
- when Array, Enumerator
17
- arg
18
- end,
19
- options
20
- )
7
+ def self.from(arg)
8
+ return arg if arg.is_a?(self)
9
+
10
+ new(arg)
21
11
  end
22
12
 
23
- def initialize(source, options = {})
13
+ def initialize(source)
14
+ unless source.is_a?(Enumerable)
15
+ raise ArgumentError, 'Enumerable object expected'
16
+ end
17
+
24
18
  @source = source
25
- @options = options
26
19
  end
27
20
 
28
21
  def each(&block)
29
- case @source
30
- when File
31
- if @options.fetch(:json_newlines, true) == true
32
- @source.each_line do |line|
33
- yield(JSON.parse(line.strip))
34
- end
35
- else
36
- JSON.parse(@source.read).each(&block)
37
- end
38
- @source.rewind
39
- when Array, Enumerator
22
+ if block_given?
40
23
  @source.each(&block)
24
+ else
25
+ @source.to_enum(:each)
41
26
  end
42
27
  end
43
28
  end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Struct
5
+ def initialize(&block)
6
+ instance_exec(&block)
7
+ end
8
+
9
+ def fields
10
+ @fields ||= Parqueteur::ColumnCollection.new
11
+ end
12
+
13
+ def field(name, type, options = {}, &block)
14
+ fields.add(Parqueteur::Column.new(name, type, options, &block))
15
+ end
16
+
17
+ def key?(key)
18
+ fields.key?(key)
19
+ end
20
+
21
+ def to_arrow_type
22
+ fields.collect(&:to_arrow_field)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Type
5
+ attr_reader :options, :arrow_type
6
+
7
+ def initialize(options = {}, &block)
8
+ @options = options
9
+ @block = block
10
+ @arrow_type = arrow_type_builder
11
+ end
12
+
13
+ def build_value_array(values)
14
+ raise "#to_arrow_field must be implemented in #{self.class}"
15
+ end
16
+
17
+ def resolve(type, options = {})
18
+ Parqueteur::TypeResolver.resolve(type, options)
19
+ end
20
+ end
21
+ end
@@ -2,60 +2,56 @@
2
2
 
3
3
  module Parqueteur
4
4
  class TypeResolver
5
- def self.resolve(*args)
6
- new.resolve(*args)
5
+ include Singleton
6
+
7
+ def self.registered_types
8
+ @registered_types ||= {
9
+ array: Parqueteur::Types::ArrayType,
10
+ bigdecimal: Parqueteur::Types::Decimal256Type,
11
+ bigint: Parqueteur::Types::Int64Type,
12
+ boolean: Parqueteur::Types::BooleanType,
13
+ date: Parqueteur::Types::Date32Type,
14
+ date32: Parqueteur::Types::Date64Type,
15
+ date64: Parqueteur::Types::Date64Type,
16
+ decimal: Parqueteur::Types::Decimal128Type,
17
+ decimal128: Parqueteur::Types::Decimal128Type,
18
+ decimal256: Parqueteur::Types::Decimal256Type,
19
+ int32: Parqueteur::Types::Int32Type,
20
+ int64: Parqueteur::Types::Int64Type,
21
+ integer: Parqueteur::Types::Int32Type,
22
+ map: Parqueteur::Types::MapType,
23
+ string: Parqueteur::Types::StringType,
24
+ struct: Parqueteur::Types::StructType,
25
+ time: Parqueteur::Types::Time32Type,
26
+ time32: Parqueteur::Types::Time32Type,
27
+ time64: Parqueteur::Types::Time64Type,
28
+ timestamp: Parqueteur::Types::TimestampType
29
+ }
30
+ end
31
+
32
+ def self.register_type(type, klass)
33
+ registered_types[type] = klass
34
+ end
35
+
36
+ def self.resolve(*args, &block)
37
+ instance.resolve(*args, &block)
7
38
  end
8
39
 
9
- def resolve(type, options = {})
10
- case type
11
- when :array
12
- elements_opt = options.fetch(:elements)
13
- Arrow::ListDataType.new(
14
- if elements_opt.is_a?(Hash)
15
- resolve(elements_opt.fetch(:type), elements_opt)
16
- else
17
- resolve(elements_opt)
18
- end
19
- )
20
- when :boolean
21
- Arrow::BooleanDataType.new
22
- when :integer
23
- if options.fetch(:unsigned, false) == true
24
- Arrow::UInt32DataType.new
25
- else
26
- Arrow::Int32DataType.new
27
- end
28
- when :long
29
- if options.fetch(:unsigned, false) == true
30
- Arrow::UInt64DataType.new
31
- else
32
- Arrow::Int64DataType.new
33
- end
34
- when :timestamp
35
- Arrow::TimestampDataType.new(
36
- options.fetch(:unit, :second)
37
- )
38
- when :string
39
- Arrow::StringDataType.new
40
- when :map
41
- map_value = options.fetch(:value)
42
- Arrow::MapDataType.new(
43
- resolve(options.fetch(:key)),
44
- if map_value.is_a?(Hash)
45
- resolve(map_value.fetch(:type), map_value)
46
- else
47
- resolve(map_value)
48
- end
49
- )
40
+ def resolve(type, options = {}, &block)
41
+ if type.is_a?(Symbol)
42
+ resolve_from_symbol(type, options, &block)
50
43
  else
51
- raise Error, "unknown type: #{type}"
44
+ type.new(options, &block)
52
45
  end
53
46
  end
54
- end
55
- end
56
47
 
57
- private
48
+ private
58
49
 
59
- def build_arrow_type(type, options = {})
50
+ def resolve_from_symbol(type, options, &block)
51
+ type_klass = self.class.registered_types.fetch(type.to_sym, nil)
52
+ raise Parqueteur::TypeNotFound, type if type_klass.nil?
60
53
 
54
+ type_klass.new(options, &block)
55
+ end
56
+ end
61
57
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class ArrayType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::ListArrayBuilder.build(arrow_type, values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::ListDataType.new(
12
+ if options[:elements].is_a?(Hash)
13
+ resolve(options[:elements].fetch(:type), options[:elements]).arrow_type
14
+ else
15
+ resolve(options[:elements]).arrow_type
16
+ end
17
+ )
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class BooleanType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::BooleanArray.new(values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::BooleanDataType.new
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Date32Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::Date32ArrayBuilder.build(values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::Date32DataType.new
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Date64Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::Date64ArrayBuilder.build([values])
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::Date64DataType.new
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Decimal128Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::Decimal128ArrayBuilder.build(@arrow_type, values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::Decimal128DataType.new(
12
+ precision: @options.fetch(:precision),
13
+ scale: @options.fetch(:scale)
14
+ )
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Decimal256Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::Decimal256ArrayBuilder.build(@arrow_type, values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::Decimal256DataType.new(
12
+ precision: @options.fetch(:precision),
13
+ scale: @options.fetch(:scale)
14
+ )
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Int32Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ if options.fetch(:unsigned, false) == true
8
+ Arrow::UInt32Array.new(values)
9
+ else
10
+ Arrow::Int32Array.new(values)
11
+ end
12
+ end
13
+
14
+ def arrow_type_builder
15
+ if options.fetch(:unsigned, false) == true
16
+ Arrow::UInt32DataType.new
17
+ else
18
+ Arrow::Int32DataType.new
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Int64Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ if options.fetch(:unsigned, false) == true
8
+ Arrow::UInt64Array.new(values)
9
+ else
10
+ Arrow::Int64Array.new(values)
11
+ end
12
+ end
13
+
14
+ def arrow_type_builder
15
+ if options.fetch(:unsigned, false) == true
16
+ Arrow::UInt64DataType.new
17
+ else
18
+ Arrow::Int64DataType.new
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class MapType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ builder = Arrow::MapArrayBuilder.new(arrow_type)
8
+ values.each do |entry|
9
+ builder.append_value
10
+ next if entry.nil?
11
+
12
+ entry.each do |k, v|
13
+ builder.key_builder.append(k)
14
+ builder.item_builder.append(v)
15
+ end
16
+ end
17
+
18
+ builder.finish
19
+ end
20
+
21
+ def arrow_type_builder
22
+ map_value = options.fetch(:value)
23
+
24
+ Arrow::MapDataType.new(
25
+ resolve(options.fetch(:key)).arrow_type,
26
+ if map_value.is_a?(Hash)
27
+ resolve(map_value.fetch(:type), map_value).arrow_type
28
+ else
29
+ resolve(map_value).arrow_type
30
+ end
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class StringType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ Arrow::StringArray.new(values)
13
+ end
14
+
15
+ def arrow_type_builder
16
+ Arrow::StringDataType.new
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class StructType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ values.each do |value|
13
+ next if value.nil?
14
+
15
+ value.each_key do |key|
16
+ next if struct_object.key?(key)
17
+
18
+ raise Parqueteur::Error, "Struct field '#{key}' not found"
19
+ end
20
+ end
21
+ Arrow::StructArrayBuilder.build(arrow_type, values)
22
+ end
23
+
24
+ def arrow_type_builder
25
+ Arrow::StructDataType.new(struct_object.to_arrow_type)
26
+ end
27
+
28
+ private
29
+
30
+ def struct_object
31
+ @struct_object ||= Parqueteur::Struct.new(&@block)
32
+ end
33
+ end
34
+ end
35
+ end