parqueteur 1.1.1 → 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,9 +2,7 @@
2
2
 
3
3
  module Parqueteur
4
4
  class Converter
5
- attr_reader :schema
6
-
7
- DEFAULT_BATCH_SIZE = 25
5
+ DEFAULT_BATCH_SIZE = 10
8
6
 
9
7
  def self.inline(&block)
10
8
  Class.new(self, &block)
@@ -26,106 +24,137 @@ module Parqueteur
26
24
  transforms << (method_name || block)
27
25
  end
28
26
 
29
- def self.convert(input, output: nil, batch_size: DEFAULT_BATCH_SIZE)
30
- converter = new(input, bactch_size: batch_size)
31
- if !output.nil?
32
- converter.write(output)
33
- else
34
- converter.to_blob
27
+ def self.convert(input, **kwargs)
28
+ new(input, **kwargs).to_io
29
+ end
30
+
31
+ def self.convert_to(input, output_path, **kwargs)
32
+ converter = new(input, **kwargs)
33
+ converter.write(output_path)
34
+ end
35
+
36
+ # @param [Enumerable] An enumerable object
37
+ # @option [Symbol] compression - :gzip
38
+ def initialize(input, **kwargs)
39
+ @input = Parqueteur::Input.from(input)
40
+ @batch_size = kwargs.fetch(:batch_size, DEFAULT_BATCH_SIZE)
41
+ @compression = kwargs.fetch(:compression, nil)&.to_sym
42
+ end
43
+
44
+ def split(size, batch_size: nil, compression: nil)
45
+ Enumerator.new do |arr|
46
+ options = {
47
+ batch_size: batch_size || @batch_size,
48
+ compression: compression || @compression
49
+ }
50
+ @input.each_slice(size) do |records|
51
+ local_converter = self.class.new(records, **options)
52
+ file = local_converter.to_tmpfile
53
+ arr << file
54
+ file.close
55
+ file.unlink
56
+ end
35
57
  end
36
58
  end
37
59
 
38
- def initialize(input, options = {})
39
- @input = Parqueteur::Input.from(input, options)
40
- @batch_size = options.fetch(:batch_size, DEFAULT_BATCH_SIZE)
41
- end
42
-
43
- def write(output)
44
- case output
45
- when :io
46
- to_io
47
- when String
48
- to_arrow_table.save(output)
49
- when StringIO, IO
50
- buffer = Arrow::ResizableBuffer.new(0)
51
- to_arrow_table.save(buffer, format: :parquet)
52
- output.write(buffer.data.to_s)
53
- output.rewind
54
- output
55
- else
56
- raise ArgumentError, "unsupported output: #{output.class}, accepted: String (filename), IO, StringIO"
60
+ def split_by_io(size, batch_size: nil, compression: nil)
61
+ Enumerator.new do |arr|
62
+ options = {
63
+ batch_size: batch_size || @batch_size,
64
+ compression: compression || @compression
65
+ }
66
+ @input.each_slice(size) do |records|
67
+ local_converter = self.class.new(records, **options)
68
+ arr << local_converter.to_io
69
+ end
70
+ end
71
+ end
72
+
73
+ def write(path, batch_size: nil, compression: nil)
74
+ compression = @compression if compression.nil?
75
+ batch_size = @batch_size if batch_size.nil?
76
+ arrow_schema = self.class.columns.arrow_schema
77
+ writer_properties = Parquet::WriterProperties.new
78
+ if !compression.nil? && compression != false
79
+ writer_properties.set_compression(compression)
80
+ end
81
+
82
+ Arrow::FileOutputStream.open(path, false) do |output|
83
+ Parquet::ArrowFileWriter.open(arrow_schema, output, writer_properties) do |writer|
84
+ @input.each_slice(batch_size) do |records|
85
+ arrow_table = build_arrow_table(records)
86
+ writer.write_table(arrow_table, 1024)
87
+ end
88
+ end
57
89
  end
90
+
91
+ true
92
+ end
93
+
94
+ def to_tmpfile(options = {})
95
+ tempfile = Tempfile.new
96
+ tempfile.binmode
97
+ write(tempfile.path, **options)
98
+ tempfile.rewind
99
+ tempfile
58
100
  end
59
101
 
60
- def to_s
61
- inspect
102
+ def to_io(options = {})
103
+ tmpfile = to_tmpfile(options)
104
+ strio = StringIO.new(tmpfile.read)
105
+ tmpfile.close
106
+ tmpfile.unlink
107
+ strio
62
108
  end
63
109
 
64
- def to_io
65
- write(StringIO.new)
110
+ def to_arrow_table(options = {})
111
+ file = to_tmpfile(options)
112
+ table = Arrow::Table.load(file.path, format: :parquet)
113
+ file.close
114
+ file.unlink
115
+ table
66
116
  end
67
117
 
68
- def to_blob
69
- write(StringIO.new).read
118
+ def to_blob(options = {})
119
+ to_tmpfile(options).read
70
120
  end
71
121
 
72
- def to_arrow_table
122
+ private
123
+
124
+ def build_arrow_table(records)
73
125
  transforms = self.class.transforms
74
126
 
75
- chunks = self.class.columns.each_with_object({}) do |column, hash|
127
+ values = self.class.columns.each_with_object({}) do |column, hash|
76
128
  hash[column.name] = []
77
129
  end
78
130
 
79
- items_count = 0
80
- @input.each_slice(@batch_size) do |items|
81
- values = self.class.columns.each_with_object({}) do |column, hash|
82
- hash[column.name] = []
83
- end
84
-
85
- items.each do |item|
86
- if transforms.length > 0
87
- transforms.each do |transform|
88
- item = \
89
- if transform.is_a?(Symbol)
90
- __send__(transform, item)
91
- else
92
- transform.call(item)
93
- end
94
- end
131
+ records.each do |item|
132
+ if transforms.length > 0
133
+ transforms.each do |transform|
134
+ item = \
135
+ if transform.is_a?(Symbol)
136
+ __send__(transform, item)
137
+ else
138
+ transform.call(item)
139
+ end
95
140
  end
141
+ end
96
142
 
97
- values.each_key do |value_key|
98
- if item.key?(value_key)
99
- values[value_key] << item[value_key]
100
- else
101
- values[value_key] << nil
102
- end
143
+ values.each_key do |value_key|
144
+ if item.key?(value_key)
145
+ values[value_key] << item[value_key]
146
+ else
147
+ values[value_key] << nil
103
148
  end
104
149
  end
150
+ end
105
151
 
106
- values.each_with_object(chunks) do |item, hash|
152
+ Arrow::Table.new(
153
+ values.each_with_object({}) do |item, hash|
107
154
  column = self.class.columns.find(item[0])
108
- hash[item[0]].push(
109
- column.type.build_value_array(item[1])
110
- )
155
+ hash[item[0]] = column.type.build_value_array(item[1])
111
156
  end
112
-
113
- items_count += items.length
114
- end
115
-
116
- if items_count > 0
117
- Arrow::Table.new(
118
- chunks.transform_values! do |value|
119
- Arrow::ChunkedArray.new(value)
120
- end
121
- )
122
- else
123
- Arrow::Table.new(
124
- self.class.columns.each_with_object({}) do |column, hash|
125
- hash[column.name] = column.type.build_value_array([])
126
- end
127
- )
128
- end
157
+ )
129
158
  end
130
159
  end
131
160
  end
@@ -4,40 +4,25 @@ module Parqueteur
4
4
  class Input
5
5
  include Enumerable
6
6
 
7
- def self.from(arg, options = {})
8
- new(
9
- case arg
10
- when String
11
- if File.exist?(arg)
12
- File.new(arg, 'r')
13
- else
14
- arg.split("\n")
15
- end
16
- when Enumerable
17
- arg
18
- end,
19
- options
20
- )
7
+ def self.from(arg)
8
+ return arg if arg.is_a?(self)
9
+
10
+ new(arg)
21
11
  end
22
12
 
23
- def initialize(source, options = {})
13
+ def initialize(source)
14
+ unless source.is_a?(Enumerable)
15
+ raise ArgumentError, 'Enumerable object expected'
16
+ end
17
+
24
18
  @source = source
25
- @options = options
26
19
  end
27
20
 
28
21
  def each(&block)
29
- case @source
30
- when File
31
- if @options.fetch(:json_newlines, true) == true
32
- @source.each_line do |line|
33
- yield(JSON.parse(line.strip))
34
- end
35
- else
36
- JSON.parse(@source.read).each(&block)
37
- end
38
- @source.rewind
39
- when Enumerable
22
+ if block_given?
40
23
  @source.each(&block)
24
+ else
25
+ @source.to_enum(:each)
41
26
  end
42
27
  end
43
28
  end
@@ -7,14 +7,24 @@ module Parqueteur
7
7
  def self.registered_types
8
8
  @registered_types ||= {
9
9
  array: Parqueteur::Types::ArrayType,
10
+ bigdecimal: Parqueteur::Types::Decimal256Type,
10
11
  bigint: Parqueteur::Types::Int64Type,
11
12
  boolean: Parqueteur::Types::BooleanType,
13
+ date: Parqueteur::Types::Date32Type,
14
+ date32: Parqueteur::Types::Date32Type,
15
+ date64: Parqueteur::Types::Date64Type,
16
+ decimal: Parqueteur::Types::Decimal128Type,
17
+ decimal128: Parqueteur::Types::Decimal128Type,
18
+ decimal256: Parqueteur::Types::Decimal256Type,
12
19
  int32: Parqueteur::Types::Int32Type,
13
20
  int64: Parqueteur::Types::Int64Type,
14
21
  integer: Parqueteur::Types::Int32Type,
15
22
  map: Parqueteur::Types::MapType,
16
23
  string: Parqueteur::Types::StringType,
17
24
  struct: Parqueteur::Types::StructType,
25
+ time: Parqueteur::Types::Time32Type,
26
+ time32: Parqueteur::Types::Time32Type,
27
+ time64: Parqueteur::Types::Time64Type,
18
28
  timestamp: Parqueteur::Types::TimestampType
19
29
  }
20
30
  end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Date32Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::Date32ArrayBuilder.build(values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::Date32DataType.new
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Date64Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::Date64ArrayBuilder.build([values])
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::Date64DataType.new
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Decimal128Type < Parqueteur::Type
6
+ def initialize(options = {}, &block)
7
+ @scale = options.fetch(:scale)
8
+ @precision = options.fetch(:precision)
9
+ @format_str = "%.#{@scale}f"
10
+ super(options, &block)
11
+ end
12
+
13
+ def build_value_array(values)
14
+ Arrow::Decimal128ArrayBuilder.build(
15
+ @arrow_type,
16
+ values.map do |value|
17
+ Arrow::Decimal128.new(format(@format_str, BigDecimal(value)))
18
+ end
19
+ )
20
+ end
21
+
22
+ def arrow_type_builder
23
+ Arrow::Decimal128DataType.new(
24
+ @precision, @scale
25
+ )
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Decimal256Type < Parqueteur::Type
6
+ def initialize(options = {}, &block)
7
+ @scale = options.fetch(:scale)
8
+ @precision = options.fetch(:precision)
9
+ @format_str = "%.#{@scale}f"
10
+ super(options, &block)
11
+ end
12
+
13
+ def build_value_array(values)
14
+ Arrow::Decimal256ArrayBuilder.build(
15
+ @arrow_type,
16
+ values.map do |value|
17
+ Arrow::Decimal256.new(format(@format_str, BigDecimal(value)))
18
+ end
19
+ )
20
+ end
21
+
22
+ def arrow_type_builder
23
+ Arrow::Decimal256DataType.new(
24
+ @precision, @scale
25
+ )
26
+ end
27
+ end
28
+ end
29
+ end
@@ -21,5 +21,3 @@ module Parqueteur
21
21
  end
22
22
  end
23
23
  end
24
-
25
- # when :integer
@@ -21,5 +21,3 @@ module Parqueteur
21
21
  end
22
22
  end
23
23
  end
24
-
25
- # when :integer
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Time32Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::Time32Array.new(
8
+ @options.fetch(:precision, :second), values
9
+ )
10
+ end
11
+
12
+ def arrow_type_builder
13
+ Arrow::Time32DataType.new(
14
+ options.fetch(:unit, :second)
15
+ )
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Time64Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::Time64Array.new(
8
+ @options.fetch(:precision, :second), values
9
+ )
10
+ end
11
+
12
+ def arrow_type_builder
13
+ Arrow::Time64DataType.new(
14
+ options.fetch(:unit, :second)
15
+ )
16
+ end
17
+ end
18
+ end
19
+ end
@@ -9,7 +9,9 @@ module Parqueteur
9
9
  module Types
10
10
  class TimestampType < Parqueteur::Type
11
11
  def build_value_array(values)
12
- Arrow::TimestampArray.new(values)
12
+ Arrow::TimestampArray.new(
13
+ options.fetch(:unit, :second), values
14
+ )
13
15
  end
14
16
 
15
17
  def arrow_type_builder
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Parqueteur
4
- VERSION = '1.1.1'
4
+ VERSION = '1.3.2'
5
5
  end
data/lib/parqueteur.rb CHANGED
@@ -2,9 +2,10 @@
2
2
 
3
3
  require 'json'
4
4
  require 'singleton'
5
+ require 'tempfile'
6
+ require 'parquet'
5
7
 
6
- require_relative "parqueteur/version"
7
- require 'parqueteur/chunked_converter'
8
+ require_relative 'parqueteur/version'
8
9
  require 'parqueteur/column'
9
10
  require 'parqueteur/column_collection'
10
11
  require 'parqueteur/converter'
@@ -14,16 +15,20 @@ require 'parqueteur/type'
14
15
  require 'parqueteur/type_resolver'
15
16
  require 'parqueteur/types/array_type'
16
17
  require 'parqueteur/types/boolean_type'
18
+ require 'parqueteur/types/date32_type'
19
+ require 'parqueteur/types/date64_type'
20
+ require 'parqueteur/types/decimal128_type'
21
+ require 'parqueteur/types/decimal256_type'
17
22
  require 'parqueteur/types/int32_type'
18
23
  require 'parqueteur/types/int64_type'
19
24
  require 'parqueteur/types/map_type'
20
25
  require 'parqueteur/types/string_type'
21
26
  require 'parqueteur/types/struct_type'
27
+ require 'parqueteur/types/time32_type'
28
+ require 'parqueteur/types/time64_type'
22
29
  require 'parqueteur/types/timestamp_type'
23
- require 'parquet'
24
30
 
25
31
  module Parqueteur
26
32
  class Error < StandardError; end
27
33
  class TypeNotFound < Error; end
28
- # Your code goes here...
29
34
  end
data/parqueteur.gemspec CHANGED
@@ -8,8 +8,8 @@ Gem::Specification.new do |spec|
8
8
  spec.authors = ["Julien D."]
9
9
  spec.email = ["julien@pocketsizesun.com"]
10
10
  spec.license = 'Apache-2.0'
11
- spec.summary = 'Parqueteur - A Ruby gem that convert JSON to Parquet'
12
- spec.description = 'Convert JSON to Parquet'
11
+ spec.summary = 'Parqueteur - A Ruby gem that convert data to Parquet'
12
+ spec.description = 'Convert data to Parquet'
13
13
  spec.homepage = 'https://github.com/pocketsizesun/parqueteur-ruby'
14
14
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
15
15
 
@@ -30,7 +30,7 @@ Gem::Specification.new do |spec|
30
30
 
31
31
  # Uncomment to register a new dependency of your gem
32
32
  # spec.add_dependency "example-gem", "~> 1.0"
33
- spec.add_dependency 'red-parquet', '~> 5.0'
33
+ spec.add_dependency 'red-parquet', '~> 6.0'
34
34
 
35
35
  # For more information and examples about making a new gem, checkout our
36
36
  # guide at: https://bundler.io/guides/creating_gem.html
@@ -0,0 +1,18 @@
1
+ #!/bin/sh
2
+
3
+ if [ $(dpkg-query -W -f='${Status}' apache-arrow-apt-source 2>/dev/null | grep -c "ok installed") -eq 1 ]
4
+ then
5
+ exit 0
6
+ fi
7
+
8
+ LSB_RELEASE_CODENAME_SHORT=$(lsb_release --codename --short)
9
+
10
+ apt-get update
11
+ apt-get install -y -V ca-certificates lsb-release wget
12
+ wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
13
+ apt-get install -y -V ./apache-arrow-apt-source-latest-${LSB_RELEASE_CODENAME_SHORT}.deb
14
+ rm ./apache-arrow-apt-source-latest-${LSB_RELEASE_CODENAME_SHORT}.deb
15
+ apt-get update
16
+ apt-get install -y libgirepository1.0-dev libarrow-dev libarrow-glib-dev libparquet-dev libparquet-glib-dev
17
+
18
+ exit 0
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parqueteur
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julien D.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-10-02 00:00:00.000000000 Z
11
+ date: 2021-11-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: red-parquet
@@ -16,15 +16,15 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '5.0'
19
+ version: '6.0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '5.0'
27
- description: Convert JSON to Parquet
26
+ version: '6.0'
27
+ description: Convert data to Parquet
28
28
  email:
29
29
  - julien@pocketsizesun.com
30
30
  executables: []
@@ -38,9 +38,15 @@ files:
38
38
  - Rakefile
39
39
  - bin/console
40
40
  - bin/setup
41
- - example.rb
41
+ - examples/cars.rb
42
+ - examples/convert-and-compression.rb
43
+ - examples/convert-methods.rb
44
+ - examples/convert-to-io.rb
45
+ - examples/convert-with-chunks.rb
46
+ - examples/convert-without-compression.rb
47
+ - examples/hello-world.rb
48
+ - examples/readme-example.rb
42
49
  - lib/parqueteur.rb
43
- - lib/parqueteur/chunked_converter.rb
44
50
  - lib/parqueteur/column.rb
45
51
  - lib/parqueteur/column_collection.rb
46
52
  - lib/parqueteur/converter.rb
@@ -50,15 +56,21 @@ files:
50
56
  - lib/parqueteur/type_resolver.rb
51
57
  - lib/parqueteur/types/array_type.rb
52
58
  - lib/parqueteur/types/boolean_type.rb
59
+ - lib/parqueteur/types/date32_type.rb
60
+ - lib/parqueteur/types/date64_type.rb
61
+ - lib/parqueteur/types/decimal128_type.rb
62
+ - lib/parqueteur/types/decimal256_type.rb
53
63
  - lib/parqueteur/types/int32_type.rb
54
64
  - lib/parqueteur/types/int64_type.rb
55
65
  - lib/parqueteur/types/map_type.rb
56
66
  - lib/parqueteur/types/string_type.rb
57
67
  - lib/parqueteur/types/struct_type.rb
68
+ - lib/parqueteur/types/time32_type.rb
69
+ - lib/parqueteur/types/time64_type.rb
58
70
  - lib/parqueteur/types/timestamp_type.rb
59
71
  - lib/parqueteur/version.rb
60
72
  - parqueteur.gemspec
61
- - test.json
73
+ - scripts/apache-arrow-ubuntu-install.sh
62
74
  homepage: https://github.com/pocketsizesun/parqueteur-ruby
63
75
  licenses:
64
76
  - Apache-2.0
@@ -82,5 +94,5 @@ requirements: []
82
94
  rubygems_version: 3.2.3
83
95
  signing_key:
84
96
  specification_version: 4
85
- summary: Parqueteur - A Ruby gem that convert JSON to Parquet
97
+ summary: Parqueteur - A Ruby gem that convert data to Parquet
86
98
  test_files: []
data/example.rb DELETED
@@ -1,39 +0,0 @@
1
- require 'bundler/setup'
2
- require 'parqueteur'
3
-
4
- class Foo < Parqueteur::Converter
5
- column :id, :bigint
6
- column :reference, :string
7
- column :hash, :map, key: :string, value: :string
8
- column :valid, :boolean
9
- column :total, :integer
10
- column :numbers, :array, elements: :integer
11
- column :my_struct, :struct do
12
- field :test, :string
13
- field :mon_nombre, :integer
14
- end
15
- end
16
-
17
- LETTERS = ('a'..'z').to_a
18
-
19
- data = 1000.times.collect do |i|
20
- {
21
- 'id' => i + 1,
22
- 'reference' => "coucou:#{i}",
23
- 'hash' => { 'a' => LETTERS.sample },
24
- 'valid' => rand < 0.5,
25
- 'total' => rand(100..500),
26
- 'numbers' => [1, 2, 3],
27
- 'my_struct' => {
28
- 'test' => 'super'
29
- }
30
- }
31
- end
32
-
33
- # chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
34
- # pp chunked_converter.write_files('test')
35
- puts Foo.convert(data, output: 'tmp/test.parquet')
36
- table = Arrow::Table.load('tmp/test.parquet')
37
- table.each_record do |record|
38
- puts record.to_h
39
- end