parqueteur 1.0.3 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 51c58dad3007e9c6a08b4673b378975c7839158240c27fb0a883d0e59c05d5d5
4
- data.tar.gz: fa63fe9cd3cfa71d587400f3672ca2697c87a77fc574d8e2463d89f0090cd686
3
+ metadata.gz: 90ffcadf5b78e4ffc3329eac9be1be34af9209a77a39a6395eefc1c56afa7ce6
4
+ data.tar.gz: 1d7f5257d3f86443e0d13b789d7449565198f6cda1563d111a36ad3728264044
5
5
  SHA512:
6
- metadata.gz: 58c98870d35b4af03bf52fea1572d26b4073d16696637efe8992eba9ddfbe8931e3b015976cd997ee85fc1a0388b031ecafe1b165b911a36d6748a90ce8af88d
7
- data.tar.gz: 288418ec8a410f94fe38e76d94bb2da4173a4760ed8a9825132a41f3d808370ff60c8975ea8e6b8b72c1198dd36e5b4ceb1682570edf4ae9fa60be93ddc3cba8
6
+ metadata.gz: 262039094dd3aa5890f9d1a87d836eb64004c51fc57456d7880a36332295ac8a447fcd045c6bd2e053986ad4e9981021448b39e20010d0e41fef6b7e233d91ca
7
+ data.tar.gz: 73988e1f836acbe22e26c20b8559d8f79e9eeda55f9612fbefae7aa5bef131187b7a0716a3bd7804db5321b29b19f54b383e8594830814ae765d2c7b30986f59
data/.gitignore CHANGED
@@ -7,3 +7,5 @@
7
7
  /spec/reports/
8
8
  /tmp/
9
9
  /*.gem
10
+ /tmp/*
11
+ !/tmp/.keep
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- parqueteur (1.0.3)
4
+ parqueteur (1.1.0)
5
5
  red-parquet (~> 5.0)
6
6
 
7
7
  GEM
data/example.rb CHANGED
@@ -2,19 +2,38 @@ require 'bundler/setup'
2
2
  require 'parqueteur'
3
3
 
4
4
  class Foo < Parqueteur::Converter
5
- column :id, :long
5
+ column :id, :bigint
6
6
  column :reference, :string
7
7
  column :hash, :map, key: :string, value: :string
8
8
  column :valid, :boolean
9
9
  column :total, :integer
10
+ column :numbers, :array, elements: :integer
11
+ column :my_struct, :struct do
12
+ field :test, :string
13
+ field :mon_nombre, :integer
14
+ end
10
15
  end
11
16
 
12
17
  LETTERS = ('a'..'z').to_a
13
18
 
14
19
  data = 1000.times.collect do |i|
15
- { 'id' => i + 1, 'reference' => "coucou:#{i}", 'hash' => { 'a' => LETTERS.sample }, 'valid' => rand < 0.5, 'total' => rand(100..500) }
20
+ {
21
+ 'id' => i + 1,
22
+ 'reference' => "coucou:#{i}",
23
+ 'hash' => { 'a' => LETTERS.sample },
24
+ 'valid' => rand < 0.5,
25
+ 'total' => rand(100..500),
26
+ 'numbers' => [1, 2, 3],
27
+ 'my_struct' => {
28
+ 'test' => 'super'
29
+ }
30
+ }
16
31
  end
17
32
 
18
- chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
19
- pp chunked_converter.write_files('test')
20
- # puts Foo.convert(data, output: 'test.parquet')
33
+ # chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
34
+ # pp chunked_converter.write_files('test')
35
+ puts Foo.convert(data, output: 'tmp/test.parquet')
36
+ table = Arrow::Table.load('tmp/test.parquet')
37
+ table.each_record do |record|
38
+ puts record.to_h
39
+ end
@@ -4,31 +4,14 @@ module Parqueteur
4
4
  class Column
5
5
  attr_reader :name, :type, :options
6
6
 
7
- def initialize(name, type, options = {})
7
+ def initialize(name, type, options = {}, &block)
8
8
  @name = name.to_s
9
- @type = type
9
+ @type = Parqueteur::TypeResolver.resolve(type, options, &block)
10
10
  @options = options
11
11
  end
12
12
 
13
13
  def arrow_type
14
- @arrow_type ||= Parqueteur::TypeResolver.resolve(@type, @options)
15
- end
16
-
17
- def cast(value)
18
- case @type
19
- when :string then value.to_s
20
- when :boolean then value == true
21
- when :integer then value.to_i
22
- when :long then value.to_i
23
- when :timestamp
24
- case value
25
- when String then Time.parse(value).to_i
26
- when Integer then value
27
- else
28
- raise ArgumentError, "Unable to cast '#{value}' to timestamp"
29
- end
30
- when :map then value
31
- end
14
+ @type.arrow_type
32
15
  end
33
16
 
34
17
  def to_arrow_field
@@ -4,11 +4,18 @@ module Parqueteur
4
4
  class ColumnCollection
5
5
  include Enumerable
6
6
 
7
+ attr_reader :column_names
8
+
7
9
  def initialize
8
10
  @columns = []
11
+ @column_names = []
9
12
  @columns_idx = {}
10
13
  end
11
14
 
15
+ def key?(key)
16
+ @columns_idx.key?(key)
17
+ end
18
+
12
19
  def each(&block)
13
20
  @columns.each(&block)
14
21
  end
@@ -17,6 +24,7 @@ module Parqueteur
17
24
  unless @columns_idx.key?(column.name)
18
25
  @columns_idx[column.name] = column
19
26
  @columns << column
27
+ @column_names << column.name
20
28
  end
21
29
 
22
30
  true
@@ -12,8 +12,8 @@ module Parqueteur
12
12
  @columns ||= Parqueteur::ColumnCollection.new
13
13
  end
14
14
 
15
- def self.column(name, type, options = {})
16
- columns.add(Parqueteur::Column.new(name, type, options))
15
+ def self.column(name, type, options = {}, &block)
16
+ columns.add(Parqueteur::Column.new(name, type, options, &block))
17
17
  end
18
18
 
19
19
  def self.transforms
@@ -69,7 +69,10 @@ module Parqueteur
69
69
  def to_arrow_table
70
70
  transforms = self.class.transforms
71
71
 
72
- chunks = {}
72
+ chunks = self.class.columns.each_with_object({}) do |column, hash|
73
+ hash[column.name] = []
74
+ end
75
+ items_count = 0
73
76
  @input.each_slice(100) do |items|
74
77
  values = self.class.columns.each_with_object({}) do |column, hash|
75
78
  hash[column.name] = []
@@ -98,20 +101,27 @@ module Parqueteur
98
101
 
99
102
  values.each_with_object(chunks) do |item, hash|
100
103
  column = self.class.columns.find(item[0])
101
- hash[item[0]] ||= []
102
104
  hash[item[0]].push(
103
- Parqueteur::ValueArrayBuilder.build(
104
- item[1], column.type, column.options
105
- )
105
+ column.type.build_value_array(item[1])
106
106
  )
107
107
  end
108
+
109
+ items_count += items.length
108
110
  end
109
111
 
110
- Arrow::Table.new(
111
- chunks.transform_values! do |value|
112
- Arrow::ChunkedArray.new(value)
113
- end
114
- )
112
+ if items_count > 0
113
+ Arrow::Table.new(
114
+ chunks.transform_values! do |value|
115
+ Arrow::ChunkedArray.new(value)
116
+ end
117
+ )
118
+ else
119
+ Arrow::Table.new(
120
+ self.class.columns.each_with_object({}) do |column, hash|
121
+ hash[column.name] = column.type.build_value_array([])
122
+ end
123
+ )
124
+ end
115
125
  end
116
126
  end
117
127
  end
@@ -13,7 +13,7 @@ module Parqueteur
13
13
  else
14
14
  arg.split("\n")
15
15
  end
16
- when Array, Enumerator
16
+ when Enumerable
17
17
  arg
18
18
  end,
19
19
  options
@@ -36,7 +36,7 @@ module Parqueteur
36
36
  JSON.parse(@source.read).each(&block)
37
37
  end
38
38
  @source.rewind
39
- when Array, Enumerator
39
+ when Enumerable
40
40
  @source.each(&block)
41
41
  end
42
42
  end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Struct
5
+ def initialize(&block)
6
+ instance_exec(&block)
7
+ end
8
+
9
+ def fields
10
+ @fields ||= Parqueteur::ColumnCollection.new
11
+ end
12
+
13
+ def field(name, type, options = {}, &block)
14
+ fields.add(Parqueteur::Column.new(name, type, options, &block))
15
+ end
16
+
17
+ def key?(key)
18
+ fields.key?(key)
19
+ end
20
+
21
+ def to_arrow_type
22
+ fields.collect(&:to_arrow_field)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Type
5
+ attr_reader :options, :arrow_type
6
+
7
+ def initialize(options = {}, &block)
8
+ @options = options
9
+ @block = block
10
+ @arrow_type = arrow_type_builder
11
+ end
12
+
13
+ def build_value_array(values)
14
+ raise "#to_arrow_field must be implemented in #{self.class}"
15
+ end
16
+
17
+ def resolve(type, options = {})
18
+ Parqueteur::TypeResolver.resolve(type, options)
19
+ end
20
+ end
21
+ end
@@ -2,60 +2,46 @@
2
2
 
3
3
  module Parqueteur
4
4
  class TypeResolver
5
- def self.resolve(*args)
6
- new.resolve(*args)
5
+ include Singleton
6
+
7
+ def self.registered_types
8
+ @registered_types ||= {
9
+ array: Parqueteur::Types::ArrayType,
10
+ bigint: Parqueteur::Types::Int64Type,
11
+ boolean: Parqueteur::Types::BooleanType,
12
+ int32: Parqueteur::Types::Int32Type,
13
+ int64: Parqueteur::Types::Int64Type,
14
+ integer: Parqueteur::Types::Int32Type,
15
+ map: Parqueteur::Types::MapType,
16
+ string: Parqueteur::Types::StringType,
17
+ struct: Parqueteur::Types::StructType,
18
+ timestamp: Parqueteur::Types::TimestampType
19
+ }
20
+ end
21
+
22
+ def self.register_type(type, klass)
23
+ registered_types[type] = klass
24
+ end
25
+
26
+ def self.resolve(*args, &block)
27
+ instance.resolve(*args, &block)
7
28
  end
8
29
 
9
- def resolve(type, options = {})
10
- case type
11
- when :array
12
- elements_opt = options.fetch(:elements)
13
- Arrow::ListDataType.new(
14
- if elements_opt.is_a?(Hash)
15
- resolve(elements_opt.fetch(:type), elements_opt)
16
- else
17
- resolve(elements_opt)
18
- end
19
- )
20
- when :boolean
21
- Arrow::BooleanDataType.new
22
- when :integer
23
- if options.fetch(:unsigned, false) == true
24
- Arrow::UInt32DataType.new
25
- else
26
- Arrow::Int32DataType.new
27
- end
28
- when :long
29
- if options.fetch(:unsigned, false) == true
30
- Arrow::UInt64DataType.new
31
- else
32
- Arrow::Int64DataType.new
33
- end
34
- when :timestamp
35
- Arrow::TimestampDataType.new(
36
- options.fetch(:unit, :second)
37
- )
38
- when :string
39
- Arrow::StringDataType.new
40
- when :map
41
- map_value = options.fetch(:value)
42
- Arrow::MapDataType.new(
43
- resolve(options.fetch(:key)),
44
- if map_value.is_a?(Hash)
45
- resolve(map_value.fetch(:type), map_value)
46
- else
47
- resolve(map_value)
48
- end
49
- )
30
+ def resolve(type, options = {}, &block)
31
+ if type.is_a?(Symbol)
32
+ resolve_from_symbol(type, options, &block)
50
33
  else
51
- raise Error, "unknown type: #{type}"
34
+ type.new(options, &block)
52
35
  end
53
36
  end
54
- end
55
- end
56
37
 
57
- private
38
+ private
58
39
 
59
- def build_arrow_type(type, options = {})
40
+ def resolve_from_symbol(type, options, &block)
41
+ type_klass = self.class.registered_types.fetch(type.to_sym, nil)
42
+ raise Parqueteur::TypeNotFound, type if type_klass.nil?
60
43
 
44
+ type_klass.new(options, &block)
45
+ end
46
+ end
61
47
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class ArrayType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::ListArrayBuilder.build(arrow_type, values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::ListDataType.new(
12
+ if options[:elements].is_a?(Hash)
13
+ resolve(options[:elements].fetch(:type), options[:elements]).arrow_type
14
+ else
15
+ resolve(options[:elements]).arrow_type
16
+ end
17
+ )
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class BooleanType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::BooleanArray.new(values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::BooleanDataType.new
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Int32Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ if options.fetch(:unsigned, false) == true
8
+ Arrow::UInt32Array.new(values)
9
+ else
10
+ Arrow::Int32Array.new(values)
11
+ end
12
+ end
13
+
14
+ def arrow_type_builder
15
+ if options.fetch(:unsigned, false) == true
16
+ Arrow::UInt32DataType.new
17
+ else
18
+ Arrow::Int32DataType.new
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ # when :integer
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Int64Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ if options.fetch(:unsigned, false) == true
8
+ Arrow::UInt64Array.new(values)
9
+ else
10
+ Arrow::Int64Array.new(values)
11
+ end
12
+ end
13
+
14
+ def arrow_type_builder
15
+ if options.fetch(:unsigned, false) == true
16
+ Arrow::UInt64DataType.new
17
+ else
18
+ Arrow::Int64DataType.new
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ # when :integer
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class MapType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ builder = Arrow::MapArrayBuilder.new(arrow_type)
8
+ values.each do |entry|
9
+ builder.append_value
10
+ next if entry.nil?
11
+
12
+ entry.each do |k, v|
13
+ builder.key_builder.append(k)
14
+ builder.item_builder.append(v)
15
+ end
16
+ end
17
+
18
+ builder.finish
19
+ end
20
+
21
+ def arrow_type_builder
22
+ map_value = options.fetch(:value)
23
+
24
+ Arrow::MapDataType.new(
25
+ resolve(options.fetch(:key)).arrow_type,
26
+ if map_value.is_a?(Hash)
27
+ resolve(map_value.fetch(:type), map_value).arrow_type
28
+ else
29
+ resolve(map_value).arrow_type
30
+ end
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class StringType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ Arrow::StringArray.new(values)
13
+ end
14
+
15
+ def arrow_type_builder
16
+ Arrow::StringDataType.new
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class StructType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ values.each do |value|
13
+ next if value.nil?
14
+
15
+ value.each_key do |key|
16
+ next if struct_object.key?(key)
17
+
18
+ raise Parqueteur::Error, "Struct field '#{key}' not found"
19
+ end
20
+ end
21
+ Arrow::StructArrayBuilder.build(arrow_type, values)
22
+ end
23
+
24
+ def arrow_type_builder
25
+ Arrow::StructDataType.new(struct_object.to_arrow_type)
26
+ end
27
+
28
+ private
29
+
30
+ def struct_object
31
+ @struct_object ||= Parqueteur::Struct.new(&@block)
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class TimestampType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ Arrow::TimestampArray.new(values)
13
+ end
14
+
15
+ def arrow_type_builder
16
+ Arrow::TimestampDataType.new(
17
+ options.fetch(:unit, :second)
18
+ )
19
+ end
20
+ end
21
+ end
22
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Parqueteur
4
- VERSION = '1.0.3'
4
+ VERSION = '1.1.0'
5
5
  end
data/lib/parqueteur.rb CHANGED
@@ -1,17 +1,29 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'json'
4
+ require 'singleton'
5
+
3
6
  require_relative "parqueteur/version"
4
- require 'parqueteur/type_resolver'
7
+ require 'parqueteur/chunked_converter'
5
8
  require 'parqueteur/column'
6
9
  require 'parqueteur/column_collection'
7
10
  require 'parqueteur/converter'
8
- require 'parqueteur/chunked_converter'
9
11
  require 'parqueteur/input'
10
- require 'parqueteur/value_array_builder'
11
- require 'json'
12
+ require 'parqueteur/struct'
13
+ require 'parqueteur/type'
14
+ require 'parqueteur/type_resolver'
15
+ require 'parqueteur/types/array_type'
16
+ require 'parqueteur/types/boolean_type'
17
+ require 'parqueteur/types/int32_type'
18
+ require 'parqueteur/types/int64_type'
19
+ require 'parqueteur/types/map_type'
20
+ require 'parqueteur/types/string_type'
21
+ require 'parqueteur/types/struct_type'
22
+ require 'parqueteur/types/timestamp_type'
12
23
  require 'parquet'
13
24
 
14
25
  module Parqueteur
15
26
  class Error < StandardError; end
27
+ class TypeNotFound < Error; end
16
28
  # Your code goes here...
17
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parqueteur
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julien D.
@@ -45,8 +45,17 @@ files:
45
45
  - lib/parqueteur/column_collection.rb
46
46
  - lib/parqueteur/converter.rb
47
47
  - lib/parqueteur/input.rb
48
+ - lib/parqueteur/struct.rb
49
+ - lib/parqueteur/type.rb
48
50
  - lib/parqueteur/type_resolver.rb
49
- - lib/parqueteur/value_array_builder.rb
51
+ - lib/parqueteur/types/array_type.rb
52
+ - lib/parqueteur/types/boolean_type.rb
53
+ - lib/parqueteur/types/int32_type.rb
54
+ - lib/parqueteur/types/int64_type.rb
55
+ - lib/parqueteur/types/map_type.rb
56
+ - lib/parqueteur/types/string_type.rb
57
+ - lib/parqueteur/types/struct_type.rb
58
+ - lib/parqueteur/types/timestamp_type.rb
50
59
  - lib/parqueteur/version.rb
51
60
  - parqueteur.gemspec
52
61
  - test.json
@@ -1,59 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Parqueteur
4
- class ValueArrayBuilder
5
- attr_reader :type, :options, :arrow_type
6
-
7
- def self.build(input, type, options)
8
- new(type, options).build(input)
9
- end
10
-
11
- def initialize(type, options)
12
- @type = type
13
- @options = options
14
- @arrow_type = Parqueteur::TypeResolver.resolve(type, options)
15
- end
16
-
17
- def build(input)
18
- return if input.nil?
19
-
20
- case type
21
- when :array
22
- Arrow::ListArrayBuilder.build(arrow_type, input)
23
- when :map
24
- builder = Arrow::MapArrayBuilder.new(arrow_type)
25
- input.each do |entry|
26
- builder.append_value
27
- next if entry.nil?
28
-
29
- entry.each do |k, v|
30
- builder.key_builder.append(k)
31
- builder.item_builder.append(v)
32
- end
33
- end
34
-
35
- builder.finish
36
- when :boolean
37
- Arrow::BooleanArray.new(input)
38
- when :integer
39
- if options.fetch(:unsigned, false) == true
40
- Arrow::UInt32Array.new(input)
41
- else
42
- Arrow::Int32Array.new(input)
43
- end
44
- when :long
45
- if options.fetch(:unsigned, false) == true
46
- Arrow::UInt64Array.new(input)
47
- else
48
- Arrow::Int64Array.new(input)
49
- end
50
- when :string
51
- Arrow::StringArray.new(input)
52
- when :timestamp
53
- Arrow::TimestampArray.new(input)
54
- else
55
- raise Error, "unknown type: #{type}"
56
- end
57
- end
58
- end
59
- end