parqueteur 1.0.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 51c58dad3007e9c6a08b4673b378975c7839158240c27fb0a883d0e59c05d5d5
4
- data.tar.gz: fa63fe9cd3cfa71d587400f3672ca2697c87a77fc574d8e2463d89f0090cd686
3
+ metadata.gz: 90ffcadf5b78e4ffc3329eac9be1be34af9209a77a39a6395eefc1c56afa7ce6
4
+ data.tar.gz: 1d7f5257d3f86443e0d13b789d7449565198f6cda1563d111a36ad3728264044
5
5
  SHA512:
6
- metadata.gz: 58c98870d35b4af03bf52fea1572d26b4073d16696637efe8992eba9ddfbe8931e3b015976cd997ee85fc1a0388b031ecafe1b165b911a36d6748a90ce8af88d
7
- data.tar.gz: 288418ec8a410f94fe38e76d94bb2da4173a4760ed8a9825132a41f3d808370ff60c8975ea8e6b8b72c1198dd36e5b4ceb1682570edf4ae9fa60be93ddc3cba8
6
+ metadata.gz: 262039094dd3aa5890f9d1a87d836eb64004c51fc57456d7880a36332295ac8a447fcd045c6bd2e053986ad4e9981021448b39e20010d0e41fef6b7e233d91ca
7
+ data.tar.gz: 73988e1f836acbe22e26c20b8559d8f79e9eeda55f9612fbefae7aa5bef131187b7a0716a3bd7804db5321b29b19f54b383e8594830814ae765d2c7b30986f59
data/.gitignore CHANGED
@@ -7,3 +7,5 @@
7
7
  /spec/reports/
8
8
  /tmp/
9
9
  /*.gem
10
+ /tmp/*
11
+ !/tmp/.keep
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- parqueteur (1.0.3)
4
+ parqueteur (1.1.0)
5
5
  red-parquet (~> 5.0)
6
6
 
7
7
  GEM
data/example.rb CHANGED
@@ -2,19 +2,38 @@ require 'bundler/setup'
2
2
  require 'parqueteur'
3
3
 
4
4
  class Foo < Parqueteur::Converter
5
- column :id, :long
5
+ column :id, :bigint
6
6
  column :reference, :string
7
7
  column :hash, :map, key: :string, value: :string
8
8
  column :valid, :boolean
9
9
  column :total, :integer
10
+ column :numbers, :array, elements: :integer
11
+ column :my_struct, :struct do
12
+ field :test, :string
13
+ field :mon_nombre, :integer
14
+ end
10
15
  end
11
16
 
12
17
  LETTERS = ('a'..'z').to_a
13
18
 
14
19
  data = 1000.times.collect do |i|
15
- { 'id' => i + 1, 'reference' => "coucou:#{i}", 'hash' => { 'a' => LETTERS.sample }, 'valid' => rand < 0.5, 'total' => rand(100..500) }
20
+ {
21
+ 'id' => i + 1,
22
+ 'reference' => "coucou:#{i}",
23
+ 'hash' => { 'a' => LETTERS.sample },
24
+ 'valid' => rand < 0.5,
25
+ 'total' => rand(100..500),
26
+ 'numbers' => [1, 2, 3],
27
+ 'my_struct' => {
28
+ 'test' => 'super'
29
+ }
30
+ }
16
31
  end
17
32
 
18
- chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
19
- pp chunked_converter.write_files('test')
20
- # puts Foo.convert(data, output: 'test.parquet')
33
+ # chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
34
+ # pp chunked_converter.write_files('test')
35
+ puts Foo.convert(data, output: 'tmp/test.parquet')
36
+ table = Arrow::Table.load('tmp/test.parquet')
37
+ table.each_record do |record|
38
+ puts record.to_h
39
+ end
@@ -4,31 +4,14 @@ module Parqueteur
4
4
  class Column
5
5
  attr_reader :name, :type, :options
6
6
 
7
- def initialize(name, type, options = {})
7
+ def initialize(name, type, options = {}, &block)
8
8
  @name = name.to_s
9
- @type = type
9
+ @type = Parqueteur::TypeResolver.resolve(type, options, &block)
10
10
  @options = options
11
11
  end
12
12
 
13
13
  def arrow_type
14
- @arrow_type ||= Parqueteur::TypeResolver.resolve(@type, @options)
15
- end
16
-
17
- def cast(value)
18
- case @type
19
- when :string then value.to_s
20
- when :boolean then value == true
21
- when :integer then value.to_i
22
- when :long then value.to_i
23
- when :timestamp
24
- case value
25
- when String then Time.parse(value).to_i
26
- when Integer then value
27
- else
28
- raise ArgumentError, "Unable to cast '#{value}' to timestamp"
29
- end
30
- when :map then value
31
- end
14
+ @type.arrow_type
32
15
  end
33
16
 
34
17
  def to_arrow_field
@@ -4,11 +4,18 @@ module Parqueteur
4
4
  class ColumnCollection
5
5
  include Enumerable
6
6
 
7
+ attr_reader :column_names
8
+
7
9
  def initialize
8
10
  @columns = []
11
+ @column_names = []
9
12
  @columns_idx = {}
10
13
  end
11
14
 
15
+ def key?(key)
16
+ @columns_idx.key?(key)
17
+ end
18
+
12
19
  def each(&block)
13
20
  @columns.each(&block)
14
21
  end
@@ -17,6 +24,7 @@ module Parqueteur
17
24
  unless @columns_idx.key?(column.name)
18
25
  @columns_idx[column.name] = column
19
26
  @columns << column
27
+ @column_names << column.name
20
28
  end
21
29
 
22
30
  true
@@ -12,8 +12,8 @@ module Parqueteur
12
12
  @columns ||= Parqueteur::ColumnCollection.new
13
13
  end
14
14
 
15
- def self.column(name, type, options = {})
16
- columns.add(Parqueteur::Column.new(name, type, options))
15
+ def self.column(name, type, options = {}, &block)
16
+ columns.add(Parqueteur::Column.new(name, type, options, &block))
17
17
  end
18
18
 
19
19
  def self.transforms
@@ -69,7 +69,10 @@ module Parqueteur
69
69
  def to_arrow_table
70
70
  transforms = self.class.transforms
71
71
 
72
- chunks = {}
72
+ chunks = self.class.columns.each_with_object({}) do |column, hash|
73
+ hash[column.name] = []
74
+ end
75
+ items_count = 0
73
76
  @input.each_slice(100) do |items|
74
77
  values = self.class.columns.each_with_object({}) do |column, hash|
75
78
  hash[column.name] = []
@@ -98,20 +101,27 @@ module Parqueteur
98
101
 
99
102
  values.each_with_object(chunks) do |item, hash|
100
103
  column = self.class.columns.find(item[0])
101
- hash[item[0]] ||= []
102
104
  hash[item[0]].push(
103
- Parqueteur::ValueArrayBuilder.build(
104
- item[1], column.type, column.options
105
- )
105
+ column.type.build_value_array(item[1])
106
106
  )
107
107
  end
108
+
109
+ items_count += items.length
108
110
  end
109
111
 
110
- Arrow::Table.new(
111
- chunks.transform_values! do |value|
112
- Arrow::ChunkedArray.new(value)
113
- end
114
- )
112
+ if items_count > 0
113
+ Arrow::Table.new(
114
+ chunks.transform_values! do |value|
115
+ Arrow::ChunkedArray.new(value)
116
+ end
117
+ )
118
+ else
119
+ Arrow::Table.new(
120
+ self.class.columns.each_with_object({}) do |column, hash|
121
+ hash[column.name] = column.type.build_value_array([])
122
+ end
123
+ )
124
+ end
115
125
  end
116
126
  end
117
127
  end
@@ -13,7 +13,7 @@ module Parqueteur
13
13
  else
14
14
  arg.split("\n")
15
15
  end
16
- when Array, Enumerator
16
+ when Enumerable
17
17
  arg
18
18
  end,
19
19
  options
@@ -36,7 +36,7 @@ module Parqueteur
36
36
  JSON.parse(@source.read).each(&block)
37
37
  end
38
38
  @source.rewind
39
- when Array, Enumerator
39
+ when Enumerable
40
40
  @source.each(&block)
41
41
  end
42
42
  end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Struct
5
+ def initialize(&block)
6
+ instance_exec(&block)
7
+ end
8
+
9
+ def fields
10
+ @fields ||= Parqueteur::ColumnCollection.new
11
+ end
12
+
13
+ def field(name, type, options = {}, &block)
14
+ fields.add(Parqueteur::Column.new(name, type, options, &block))
15
+ end
16
+
17
+ def key?(key)
18
+ fields.key?(key)
19
+ end
20
+
21
+ def to_arrow_type
22
+ fields.collect(&:to_arrow_field)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Type
5
+ attr_reader :options, :arrow_type
6
+
7
+ def initialize(options = {}, &block)
8
+ @options = options
9
+ @block = block
10
+ @arrow_type = arrow_type_builder
11
+ end
12
+
13
+ def build_value_array(values)
14
+ raise "#to_arrow_field must be implemented in #{self.class}"
15
+ end
16
+
17
+ def resolve(type, options = {})
18
+ Parqueteur::TypeResolver.resolve(type, options)
19
+ end
20
+ end
21
+ end
@@ -2,60 +2,46 @@
2
2
 
3
3
  module Parqueteur
4
4
  class TypeResolver
5
- def self.resolve(*args)
6
- new.resolve(*args)
5
+ include Singleton
6
+
7
+ def self.registered_types
8
+ @registered_types ||= {
9
+ array: Parqueteur::Types::ArrayType,
10
+ bigint: Parqueteur::Types::Int64Type,
11
+ boolean: Parqueteur::Types::BooleanType,
12
+ int32: Parqueteur::Types::Int32Type,
13
+ int64: Parqueteur::Types::Int64Type,
14
+ integer: Parqueteur::Types::Int32Type,
15
+ map: Parqueteur::Types::MapType,
16
+ string: Parqueteur::Types::StringType,
17
+ struct: Parqueteur::Types::StructType,
18
+ timestamp: Parqueteur::Types::TimestampType
19
+ }
20
+ end
21
+
22
+ def self.register_type(type, klass)
23
+ registered_types[type] = klass
24
+ end
25
+
26
+ def self.resolve(*args, &block)
27
+ instance.resolve(*args, &block)
7
28
  end
8
29
 
9
- def resolve(type, options = {})
10
- case type
11
- when :array
12
- elements_opt = options.fetch(:elements)
13
- Arrow::ListDataType.new(
14
- if elements_opt.is_a?(Hash)
15
- resolve(elements_opt.fetch(:type), elements_opt)
16
- else
17
- resolve(elements_opt)
18
- end
19
- )
20
- when :boolean
21
- Arrow::BooleanDataType.new
22
- when :integer
23
- if options.fetch(:unsigned, false) == true
24
- Arrow::UInt32DataType.new
25
- else
26
- Arrow::Int32DataType.new
27
- end
28
- when :long
29
- if options.fetch(:unsigned, false) == true
30
- Arrow::UInt64DataType.new
31
- else
32
- Arrow::Int64DataType.new
33
- end
34
- when :timestamp
35
- Arrow::TimestampDataType.new(
36
- options.fetch(:unit, :second)
37
- )
38
- when :string
39
- Arrow::StringDataType.new
40
- when :map
41
- map_value = options.fetch(:value)
42
- Arrow::MapDataType.new(
43
- resolve(options.fetch(:key)),
44
- if map_value.is_a?(Hash)
45
- resolve(map_value.fetch(:type), map_value)
46
- else
47
- resolve(map_value)
48
- end
49
- )
30
+ def resolve(type, options = {}, &block)
31
+ if type.is_a?(Symbol)
32
+ resolve_from_symbol(type, options, &block)
50
33
  else
51
- raise Error, "unknown type: #{type}"
34
+ type.new(options, &block)
52
35
  end
53
36
  end
54
- end
55
- end
56
37
 
57
- private
38
+ private
58
39
 
59
- def build_arrow_type(type, options = {})
40
+ def resolve_from_symbol(type, options, &block)
41
+ type_klass = self.class.registered_types.fetch(type.to_sym, nil)
42
+ raise Parqueteur::TypeNotFound, type if type_klass.nil?
60
43
 
44
+ type_klass.new(options, &block)
45
+ end
46
+ end
61
47
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class ArrayType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::ListArrayBuilder.build(arrow_type, values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::ListDataType.new(
12
+ if options[:elements].is_a?(Hash)
13
+ resolve(options[:elements].fetch(:type), options[:elements]).arrow_type
14
+ else
15
+ resolve(options[:elements]).arrow_type
16
+ end
17
+ )
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class BooleanType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::BooleanArray.new(values)
8
+ end
9
+
10
+ def arrow_type_builder
11
+ Arrow::BooleanDataType.new
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Int32Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ if options.fetch(:unsigned, false) == true
8
+ Arrow::UInt32Array.new(values)
9
+ else
10
+ Arrow::Int32Array.new(values)
11
+ end
12
+ end
13
+
14
+ def arrow_type_builder
15
+ if options.fetch(:unsigned, false) == true
16
+ Arrow::UInt32DataType.new
17
+ else
18
+ Arrow::Int32DataType.new
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ # when :integer
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Int64Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ if options.fetch(:unsigned, false) == true
8
+ Arrow::UInt64Array.new(values)
9
+ else
10
+ Arrow::Int64Array.new(values)
11
+ end
12
+ end
13
+
14
+ def arrow_type_builder
15
+ if options.fetch(:unsigned, false) == true
16
+ Arrow::UInt64DataType.new
17
+ else
18
+ Arrow::Int64DataType.new
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ # when :integer
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class MapType < Parqueteur::Type
6
+ def build_value_array(values)
7
+ builder = Arrow::MapArrayBuilder.new(arrow_type)
8
+ values.each do |entry|
9
+ builder.append_value
10
+ next if entry.nil?
11
+
12
+ entry.each do |k, v|
13
+ builder.key_builder.append(k)
14
+ builder.item_builder.append(v)
15
+ end
16
+ end
17
+
18
+ builder.finish
19
+ end
20
+
21
+ def arrow_type_builder
22
+ map_value = options.fetch(:value)
23
+
24
+ Arrow::MapDataType.new(
25
+ resolve(options.fetch(:key)).arrow_type,
26
+ if map_value.is_a?(Hash)
27
+ resolve(map_value.fetch(:type), map_value).arrow_type
28
+ else
29
+ resolve(map_value).arrow_type
30
+ end
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class StringType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ Arrow::StringArray.new(values)
13
+ end
14
+
15
+ def arrow_type_builder
16
+ Arrow::StringDataType.new
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class StructType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ values.each do |value|
13
+ next if value.nil?
14
+
15
+ value.each_key do |key|
16
+ next if struct_object.key?(key)
17
+
18
+ raise Parqueteur::Error, "Struct field '#{key}' not found"
19
+ end
20
+ end
21
+ Arrow::StructArrayBuilder.build(arrow_type, values)
22
+ end
23
+
24
+ def arrow_type_builder
25
+ Arrow::StructDataType.new(struct_object.to_arrow_type)
26
+ end
27
+
28
+ private
29
+
30
+ def struct_object
31
+ @struct_object ||= Parqueteur::Struct.new(&@block)
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class TimestampType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ Arrow::TimestampArray.new(values)
13
+ end
14
+
15
+ def arrow_type_builder
16
+ Arrow::TimestampDataType.new(
17
+ options.fetch(:unit, :second)
18
+ )
19
+ end
20
+ end
21
+ end
22
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Parqueteur
4
- VERSION = '1.0.3'
4
+ VERSION = '1.1.0'
5
5
  end
data/lib/parqueteur.rb CHANGED
@@ -1,17 +1,29 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'json'
4
+ require 'singleton'
5
+
3
6
  require_relative "parqueteur/version"
4
- require 'parqueteur/type_resolver'
7
+ require 'parqueteur/chunked_converter'
5
8
  require 'parqueteur/column'
6
9
  require 'parqueteur/column_collection'
7
10
  require 'parqueteur/converter'
8
- require 'parqueteur/chunked_converter'
9
11
  require 'parqueteur/input'
10
- require 'parqueteur/value_array_builder'
11
- require 'json'
12
+ require 'parqueteur/struct'
13
+ require 'parqueteur/type'
14
+ require 'parqueteur/type_resolver'
15
+ require 'parqueteur/types/array_type'
16
+ require 'parqueteur/types/boolean_type'
17
+ require 'parqueteur/types/int32_type'
18
+ require 'parqueteur/types/int64_type'
19
+ require 'parqueteur/types/map_type'
20
+ require 'parqueteur/types/string_type'
21
+ require 'parqueteur/types/struct_type'
22
+ require 'parqueteur/types/timestamp_type'
12
23
  require 'parquet'
13
24
 
14
25
  module Parqueteur
15
26
  class Error < StandardError; end
27
+ class TypeNotFound < Error; end
16
28
  # Your code goes here...
17
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parqueteur
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julien D.
@@ -45,8 +45,17 @@ files:
45
45
  - lib/parqueteur/column_collection.rb
46
46
  - lib/parqueteur/converter.rb
47
47
  - lib/parqueteur/input.rb
48
+ - lib/parqueteur/struct.rb
49
+ - lib/parqueteur/type.rb
48
50
  - lib/parqueteur/type_resolver.rb
49
- - lib/parqueteur/value_array_builder.rb
51
+ - lib/parqueteur/types/array_type.rb
52
+ - lib/parqueteur/types/boolean_type.rb
53
+ - lib/parqueteur/types/int32_type.rb
54
+ - lib/parqueteur/types/int64_type.rb
55
+ - lib/parqueteur/types/map_type.rb
56
+ - lib/parqueteur/types/string_type.rb
57
+ - lib/parqueteur/types/struct_type.rb
58
+ - lib/parqueteur/types/timestamp_type.rb
50
59
  - lib/parqueteur/version.rb
51
60
  - parqueteur.gemspec
52
61
  - test.json
@@ -1,59 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Parqueteur
4
- class ValueArrayBuilder
5
- attr_reader :type, :options, :arrow_type
6
-
7
- def self.build(input, type, options)
8
- new(type, options).build(input)
9
- end
10
-
11
- def initialize(type, options)
12
- @type = type
13
- @options = options
14
- @arrow_type = Parqueteur::TypeResolver.resolve(type, options)
15
- end
16
-
17
- def build(input)
18
- return if input.nil?
19
-
20
- case type
21
- when :array
22
- Arrow::ListArrayBuilder.build(arrow_type, input)
23
- when :map
24
- builder = Arrow::MapArrayBuilder.new(arrow_type)
25
- input.each do |entry|
26
- builder.append_value
27
- next if entry.nil?
28
-
29
- entry.each do |k, v|
30
- builder.key_builder.append(k)
31
- builder.item_builder.append(v)
32
- end
33
- end
34
-
35
- builder.finish
36
- when :boolean
37
- Arrow::BooleanArray.new(input)
38
- when :integer
39
- if options.fetch(:unsigned, false) == true
40
- Arrow::UInt32Array.new(input)
41
- else
42
- Arrow::Int32Array.new(input)
43
- end
44
- when :long
45
- if options.fetch(:unsigned, false) == true
46
- Arrow::UInt64Array.new(input)
47
- else
48
- Arrow::Int64Array.new(input)
49
- end
50
- when :string
51
- Arrow::StringArray.new(input)
52
- when :timestamp
53
- Arrow::TimestampArray.new(input)
54
- else
55
- raise Error, "unknown type: #{type}"
56
- end
57
- end
58
- end
59
- end