parqueteur 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: b60aefc8e90564af2abeeb01a36aaaabe8cf08c65efeebdbf6f034372c8100c1
4
+ data.tar.gz: b4f93aeac25321e50fdf8d2a8c8e0216feb4ec68d0ac356d2ed98118820c61a4
5
+ SHA512:
6
+ metadata.gz: 1dbb0d1a870ff2291909014f595a6c63a35c11c80adc4e1972e20852ac87ba20447ce31dee9feb2f11dcea1a2ec5276ad4a05ca3eb406ab26a2b606ea369b809
7
+ data.tar.gz: ef4dec6ca81564972112468dc1729053c9c03bdc61a947a01990c1a870dc45e8136561553390fbfbb1b49d22ca1f8221a94cb8403a7ecfc6bf18f10ba07acd9b
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in parqueteur.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
data/Gemfile.lock ADDED
@@ -0,0 +1,39 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ parqueteur (1.0.2)
5
+ red-parquet (~> 5.0)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ bigdecimal (3.0.0)
11
+ extpp (0.0.9)
12
+ gio2 (3.4.6)
13
+ gobject-introspection (= 3.4.6)
14
+ glib2 (3.4.6)
15
+ native-package-installer (>= 1.0.3)
16
+ pkg-config (>= 1.3.5)
17
+ gobject-introspection (3.4.6)
18
+ glib2 (= 3.4.6)
19
+ native-package-installer (1.1.1)
20
+ pkg-config (1.4.6)
21
+ rake (13.0.6)
22
+ red-arrow (5.0.0)
23
+ bigdecimal (>= 2.0.3)
24
+ extpp (>= 0.0.7)
25
+ gio2 (>= 3.4.5)
26
+ native-package-installer
27
+ pkg-config
28
+ red-parquet (5.0.0)
29
+ red-arrow (= 5.0.0)
30
+
31
+ PLATFORMS
32
+ ruby
33
+
34
+ DEPENDENCIES
35
+ parqueteur!
36
+ rake (~> 13.0)
37
+
38
+ BUNDLED WITH
39
+ 2.2.3
data/README.md ADDED
@@ -0,0 +1,35 @@
1
+ # Parqueteur
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/parqueteur`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'parqueteur'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle install
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install parqueteur
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/parqueteur.
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ task default: %i[]
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "parqueteur"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/example.rb ADDED
@@ -0,0 +1,20 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+
4
+ class Foo < Parqueteur::Converter
5
+ column :id, :long
6
+ column :reference, :string
7
+ column :hash, :map, key: :string, value: :string
8
+ column :valid, :boolean
9
+ column :total, :integer
10
+ end
11
+
12
+ LETTERS = ('a'..'z').to_a
13
+
14
+ data = 1000.times.collect do |i|
15
+ { 'id' => i + 1, 'reference' => "coucou:#{i}", 'hash' => { 'a' => LETTERS.sample }, 'valid' => rand < 0.5, 'total' => rand(100..500) }
16
+ end
17
+
18
+ chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
19
+ pp chunked_converter.write_files('test')
20
+ # puts Foo.convert(data, output: 'test.parquet')
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class ChunkedConverter
5
+ attr_reader :schema
6
+
7
+ def initialize(input, converter, chunk_size = 200)
8
+ @input = Parqueteur::Input.from(input)
9
+ @converter = converter
10
+ @chunk_size = chunk_size
11
+ end
12
+
13
+ def chunks
14
+ Enumerator.new do |arr|
15
+ @input.each_slice(@chunk_size) do |chunk|
16
+ local_converter = @converter.new(chunk)
17
+ arr << local_converter.to_io
18
+ end
19
+ end
20
+ end
21
+
22
+ def write_files(prefix)
23
+ chunks.each_with_index do |chunk, idx|
24
+ File.write("#{prefix}.#{idx}.parquet", chunk.read)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Column
5
+ attr_reader :name, :type, :options
6
+
7
+ def initialize(name, type, options = {})
8
+ @name = name.to_s
9
+ @type = type
10
+ @options = options
11
+ end
12
+
13
+ def arrow_type
14
+ @arrow_type ||= Parqueteur::TypeResolver.resolve(@type, @options)
15
+ end
16
+
17
+ def cast(value)
18
+ case @type
19
+ when :string then value.to_s
20
+ when :boolean then value == true
21
+ when :integer then value.to_i
22
+ when :long then value.to_i
23
+ when :timestamp
24
+ case value
25
+ when String then Time.parse(value).to_i
26
+ when Integer then value
27
+ else
28
+ raise ArgumentError, "Unable to cast '#{value}' to timestamp"
29
+ end
30
+ when :map then value
31
+ end
32
+ end
33
+
34
+ def to_arrow_field
35
+ Arrow::Field.new(name, arrow_type)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class ColumnCollection
5
+ include Enumerable
6
+
7
+ def initialize
8
+ @columns = []
9
+ @columns_idx = {}
10
+ end
11
+
12
+ def each(&block)
13
+ @columns.each(&block)
14
+ end
15
+
16
+ def add(column)
17
+ unless @columns_idx.key?(column.name)
18
+ @columns_idx[column.name] = column
19
+ @columns << column
20
+ end
21
+
22
+ true
23
+ end
24
+
25
+ def find(name)
26
+ @columns_idx.fetch(name, nil)
27
+ end
28
+
29
+ def arrow_schema
30
+ @arrow_schema ||= Arrow::Schema.new(@columns.collect(&:to_arrow_field))
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Converter
5
+ attr_reader :schema
6
+
7
+ def self.inline(&block)
8
+ Class.new(self, &block)
9
+ end
10
+
11
+ def self.columns
12
+ @columns ||= Parqueteur::ColumnCollection.new
13
+ end
14
+
15
+ def self.column(name, type, options = {})
16
+ columns.add(Parqueteur::Column.new(name, type, options))
17
+ end
18
+
19
+ def self.transforms
20
+ @transforms ||= []
21
+ end
22
+
23
+ def self.transform(method_name, &block)
24
+ transforms << (method_name || block)
25
+ end
26
+
27
+ def self.convert(input, output: nil)
28
+ converter = new(input)
29
+ if !output.nil?
30
+ converter.write(output)
31
+ else
32
+ converter.to_blob
33
+ end
34
+ end
35
+
36
+ def initialize(input, options = {})
37
+ @input = Parqueteur::Input.from(input, options)
38
+ end
39
+
40
+ def write(output)
41
+ case output
42
+ when :io
43
+ to_io
44
+ when String
45
+ to_arrow_table.save(output)
46
+ when StringIO, IO
47
+ buffer = Arrow::ResizableBuffer.new(0)
48
+ to_arrow_table.save(buffer, format: :parquet)
49
+ output.write(buffer.data.to_s)
50
+ output.rewind
51
+ output
52
+ else
53
+ raise ArgumentError, "unsupported output: #{output.class}, accepted: String (filename), IO, StringIO"
54
+ end
55
+ end
56
+
57
+ def to_s
58
+ inspect
59
+ end
60
+
61
+ def to_io
62
+ write(StringIO.new)
63
+ end
64
+
65
+ def to_blob
66
+ write(StringIO.new).read
67
+ end
68
+
69
+ def to_arrow_table
70
+ transforms = self.class.transforms
71
+
72
+ chunks = {}
73
+ @input.each_slice(100) do |items|
74
+ values = self.class.columns.each_with_object({}) do |column, hash|
75
+ hash[column.name] = []
76
+ end
77
+
78
+ items.each do |item|
79
+ if transforms.length > 0
80
+ transforms.each do |transform|
81
+ item = \
82
+ if transform.is_a?(Symbol)
83
+ __send__(transform, item)
84
+ else
85
+ transform.call(item)
86
+ end
87
+ end
88
+ end
89
+
90
+ values.each_key do |value_key|
91
+ if item.key?(value_key)
92
+ values[value_key] << item[value_key]
93
+ else
94
+ values[value_key] << nil
95
+ end
96
+ end
97
+ end
98
+
99
+ values.each_with_object(chunks) do |item, hash|
100
+ column = self.class.columns.find(item[0])
101
+ hash[item[0]] ||= []
102
+ hash[item[0]].push(
103
+ Parqueteur::ValueArrayBuilder.build(
104
+ item[1], column.type, column.options
105
+ )
106
+ )
107
+ end
108
+ end
109
+
110
+ Arrow::Table.new(
111
+ chunks.transform_values! do |value|
112
+ Arrow::ChunkedArray.new(value)
113
+ end
114
+ )
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Input
5
+ include Enumerable
6
+
7
+ def self.from(arg, options = {})
8
+ new(
9
+ case arg
10
+ when String
11
+ if File.exist?(arg)
12
+ File.new(arg, 'r')
13
+ else
14
+ arg.split("\n")
15
+ end
16
+ when Array, Enumerator
17
+ arg
18
+ end,
19
+ options
20
+ )
21
+ end
22
+
23
+ def initialize(source, options = {})
24
+ @source = source
25
+ @options = options
26
+ end
27
+
28
+ def each(&block)
29
+ case @source
30
+ when File
31
+ if @options.fetch(:json_newlines, true) == true
32
+ @source.each_line do |line|
33
+ yield(JSON.parse(line.strip))
34
+ end
35
+ else
36
+ JSON.parse(@source.read).each(&block)
37
+ end
38
+ @source.rewind
39
+ when Array, Enumerator
40
+ @source.each(&block)
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class TypeResolver
5
+ def self.resolve(*args)
6
+ new.resolve(*args)
7
+ end
8
+
9
+ def resolve(type, options = {})
10
+ case type
11
+ when :array
12
+ elements_opt = options.fetch(:elements)
13
+ Arrow::ListDataType.new(
14
+ if elements_opt.is_a?(Hash)
15
+ resolve(elements_opt.fetch(:type), elements_opt)
16
+ else
17
+ resolve(elements_opt)
18
+ end
19
+ )
20
+ when :boolean
21
+ Arrow::BooleanDataType.new
22
+ when :integer
23
+ if options.fetch(:unsigned, false) == true
24
+ Arrow::UInt32DataType.new
25
+ else
26
+ Arrow::Int32DataType.new
27
+ end
28
+ when :long
29
+ if options.fetch(:unsigned, false) == true
30
+ Arrow::UInt64DataType.new
31
+ else
32
+ Arrow::Int64DataType.new
33
+ end
34
+ when :timestamp
35
+ Arrow::TimestampDataType.new(
36
+ options.fetch(:unit, :second)
37
+ )
38
+ when :string
39
+ Arrow::StringDataType.new
40
+ when :map
41
+ map_value = options.fetch(:value)
42
+ Arrow::MapDataType.new(
43
+ resolve(options.fetch(:key)),
44
+ if map_value.is_a?(Hash)
45
+ resolve(map_value.fetch(:type), map_value)
46
+ else
47
+ resolve(map_value)
48
+ end
49
+ )
50
+ else
51
+ raise Error, "unknown type: #{type}"
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ private
58
+
59
+ def build_arrow_type(type, options = {})
60
+
61
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class ValueArrayBuilder
5
+ attr_reader :type, :options, :arrow_type
6
+
7
+ def self.build(input, type, options)
8
+ new(type, options).build(input)
9
+ end
10
+
11
+ def initialize(type, options)
12
+ @type = type
13
+ @options = options
14
+ @arrow_type = Parqueteur::TypeResolver.resolve(type, options)
15
+ end
16
+
17
+ def build(input)
18
+ return if input.nil?
19
+
20
+ case type
21
+ when :array
22
+ Arrow::ListArrayBuilder.build(arrow_type, input)
23
+ when :map
24
+ builder = Arrow::MapArrayBuilder.new(arrow_type)
25
+ input.each do |entry|
26
+ builder.append_value
27
+ next if entry.nil?
28
+
29
+ entry.each do |k, v|
30
+ builder.key_builder.append(k)
31
+ builder.item_builder.append(v)
32
+ end
33
+ end
34
+
35
+ builder.finish
36
+ when :boolean
37
+ Arrow::BooleanArray.new(input)
38
+ when :integer
39
+ if options.fetch(:unsigned, false) == true
40
+ Arrow::UInt32Array.new(input)
41
+ else
42
+ Arrow::Int32Array.new(input)
43
+ end
44
+ when :long
45
+ if options.fetch(:unsigned, false) == true
46
+ Arrow::UInt64Array.new(input)
47
+ else
48
+ Arrow::Int64Array.new(input)
49
+ end
50
+ when :string
51
+ Arrow::StringArray.new(input)
52
+ when :timestamp
53
+ Arrow::TimestampArray.new(input)
54
+ else
55
+ raise Error, "unknown type: #{type}"
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ VERSION = '1.0.2'
5
+ end
data/lib/parqueteur.rb ADDED
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "parqueteur/version"
4
+ require 'parqueteur/type_resolver'
5
+ require 'parqueteur/column'
6
+ require 'parqueteur/column_collection'
7
+ require 'parqueteur/converter'
8
+ require 'parqueteur/chunked_converter'
9
+ require 'parqueteur/input'
10
+ require 'parqueteur/value_array_builder'
11
+ require 'json'
12
+ require 'parquet'
13
+
14
+ module Parqueteur
15
+ class Error < StandardError; end
16
+ # Your code goes here...
17
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/parqueteur/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "parqueteur"
7
+ spec.version = Parqueteur::VERSION
8
+ spec.authors = ["Julien D."]
9
+ spec.email = ["julien@pocketsizesun.com"]
10
+ spec.license = 'Apache-2.0'
11
+ spec.summary = 'Parqueteur - A Ruby gem that convert JSON to Parquet'
12
+ spec.description = 'Convert JSON to Parquet'
13
+ spec.homepage = 'https://github.com/pocketsizesun/parqueteur-ruby'
14
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
15
+
16
+ spec.metadata["allowed_push_host"] = "https://rubygems.org"
17
+
18
+ # spec.metadata["homepage_uri"] = spec.homepage
19
+ # spec.metadata["source_code_uri"] = "Put your gem's public repo URL here."
20
+ # spec.metadata["changelog_uri"] = "Put your gem's CHANGELOG.md URL here."
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
25
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
26
+ end
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+
31
+ # Uncomment to register a new dependency of your gem
32
+ # spec.add_dependency "example-gem", "~> 1.0"
33
+ spec.add_dependency 'red-parquet', '~> 5.0'
34
+
35
+ # For more information and examples about making a new gem, checkout our
36
+ # guide at: https://bundler.io/guides/creating_gem.html
37
+ end
data/test.json ADDED
@@ -0,0 +1 @@
1
+ [{"id":1,"reference":"coucou","hash":{"a":"b"},"valid":true,"hash2":{},"numbers":[1,2,3],"map_array":[]},{"id":2,"reference":"coucou","hash":{"c":"d"},"valid":false,"hash2":{},"numbers":[4,5,6],"map_array":[]},{"id":3,"reference":"coucou","hash":{"e":"f"},"valid":true,"hash2":{"x":[1,2,3]},"numbers":[7,8,9],"map_array":[{"x":"y"}]}]
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parqueteur
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Julien D.
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-10-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: red-parquet
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '5.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '5.0'
27
+ description: Convert JSON to Parquet
28
+ email:
29
+ - julien@pocketsizesun.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - ".gitignore"
35
+ - Gemfile
36
+ - Gemfile.lock
37
+ - README.md
38
+ - Rakefile
39
+ - bin/console
40
+ - bin/setup
41
+ - example.rb
42
+ - lib/parqueteur.rb
43
+ - lib/parqueteur/chunked_converter.rb
44
+ - lib/parqueteur/column.rb
45
+ - lib/parqueteur/column_collection.rb
46
+ - lib/parqueteur/converter.rb
47
+ - lib/parqueteur/input.rb
48
+ - lib/parqueteur/type_resolver.rb
49
+ - lib/parqueteur/value_array_builder.rb
50
+ - lib/parqueteur/version.rb
51
+ - parqueteur.gemspec
52
+ - test.json
53
+ homepage: https://github.com/pocketsizesun/parqueteur-ruby
54
+ licenses:
55
+ - Apache-2.0
56
+ metadata:
57
+ allowed_push_host: https://rubygems.org
58
+ post_install_message:
59
+ rdoc_options: []
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: 2.3.0
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ requirements: []
73
+ rubygems_version: 3.2.3
74
+ signing_key:
75
+ specification_version: 4
76
+ summary: Parqueteur - A Ruby gem that convert JSON to Parquet
77
+ test_files: []