parqueteur 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: b60aefc8e90564af2abeeb01a36aaaabe8cf08c65efeebdbf6f034372c8100c1
4
+ data.tar.gz: b4f93aeac25321e50fdf8d2a8c8e0216feb4ec68d0ac356d2ed98118820c61a4
5
+ SHA512:
6
+ metadata.gz: 1dbb0d1a870ff2291909014f595a6c63a35c11c80adc4e1972e20852ac87ba20447ce31dee9feb2f11dcea1a2ec5276ad4a05ca3eb406ab26a2b606ea369b809
7
+ data.tar.gz: ef4dec6ca81564972112468dc1729053c9c03bdc61a947a01990c1a870dc45e8136561553390fbfbb1b49d22ca1f8221a94cb8403a7ecfc6bf18f10ba07acd9b
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in parqueteur.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
data/Gemfile.lock ADDED
@@ -0,0 +1,39 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ parqueteur (1.0.2)
5
+ red-parquet (~> 5.0)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ bigdecimal (3.0.0)
11
+ extpp (0.0.9)
12
+ gio2 (3.4.6)
13
+ gobject-introspection (= 3.4.6)
14
+ glib2 (3.4.6)
15
+ native-package-installer (>= 1.0.3)
16
+ pkg-config (>= 1.3.5)
17
+ gobject-introspection (3.4.6)
18
+ glib2 (= 3.4.6)
19
+ native-package-installer (1.1.1)
20
+ pkg-config (1.4.6)
21
+ rake (13.0.6)
22
+ red-arrow (5.0.0)
23
+ bigdecimal (>= 2.0.3)
24
+ extpp (>= 0.0.7)
25
+ gio2 (>= 3.4.5)
26
+ native-package-installer
27
+ pkg-config
28
+ red-parquet (5.0.0)
29
+ red-arrow (= 5.0.0)
30
+
31
+ PLATFORMS
32
+ ruby
33
+
34
+ DEPENDENCIES
35
+ parqueteur!
36
+ rake (~> 13.0)
37
+
38
+ BUNDLED WITH
39
+ 2.2.3
data/README.md ADDED
@@ -0,0 +1,35 @@
1
+ # Parqueteur
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/parqueteur`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'parqueteur'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle install
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install parqueteur
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/parqueteur.
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ task default: %i[]
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "parqueteur"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/example.rb ADDED
@@ -0,0 +1,20 @@
1
+ require 'bundler/setup'
2
+ require 'parqueteur'
3
+
4
+ class Foo < Parqueteur::Converter
5
+ column :id, :long
6
+ column :reference, :string
7
+ column :hash, :map, key: :string, value: :string
8
+ column :valid, :boolean
9
+ column :total, :integer
10
+ end
11
+
12
+ LETTERS = ('a'..'z').to_a
13
+
14
+ data = 1000.times.collect do |i|
15
+ { 'id' => i + 1, 'reference' => "coucou:#{i}", 'hash' => { 'a' => LETTERS.sample }, 'valid' => rand < 0.5, 'total' => rand(100..500) }
16
+ end
17
+
18
+ chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
19
+ pp chunked_converter.write_files('test')
20
+ # puts Foo.convert(data, output: 'test.parquet')
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class ChunkedConverter
5
+ attr_reader :schema
6
+
7
+ def initialize(input, converter, chunk_size = 200)
8
+ @input = Parqueteur::Input.from(input)
9
+ @converter = converter
10
+ @chunk_size = chunk_size
11
+ end
12
+
13
+ def chunks
14
+ Enumerator.new do |arr|
15
+ @input.each_slice(@chunk_size) do |chunk|
16
+ local_converter = @converter.new(chunk)
17
+ arr << local_converter.to_io
18
+ end
19
+ end
20
+ end
21
+
22
+ def write_files(prefix)
23
+ chunks.each_with_index do |chunk, idx|
24
+ File.write("#{prefix}.#{idx}.parquet", chunk.read)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Column
5
+ attr_reader :name, :type, :options
6
+
7
+ def initialize(name, type, options = {})
8
+ @name = name.to_s
9
+ @type = type
10
+ @options = options
11
+ end
12
+
13
+ def arrow_type
14
+ @arrow_type ||= Parqueteur::TypeResolver.resolve(@type, @options)
15
+ end
16
+
17
+ def cast(value)
18
+ case @type
19
+ when :string then value.to_s
20
+ when :boolean then value == true
21
+ when :integer then value.to_i
22
+ when :long then value.to_i
23
+ when :timestamp
24
+ case value
25
+ when String then Time.parse(value).to_i
26
+ when Integer then value
27
+ else
28
+ raise ArgumentError, "Unable to cast '#{value}' to timestamp"
29
+ end
30
+ when :map then value
31
+ end
32
+ end
33
+
34
+ def to_arrow_field
35
+ Arrow::Field.new(name, arrow_type)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class ColumnCollection
5
+ include Enumerable
6
+
7
+ def initialize
8
+ @columns = []
9
+ @columns_idx = {}
10
+ end
11
+
12
+ def each(&block)
13
+ @columns.each(&block)
14
+ end
15
+
16
+ def add(column)
17
+ unless @columns_idx.key?(column.name)
18
+ @columns_idx[column.name] = column
19
+ @columns << column
20
+ end
21
+
22
+ true
23
+ end
24
+
25
+ def find(name)
26
+ @columns_idx.fetch(name, nil)
27
+ end
28
+
29
+ def arrow_schema
30
+ @arrow_schema ||= Arrow::Schema.new(@columns.collect(&:to_arrow_field))
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Converter
5
+ attr_reader :schema
6
+
7
+ def self.inline(&block)
8
+ Class.new(self, &block)
9
+ end
10
+
11
+ def self.columns
12
+ @columns ||= Parqueteur::ColumnCollection.new
13
+ end
14
+
15
+ def self.column(name, type, options = {})
16
+ columns.add(Parqueteur::Column.new(name, type, options))
17
+ end
18
+
19
+ def self.transforms
20
+ @transforms ||= []
21
+ end
22
+
23
+ def self.transform(method_name, &block)
24
+ transforms << (method_name || block)
25
+ end
26
+
27
+ def self.convert(input, output: nil)
28
+ converter = new(input)
29
+ if !output.nil?
30
+ converter.write(output)
31
+ else
32
+ converter.to_blob
33
+ end
34
+ end
35
+
36
+ def initialize(input, options = {})
37
+ @input = Parqueteur::Input.from(input, options)
38
+ end
39
+
40
+ def write(output)
41
+ case output
42
+ when :io
43
+ to_io
44
+ when String
45
+ to_arrow_table.save(output)
46
+ when StringIO, IO
47
+ buffer = Arrow::ResizableBuffer.new(0)
48
+ to_arrow_table.save(buffer, format: :parquet)
49
+ output.write(buffer.data.to_s)
50
+ output.rewind
51
+ output
52
+ else
53
+ raise ArgumentError, "unsupported output: #{output.class}, accepted: String (filename), IO, StringIO"
54
+ end
55
+ end
56
+
57
+ def to_s
58
+ inspect
59
+ end
60
+
61
+ def to_io
62
+ write(StringIO.new)
63
+ end
64
+
65
+ def to_blob
66
+ write(StringIO.new).read
67
+ end
68
+
69
+ def to_arrow_table
70
+ transforms = self.class.transforms
71
+
72
+ chunks = {}
73
+ @input.each_slice(100) do |items|
74
+ values = self.class.columns.each_with_object({}) do |column, hash|
75
+ hash[column.name] = []
76
+ end
77
+
78
+ items.each do |item|
79
+ if transforms.length > 0
80
+ transforms.each do |transform|
81
+ item = \
82
+ if transform.is_a?(Symbol)
83
+ __send__(transform, item)
84
+ else
85
+ transform.call(item)
86
+ end
87
+ end
88
+ end
89
+
90
+ values.each_key do |value_key|
91
+ if item.key?(value_key)
92
+ values[value_key] << item[value_key]
93
+ else
94
+ values[value_key] << nil
95
+ end
96
+ end
97
+ end
98
+
99
+ values.each_with_object(chunks) do |item, hash|
100
+ column = self.class.columns.find(item[0])
101
+ hash[item[0]] ||= []
102
+ hash[item[0]].push(
103
+ Parqueteur::ValueArrayBuilder.build(
104
+ item[1], column.type, column.options
105
+ )
106
+ )
107
+ end
108
+ end
109
+
110
+ Arrow::Table.new(
111
+ chunks.transform_values! do |value|
112
+ Arrow::ChunkedArray.new(value)
113
+ end
114
+ )
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class Input
5
+ include Enumerable
6
+
7
+ def self.from(arg, options = {})
8
+ new(
9
+ case arg
10
+ when String
11
+ if File.exist?(arg)
12
+ File.new(arg, 'r')
13
+ else
14
+ arg.split("\n")
15
+ end
16
+ when Array, Enumerator
17
+ arg
18
+ end,
19
+ options
20
+ )
21
+ end
22
+
23
+ def initialize(source, options = {})
24
+ @source = source
25
+ @options = options
26
+ end
27
+
28
+ def each(&block)
29
+ case @source
30
+ when File
31
+ if @options.fetch(:json_newlines, true) == true
32
+ @source.each_line do |line|
33
+ yield(JSON.parse(line.strip))
34
+ end
35
+ else
36
+ JSON.parse(@source.read).each(&block)
37
+ end
38
+ @source.rewind
39
+ when Array, Enumerator
40
+ @source.each(&block)
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class TypeResolver
5
+ def self.resolve(*args)
6
+ new.resolve(*args)
7
+ end
8
+
9
+ def resolve(type, options = {})
10
+ case type
11
+ when :array
12
+ elements_opt = options.fetch(:elements)
13
+ Arrow::ListDataType.new(
14
+ if elements_opt.is_a?(Hash)
15
+ resolve(elements_opt.fetch(:type), elements_opt)
16
+ else
17
+ resolve(elements_opt)
18
+ end
19
+ )
20
+ when :boolean
21
+ Arrow::BooleanDataType.new
22
+ when :integer
23
+ if options.fetch(:unsigned, false) == true
24
+ Arrow::UInt32DataType.new
25
+ else
26
+ Arrow::Int32DataType.new
27
+ end
28
+ when :long
29
+ if options.fetch(:unsigned, false) == true
30
+ Arrow::UInt64DataType.new
31
+ else
32
+ Arrow::Int64DataType.new
33
+ end
34
+ when :timestamp
35
+ Arrow::TimestampDataType.new(
36
+ options.fetch(:unit, :second)
37
+ )
38
+ when :string
39
+ Arrow::StringDataType.new
40
+ when :map
41
+ map_value = options.fetch(:value)
42
+ Arrow::MapDataType.new(
43
+ resolve(options.fetch(:key)),
44
+ if map_value.is_a?(Hash)
45
+ resolve(map_value.fetch(:type), map_value)
46
+ else
47
+ resolve(map_value)
48
+ end
49
+ )
50
+ else
51
+ raise Error, "unknown type: #{type}"
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ private
58
+
59
+ def build_arrow_type(type, options = {})
60
+
61
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ class ValueArrayBuilder
5
+ attr_reader :type, :options, :arrow_type
6
+
7
+ def self.build(input, type, options)
8
+ new(type, options).build(input)
9
+ end
10
+
11
+ def initialize(type, options)
12
+ @type = type
13
+ @options = options
14
+ @arrow_type = Parqueteur::TypeResolver.resolve(type, options)
15
+ end
16
+
17
+ def build(input)
18
+ return if input.nil?
19
+
20
+ case type
21
+ when :array
22
+ Arrow::ListArrayBuilder.build(arrow_type, input)
23
+ when :map
24
+ builder = Arrow::MapArrayBuilder.new(arrow_type)
25
+ input.each do |entry|
26
+ builder.append_value
27
+ next if entry.nil?
28
+
29
+ entry.each do |k, v|
30
+ builder.key_builder.append(k)
31
+ builder.item_builder.append(v)
32
+ end
33
+ end
34
+
35
+ builder.finish
36
+ when :boolean
37
+ Arrow::BooleanArray.new(input)
38
+ when :integer
39
+ if options.fetch(:unsigned, false) == true
40
+ Arrow::UInt32Array.new(input)
41
+ else
42
+ Arrow::Int32Array.new(input)
43
+ end
44
+ when :long
45
+ if options.fetch(:unsigned, false) == true
46
+ Arrow::UInt64Array.new(input)
47
+ else
48
+ Arrow::Int64Array.new(input)
49
+ end
50
+ when :string
51
+ Arrow::StringArray.new(input)
52
+ when :timestamp
53
+ Arrow::TimestampArray.new(input)
54
+ else
55
+ raise Error, "unknown type: #{type}"
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ VERSION = '1.0.2'
5
+ end
data/lib/parqueteur.rb ADDED
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "parqueteur/version"
4
+ require 'parqueteur/type_resolver'
5
+ require 'parqueteur/column'
6
+ require 'parqueteur/column_collection'
7
+ require 'parqueteur/converter'
8
+ require 'parqueteur/chunked_converter'
9
+ require 'parqueteur/input'
10
+ require 'parqueteur/value_array_builder'
11
+ require 'json'
12
+ require 'parquet'
13
+
14
+ module Parqueteur
15
+ class Error < StandardError; end
16
+ # Your code goes here...
17
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/parqueteur/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "parqueteur"
7
+ spec.version = Parqueteur::VERSION
8
+ spec.authors = ["Julien D."]
9
+ spec.email = ["julien@pocketsizesun.com"]
10
+ spec.license = 'Apache-2.0'
11
+ spec.summary = 'Parqueteur - A Ruby gem that convert JSON to Parquet'
12
+ spec.description = 'Convert JSON to Parquet'
13
+ spec.homepage = 'https://github.com/pocketsizesun/parqueteur-ruby'
14
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
15
+
16
+ spec.metadata["allowed_push_host"] = "https://rubygems.org"
17
+
18
+ # spec.metadata["homepage_uri"] = spec.homepage
19
+ # spec.metadata["source_code_uri"] = "Put your gem's public repo URL here."
20
+ # spec.metadata["changelog_uri"] = "Put your gem's CHANGELOG.md URL here."
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
25
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
26
+ end
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+
31
+ # Uncomment to register a new dependency of your gem
32
+ # spec.add_dependency "example-gem", "~> 1.0"
33
+ spec.add_dependency 'red-parquet', '~> 5.0'
34
+
35
+ # For more information and examples about making a new gem, checkout our
36
+ # guide at: https://bundler.io/guides/creating_gem.html
37
+ end
data/test.json ADDED
@@ -0,0 +1 @@
1
+ [{"id":1,"reference":"coucou","hash":{"a":"b"},"valid":true,"hash2":{},"numbers":[1,2,3],"map_array":[]},{"id":2,"reference":"coucou","hash":{"c":"d"},"valid":false,"hash2":{},"numbers":[4,5,6],"map_array":[]},{"id":3,"reference":"coucou","hash":{"e":"f"},"valid":true,"hash2":{"x":[1,2,3]},"numbers":[7,8,9],"map_array":[{"x":"y"}]}]
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parqueteur
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Julien D.
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-10-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: red-parquet
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '5.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '5.0'
27
+ description: Convert JSON to Parquet
28
+ email:
29
+ - julien@pocketsizesun.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - ".gitignore"
35
+ - Gemfile
36
+ - Gemfile.lock
37
+ - README.md
38
+ - Rakefile
39
+ - bin/console
40
+ - bin/setup
41
+ - example.rb
42
+ - lib/parqueteur.rb
43
+ - lib/parqueteur/chunked_converter.rb
44
+ - lib/parqueteur/column.rb
45
+ - lib/parqueteur/column_collection.rb
46
+ - lib/parqueteur/converter.rb
47
+ - lib/parqueteur/input.rb
48
+ - lib/parqueteur/type_resolver.rb
49
+ - lib/parqueteur/value_array_builder.rb
50
+ - lib/parqueteur/version.rb
51
+ - parqueteur.gemspec
52
+ - test.json
53
+ homepage: https://github.com/pocketsizesun/parqueteur-ruby
54
+ licenses:
55
+ - Apache-2.0
56
+ metadata:
57
+ allowed_push_host: https://rubygems.org
58
+ post_install_message:
59
+ rdoc_options: []
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: 2.3.0
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ requirements: []
73
+ rubygems_version: 3.2.3
74
+ signing_key:
75
+ specification_version: 4
76
+ summary: Parqueteur - A Ruby gem that convert JSON to Parquet
77
+ test_files: []