parqueteur 1.0.3 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/Gemfile.lock +1 -1
  4. data/README.md +43 -8
  5. data/examples/convert-and-compression.rb +56 -0
  6. data/examples/convert-methods.rb +54 -0
  7. data/examples/convert-to-io.rb +52 -0
  8. data/examples/convert-with-chunks.rb +54 -0
  9. data/examples/convert-without-compression.rb +52 -0
  10. data/examples/hello-world.rb +56 -0
  11. data/lib/parqueteur/column.rb +3 -20
  12. data/lib/parqueteur/column_collection.rb +8 -0
  13. data/lib/parqueteur/converter.rb +110 -67
  14. data/lib/parqueteur/input.rb +12 -27
  15. data/lib/parqueteur/struct.rb +25 -0
  16. data/lib/parqueteur/type.rb +21 -0
  17. data/lib/parqueteur/type_resolver.rb +44 -48
  18. data/lib/parqueteur/types/array_type.rb +21 -0
  19. data/lib/parqueteur/types/boolean_type.rb +15 -0
  20. data/lib/parqueteur/types/date32_type.rb +15 -0
  21. data/lib/parqueteur/types/date64_type.rb +15 -0
  22. data/lib/parqueteur/types/decimal128_type.rb +18 -0
  23. data/lib/parqueteur/types/decimal256_type.rb +18 -0
  24. data/lib/parqueteur/types/int32_type.rb +23 -0
  25. data/lib/parqueteur/types/int64_type.rb +23 -0
  26. data/lib/parqueteur/types/map_type.rb +36 -0
  27. data/lib/parqueteur/types/string_type.rb +20 -0
  28. data/lib/parqueteur/types/struct_type.rb +35 -0
  29. data/lib/parqueteur/types/time32_type.rb +19 -0
  30. data/lib/parqueteur/types/time64_type.rb +19 -0
  31. data/lib/parqueteur/types/timestamp_type.rb +24 -0
  32. data/lib/parqueteur/version.rb +1 -1
  33. data/lib/parqueteur.rb +24 -7
  34. data/parqueteur.gemspec +2 -2
  35. data/scripts/apache-arrow-ubuntu-install.sh +18 -0
  36. metadata +27 -8
  37. data/example.rb +0 -20
  38. data/lib/parqueteur/chunked_converter.rb +0 -28
  39. data/lib/parqueteur/value_array_builder.rb +0 -59
  40. data/test.json +0 -1
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Time32Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::Time32Array.new(
8
+ @options.fetch(:precision, :second), values
9
+ )
10
+ end
11
+
12
+ def arrow_type_builder
13
+ Arrow::Time32DataType.new(
14
+ options.fetch(:unit, :second)
15
+ )
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parqueteur
4
+ module Types
5
+ class Time64Type < Parqueteur::Type
6
+ def build_value_array(values)
7
+ Arrow::Time64Array.new(
8
+ @options.fetch(:precision, :second), values
9
+ )
10
+ end
11
+
12
+ def arrow_type_builder
13
+ Arrow::Time64DataType.new(
14
+ options.fetch(:unit, :second)
15
+ )
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ # when :timestamp
4
+ # Arrow::TimestampDataType.new(
5
+ # options.fetch(:unit, :second)
6
+ # )
7
+
8
+ module Parqueteur
9
+ module Types
10
+ class TimestampType < Parqueteur::Type
11
+ def build_value_array(values)
12
+ Arrow::TimestampArray.new(
13
+ options.fetch(:unit, :second), values
14
+ )
15
+ end
16
+
17
+ def arrow_type_builder
18
+ Arrow::TimestampDataType.new(
19
+ options.fetch(:unit, :second)
20
+ )
21
+ end
22
+ end
23
+ end
24
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Parqueteur
4
- VERSION = '1.0.3'
4
+ VERSION = '1.3.0'
5
5
  end
data/lib/parqueteur.rb CHANGED
@@ -1,17 +1,34 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "parqueteur/version"
4
- require 'parqueteur/type_resolver'
3
+ require 'json'
4
+ require 'singleton'
5
+ require 'tempfile'
6
+ require 'parquet'
7
+
8
+ require_relative 'parqueteur/version'
5
9
  require 'parqueteur/column'
6
10
  require 'parqueteur/column_collection'
7
11
  require 'parqueteur/converter'
8
- require 'parqueteur/chunked_converter'
9
12
  require 'parqueteur/input'
10
- require 'parqueteur/value_array_builder'
11
- require 'json'
12
- require 'parquet'
13
+ require 'parqueteur/struct'
14
+ require 'parqueteur/type'
15
+ require 'parqueteur/type_resolver'
16
+ require 'parqueteur/types/array_type'
17
+ require 'parqueteur/types/boolean_type'
18
+ require 'parqueteur/types/date32_type'
19
+ require 'parqueteur/types/date64_type'
20
+ require 'parqueteur/types/decimal128_type'
21
+ require 'parqueteur/types/decimal256_type'
22
+ require 'parqueteur/types/int32_type'
23
+ require 'parqueteur/types/int64_type'
24
+ require 'parqueteur/types/map_type'
25
+ require 'parqueteur/types/string_type'
26
+ require 'parqueteur/types/struct_type'
27
+ require 'parqueteur/types/time32_type'
28
+ require 'parqueteur/types/time64_type'
29
+ require 'parqueteur/types/timestamp_type'
13
30
 
14
31
  module Parqueteur
15
32
  class Error < StandardError; end
16
- # Your code goes here...
33
+ class TypeNotFound < Error; end
17
34
  end
data/parqueteur.gemspec CHANGED
@@ -8,8 +8,8 @@ Gem::Specification.new do |spec|
8
8
  spec.authors = ["Julien D."]
9
9
  spec.email = ["julien@pocketsizesun.com"]
10
10
  spec.license = 'Apache-2.0'
11
- spec.summary = 'Parqueteur - A Ruby gem that convert JSON to Parquet'
12
- spec.description = 'Convert JSON to Parquet'
11
+ spec.summary = 'Parqueteur - A Ruby gem that convert data to Parquet'
12
+ spec.description = 'Convert data to Parquet'
13
13
  spec.homepage = 'https://github.com/pocketsizesun/parqueteur-ruby'
14
14
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
15
15
 
@@ -0,0 +1,18 @@
1
+ #!/bin/sh
2
+
3
+ if [ $(dpkg-query -W -f='${Status}' apache-arrow-apt-source 2>/dev/null | grep -c "ok installed") -eq 1 ]
4
+ then
5
+ exit 0
6
+ fi
7
+
8
+ LSB_RELEASE_CODENAME_SHORT=$(lsb_release --codename --short)
9
+
10
+ apt-get update
11
+ apt-get install -y -V ca-certificates lsb-release wget
12
+ wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
13
+ apt-get install -y -V ./apache-arrow-apt-source-latest-${LSB_RELEASE_CODENAME_SHORT}.deb
14
+ rm ./apache-arrow-apt-source-latest-${LSB_RELEASE_CODENAME_SHORT}.deb
15
+ apt-get update
16
+ apt-get install -y libgirepository1.0-dev libarrow-dev libarrow-glib-dev libparquet-dev libparquet-glib-dev
17
+
18
+ exit 0
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parqueteur
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Julien D.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-10-02 00:00:00.000000000 Z
11
+ date: 2021-10-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: red-parquet
@@ -24,7 +24,7 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '5.0'
27
- description: Convert JSON to Parquet
27
+ description: Convert data to Parquet
28
28
  email:
29
29
  - julien@pocketsizesun.com
30
30
  executables: []
@@ -38,18 +38,37 @@ files:
38
38
  - Rakefile
39
39
  - bin/console
40
40
  - bin/setup
41
- - example.rb
41
+ - examples/convert-and-compression.rb
42
+ - examples/convert-methods.rb
43
+ - examples/convert-to-io.rb
44
+ - examples/convert-with-chunks.rb
45
+ - examples/convert-without-compression.rb
46
+ - examples/hello-world.rb
42
47
  - lib/parqueteur.rb
43
- - lib/parqueteur/chunked_converter.rb
44
48
  - lib/parqueteur/column.rb
45
49
  - lib/parqueteur/column_collection.rb
46
50
  - lib/parqueteur/converter.rb
47
51
  - lib/parqueteur/input.rb
52
+ - lib/parqueteur/struct.rb
53
+ - lib/parqueteur/type.rb
48
54
  - lib/parqueteur/type_resolver.rb
49
- - lib/parqueteur/value_array_builder.rb
55
+ - lib/parqueteur/types/array_type.rb
56
+ - lib/parqueteur/types/boolean_type.rb
57
+ - lib/parqueteur/types/date32_type.rb
58
+ - lib/parqueteur/types/date64_type.rb
59
+ - lib/parqueteur/types/decimal128_type.rb
60
+ - lib/parqueteur/types/decimal256_type.rb
61
+ - lib/parqueteur/types/int32_type.rb
62
+ - lib/parqueteur/types/int64_type.rb
63
+ - lib/parqueteur/types/map_type.rb
64
+ - lib/parqueteur/types/string_type.rb
65
+ - lib/parqueteur/types/struct_type.rb
66
+ - lib/parqueteur/types/time32_type.rb
67
+ - lib/parqueteur/types/time64_type.rb
68
+ - lib/parqueteur/types/timestamp_type.rb
50
69
  - lib/parqueteur/version.rb
51
70
  - parqueteur.gemspec
52
- - test.json
71
+ - scripts/apache-arrow-ubuntu-install.sh
53
72
  homepage: https://github.com/pocketsizesun/parqueteur-ruby
54
73
  licenses:
55
74
  - Apache-2.0
@@ -73,5 +92,5 @@ requirements: []
73
92
  rubygems_version: 3.2.3
74
93
  signing_key:
75
94
  specification_version: 4
76
- summary: Parqueteur - A Ruby gem that convert JSON to Parquet
95
+ summary: Parqueteur - A Ruby gem that convert data to Parquet
77
96
  test_files: []
data/example.rb DELETED
@@ -1,20 +0,0 @@
1
- require 'bundler/setup'
2
- require 'parqueteur'
3
-
4
- class Foo < Parqueteur::Converter
5
- column :id, :long
6
- column :reference, :string
7
- column :hash, :map, key: :string, value: :string
8
- column :valid, :boolean
9
- column :total, :integer
10
- end
11
-
12
- LETTERS = ('a'..'z').to_a
13
-
14
- data = 1000.times.collect do |i|
15
- { 'id' => i + 1, 'reference' => "coucou:#{i}", 'hash' => { 'a' => LETTERS.sample }, 'valid' => rand < 0.5, 'total' => rand(100..500) }
16
- end
17
-
18
- chunked_converter = Parqueteur::ChunkedConverter.new(data, Foo)
19
- pp chunked_converter.write_files('test')
20
- # puts Foo.convert(data, output: 'test.parquet')
@@ -1,28 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Parqueteur
4
- class ChunkedConverter
5
- attr_reader :schema
6
-
7
- def initialize(input, converter, chunk_size = 200)
8
- @input = Parqueteur::Input.from(input)
9
- @converter = converter
10
- @chunk_size = chunk_size
11
- end
12
-
13
- def chunks
14
- Enumerator.new do |arr|
15
- @input.each_slice(@chunk_size) do |chunk|
16
- local_converter = @converter.new(chunk)
17
- arr << local_converter.to_io
18
- end
19
- end
20
- end
21
-
22
- def write_files(prefix)
23
- chunks.each_with_index do |chunk, idx|
24
- File.write("#{prefix}.#{idx}.parquet", chunk.read)
25
- end
26
- end
27
- end
28
- end
@@ -1,59 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Parqueteur
4
- class ValueArrayBuilder
5
- attr_reader :type, :options, :arrow_type
6
-
7
- def self.build(input, type, options)
8
- new(type, options).build(input)
9
- end
10
-
11
- def initialize(type, options)
12
- @type = type
13
- @options = options
14
- @arrow_type = Parqueteur::TypeResolver.resolve(type, options)
15
- end
16
-
17
- def build(input)
18
- return if input.nil?
19
-
20
- case type
21
- when :array
22
- Arrow::ListArrayBuilder.build(arrow_type, input)
23
- when :map
24
- builder = Arrow::MapArrayBuilder.new(arrow_type)
25
- input.each do |entry|
26
- builder.append_value
27
- next if entry.nil?
28
-
29
- entry.each do |k, v|
30
- builder.key_builder.append(k)
31
- builder.item_builder.append(v)
32
- end
33
- end
34
-
35
- builder.finish
36
- when :boolean
37
- Arrow::BooleanArray.new(input)
38
- when :integer
39
- if options.fetch(:unsigned, false) == true
40
- Arrow::UInt32Array.new(input)
41
- else
42
- Arrow::Int32Array.new(input)
43
- end
44
- when :long
45
- if options.fetch(:unsigned, false) == true
46
- Arrow::UInt64Array.new(input)
47
- else
48
- Arrow::Int64Array.new(input)
49
- end
50
- when :string
51
- Arrow::StringArray.new(input)
52
- when :timestamp
53
- Arrow::TimestampArray.new(input)
54
- else
55
- raise Error, "unknown type: #{type}"
56
- end
57
- end
58
- end
59
- end
data/test.json DELETED
@@ -1 +0,0 @@
1
- [{"id":1,"reference":"coucou","hash":{"a":"b"},"valid":true,"hash2":{},"numbers":[1,2,3],"map_array":[]},{"id":2,"reference":"coucou","hash":{"c":"d"},"valid":false,"hash2":{},"numbers":[4,5,6],"map_array":[]},{"id":3,"reference":"coucou","hash":{"e":"f"},"valid":true,"hash2":{"x":[1,2,3]},"numbers":[7,8,9],"map_array":[{"x":"y"}]}]