red-parquet 16.1.0 → 18.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/parquet/arrow-file-writer.rb +98 -0
- data/lib/parquet/loader.rb +1 -0
- data/lib/parquet/version.rb +1 -1
- data/test/test-arrow-file-writer.rb +76 -0
- metadata +11 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cecbea49f27646046853df48ab3cbee973a252a949e0307611555222c2d8d726
|
4
|
+
data.tar.gz: c9d0cd4b119c142ef03ce6c41d4c82bc3ad645d80fede4a13ac2f54bc6559af4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3c5ef482cf2b632964849e84a4ed00feb59895c234270814ef812690a81cf46ba1311d53a4473f7903d7508a41199f82d3f58b2d01f3f4ae1efedfb55ed3a003
|
7
|
+
data.tar.gz: 1309ee59d3088712864bbf6783ddd07a13dbf7014091d2c523f81c05ec69e166d9c5449bdde6ffc3f84308c3abedc4e7e558d58135c5d51efd9daa0d0f9a0aad
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
module Parquet
|
19
|
+
class ArrowFileWriter
|
20
|
+
# Write data to Apache Parquet.
|
21
|
+
#
|
22
|
+
# @return [void]
|
23
|
+
#
|
24
|
+
# @overload write(record_batch)
|
25
|
+
#
|
26
|
+
# @param record_batch [Arrow::RecordBatch] The record batch to
|
27
|
+
# be written.
|
28
|
+
#
|
29
|
+
# @example Write a record batch
|
30
|
+
# record_batch = Arrow::RecordBatch.new(enabled: [true, false])
|
31
|
+
# schema = record_batch.schema
|
32
|
+
# Parquet::ArrowFileWriter.open(schema, "data.parquet") do |writer|
|
33
|
+
# writer.write(record_batch)
|
34
|
+
# end
|
35
|
+
#
|
36
|
+
# @overload write(table, chunk_size: nil)
|
37
|
+
#
|
38
|
+
# @param table [Arrow::Table] The table to be written.
|
39
|
+
#
|
40
|
+
# @param chunk_size [nil, Integer] (nil) The maximum number of
|
41
|
+
# rows to write per row group.
|
42
|
+
#
|
43
|
+
# If this is `nil`, the default value (`1024 * 1024`) is used.
|
44
|
+
#
|
45
|
+
# @example Write a record batch with the default chunk size
|
46
|
+
# table = Arrow::Table.new(enabled: [true, false])
|
47
|
+
# schema = table.schema
|
48
|
+
# Parquet::ArrowFileWriter.open(schema, "data.parquet") do |writer|
|
49
|
+
# writer.write(table)
|
50
|
+
# end
|
51
|
+
#
|
52
|
+
# @example Write a record batch with the specified chunk size
|
53
|
+
# table = Arrow::Table.new(enabled: [true, false])
|
54
|
+
# schema = table.schema
|
55
|
+
# Parquet::ArrowFileWriter.open(schema, "data.parquet") do |writer|
|
56
|
+
# writer.write(table, chunk_size: 1)
|
57
|
+
# end
|
58
|
+
#
|
59
|
+
# @overload write(raw_records)
|
60
|
+
#
|
61
|
+
# @param data [Array<Hash>, Array<Array>] The data to be written
|
62
|
+
# as primitive Ruby objects.
|
63
|
+
#
|
64
|
+
# @example Write a record batch with Array<Array> based data
|
65
|
+
# schema = Arrow::Schema.new(enabled: :boolean)
|
66
|
+
# raw_records = [
|
67
|
+
# [true],
|
68
|
+
# [false],
|
69
|
+
# ]
|
70
|
+
# Parquet::ArrowFileWriter.open(schema, "data.parquet") do |writer|
|
71
|
+
# writer.write(raw_records)
|
72
|
+
# end
|
73
|
+
#
|
74
|
+
# @example Write a record batch with Array<Hash> based data
|
75
|
+
# schema = Arrow::Schema.new(enabled: :boolean)
|
76
|
+
# raw_columns = [
|
77
|
+
# enabled: [true, false],
|
78
|
+
# ]
|
79
|
+
# Parquet::ArrowFileWriter.open(schema, "data.parquet") do |writer|
|
80
|
+
# writer.write(raw_columns)
|
81
|
+
# end
|
82
|
+
#
|
83
|
+
# @since 18.0.0
|
84
|
+
def write(target, chunk_size: nil)
|
85
|
+
case target
|
86
|
+
when Arrow::RecordBatch
|
87
|
+
write_record_batch(target)
|
88
|
+
when Arrow::Table
|
89
|
+
# Same as parquet::DEFAULT_MAX_ROW_GROUP_LENGTH in C++
|
90
|
+
chunk_size ||= 1024 * 1024
|
91
|
+
write_table(target, chunk_size)
|
92
|
+
else
|
93
|
+
record_batch = Arrow::RecordBatch.new(schema, target)
|
94
|
+
write_record_batch(record_batch)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
data/lib/parquet/loader.rb
CHANGED
data/lib/parquet/version.rb
CHANGED
@@ -0,0 +1,76 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
class TestArrowFileWriter < Test::Unit::TestCase
|
19
|
+
def open_buffer_output_stream
|
20
|
+
buffer = Arrow::ResizableBuffer.new(4096)
|
21
|
+
Arrow::BufferOutputStream.open(buffer) do |output|
|
22
|
+
yield(output)
|
23
|
+
end
|
24
|
+
buffer
|
25
|
+
end
|
26
|
+
|
27
|
+
sub_test_case("#write") do
|
28
|
+
test("RecordBatch") do
|
29
|
+
schema = Arrow::Schema.new(visible: :boolean)
|
30
|
+
record_batch = Arrow::RecordBatch.new(schema, [[true], [false]])
|
31
|
+
buffer = open_buffer_output_stream do |output|
|
32
|
+
Parquet::ArrowFileWriter.open(record_batch.schema, output) do |writer|
|
33
|
+
writer.write(record_batch)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
assert_equal(record_batch.to_table,
|
37
|
+
Arrow::Table.load(buffer, format: :parquet))
|
38
|
+
end
|
39
|
+
|
40
|
+
test("Table") do
|
41
|
+
schema = Arrow::Schema.new(visible: :boolean)
|
42
|
+
table = Arrow::Table.new(schema, [[true], [false]])
|
43
|
+
buffer = open_buffer_output_stream do |output|
|
44
|
+
Parquet::ArrowFileWriter.open(table.schema, output) do |writer|
|
45
|
+
writer.write(table)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
assert_equal(table,
|
49
|
+
Arrow::Table.load(buffer, format: :parquet))
|
50
|
+
end
|
51
|
+
|
52
|
+
test("[[]]") do
|
53
|
+
schema = Arrow::Schema.new(visible: :boolean)
|
54
|
+
raw_records = [[true], [false]]
|
55
|
+
buffer = open_buffer_output_stream do |output|
|
56
|
+
Parquet::ArrowFileWriter.open(schema, output) do |writer|
|
57
|
+
writer.write(raw_records)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
assert_equal(Arrow::RecordBatch.new(schema, raw_records).to_table,
|
61
|
+
Arrow::Table.load(buffer, format: :parquet))
|
62
|
+
end
|
63
|
+
|
64
|
+
test("[{}]") do
|
65
|
+
schema = Arrow::Schema.new(visible: :boolean)
|
66
|
+
raw_columns = [visible: [true, false]]
|
67
|
+
buffer = open_buffer_output_stream do |output|
|
68
|
+
Parquet::ArrowFileWriter.open(schema, output) do |writer|
|
69
|
+
writer.write(raw_columns)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
assert_equal(Arrow::RecordBatch.new(schema, raw_columns).to_table,
|
73
|
+
Arrow::Table.load(buffer, format: :parquet))
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 18.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Apache Arrow Developers
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: red-arrow
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - '='
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 18.0.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - '='
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 18.0.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -82,6 +82,7 @@ files:
|
|
82
82
|
- dependency-check/Rakefile
|
83
83
|
- lib/parquet.rb
|
84
84
|
- lib/parquet/arrow-file-reader.rb
|
85
|
+
- lib/parquet/arrow-file-writer.rb
|
85
86
|
- lib/parquet/arrow-table-loadable.rb
|
86
87
|
- lib/parquet/arrow-table-savable.rb
|
87
88
|
- lib/parquet/loader.rb
|
@@ -91,13 +92,14 @@ files:
|
|
91
92
|
- test/helper.rb
|
92
93
|
- test/run-test.rb
|
93
94
|
- test/test-arrow-file-reader.rb
|
95
|
+
- test/test-arrow-file-writer.rb
|
94
96
|
- test/test-arrow-table.rb
|
95
97
|
- test/test-boolean-statistics.rb
|
96
98
|
homepage: https://arrow.apache.org/
|
97
99
|
licenses:
|
98
100
|
- Apache-2.0
|
99
101
|
metadata: {}
|
100
|
-
post_install_message:
|
102
|
+
post_install_message:
|
101
103
|
rdoc_options: []
|
102
104
|
require_paths:
|
103
105
|
- lib
|
@@ -112,13 +114,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
112
114
|
- !ruby/object:Gem::Version
|
113
115
|
version: '0'
|
114
116
|
requirements: []
|
115
|
-
rubygems_version: 3.
|
116
|
-
signing_key:
|
117
|
+
rubygems_version: 3.4.20
|
118
|
+
signing_key:
|
117
119
|
specification_version: 4
|
118
120
|
summary: Red Parquet is the Ruby bindings of Apache Parquet
|
119
121
|
test_files:
|
120
122
|
- test/helper.rb
|
121
123
|
- test/run-test.rb
|
122
124
|
- test/test-arrow-file-reader.rb
|
125
|
+
- test/test-arrow-file-writer.rb
|
123
126
|
- test/test-arrow-table.rb
|
124
127
|
- test/test-boolean-statistics.rb
|