red-arrow-dataset 5.0.0 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bd2475450c901efd8a4a2e311459285098b42a714ada39e41959e8c7d7f477c8
4
- data.tar.gz: 7d696272844899dbdedf1d1e4ba379a54c91a1187da33c97a6d5eddb89c5f588
3
+ metadata.gz: 197357bda9355ce343ee13115d4850fcee6730df92d47c6b3def322533317a10
4
+ data.tar.gz: 45c1e7544c1323d6b7f342ea528372e414e10a0349e296bcc57c0200b1dd5e17
5
5
  SHA512:
6
- metadata.gz: 61868f12b9d4b607ebf3e408a81e576c0684a433b9ffbb2245abc34dd5a4f797a78ed05544c74a0bdd0a8caa7da01cc80e9885f32f90c63c788f7b1380d04df1
7
- data.tar.gz: dd3022730a0d70182217dc80a7364db74eaba3bab783bb673b0776e1c5b562c8200ec6c10d81fda61627aca3ce458d287141f0c97e2b1e706ddda752deab2b35
6
+ metadata.gz: f9f1d0f69620be9967ca0b8b9115eb5ead5791ad1cd81e324047951ac80e5583b0568ae521abd8c491a32b422d8232bed20f1ea42fbc0c480786538f20c43bad
7
+ data.tar.gz: b12912dd69dd26eb3c3754ce466623b81e86bf6664433feb6ebd95e252e539144250cdffc8d615d3b5b070ae63f82d9f5a0e507be8b13c7f1114797e13140e4d
@@ -0,0 +1,61 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module ArrowDataset
19
+ module ArrowTableLoadable
20
+ private
21
+ def path_to_uri(path)
22
+ absolute_path = ::File.expand_path(path)
23
+ if absolute_path.start_with?("/")
24
+ URI("file://#{absolute_path}")
25
+ else
26
+ URI("file:///#{absolute_path}")
27
+ end
28
+ end
29
+
30
+ def load_from_directory
31
+ internal_load_from_uri(path_to_uri(@input))
32
+ end
33
+
34
+ def load_from_uri
35
+ internal_load_from_uri(@input)
36
+ end
37
+
38
+ def internal_load_from_uri(uri)
39
+ format = FileFormat.resolve(@options[:format])
40
+ dataset = FileSystemDataset.build(format) do |factory|
41
+ factory.file_system_uri = uri
42
+ end
43
+ scanner_builder = dataset.begin_scan
44
+ @options.each do |key, value|
45
+ next if key == :format
46
+ next if value.nil?
47
+ setter = "#{key}="
48
+ next unless scanner_builder.respond_to?(setter)
49
+ scanner_builder.public_send(setter, value)
50
+ end
51
+ scanner = scanner_builder.finish
52
+ scanner.to_table
53
+ end
54
+ end
55
+ end
56
+
57
+ module Arrow
58
+ class TableLoader
59
+ include ArrowDataset::ArrowTableLoadable
60
+ end
61
+ end
@@ -0,0 +1,69 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module ArrowDataset
19
+ module ArrowTableSavable
20
+ private
21
+ def save_to_uri
22
+ format = FileFormat.resolve(@options[:format])
23
+ options = FileSystemDatasetWriteOptions.new
24
+ options.file_write_options = format.default_write_options
25
+ path = @output.path
26
+ if @output.scheme.nil?
27
+ options.file_system = Arrow::LocalFileSystem.new
28
+ else
29
+ options.file_system = Arrow::FileSystem.create(@output.to_s)
30
+ # /C:/... -> C:/...
31
+ unless File.expand_path(".").start_with?("/")
32
+ path = path.gsub(/\A\//, "")
33
+ end
34
+ end
35
+ partitioning = @options[:partitioning]
36
+ if partitioning
37
+ # TODO
38
+ options.base_dir = File.dirname(path)
39
+ options.base_name_template = File.basename(path)
40
+ options.partitioning = Partitioning.resolve(@options[:partitioning])
41
+ scanner_builder = ScannerBuilder.new(@table)
42
+ scanner_builder.use_async(true)
43
+ scanner = scanner_builder.finish
44
+ FileSystemDataset.write_scanner(scanner, options)
45
+ else
46
+ dir = File.dirname(path)
47
+ unless File.exist?(dir)
48
+ options.file_system.create_dir(dir, true)
49
+ end
50
+ options.file_system.open_output_stream(path) do |output_stream|
51
+ format.open_writer(output_stream,
52
+ options.file_system,
53
+ path,
54
+ @table.schema,
55
+ format.default_write_options) do |writer|
56
+ reader = Arrow::TableBatchReader.new(@table)
57
+ writer.write_record_batch_reader(reader)
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
64
+
65
+ module Arrow
66
+ class TableSaver
67
+ include ArrowDataset::ArrowTableSavable
68
+ end
69
+ end
@@ -0,0 +1,59 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module ArrowDataset
19
+ class FileFormat
20
+ class << self
21
+ def resolve(format)
22
+ case format
23
+ when :arrow, :arrow_file, :arrow_streaming
24
+ IPCFileFormat.new
25
+ when :parquet
26
+ ParquetFileFormat.new
27
+ when :csv
28
+ CSVFileFormat.new
29
+ else
30
+ available_formats = [
31
+ :arrow,
32
+ :arrow_file,
33
+ :arrow_streaming,
34
+ :parquet,
35
+ :csv,
36
+ ]
37
+ message = "Arrow::Table load format must be one of ["
38
+ message << available_formats.join(", ")
39
+ message << "]: #{@options[:format].inspect}"
40
+ raise ArgumentError, message
41
+ end
42
+ end
43
+ end
44
+
45
+ alias_method :open_writer_raw, :open_writer
46
+ def open_writer(destination, file_system, path, schema, options)
47
+ writer = open_writer_raw(destination, file_system, path, schema, options)
48
+ if block_given?
49
+ begin
50
+ yield(writer)
51
+ ensure
52
+ writer.finish
53
+ end
54
+ else
55
+ writer
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,39 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module ArrowDataset
19
+ class FileSystemDatasetFactory
20
+ alias_method :set_file_system_uri_raw, :set_file_system_uri
21
+ def set_file_system_uri(uri)
22
+ if uri.is_a?(URI)
23
+ if uri.scheme.nil?
24
+ uri = uri.dup
25
+ absolute_path = File.expand_path(uri.path)
26
+ if absolute_path.start_with?("/")
27
+ uri.path = absolute_path
28
+ else
29
+ uri.path = "/#{absolute_path}"
30
+ end
31
+ uri.scheme = "file"
32
+ end
33
+ uri = uri.to_s
34
+ end
35
+ set_file_system_uri_raw(uri)
36
+ end
37
+ alias_method :file_system_uri=, :set_file_system_uri
38
+ end
39
+ end
@@ -29,7 +29,11 @@ module ArrowDataset
29
29
  end
30
30
 
31
31
  def require_libraries
32
+ require "arrow-dataset/arrow-table-loadable"
33
+ require "arrow-dataset/arrow-table-savable"
32
34
  require "arrow-dataset/dataset"
35
+ require "arrow-dataset/file-format"
36
+ require "arrow-dataset/file-system-dataset-factory"
33
37
  end
34
38
  end
35
39
  end
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module ArrowDataset
19
- VERSION = "5.0.0"
19
+ VERSION = "6.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
@@ -0,0 +1,80 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ class TestArrowTable < Test::Unit::TestCase
19
+ def setup
20
+ Dir.mktmpdir do |tmpdir|
21
+ @dir = tmpdir
22
+ @path1 = File.join(@dir, "data", "table1.arrow")
23
+ @table1 = Arrow::Table.new(visible: [true, false, true],
24
+ point: [1, 2, 3])
25
+ @path2 = File.join(@dir, "data", "table2.arrow")
26
+ @table2 = Arrow::Table.new(visible: [true],
27
+ point: [10])
28
+ yield
29
+ end
30
+ end
31
+
32
+ def build_file_uri(path)
33
+ absolute_path = File.expand_path(path)
34
+ if absolute_path.start_with?("/")
35
+ URI("file://#{absolute_path}")
36
+ else
37
+ URI("file:///#{absolute_path}")
38
+ end
39
+ end
40
+
41
+ sub_test_case("load") do
42
+ def test_no_scheme
43
+ Dir.chdir(@dir) do
44
+ uri = URI(File.basename(@path1))
45
+ @table1.save(uri)
46
+ assert_equal(@table1, Arrow::Table.load(uri))
47
+ end
48
+ end
49
+
50
+ def test_file
51
+ uri = build_file_uri(@path1)
52
+ @table1.save(uri)
53
+ assert_equal(@table1, Arrow::Table.load(uri))
54
+ end
55
+
56
+ def test_directory_uri
57
+ uri = build_file_uri(@dir)
58
+ @table1.save(build_file_uri(@path1))
59
+ @table2.save(build_file_uri(@path2))
60
+ assert_equal(@table1.concatenate([@table2]),
61
+ Arrow::Table.load(uri))
62
+ end
63
+
64
+ def test_directory_path
65
+ @table1.save(build_file_uri(@path1))
66
+ @table2.save(build_file_uri(@path2))
67
+ assert_equal(@table1.concatenate([@table2]),
68
+ Arrow::Table.load(@dir))
69
+ end
70
+
71
+ def test_filter
72
+ @table1.save(build_file_uri(@path1))
73
+ @table2.save(build_file_uri(@path2))
74
+ assert_equal(Arrow::Table.new(visible: [true, true, true],
75
+ point: [1, 3, 10]),
76
+ Arrow::Table.load(@dir,
77
+ filter: ["equal", :visible, true]))
78
+ end
79
+ end
80
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-arrow-dataset
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.0.0
4
+ version: 6.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Apache Arrow Developers
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-07-28 00:00:00.000000000 Z
11
+ date: 2021-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: red-arrow
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - '='
18
18
  - !ruby/object:Gem::Version
19
- version: 5.0.0
19
+ version: 6.0.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - '='
25
25
  - !ruby/object:Gem::Version
26
- version: 5.0.0
26
+ version: 6.0.0
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -82,18 +82,23 @@ files:
82
82
  - Rakefile
83
83
  - dependency-check/Rakefile
84
84
  - lib/arrow-dataset.rb
85
+ - lib/arrow-dataset/arrow-table-loadable.rb
86
+ - lib/arrow-dataset/arrow-table-savable.rb
85
87
  - lib/arrow-dataset/dataset.rb
88
+ - lib/arrow-dataset/file-format.rb
89
+ - lib/arrow-dataset/file-system-dataset-factory.rb
86
90
  - lib/arrow-dataset/loader.rb
87
91
  - lib/arrow-dataset/version.rb
88
92
  - red-arrow-dataset.gemspec
89
93
  - test/helper.rb
90
94
  - test/run-test.rb
95
+ - test/test-arrow-table.rb
91
96
  - test/test-file-system-dataset.rb
92
97
  homepage: https://arrow.apache.org/
93
98
  licenses:
94
99
  - Apache-2.0
95
100
  metadata: {}
96
- post_install_message:
101
+ post_install_message:
97
102
  rdoc_options: []
98
103
  require_paths:
99
104
  - lib
@@ -108,11 +113,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
108
113
  - !ruby/object:Gem::Version
109
114
  version: '0'
110
115
  requirements: []
111
- rubygems_version: 3.2.22
112
- signing_key:
116
+ rubygems_version: 3.2.27
117
+ signing_key:
113
118
  specification_version: 4
114
119
  summary: Red Arrow Dataset is the Ruby bindings of Apache Arrow Dataset
115
120
  test_files:
116
121
  - test/helper.rb
117
122
  - test/run-test.rb
123
+ - test/test-arrow-table.rb
118
124
  - test/test-file-system-dataset.rb