red-arrow-dataset 4.0.1 → 7.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 00cf96f680d62ad003d931b3628313f83dd351b34222a59e5d1f85018ea20d7e
4
- data.tar.gz: 399966d45ae4bb31868c9ae21df1b5ee6e6738297597b8536718db398f2a927b
3
+ metadata.gz: 65cd29db2fd974c5f9f645dbb15fc9b60fc18795888ed2625238b7af530b39b2
4
+ data.tar.gz: b4097a34d26ff22b1d42aa1d88e81aa499f55de83f1a68cb75fddc71aa47747a
5
5
  SHA512:
6
- metadata.gz: c61e5f5ecd8ed9b027091a55ca9213e206c71c7d13c6f11714ec4ad9cdf58c45b4811a46315e267a9fcbf7d5d7a893af2b2dd935dce23c6c898d0c4ae37f4a83
7
- data.tar.gz: 0acea5814073860de6780405ff597227dd9fc0971b1312f2e51af2e43c710650968b301160ba4542498189728fb87f2a66cc9344293d80ea71efaba8fe25166c
6
+ metadata.gz: 207d36ef73e6a044375f16fdcc752c126b404cc9e49c573d226cabe1a380897bce63babd1ad229354f212d4ec2e8532f0cddfac1de894b9aed0239db8714eceb
7
+ data.tar.gz: b446b7e0b2fb95b80b77a7106d8de68f0bb9092e3228fa10bfe7f2cd8d90d58621f413e876cfd23f4b63c586a1ecb6a0c4607bcc2796dde90f6bc51586eeb497
@@ -0,0 +1,61 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module ArrowDataset
19
+ module ArrowTableLoadable
20
+ private
21
+ def path_to_uri(path)
22
+ absolute_path = ::File.expand_path(path)
23
+ if absolute_path.start_with?("/")
24
+ URI("file://#{absolute_path}")
25
+ else
26
+ URI("file:///#{absolute_path}")
27
+ end
28
+ end
29
+
30
+ def load_from_directory
31
+ internal_load_from_uri(path_to_uri(@input))
32
+ end
33
+
34
+ def load_from_uri
35
+ internal_load_from_uri(@input)
36
+ end
37
+
38
+ def internal_load_from_uri(uri)
39
+ format = FileFormat.resolve(@options[:format])
40
+ dataset = FileSystemDataset.build(format) do |factory|
41
+ factory.file_system_uri = uri
42
+ end
43
+ scanner_builder = dataset.begin_scan
44
+ @options.each do |key, value|
45
+ next if key == :format
46
+ next if value.nil?
47
+ setter = "#{key}="
48
+ next unless scanner_builder.respond_to?(setter)
49
+ scanner_builder.public_send(setter, value)
50
+ end
51
+ scanner = scanner_builder.finish
52
+ scanner.to_table
53
+ end
54
+ end
55
+ end
56
+
57
+ module Arrow
58
+ class TableLoader
59
+ include ArrowDataset::ArrowTableLoadable
60
+ end
61
+ end
@@ -0,0 +1,69 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module ArrowDataset
19
+ module ArrowTableSavable
20
+ private
21
+ def save_to_uri
22
+ format = FileFormat.resolve(@options[:format])
23
+ options = FileSystemDatasetWriteOptions.new
24
+ options.file_write_options = format.default_write_options
25
+ path = @output.path
26
+ if @output.scheme.nil?
27
+ options.file_system = Arrow::LocalFileSystem.new
28
+ else
29
+ options.file_system = Arrow::FileSystem.create(@output.to_s)
30
+ # /C:/... -> C:/...
31
+ unless File.expand_path(".").start_with?("/")
32
+ path = path.gsub(/\A\//, "")
33
+ end
34
+ end
35
+ partitioning = @options[:partitioning]
36
+ if partitioning
37
+ # TODO
38
+ options.base_dir = File.dirname(path)
39
+ options.base_name_template = File.basename(path)
40
+ options.partitioning = Partitioning.resolve(@options[:partitioning])
41
+ scanner_builder = ScannerBuilder.new(@table)
42
+ scanner_builder.use_async(true)
43
+ scanner = scanner_builder.finish
44
+ FileSystemDataset.write_scanner(scanner, options)
45
+ else
46
+ dir = File.dirname(path)
47
+ unless File.exist?(dir)
48
+ options.file_system.create_dir(dir, true)
49
+ end
50
+ options.file_system.open_output_stream(path) do |output_stream|
51
+ format.open_writer(output_stream,
52
+ options.file_system,
53
+ path,
54
+ @table.schema,
55
+ format.default_write_options) do |writer|
56
+ reader = Arrow::TableBatchReader.new(@table)
57
+ writer.write_record_batch_reader(reader)
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
64
+
65
+ module Arrow
66
+ class TableSaver
67
+ include ArrowDataset::ArrowTableSavable
68
+ end
69
+ end
@@ -16,21 +16,13 @@
16
16
  # under the License.
17
17
 
18
18
  module ArrowDataset
19
- class ScanOptions
19
+ class Dataset
20
20
  class << self
21
- def try_convert(value)
22
- case value
23
- when Hash
24
- return nil unless value.key?(:schema)
25
- options = new(value[:schema])
26
- value.each do |name, value|
27
- next if name == :schema
28
- options.__send__("#{name}=", value)
29
- end
30
- options
31
- else
32
- nil
33
- end
21
+ def build(*args)
22
+ factory_class = ArrowDataset.const_get("#{name}Factory")
23
+ factory = factory_class.new(*args)
24
+ yield(factory)
25
+ factory.finish
34
26
  end
35
27
  end
36
28
  end
@@ -0,0 +1,59 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module ArrowDataset
19
+ class FileFormat
20
+ class << self
21
+ def resolve(format)
22
+ case format
23
+ when :arrow, :arrow_file, :arrow_streaming
24
+ IPCFileFormat.new
25
+ when :parquet
26
+ ParquetFileFormat.new
27
+ when :csv
28
+ CSVFileFormat.new
29
+ else
30
+ available_formats = [
31
+ :arrow,
32
+ :arrow_file,
33
+ :arrow_streaming,
34
+ :parquet,
35
+ :csv,
36
+ ]
37
+ message = "Arrow::Table load format must be one of ["
38
+ message << available_formats.join(", ")
39
+ message << "]: #{@options[:format].inspect}"
40
+ raise ArgumentError, message
41
+ end
42
+ end
43
+ end
44
+
45
+ alias_method :open_writer_raw, :open_writer
46
+ def open_writer(destination, file_system, path, schema, options)
47
+ writer = open_writer_raw(destination, file_system, path, schema, options)
48
+ if block_given?
49
+ begin
50
+ yield(writer)
51
+ ensure
52
+ writer.finish
53
+ end
54
+ else
55
+ writer
56
+ end
57
+ end
58
+ end
59
+ end
@@ -16,17 +16,24 @@
16
16
  # under the License.
17
17
 
18
18
  module ArrowDataset
19
- class InMemoryFragment
20
- alias_method :initialize_raw, :initialize
21
- private :initialize_raw
22
- def initialize(schema, record_batches)
23
- record_batches = record_batches.collect do |record_batch|
24
- unless record_batch.is_a?(Arrow::RecordBatch)
25
- record_batch = Arrow::RecordBatch.new(record_batch)
19
+ class FileSystemDatasetFactory
20
+ alias_method :set_file_system_uri_raw, :set_file_system_uri
21
+ def set_file_system_uri(uri)
22
+ if uri.is_a?(URI)
23
+ if uri.scheme.nil?
24
+ uri = uri.dup
25
+ absolute_path = File.expand_path(uri.path)
26
+ if absolute_path.start_with?("/")
27
+ uri.path = absolute_path
28
+ else
29
+ uri.path = "/#{absolute_path}"
30
+ end
31
+ uri.scheme = "file"
26
32
  end
27
- record_batch
33
+ uri = uri.to_s
28
34
  end
29
- initialize_raw(schema, record_batches)
35
+ set_file_system_uri_raw(uri)
30
36
  end
37
+ alias_method :file_system_uri=, :set_file_system_uri
31
38
  end
32
39
  end
@@ -29,8 +29,11 @@ module ArrowDataset
29
29
  end
30
30
 
31
31
  def require_libraries
32
- require "arrow-dataset/in-memory-scan-task"
33
- require "arrow-dataset/scan-options"
32
+ require "arrow-dataset/arrow-table-loadable"
33
+ require "arrow-dataset/arrow-table-savable"
34
+ require "arrow-dataset/dataset"
35
+ require "arrow-dataset/file-format"
36
+ require "arrow-dataset/file-system-dataset-factory"
34
37
  end
35
38
  end
36
39
  end
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module ArrowDataset
19
- VERSION = "4.0.1"
19
+ VERSION = "7.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
data/test/helper.rb CHANGED
@@ -17,4 +17,6 @@
17
17
 
18
18
  require "arrow-dataset"
19
19
 
20
+ require "tmpdir"
21
+
20
22
  require "test-unit"
@@ -0,0 +1,80 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ class TestArrowTable < Test::Unit::TestCase
19
+ def setup
20
+ Dir.mktmpdir do |tmpdir|
21
+ @dir = tmpdir
22
+ @path1 = File.join(@dir, "data", "table1.arrow")
23
+ @table1 = Arrow::Table.new(visible: [true, false, true],
24
+ point: [1, 2, 3])
25
+ @path2 = File.join(@dir, "data", "table2.arrow")
26
+ @table2 = Arrow::Table.new(visible: [true],
27
+ point: [10])
28
+ yield
29
+ end
30
+ end
31
+
32
+ def build_file_uri(path)
33
+ absolute_path = File.expand_path(path)
34
+ if absolute_path.start_with?("/")
35
+ URI("file://#{absolute_path}")
36
+ else
37
+ URI("file:///#{absolute_path}")
38
+ end
39
+ end
40
+
41
+ sub_test_case("load") do
42
+ def test_no_scheme
43
+ Dir.chdir(@dir) do
44
+ uri = URI(File.basename(@path1))
45
+ @table1.save(uri)
46
+ assert_equal(@table1, Arrow::Table.load(uri))
47
+ end
48
+ end
49
+
50
+ def test_file
51
+ uri = build_file_uri(@path1)
52
+ @table1.save(uri)
53
+ assert_equal(@table1, Arrow::Table.load(uri))
54
+ end
55
+
56
+ def test_directory_uri
57
+ uri = build_file_uri(@dir)
58
+ @table1.save(build_file_uri(@path1))
59
+ @table2.save(build_file_uri(@path2))
60
+ assert_equal(@table1.concatenate([@table2]),
61
+ Arrow::Table.load(uri))
62
+ end
63
+
64
+ def test_directory_path
65
+ @table1.save(build_file_uri(@path1))
66
+ @table2.save(build_file_uri(@path2))
67
+ assert_equal(@table1.concatenate([@table2]),
68
+ Arrow::Table.load(@dir))
69
+ end
70
+
71
+ def test_filter
72
+ @table1.save(build_file_uri(@path1))
73
+ @table2.save(build_file_uri(@path2))
74
+ assert_equal(Arrow::Table.new(visible: [true, true, true],
75
+ point: [1, 3, 10]),
76
+ Arrow::Table.load(@dir,
77
+ filter: ["equal", :visible, true]))
78
+ end
79
+ end
80
+ end
@@ -15,22 +15,24 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- class TestScanOptions < Test::Unit::TestCase
18
+ class TestFileSystemDataset < Test::Unit::TestCase
19
19
  def setup
20
- @record_batches = [
21
- Arrow::RecordBatch.new(visible: [true, false, true],
22
- point: [1, 2, 3]),
23
- ]
24
- @schema = @record_batches.first.schema
20
+ Dir.mktmpdir do |tmpdir|
21
+ @dir = tmpdir
22
+ @path = File.join(@dir, "table.arrow")
23
+ @table = Arrow::Table.new(visible: [true, false, true],
24
+ point: [1, 2, 3])
25
+ @table.save(@path)
26
+ @format = ArrowDataset::IPCFileFormat.new
27
+ yield
28
+ end
25
29
  end
26
30
 
27
- sub_test_case(".try_convert") do
28
- def test_hash
29
- batch_size = 1024
30
- context = ArrowDataset::ScanOptions.try_convert(schema: @schema,
31
- batch_size: batch_size)
32
- assert_equal([@schema, batch_size],
33
- [context.schema, context.batch_size])
31
+ test(".build") do
32
+ dataset = ArrowDataset::FileSystemDataset.build(@format) do |factory|
33
+ factory.file_system = Arrow::LocalFileSystem.new
34
+ factory.add_path(File.expand_path(@path))
34
35
  end
36
+ assert_equal(@table, dataset.to_table)
35
37
  end
36
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-arrow-dataset
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.0.1
4
+ version: 7.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Apache Arrow Developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-05-27 00:00:00.000000000 Z
11
+ date: 2022-02-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: red-arrow
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - '='
18
18
  - !ruby/object:Gem::Version
19
- version: 4.0.1
19
+ version: 7.0.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - '='
25
25
  - !ruby/object:Gem::Version
26
- version: 4.0.1
26
+ version: 7.0.0
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -82,16 +82,18 @@ files:
82
82
  - Rakefile
83
83
  - dependency-check/Rakefile
84
84
  - lib/arrow-dataset.rb
85
- - lib/arrow-dataset/in-memory-fragment.rb
86
- - lib/arrow-dataset/in-memory-scan-task.rb
85
+ - lib/arrow-dataset/arrow-table-loadable.rb
86
+ - lib/arrow-dataset/arrow-table-savable.rb
87
+ - lib/arrow-dataset/dataset.rb
88
+ - lib/arrow-dataset/file-format.rb
89
+ - lib/arrow-dataset/file-system-dataset-factory.rb
87
90
  - lib/arrow-dataset/loader.rb
88
- - lib/arrow-dataset/scan-options.rb
89
91
  - lib/arrow-dataset/version.rb
90
92
  - red-arrow-dataset.gemspec
91
93
  - test/helper.rb
92
94
  - test/run-test.rb
93
- - test/test-in-memory-scan-task.rb
94
- - test/test-scan-options.rb
95
+ - test/test-arrow-table.rb
96
+ - test/test-file-system-dataset.rb
95
97
  homepage: https://arrow.apache.org/
96
98
  licenses:
97
99
  - Apache-2.0
@@ -111,12 +113,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
111
113
  - !ruby/object:Gem::Version
112
114
  version: '0'
113
115
  requirements: []
114
- rubygems_version: 3.2.5
116
+ rubygems_version: 3.4.0.dev
115
117
  signing_key:
116
118
  specification_version: 4
117
119
  summary: Red Arrow Dataset is the Ruby bindings of Apache Arrow Dataset
118
120
  test_files:
119
121
  - test/helper.rb
120
122
  - test/run-test.rb
121
- - test/test-scan-options.rb
122
- - test/test-in-memory-scan-task.rb
123
+ - test/test-arrow-table.rb
124
+ - test/test-file-system-dataset.rb
@@ -1,35 +0,0 @@
1
- # Licensed to the Apache Software Foundation (ASF) under one
2
- # or more contributor license agreements. See the NOTICE file
3
- # distributed with this work for additional information
4
- # regarding copyright ownership. The ASF licenses this file
5
- # to you under the Apache License, Version 2.0 (the
6
- # "License"); you may not use this file except in compliance
7
- # with the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing,
12
- # software distributed under the License is distributed on an
13
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
- # KIND, either express or implied. See the License for the
15
- # specific language governing permissions and limitations
16
- # under the License.
17
-
18
- module ArrowDataset
19
- class InMemoryScanTask
20
- alias_method :initialize_raw, :initialize
21
- private :initialize_raw
22
- def initialize(record_batches, **options)
23
- record_batches = record_batches.collect do |record_batch|
24
- unless record_batch.is_a?(Arrow::RecordBatch)
25
- record_batch = Arrow::RecordBatch.new(record_batch)
26
- end
27
- record_batch
28
- end
29
- options[:schema] ||= record_batches.first.schema
30
- fragment = options.delete(:fragment)
31
- fragment ||= InMemoryFragment.new(options[:schema], record_batches)
32
- initialize_raw(record_batches, options, fragment)
33
- end
34
- end
35
- end
@@ -1,33 +0,0 @@
1
- # Licensed to the Apache Software Foundation (ASF) under one
2
- # or more contributor license agreements. See the NOTICE file
3
- # distributed with this work for additional information
4
- # regarding copyright ownership. The ASF licenses this file
5
- # to you under the Apache License, Version 2.0 (the
6
- # "License"); you may not use this file except in compliance
7
- # with the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing,
12
- # software distributed under the License is distributed on an
13
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
- # KIND, either express or implied. See the License for the
15
- # specific language governing permissions and limitations
16
- # under the License.
17
-
18
- class TestInMemoryScanTask < Test::Unit::TestCase
19
- def setup
20
- @record_batches = [
21
- Arrow::RecordBatch.new(visible: [true, false, true],
22
- point: [1, 2, 3]),
23
- ]
24
- end
25
-
26
- sub_test_case(".new") do
27
- test("[[Arrow::RecordBatch]]") do
28
- scan_task = ArrowDataset::InMemoryScanTask.new(@record_batches)
29
- assert_equal(@record_batches,
30
- scan_task.execute.to_a)
31
- end
32
- end
33
- end