red-arrow-dataset 3.0.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 60801af8ebd03566fa8fd59bf971834c9d10040a9487d91a39fbb80d8fcb2918
4
- data.tar.gz: '07699ad51ea6c39283f090a3683ebd1c70b4f92bd6c0350ba146db94d7bb75db'
3
+ metadata.gz: 197357bda9355ce343ee13115d4850fcee6730df92d47c6b3def322533317a10
4
+ data.tar.gz: 45c1e7544c1323d6b7f342ea528372e414e10a0349e296bcc57c0200b1dd5e17
5
5
  SHA512:
6
- metadata.gz: 201295a8e1ee873a50e34d8fd33d69c30a8bdf375d814430cbc2da1aec79c072f15a22f06ebda632a77b7430a4c5af1f7f54a2178e625e51b07d57ab1be58d14
7
- data.tar.gz: b073742b5d1c89a751e264dd342ffd950195075d0d53e87c8fc8cdbdbe68feb5da69415ecdea0855088caf39493e534a02e86193b2ba23491e2c09d778fe33ce
6
+ metadata.gz: f9f1d0f69620be9967ca0b8b9115eb5ead5791ad1cd81e324047951ac80e5583b0568ae521abd8c491a32b422d8232bed20f1ea42fbc0c480786538f20c43bad
7
+ data.tar.gz: b12912dd69dd26eb3c3754ce466623b81e86bf6664433feb6ebd95e252e539144250cdffc8d615d3b5b070ae63f82d9f5a0e507be8b13c7f1114797e13140e4d
@@ -0,0 +1,61 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module ArrowDataset
19
+ module ArrowTableLoadable
20
+ private
21
+ def path_to_uri(path)
22
+ absolute_path = ::File.expand_path(path)
23
+ if absolute_path.start_with?("/")
24
+ URI("file://#{absolute_path}")
25
+ else
26
+ URI("file:///#{absolute_path}")
27
+ end
28
+ end
29
+
30
+ def load_from_directory
31
+ internal_load_from_uri(path_to_uri(@input))
32
+ end
33
+
34
+ def load_from_uri
35
+ internal_load_from_uri(@input)
36
+ end
37
+
38
+ def internal_load_from_uri(uri)
39
+ format = FileFormat.resolve(@options[:format])
40
+ dataset = FileSystemDataset.build(format) do |factory|
41
+ factory.file_system_uri = uri
42
+ end
43
+ scanner_builder = dataset.begin_scan
44
+ @options.each do |key, value|
45
+ next if key == :format
46
+ next if value.nil?
47
+ setter = "#{key}="
48
+ next unless scanner_builder.respond_to?(setter)
49
+ scanner_builder.public_send(setter, value)
50
+ end
51
+ scanner = scanner_builder.finish
52
+ scanner.to_table
53
+ end
54
+ end
55
+ end
56
+
57
+ module Arrow
58
+ class TableLoader
59
+ include ArrowDataset::ArrowTableLoadable
60
+ end
61
+ end
@@ -0,0 +1,69 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module ArrowDataset
19
+ module ArrowTableSavable
20
+ private
21
+ def save_to_uri
22
+ format = FileFormat.resolve(@options[:format])
23
+ options = FileSystemDatasetWriteOptions.new
24
+ options.file_write_options = format.default_write_options
25
+ path = @output.path
26
+ if @output.scheme.nil?
27
+ options.file_system = Arrow::LocalFileSystem.new
28
+ else
29
+ options.file_system = Arrow::FileSystem.create(@output.to_s)
30
+ # /C:/... -> C:/...
31
+ unless File.expand_path(".").start_with?("/")
32
+ path = path.gsub(/\A\//, "")
33
+ end
34
+ end
35
+ partitioning = @options[:partitioning]
36
+ if partitioning
37
+ # TODO
38
+ options.base_dir = File.dirname(path)
39
+ options.base_name_template = File.basename(path)
40
+ options.partitioning = Partitioning.resolve(@options[:partitioning])
41
+ scanner_builder = ScannerBuilder.new(@table)
42
+ scanner_builder.use_async(true)
43
+ scanner = scanner_builder.finish
44
+ FileSystemDataset.write_scanner(scanner, options)
45
+ else
46
+ dir = File.dirname(path)
47
+ unless File.exist?(dir)
48
+ options.file_system.create_dir(dir, true)
49
+ end
50
+ options.file_system.open_output_stream(path) do |output_stream|
51
+ format.open_writer(output_stream,
52
+ options.file_system,
53
+ path,
54
+ @table.schema,
55
+ format.default_write_options) do |writer|
56
+ reader = Arrow::TableBatchReader.new(@table)
57
+ writer.write_record_batch_reader(reader)
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
64
+
65
+ module Arrow
66
+ class TableSaver
67
+ include ArrowDataset::ArrowTableSavable
68
+ end
69
+ end
@@ -16,21 +16,13 @@
16
16
  # under the License.
17
17
 
18
18
  module ArrowDataset
19
- class ScanOptions
19
+ class Dataset
20
20
  class << self
21
- def try_convert(value)
22
- case value
23
- when Hash
24
- return nil unless value.key?(:schema)
25
- options = new(value[:schema])
26
- value.each do |name, value|
27
- next if name == :schema
28
- options.__send__("#{name}=", value)
29
- end
30
- options
31
- else
32
- nil
33
- end
21
+ def build(*args)
22
+ factory_class = ArrowDataset.const_get("#{name}Factory")
23
+ factory = factory_class.new(*args)
24
+ yield(factory)
25
+ factory.finish
34
26
  end
35
27
  end
36
28
  end
@@ -0,0 +1,59 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module ArrowDataset
19
+ class FileFormat
20
+ class << self
21
+ def resolve(format)
22
+ case format
23
+ when :arrow, :arrow_file, :arrow_streaming
24
+ IPCFileFormat.new
25
+ when :parquet
26
+ ParquetFileFormat.new
27
+ when :csv
28
+ CSVFileFormat.new
29
+ else
30
+ available_formats = [
31
+ :arrow,
32
+ :arrow_file,
33
+ :arrow_streaming,
34
+ :parquet,
35
+ :csv,
36
+ ]
37
+ message = "Arrow::Table load format must be one of ["
38
+ message << available_formats.join(", ")
39
+ message << "]: #{@options[:format].inspect}"
40
+ raise ArgumentError, message
41
+ end
42
+ end
43
+ end
44
+
45
+ alias_method :open_writer_raw, :open_writer
46
+ def open_writer(destination, file_system, path, schema, options)
47
+ writer = open_writer_raw(destination, file_system, path, schema, options)
48
+ if block_given?
49
+ begin
50
+ yield(writer)
51
+ ensure
52
+ writer.finish
53
+ end
54
+ else
55
+ writer
56
+ end
57
+ end
58
+ end
59
+ end
@@ -16,19 +16,24 @@
16
16
  # under the License.
17
17
 
18
18
  module ArrowDataset
19
- class InMemoryScanTask
20
- alias_method :initialize_raw, :initialize
21
- private :initialize_raw
22
- def initialize(record_batches, **options)
23
- record_batches = record_batches.collect do |record_batch|
24
- unless record_batch.is_a?(Arrow::RecordBatch)
25
- record_batch = Arrow::RecordBatch.new(record_batch)
19
+ class FileSystemDatasetFactory
20
+ alias_method :set_file_system_uri_raw, :set_file_system_uri
21
+ def set_file_system_uri(uri)
22
+ if uri.is_a?(URI)
23
+ if uri.scheme.nil?
24
+ uri = uri.dup
25
+ absolute_path = File.expand_path(uri.path)
26
+ if absolute_path.start_with?("/")
27
+ uri.path = absolute_path
28
+ else
29
+ uri.path = "/#{absolute_path}"
30
+ end
31
+ uri.scheme = "file"
26
32
  end
27
- record_batch
33
+ uri = uri.to_s
28
34
  end
29
- context = options.delete(:context) || ScanContext.new
30
- options[:schema] ||= record_batches.first.schema
31
- initialize_raw(record_batches, options, context)
35
+ set_file_system_uri_raw(uri)
32
36
  end
37
+ alias_method :file_system_uri=, :set_file_system_uri
33
38
  end
34
39
  end
@@ -29,8 +29,11 @@ module ArrowDataset
29
29
  end
30
30
 
31
31
  def require_libraries
32
- require "arrow-dataset/in-memory-scan-task"
33
- require "arrow-dataset/scan-options"
32
+ require "arrow-dataset/arrow-table-loadable"
33
+ require "arrow-dataset/arrow-table-savable"
34
+ require "arrow-dataset/dataset"
35
+ require "arrow-dataset/file-format"
36
+ require "arrow-dataset/file-system-dataset-factory"
34
37
  end
35
38
  end
36
39
  end
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module ArrowDataset
19
- VERSION = "3.0.0"
19
+ VERSION = "6.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
data/test/helper.rb CHANGED
@@ -17,4 +17,6 @@
17
17
 
18
18
  require "arrow-dataset"
19
19
 
20
+ require "tmpdir"
21
+
20
22
  require "test-unit"
@@ -0,0 +1,80 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ class TestArrowTable < Test::Unit::TestCase
19
+ def setup
20
+ Dir.mktmpdir do |tmpdir|
21
+ @dir = tmpdir
22
+ @path1 = File.join(@dir, "data", "table1.arrow")
23
+ @table1 = Arrow::Table.new(visible: [true, false, true],
24
+ point: [1, 2, 3])
25
+ @path2 = File.join(@dir, "data", "table2.arrow")
26
+ @table2 = Arrow::Table.new(visible: [true],
27
+ point: [10])
28
+ yield
29
+ end
30
+ end
31
+
32
+ def build_file_uri(path)
33
+ absolute_path = File.expand_path(path)
34
+ if absolute_path.start_with?("/")
35
+ URI("file://#{absolute_path}")
36
+ else
37
+ URI("file:///#{absolute_path}")
38
+ end
39
+ end
40
+
41
+ sub_test_case("load") do
42
+ def test_no_scheme
43
+ Dir.chdir(@dir) do
44
+ uri = URI(File.basename(@path1))
45
+ @table1.save(uri)
46
+ assert_equal(@table1, Arrow::Table.load(uri))
47
+ end
48
+ end
49
+
50
+ def test_file
51
+ uri = build_file_uri(@path1)
52
+ @table1.save(uri)
53
+ assert_equal(@table1, Arrow::Table.load(uri))
54
+ end
55
+
56
+ def test_directory_uri
57
+ uri = build_file_uri(@dir)
58
+ @table1.save(build_file_uri(@path1))
59
+ @table2.save(build_file_uri(@path2))
60
+ assert_equal(@table1.concatenate([@table2]),
61
+ Arrow::Table.load(uri))
62
+ end
63
+
64
+ def test_directory_path
65
+ @table1.save(build_file_uri(@path1))
66
+ @table2.save(build_file_uri(@path2))
67
+ assert_equal(@table1.concatenate([@table2]),
68
+ Arrow::Table.load(@dir))
69
+ end
70
+
71
+ def test_filter
72
+ @table1.save(build_file_uri(@path1))
73
+ @table2.save(build_file_uri(@path2))
74
+ assert_equal(Arrow::Table.new(visible: [true, true, true],
75
+ point: [1, 3, 10]),
76
+ Arrow::Table.load(@dir,
77
+ filter: ["equal", :visible, true]))
78
+ end
79
+ end
80
+ end
@@ -15,19 +15,24 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- class TestInMemoryScanTask < Test::Unit::TestCase
18
+ class TestFileSystemDataset < Test::Unit::TestCase
19
19
  def setup
20
- @record_batches = [
21
- Arrow::RecordBatch.new(visible: [true, false, true],
22
- point: [1, 2, 3]),
23
- ]
20
+ Dir.mktmpdir do |tmpdir|
21
+ @dir = tmpdir
22
+ @path = File.join(@dir, "table.arrow")
23
+ @table = Arrow::Table.new(visible: [true, false, true],
24
+ point: [1, 2, 3])
25
+ @table.save(@path)
26
+ @format = ArrowDataset::IPCFileFormat.new
27
+ yield
28
+ end
24
29
  end
25
30
 
26
- sub_test_case(".new") do
27
- test("[[Arrow::RecordBatch]]") do
28
- scan_task = ArrowDataset::InMemoryScanTask.new(@record_batches)
29
- assert_equal(@record_batches,
30
- scan_task.execute.to_a)
31
+ test(".build") do
32
+ dataset = ArrowDataset::FileSystemDataset.build(@format) do |factory|
33
+ factory.file_system = Arrow::LocalFileSystem.new
34
+ factory.add_path(File.expand_path(@path))
31
35
  end
36
+ assert_equal(@table, dataset.to_table)
32
37
  end
33
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-arrow-dataset
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.0
4
+ version: 6.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Apache Arrow Developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-25 00:00:00.000000000 Z
11
+ date: 2021-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: red-arrow
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - '='
18
18
  - !ruby/object:Gem::Version
19
- version: 3.0.0
19
+ version: 6.0.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - '='
25
25
  - !ruby/object:Gem::Version
26
- version: 3.0.0
26
+ version: 6.0.0
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -82,15 +82,18 @@ files:
82
82
  - Rakefile
83
83
  - dependency-check/Rakefile
84
84
  - lib/arrow-dataset.rb
85
- - lib/arrow-dataset/in-memory-scan-task.rb
85
+ - lib/arrow-dataset/arrow-table-loadable.rb
86
+ - lib/arrow-dataset/arrow-table-savable.rb
87
+ - lib/arrow-dataset/dataset.rb
88
+ - lib/arrow-dataset/file-format.rb
89
+ - lib/arrow-dataset/file-system-dataset-factory.rb
86
90
  - lib/arrow-dataset/loader.rb
87
- - lib/arrow-dataset/scan-options.rb
88
91
  - lib/arrow-dataset/version.rb
89
92
  - red-arrow-dataset.gemspec
90
93
  - test/helper.rb
91
94
  - test/run-test.rb
92
- - test/test-in-memory-scan-task.rb
93
- - test/test-scan-options.rb
95
+ - test/test-arrow-table.rb
96
+ - test/test-file-system-dataset.rb
94
97
  homepage: https://arrow.apache.org/
95
98
  licenses:
96
99
  - Apache-2.0
@@ -110,12 +113,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
110
113
  - !ruby/object:Gem::Version
111
114
  version: '0'
112
115
  requirements: []
113
- rubygems_version: 3.2.5
116
+ rubygems_version: 3.2.27
114
117
  signing_key:
115
118
  specification_version: 4
116
119
  summary: Red Arrow Dataset is the Ruby bindings of Apache Arrow Dataset
117
120
  test_files:
118
121
  - test/helper.rb
119
122
  - test/run-test.rb
120
- - test/test-scan-options.rb
121
- - test/test-in-memory-scan-task.rb
123
+ - test/test-arrow-table.rb
124
+ - test/test-file-system-dataset.rb
@@ -1,36 +0,0 @@
1
- # Licensed to the Apache Software Foundation (ASF) under one
2
- # or more contributor license agreements. See the NOTICE file
3
- # distributed with this work for additional information
4
- # regarding copyright ownership. The ASF licenses this file
5
- # to you under the Apache License, Version 2.0 (the
6
- # "License"); you may not use this file except in compliance
7
- # with the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing,
12
- # software distributed under the License is distributed on an
13
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
- # KIND, either express or implied. See the License for the
15
- # specific language governing permissions and limitations
16
- # under the License.
17
-
18
- class TestScanOptions < Test::Unit::TestCase
19
- def setup
20
- @record_batches = [
21
- Arrow::RecordBatch.new(visible: [true, false, true],
22
- point: [1, 2, 3]),
23
- ]
24
- @schema = @record_batches.first.schema
25
- end
26
-
27
- sub_test_case(".try_convert") do
28
- def test_hash
29
- batch_size = 1024
30
- context = ArrowDataset::ScanOptions.try_convert(schema: @schema,
31
- batch_size: batch_size)
32
- assert_equal([@schema, batch_size],
33
- [context.schema, context.batch_size])
34
- end
35
- end
36
- end