torchdata 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 3417d94393b7c9636a0c3e0935ecc62ddb2306ff51dcaf3a91f7118859ce8154
4
+ data.tar.gz: 3ad87bb333a0103db4ec5ff861f088d33471029dc5ae8739e6af27f325efe907
5
+ SHA512:
6
+ metadata.gz: e3be548b4ee8575da3b9ed68b748bbf9b6f472472c903529ffaf65703582a9ee0e2c4f827e6383b318e12dfe4200b2533a118884beff406d6b98cdfaff886f19
7
+ data.tar.gz: '0797aabe0fa0975545f2650960012badccf7458208c2e0cbbcad88869000cafcce4465e389536ec3554f9d22f0322dea0ae674453e8887787fc3758ba14fa142'
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ## 0.0.1 (2023-01-30)
2
+
3
+ - First release
data/LICENSE.txt ADDED
@@ -0,0 +1,30 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2021-present, Facebook, Inc.
4
+ Copyright (c) 2023, Andrew Kane.
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from
19
+ this software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,55 @@
1
+ # TorchData Ruby
2
+
3
+ Composable data loading for Ruby
4
+
5
+ [![Build Status](https://github.com/ankane/torchdata-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/torchdata-ruby/actions)
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application’s Gemfile:
10
+
11
+ ```ruby
12
+ gem "torchdata"
13
+ ```
14
+
15
+ ## Getting Started
16
+
17
+ This library follows the [Python API](https://pytorch.org/data/). Many methods and options are missing at the moment. PRs welcome!
18
+
19
+ ```ruby
20
+ folder = "path/to/csv/folder"
21
+ datapipe = TorchData::DataPipes::Iter::FileLister.new([folder]).filter { |filename| filename.end_with?(".csv") }
22
+ datapipe = TorchData::DataPipes::Iter::FileOpener.new(datapipe, mode: "rt")
23
+ datapipe = datapipe.parse_csv(delimiter: ",")
24
+ train, valid = datapipe.random_split(total_length: 10000, weights: {train: 0.5, valid: 0.5}, seed: 0)
25
+
26
+ train.each do |x|
27
+ # code
28
+ end
29
+
30
+ valid.each do |y|
31
+ # code
32
+ end
33
+ ```
34
+
35
+ ## History
36
+
37
+ View the [changelog](CHANGELOG.md)
38
+
39
+ ## Contributing
40
+
41
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
42
+
43
+ - [Report bugs](https://github.com/ankane/torchdata-ruby/issues)
44
+ - Fix bugs and [submit pull requests](https://github.com/ankane/torchdata-ruby/pulls)
45
+ - Write, clarify, or fix documentation
46
+ - Suggest or add new features
47
+
48
+ To get started with development:
49
+
50
+ ```sh
51
+ git clone https://github.com/ankane/torchdata-ruby.git
52
+ cd torchdata-ruby
53
+ bundle install
54
+ bundle exec rake test
55
+ ```
@@ -0,0 +1,45 @@
1
+ module TorchData
2
+ module DataPipes
3
+ module Iter
4
+ module Util
5
+ class CSVParser < IterDataPipe
6
+ functional_datapipe :parse_csv
7
+
8
+ def initialize(source_datapipe, delimiter: ",")
9
+ @source_datapipe = source_datapipe
10
+ @helper = PlainTextReaderHelper.new
11
+ @fmtparams = {col_sep: delimiter}
12
+ end
13
+
14
+ def each(&block)
15
+ @source_datapipe.each do |path, file|
16
+ stream = @helper.skip_lines(file)
17
+ stream = @helper.decode(stream)
18
+ stream = CSV.parse(stream, **@fmtparams)
19
+ stream = @helper.as_tuple(stream)
20
+ @helper.return_path(stream, path: path).each(&block)
21
+ end
22
+ end
23
+ end
24
+
25
+ class PlainTextReaderHelper
26
+ def skip_lines(file)
27
+ file
28
+ end
29
+
30
+ def decode(stream)
31
+ stream
32
+ end
33
+
34
+ def return_path(stream, path: nil)
35
+ stream
36
+ end
37
+
38
+ def as_tuple(stream)
39
+ stream
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,114 @@
1
+ module TorchData
2
+ module DataPipes
3
+ module Iter
4
+ module Util
5
+ class RandomSplitter < IterDataPipe
6
+ functional_datapipe :random_split
7
+
8
+ def self.new(source_datapipe, weights:, seed:, total_length: nil, target: nil)
9
+ if total_length.nil?
10
+ begin
11
+ total_length = source_datapipe.length
12
+ rescue NoMethodError
13
+ raise TypeError, "RandomSplitter needs `total_length`, but it is unable to infer it from the `source_datapipe`: #{source_datapipe}."
14
+ end
15
+ end
16
+
17
+ container = InternalRandomSplitterIterDataPipe.new(source_datapipe, total_length, weights, seed)
18
+
19
+ if target.nil?
20
+ weights.map { |k, _| SplitterIterator.new(container, k) }
21
+ else
22
+ raise "todo"
23
+ end
24
+ end
25
+ end
26
+
27
+ class InternalRandomSplitterIterDataPipe < IterDataPipe
28
+ attr_reader :source_datapipe
29
+
30
+ def initialize(source_datapipe, total_length, weights, seed)
31
+ @source_datapipe = source_datapipe
32
+ @total_length = total_length
33
+ @remaining_length = @total_length
34
+ @seed = seed
35
+ @keys = weights.keys
36
+ @key_to_index = @keys.map.with_index.to_h
37
+ @norm_weights = self.class.normalize_weights(@keys.map { |k| weights[k] }, total_length)
38
+ @weights = @norm_weights.dup
39
+ @rng = Random.new(@seed)
40
+ @lengths = []
41
+ end
42
+
43
+ def draw
44
+ selected_key = choices(@rng, @keys, @weights)
45
+ index = @key_to_index[selected_key]
46
+ @weights[index] -= 1
47
+ @remaining_length -= 1
48
+ if @weights[index] < 0
49
+ @weights[index] = 0
50
+ @weights = self.class.normalize_weights(@weights, @remaining_length)
51
+ end
52
+ selected_key
53
+ end
54
+
55
+ def self.normalize_weights(weights, total_length)
56
+ total_weight = weights.sum
57
+ weights.map { |w| w.to_f * total_length / total_weight }
58
+ end
59
+
60
+ def reset
61
+ @rng = Random.new(@seed)
62
+ @weights = @norm_weights.dup
63
+ @remaining_length = @total_length
64
+ end
65
+
66
+ def override_seed(seed)
67
+ @seed = seed
68
+ self
69
+ end
70
+
71
+ def get_length(target)
72
+ raise "todo"
73
+ end
74
+
75
+ private
76
+
77
+ def choices(rng, keys, weights)
78
+ total = weights.sum
79
+ x = rng.rand * total
80
+ weights.each_with_index do |w, i|
81
+ return keys[i] if x < w
82
+ x -= w
83
+ end
84
+ keys[-1]
85
+ end
86
+ end
87
+
88
+ class SplitterIterator < IterDataPipe
89
+ def initialize(main_datapipe, target)
90
+ @main_datapipe = main_datapipe
91
+ @target = target
92
+ end
93
+
94
+ def each
95
+ @main_datapipe.reset
96
+ @main_datapipe.source_datapipe.each do |sample|
97
+ if @main_datapipe.draw == @target
98
+ yield sample
99
+ end
100
+ end
101
+ end
102
+
103
+ def override_seed(seed)
104
+ @main_datapipe.override_seed(seed)
105
+ end
106
+
107
+ def length
108
+ @main_datapipe.get_length(@target)
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,3 @@
1
+ module TorchData
2
+ VERSION = "0.0.1"
3
+ end
data/lib/torchdata.rb ADDED
@@ -0,0 +1,23 @@
1
+ # dependencies
2
+ require "torch"
3
+
4
+ # stdlib
5
+ require "csv"
6
+
7
+ # modules
8
+ require_relative "torchdata/version"
9
+
10
+ module TorchData
11
+ class Error < StandardError; end
12
+
13
+ module DataPipes
14
+ module Iter
15
+ IterDataPipe = Torch::Utils::Data::DataPipes::IterDataPipe
16
+ FileLister = Torch::Utils::Data::DataPipes::Iter::FileLister
17
+ FileOpener = Torch::Utils::Data::DataPipes::Iter::FileOpener
18
+ end
19
+ end
20
+ end
21
+
22
+ require_relative "torchdata/data_pipes/iter/util/csv_parser"
23
+ require_relative "torchdata/data_pipes/iter/util/random_splitter"
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: torchdata
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-01-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: torch-rb
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.12.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.12.2
27
+ description:
28
+ email: andrew@ankane.org
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - CHANGELOG.md
34
+ - LICENSE.txt
35
+ - README.md
36
+ - lib/torchdata.rb
37
+ - lib/torchdata/data_pipes/iter/util/csv_parser.rb
38
+ - lib/torchdata/data_pipes/iter/util/random_splitter.rb
39
+ - lib/torchdata/version.rb
40
+ homepage: https://github.com/ankane/torchdata-ruby
41
+ licenses:
42
+ - BSD-3-Clause
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '2.7'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubygems_version: 3.4.1
60
+ signing_key:
61
+ specification_version: 4
62
+ summary: Composable data loading for Ruby
63
+ test_files: []