torchdata 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 3417d94393b7c9636a0c3e0935ecc62ddb2306ff51dcaf3a91f7118859ce8154
4
+ data.tar.gz: 3ad87bb333a0103db4ec5ff861f088d33471029dc5ae8739e6af27f325efe907
5
+ SHA512:
6
+ metadata.gz: e3be548b4ee8575da3b9ed68b748bbf9b6f472472c903529ffaf65703582a9ee0e2c4f827e6383b318e12dfe4200b2533a118884beff406d6b98cdfaff886f19
7
+ data.tar.gz: '0797aabe0fa0975545f2650960012badccf7458208c2e0cbbcad88869000cafcce4465e389536ec3554f9d22f0322dea0ae674453e8887787fc3758ba14fa142'
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ## 0.0.1 (2023-01-30)
2
+
3
+ - First release
data/LICENSE.txt ADDED
@@ -0,0 +1,30 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2021-present, Facebook, Inc.
4
+ Copyright (c) 2023, Andrew Kane.
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from
19
+ this software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,55 @@
1
+ # TorchData Ruby
2
+
3
+ Composable data loading for Ruby
4
+
5
+ [![Build Status](https://github.com/ankane/torchdata-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/torchdata-ruby/actions)
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application’s Gemfile:
10
+
11
+ ```ruby
12
+ gem "torchdata"
13
+ ```
14
+
15
+ ## Getting Started
16
+
17
+ This library follows the [Python API](https://pytorch.org/data/). Many methods and options are missing at the moment. PRs welcome!
18
+
19
+ ```ruby
20
+ folder = "path/to/csv/folder"
21
+ datapipe = TorchData::DataPipes::Iter::FileLister.new([folder]).filter { |filename| filename.end_with?(".csv") }
22
+ datapipe = TorchData::DataPipes::Iter::FileOpener.new(datapipe, mode: "rt")
23
+ datapipe = datapipe.parse_csv(delimiter: ",")
24
+ train, valid = datapipe.random_split(total_length: 10000, weights: {train: 0.5, valid: 0.5}, seed: 0)
25
+
26
+ train.each do |x|
27
+ # code
28
+ end
29
+
30
+ valid.each do |y|
31
+ # code
32
+ end
33
+ ```
34
+
35
+ ## History
36
+
37
+ View the [changelog](CHANGELOG.md)
38
+
39
+ ## Contributing
40
+
41
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
42
+
43
+ - [Report bugs](https://github.com/ankane/torchdata-ruby/issues)
44
+ - Fix bugs and [submit pull requests](https://github.com/ankane/torchdata-ruby/pulls)
45
+ - Write, clarify, or fix documentation
46
+ - Suggest or add new features
47
+
48
+ To get started with development:
49
+
50
+ ```sh
51
+ git clone https://github.com/ankane/torchdata-ruby.git
52
+ cd torchdata-ruby
53
+ bundle install
54
+ bundle exec rake test
55
+ ```
@@ -0,0 +1,45 @@
1
+ module TorchData
2
+ module DataPipes
3
+ module Iter
4
+ module Util
5
+ class CSVParser < IterDataPipe
6
+ functional_datapipe :parse_csv
7
+
8
+ def initialize(source_datapipe, delimiter: ",")
9
+ @source_datapipe = source_datapipe
10
+ @helper = PlainTextReaderHelper.new
11
+ @fmtparams = {col_sep: delimiter}
12
+ end
13
+
14
+ def each(&block)
15
+ @source_datapipe.each do |path, file|
16
+ stream = @helper.skip_lines(file)
17
+ stream = @helper.decode(stream)
18
+ stream = CSV.parse(stream, **@fmtparams)
19
+ stream = @helper.as_tuple(stream)
20
+ @helper.return_path(stream, path: path).each(&block)
21
+ end
22
+ end
23
+ end
24
+
25
+ class PlainTextReaderHelper
26
+ def skip_lines(file)
27
+ file
28
+ end
29
+
30
+ def decode(stream)
31
+ stream
32
+ end
33
+
34
+ def return_path(stream, path: nil)
35
+ stream
36
+ end
37
+
38
+ def as_tuple(stream)
39
+ stream
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,114 @@
1
+ module TorchData
2
+ module DataPipes
3
+ module Iter
4
+ module Util
5
+ class RandomSplitter < IterDataPipe
6
+ functional_datapipe :random_split
7
+
8
+ def self.new(source_datapipe, weights:, seed:, total_length: nil, target: nil)
9
+ if total_length.nil?
10
+ begin
11
+ total_length = source_datapipe.length
12
+ rescue NoMethodError
13
+ raise TypeError, "RandomSplitter needs `total_length`, but it is unable to infer it from the `source_datapipe`: #{source_datapipe}."
14
+ end
15
+ end
16
+
17
+ container = InternalRandomSplitterIterDataPipe.new(source_datapipe, total_length, weights, seed)
18
+
19
+ if target.nil?
20
+ weights.map { |k, _| SplitterIterator.new(container, k) }
21
+ else
22
+ raise "todo"
23
+ end
24
+ end
25
+ end
26
+
27
+ class InternalRandomSplitterIterDataPipe < IterDataPipe
28
+ attr_reader :source_datapipe
29
+
30
+ def initialize(source_datapipe, total_length, weights, seed)
31
+ @source_datapipe = source_datapipe
32
+ @total_length = total_length
33
+ @remaining_length = @total_length
34
+ @seed = seed
35
+ @keys = weights.keys
36
+ @key_to_index = @keys.map.with_index.to_h
37
+ @norm_weights = self.class.normalize_weights(@keys.map { |k| weights[k] }, total_length)
38
+ @weights = @norm_weights.dup
39
+ @rng = Random.new(@seed)
40
+ @lengths = []
41
+ end
42
+
43
+ def draw
44
+ selected_key = choices(@rng, @keys, @weights)
45
+ index = @key_to_index[selected_key]
46
+ @weights[index] -= 1
47
+ @remaining_length -= 1
48
+ if @weights[index] < 0
49
+ @weights[index] = 0
50
+ @weights = self.class.normalize_weights(@weights, @remaining_length)
51
+ end
52
+ selected_key
53
+ end
54
+
55
+ def self.normalize_weights(weights, total_length)
56
+ total_weight = weights.sum
57
+ weights.map { |w| w.to_f * total_length / total_weight }
58
+ end
59
+
60
+ def reset
61
+ @rng = Random.new(@seed)
62
+ @weights = @norm_weights.dup
63
+ @remaining_length = @total_length
64
+ end
65
+
66
+ def override_seed(seed)
67
+ @seed = seed
68
+ self
69
+ end
70
+
71
+ def get_length(target)
72
+ raise "todo"
73
+ end
74
+
75
+ private
76
+
77
+ def choices(rng, keys, weights)
78
+ total = weights.sum
79
+ x = rng.rand * total
80
+ weights.each_with_index do |w, i|
81
+ return keys[i] if x < w
82
+ x -= w
83
+ end
84
+ keys[-1]
85
+ end
86
+ end
87
+
88
+ class SplitterIterator < IterDataPipe
89
+ def initialize(main_datapipe, target)
90
+ @main_datapipe = main_datapipe
91
+ @target = target
92
+ end
93
+
94
+ def each
95
+ @main_datapipe.reset
96
+ @main_datapipe.source_datapipe.each do |sample|
97
+ if @main_datapipe.draw == @target
98
+ yield sample
99
+ end
100
+ end
101
+ end
102
+
103
+ def override_seed(seed)
104
+ @main_datapipe.override_seed(seed)
105
+ end
106
+
107
+ def length
108
+ @main_datapipe.get_length(@target)
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,3 @@
1
+ module TorchData
2
+ VERSION = "0.0.1"
3
+ end
data/lib/torchdata.rb ADDED
@@ -0,0 +1,23 @@
1
+ # dependencies
2
+ require "torch"
3
+
4
+ # stdlib
5
+ require "csv"
6
+
7
+ # modules
8
+ require_relative "torchdata/version"
9
+
10
+ module TorchData
11
+ class Error < StandardError; end
12
+
13
+ module DataPipes
14
+ module Iter
15
+ IterDataPipe = Torch::Utils::Data::DataPipes::IterDataPipe
16
+ FileLister = Torch::Utils::Data::DataPipes::Iter::FileLister
17
+ FileOpener = Torch::Utils::Data::DataPipes::Iter::FileOpener
18
+ end
19
+ end
20
+ end
21
+
22
+ require_relative "torchdata/data_pipes/iter/util/csv_parser"
23
+ require_relative "torchdata/data_pipes/iter/util/random_splitter"
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: torchdata
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-01-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: torch-rb
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.12.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.12.2
27
+ description:
28
+ email: andrew@ankane.org
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - CHANGELOG.md
34
+ - LICENSE.txt
35
+ - README.md
36
+ - lib/torchdata.rb
37
+ - lib/torchdata/data_pipes/iter/util/csv_parser.rb
38
+ - lib/torchdata/data_pipes/iter/util/random_splitter.rb
39
+ - lib/torchdata/version.rb
40
+ homepage: https://github.com/ankane/torchdata-ruby
41
+ licenses:
42
+ - BSD-3-Clause
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '2.7'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubygems_version: 3.4.1
60
+ signing_key:
61
+ specification_version: 4
62
+ summary: Composable data loading for Ruby
63
+ test_files: []