RubyGems - torchdata - Versions diffs - 0.0.1 - Mend

torchdata 0.0.1

Files changed (9) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/LICENSE.txt +30 -0
data/README.md +55 -0
data/lib/torchdata/data_pipes/iter/util/csv_parser.rb +45 -0
data/lib/torchdata/data_pipes/iter/util/random_splitter.rb +114 -0
data/lib/torchdata/version.rb +3 -0
data/lib/torchdata.rb +23 -0
metadata +63 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 3417d94393b7c9636a0c3e0935ecc62ddb2306ff51dcaf3a91f7118859ce8154
+  data.tar.gz: 3ad87bb333a0103db4ec5ff861f088d33471029dc5ae8739e6af27f325efe907
+SHA512:
+  metadata.gz: e3be548b4ee8575da3b9ed68b748bbf9b6f472472c903529ffaf65703582a9ee0e2c4f827e6383b318e12dfe4200b2533a118884beff406d6b98cdfaff886f19
+  data.tar.gz: '0797aabe0fa0975545f2650960012badccf7458208c2e0cbbcad88869000cafcce4465e389536ec3554f9d22f0322dea0ae674453e8887787fc3758ba14fa142'

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,3 @@
+## 0.0.1 (2023-01-30)
+- First release

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,30 @@
+BSD 3-Clause License
+Copyright (c) 2021-present, Facebook, Inc.
+Copyright (c) 2023, Andrew Kane.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

data/README.md ADDED Viewed

@@ -0,0 +1,55 @@
+# TorchData Ruby
+Composable data loading for Ruby
+[![Build Status](https://github.com/ankane/torchdata-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/torchdata-ruby/actions)
+## Installation
+Add this line to your application’s Gemfile:
+```ruby
+gem "torchdata"
+```
+## Getting Started
+This library follows the [Python API](https://pytorch.org/data/). Many methods and options are missing at the moment. PRs welcome!
+```ruby
+folder = "path/to/csv/folder"
+datapipe = TorchData::DataPipes::Iter::FileLister.new([folder]).filter { |filename| filename.end_with?(".csv") }
+datapipe = TorchData::DataPipes::Iter::FileOpener.new(datapipe, mode: "rt")
+datapipe = datapipe.parse_csv(delimiter: ",")
+train, valid = datapipe.random_split(total_length: 10000, weights: {train: 0.5, valid: 0.5}, seed: 0)
+train.each do |x|
+  # code
+end
+valid.each do |y|
+  # code
+end
+```
+## History
+View the [changelog](CHANGELOG.md)
+## Contributing
+Everyone is encouraged to help improve this project. Here are a few ways you can help:
+- [Report bugs](https://github.com/ankane/torchdata-ruby/issues)
+- Fix bugs and [submit pull requests](https://github.com/ankane/torchdata-ruby/pulls)
+- Write, clarify, or fix documentation
+- Suggest or add new features
+To get started with development:
+```sh
+git clone https://github.com/ankane/torchdata-ruby.git
+cd torchdata-ruby
+bundle install
+bundle exec rake test
+```

data/lib/torchdata/data_pipes/iter/util/csv_parser.rb ADDED Viewed

@@ -0,0 +1,45 @@
+module TorchData
+  module DataPipes
+    module Iter
+      module Util
+        class CSVParser < IterDataPipe
+          functional_datapipe :parse_csv
+          def initialize(source_datapipe, delimiter: ",")
+            @source_datapipe = source_datapipe
+            @helper = PlainTextReaderHelper.new
+            @fmtparams = {col_sep: delimiter}
+          end
+          def each(&block)
+            @source_datapipe.each do |path, file|
+              stream = @helper.skip_lines(file)
+              stream = @helper.decode(stream)
+              stream = CSV.parse(stream, **@fmtparams)
+              stream = @helper.as_tuple(stream)
+              @helper.return_path(stream, path: path).each(&block)
+            end
+          end
+        end
+        class PlainTextReaderHelper
+          def skip_lines(file)
+            file
+          end
+          def decode(stream)
+            stream
+          end
+          def return_path(stream, path: nil)
+            stream
+          end
+          def as_tuple(stream)
+            stream
+          end
+        end
+      end
+    end
+  end
+end

data/lib/torchdata/data_pipes/iter/util/random_splitter.rb ADDED Viewed

@@ -0,0 +1,114 @@
+module TorchData
+  module DataPipes
+    module Iter
+      module Util
+        class RandomSplitter < IterDataPipe
+          functional_datapipe :random_split
+          def self.new(source_datapipe, weights:, seed:, total_length: nil, target: nil)
+            if total_length.nil?
+              begin
+                total_length = source_datapipe.length
+              rescue NoMethodError
+                raise TypeError, "RandomSplitter needs `total_length`, but it is unable to infer it from the `source_datapipe`: #{source_datapipe}."
+              end
+            end
+            container = InternalRandomSplitterIterDataPipe.new(source_datapipe, total_length, weights, seed)
+            if target.nil?
+              weights.map { |k, _| SplitterIterator.new(container, k) }
+            else
+              raise "todo"
+            end
+          end
+        end
+        class InternalRandomSplitterIterDataPipe < IterDataPipe
+          attr_reader :source_datapipe
+          def initialize(source_datapipe, total_length, weights, seed)
+            @source_datapipe = source_datapipe
+            @total_length = total_length
+            @remaining_length = @total_length
+            @seed = seed
+            @keys = weights.keys
+            @key_to_index = @keys.map.with_index.to_h
+            @norm_weights = self.class.normalize_weights(@keys.map { |k| weights[k] }, total_length)
+            @weights = @norm_weights.dup
+            @rng = Random.new(@seed)
+            @lengths = []
+          end
+          def draw
+            selected_key = choices(@rng, @keys, @weights)
+            index = @key_to_index[selected_key]
+            @weights[index] -= 1
+            @remaining_length -= 1
+            if @weights[index] < 0
+              @weights[index] = 0
+              @weights = self.class.normalize_weights(@weights, @remaining_length)
+            end
+            selected_key
+          end
+          def self.normalize_weights(weights, total_length)
+            total_weight = weights.sum
+            weights.map { |w| w.to_f * total_length / total_weight }
+          end
+          def reset
+            @rng = Random.new(@seed)
+            @weights = @norm_weights.dup
+            @remaining_length = @total_length
+          end
+          def override_seed(seed)
+            @seed = seed
+            self
+          end
+          def get_length(target)
+            raise "todo"
+          end
+          private
+          def choices(rng, keys, weights)
+            total = weights.sum
+            x = rng.rand * total
+            weights.each_with_index do |w, i|
+              return keys[i] if x < w
+              x -= w
+            end
+            keys[-1]
+          end
+        end
+        class SplitterIterator < IterDataPipe
+          def initialize(main_datapipe, target)
+            @main_datapipe = main_datapipe
+            @target = target
+          end
+          def each
+            @main_datapipe.reset
+            @main_datapipe.source_datapipe.each do |sample|
+              if @main_datapipe.draw == @target
+                yield sample
+              end
+            end
+          end
+          def override_seed(seed)
+            @main_datapipe.override_seed(seed)
+          end
+          def length
+            @main_datapipe.get_length(@target)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/torchdata/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module TorchData
+  VERSION = "0.0.1"
+end

data/lib/torchdata.rb ADDED Viewed

@@ -0,0 +1,23 @@
+# dependencies
+require "torch"
+# stdlib
+require "csv"
+# modules
+require_relative "torchdata/version"
+module TorchData
+  class Error < StandardError; end
+  module DataPipes
+    module Iter
+      IterDataPipe = Torch::Utils::Data::DataPipes::IterDataPipe
+      FileLister = Torch::Utils::Data::DataPipes::Iter::FileLister
+      FileOpener = Torch::Utils::Data::DataPipes::Iter::FileOpener
+    end
+  end
+end
+require_relative "torchdata/data_pipes/iter/util/csv_parser"
+require_relative "torchdata/data_pipes/iter/util/random_splitter"

metadata ADDED Viewed

@@ -0,0 +1,63 @@
+--- !ruby/object:Gem::Specification
+name: torchdata
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Andrew Kane
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2023-01-31 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: torch-rb
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.12.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.12.2
+description:
+email: andrew@ankane.org
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- CHANGELOG.md
+- LICENSE.txt
+- README.md
+- lib/torchdata.rb
+- lib/torchdata/data_pipes/iter/util/csv_parser.rb
+- lib/torchdata/data_pipes/iter/util/random_splitter.rb
+- lib/torchdata/version.rb
+homepage: https://github.com/ankane/torchdata-ruby
+licenses:
+- BSD-3-Clause
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '2.7'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.4.1
+signing_key:
+specification_version: 4
+summary: Composable data loading for Ruby
+test_files: []