RubyGems - torchtext - Versions diffs - 0.1.0 → 0.1.1 - Mend

torchtext 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +34 -1
data/lib/torchtext.rb +4 -0
data/lib/torchtext/data/metrics.rb +68 -0
data/lib/torchtext/datasets/text_classification.rb +1 -1
data/lib/torchtext/nn/in_proj_container.rb +16 -0
data/lib/torchtext/nn/multihead_attention_container.rb +50 -0
data/lib/torchtext/nn/scaled_dot_product.rb +72 -0
data/lib/torchtext/version.rb +1 -1
metadata +11 -7

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 86469f8148e519b940a643f81b5317d3e180d6ebc031da14cb0599b48e3f6556
-  data.tar.gz: 499079c8a32de3ea6704b58a04ad8511f7a6784cc138b08e87696c69d7835863
+  metadata.gz: d5c6ff21a492fa03ce88ed7b823d29310e67b3d0c86825ccd2b6092f330bdbc1
+  data.tar.gz: 7df74bb05bb110ae37e9d962adc9dc020088203ed1b9e920fe3c4e9421f8e714
 SHA512:
-  metadata.gz: e3ea0d3719d35a58b757ac3d11adeda30912f35f69f7de37047ef702c556e5384862f950e055565db8396d8495a760b4919fd416affbdc0fd815dc14ed02e3a3
-  data.tar.gz: 16d2817864dc4bba2d54ca4a7288bc609b95ab5d59da51c67eccf70107fd78e67af141b765d01b8eea82c0a112b3dc779c174d7864afee6fb5651a5d787df7c5
+  metadata.gz: e2d0da285afa14f72380ad688ce63c316a6e46e0cceb7f332b15d298253ffe05d9f6bf9244aa630bc20719a794fe6b316ac7861788e7775ac104e7741dcf167b
+  data.tar.gz: '0202023381da0eefdd6c27587ad1798e9052a8798d3d32f8e71b19d7a5303aaf5a01f16c77d486d1d8249ef89cf4009f146855f6e3d1f4cf2c7e3fc9e66ce830'

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,8 @@
+## 0.1.1 (2021-07-15)
+- Added `NN` module
+- Added `bleu_score` method
 ## 0.1.0 (2020-08-24)
 - First release

data/README.md CHANGED Viewed

@@ -2,6 +2,8 @@
 :fire: Data loaders and abstractions for text and NLP - for Ruby
+[![Build Status](https://github.com/ankane/torchtext/workflows/build/badge.svg?branch=master)](https://github.com/ankane/torchtext/actions)
 ## Installation
 Add this line to your application’s Gemfile:
@@ -19,7 +21,7 @@ This library follows the [Python API](https://pytorch.org/text/). Many methods a
 Text classification
 - [PyTorch tutorial](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html)
-- [Ruby code](examples/text_classification)
+- [Ruby code](examples/text_classification.rb)
 ## Datasets
@@ -33,6 +35,37 @@ Supported datasets are:
 - [AG_NEWS](http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html)
+## Data Utils
+Supports:
+- tokenizer
+- ngrams_iterator
+## Data Metrics
+Compute the BLEU score
+```ruby
+candidate_corpus = [["My", "full", "pytorch", "test"], ["Another", "Sentence"]]
+references_corpus = [[["My", "full", "pytorch", "test"], ["Completely", "Different"]], [["No", "Match"]]]
+TorchText::Data::Metrics.bleu_score(candidate_corpus, references_corpus)
+```
+## NN
+Supports:
+- InProjContainer
+- MultiheadAttentionContainer
+- ScaledDotProduct
+## Vocab
+Supports:
+- Vocab
 ## Disclaimer
 This library downloads and prepares public datasets. We don’t host any datasets. Be sure to adhere to the license for each dataset.

data/lib/torchtext.rb CHANGED Viewed

@@ -9,8 +9,12 @@ require "set"
 # modules
 require "torchtext/data/utils"
+require "torchtext/data/metrics"
 require "torchtext/datasets/text_classification"
 require "torchtext/datasets/text_classification_dataset"
+require "torchtext/nn/in_proj_container"
+require "torchtext/nn/multihead_attention_container"
+require "torchtext/nn/scaled_dot_product"
 require "torchtext/vocab"
 require "torchtext/version"

data/lib/torchtext/data/metrics.rb ADDED Viewed

@@ -0,0 +1,68 @@
+module TorchText
+  module Data
+    module Metrics
+      class << self
+        def bleu_score(candidate_corpus, references_corpus, max_n: 4, weights: [0.25] * 4)
+          unless max_n == weights.length
+            raise "Length of the \"weights\" list has be equal to max_n"
+          end
+          unless candidate_corpus.length == references_corpus.length
+            raise "The length of candidate and reference corpus should be the same"
+          end
+          clipped_counts = Torch.zeros(max_n)
+          total_counts = Torch.zeros(max_n)
+          weights = Torch.tensor(weights)
+          candidate_len = 0.0
+          refs_len = 0.0
+          candidate_corpus.zip(references_corpus) do |candidate, refs|
+            candidate_len += candidate.length
+            # Get the length of the reference that's closest in length to the candidate
+            refs_len_list = refs.map { |ref| ref.length.to_f }
+            refs_len += refs_len_list.min_by { |x| (candidate.length - x).abs }
+            reference_counters = compute_ngram_counter(refs[0], max_n)
+            refs[1..-1].each do |ref|
+              reference_counters = reference_counters.merge(compute_ngram_counter(ref, max_n)) { |_, v1, v2| v1 > v2 ? v1 : v2 }
+            end
+            candidate_counter = compute_ngram_counter(candidate, max_n)
+            shared_keys = candidate_counter.keys & reference_counters.keys
+            clipped_counter = candidate_counter.slice(*shared_keys).merge(reference_counters.slice(*shared_keys)) { |_, v1, v2| v1 < v2 ? v1 : v2 }
+            clipped_counter.each_key do |ngram|
+              clipped_counts[ngram.length - 1] += clipped_counter[ngram]
+            end
+            candidate_counter.each_key do |ngram|
+              total_counts[ngram.length - 1] += candidate_counter[ngram]
+            end
+          end
+          if clipped_counts.to_a.min == 0
+            0.0
+          else
+            pn = clipped_counts / total_counts
+            log_pn = weights * Torch.log(pn)
+            score = Torch.exp(log_pn.sum)
+            bp = Math.exp([1 - refs_len / candidate_len, 0].min)
+            bp * score.item
+          end
+        end
+        private
+        def compute_ngram_counter(tokens, max_n)
+          raise "Failed assert" unless max_n > 0
+          Hash[TorchText::Data::Utils.ngrams_iterator(tokens, max_n).map { |x| x.split(" ") }.group_by { |v| v }.map { |k, v| [k, v.size] }]
+        end
+      end
+    end
+  end
+end

data/lib/torchtext/datasets/text_classification.rb CHANGED Viewed

@@ -139,7 +139,7 @@ module TorchText
             return to_path
           end
-          raise "Not implemented yet"
+          raise "We currently only support tar.gz and tgz archives"
         end
       end

data/lib/torchtext/nn/in_proj_container.rb ADDED Viewed

@@ -0,0 +1,16 @@
+module TorchText
+  module NN
+    class InProjContainer < Torch::NN::Module
+      def initialize(query_proj, key_proj, value_proj)
+        super()
+        @query_proj = query_proj
+        @key_proj = key_proj
+        @value_proj = value_proj
+      end
+      def forward(query, key, value)
+        [@query_proj.call(query), @key_proj.call(key), @value_proj.call(value)]
+      end
+    end
+  end
+end

data/lib/torchtext/nn/multihead_attention_container.rb ADDED Viewed

@@ -0,0 +1,50 @@
+module TorchText
+  module NN
+    class MultiheadAttentionContainer < Torch::NN::Module
+      def initialize(nhead, in_proj_container, attention_layer, out_proj, batch_first: false)
+        super()
+        @nhead = nhead
+        @in_proj_container = in_proj_container
+        @attention_layer = attention_layer
+        @out_proj = out_proj
+        @batch_first = batch_first
+      end
+      def forward(query, key, value, attn_mask: nil, bias_k: nil, bias_v: nil)
+        if @batch_first
+          query, key, value = query.transpose(-3, -2), key.transpose(-3, -2), value.transpose(-3, -2)
+        end
+        tgt_len, src_len, bsz, embed_dim = query.size(-3), key.size(-3), query.size(-2), query.size(-1)
+        q, k, v = @in_proj_container.call(query, key, value)
+        unless q.size(-1) % @nhead == 0
+          raise "query's embed_dim must be divisible by the number of heads"
+        end
+        head_dim = q.size(-1).div(@nhead)
+        q = q.reshape(tgt_len, bsz * @nhead, head_dim)
+        unless k.size(-1) % @nhead == 0
+          raise "key's embed_dim must be divisible by the number of heads"
+        end
+        head_dim = k.size(-1).div(@nhead)
+        k = k.reshape(src_len, bsz * @nhead, head_dim)
+        unless v.size(-1) % @nhead == 0
+          raise "value's embed_dim must be divisible by the number of heads"
+        end
+        head_dim = v.size(-1).div(@nhead)
+        v = v.reshape(src_len, bsz * @nhead, head_dim)
+        attn_output, attn_output_weights = @attention_layer.call(q, k, v, attn_mask: attn_mask, bias_k: bias_k, bias_v: bias_v)
+        attn_output = attn_output.reshape(tgt_len, bsz, embed_dim)
+        attn_output = @out_proj.call(attn_output)
+        if @batch_first
+          attn_output = attn_output.transpose(-3, -2)
+        end
+        [attn_output, attn_output_weights]
+      end
+    end
+  end
+end

data/lib/torchtext/nn/scaled_dot_product.rb ADDED Viewed

@@ -0,0 +1,72 @@
+module TorchText
+  module NN
+    class ScaledDotProduct < Torch::NN::Module
+      def initialize(dropout: 0.0, batch_first: false)
+        super()
+        @dropout = dropout
+        @batch_first = batch_first
+      end
+      def forward(query, key, value, attn_mask: nil, bias_k: nil, bias_v: nil)
+        if @batch_first
+          query, key, value = query.transpose(-3, -2), key.transpose(-3, -2), value.transpose(-3, -2)
+        end
+        if !bias_k.nil? && !bias_v.nil?
+          unless key.size(-1) == bias_k.size(-1) && key.size(-2) == bias_k.size(-2) && bias_k.size(-3) == 1
+            raise "Shape of bias_k is not supported"
+          end
+          unless value.size(-1) == bias_v.size(-1) && value.size(-2) == bias_v.size(-2) && bias_v.size(-3) == 1
+            raise "Shape of bias_v is not supported"
+          end
+          key = Torch.cat([key, bias_k])
+          value = Torch.cat([value, bias_v])
+          if !attn_mask.nil?
+            attn_mask = Torch::NN::Functional.pad(attn_mask, [0, 1])
+          end
+        end
+        tgt_len, head_dim = query.size(-3), query.size(-1)
+        unless query.size(-1) == key.size(-1) && key.size(-1) == value.size(-1)
+          raise "The feature dim of query, key, value must be equal."
+        end
+        unless key.size() == value.size()
+          raise "Shape of key, value must match"
+        end
+        src_len = key.size(-3)
+        batch_heads = [query.size(-2), key.size(-2)].max
+        # Scale query
+        query, key, value = query.transpose(-2, -3), key.transpose(-2, -3), value.transpose(-2, -3)
+        query = query * (head_dim.to_f ** -0.5)
+        if !attn_mask.nil?
+          if attn_mask.dim() != 3
+            raise RuntimeError, "attn_mask must be a 3D tensor."
+          end
+          if (attn_mask.size(-1) != src_len) || (attn_mask.size(-2) != tgt_len) || (attn_mask.size(-3) != 1 && attn_mask.size(-3) != batch_heads)
+            raise RuntimeError, "The size of the attn_mask is not correct."
+          end
+          if attn_mask.dtype != :bool
+            raise RuntimeError, "Only bool tensor is supported for attn_mask"
+          end
+        end
+        # Dot product of q, k
+        attn_output_weights = Torch.matmul(query, key.transpose(-2, -1))
+        if !attn_mask.nil?
+          # TODO confirm last argument
+          attn_output_weights.masked_fill!(attn_mask, -1e8, nil)
+        end
+        attn_output_weights = Torch::NN::Functional.softmax(attn_output_weights, dim: -1)
+        attn_output_weights = Torch::NN::Functional.dropout(attn_output_weights, p: @dropout, training: @training)
+        attn_output = Torch.matmul(attn_output_weights, value)
+        if @batch_first
+          [attn_output, attn_output_weights]
+        else
+          [attn_output.transpose(-3, -2), attn_output_weights]
+        end
+      end
+    end
+  end
+end

data/lib/torchtext/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module TorchText
-  VERSION = "0.1.0"
+  VERSION = "0.1.1"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: torchtext
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Andrew Kane
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-08-24 00:00:00.000000000 Z
+date: 2021-07-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: torch-rb
@@ -66,7 +66,7 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '5'
-description:
+description:
 email: andrew@chartkick.com
 executables: []
 extensions: []
@@ -76,16 +76,20 @@ files:
 - LICENSE.txt
 - README.md
 - lib/torchtext.rb
+- lib/torchtext/data/metrics.rb
 - lib/torchtext/data/utils.rb
 - lib/torchtext/datasets/text_classification.rb
 - lib/torchtext/datasets/text_classification_dataset.rb
+- lib/torchtext/nn/in_proj_container.rb
+- lib/torchtext/nn/multihead_attention_container.rb
+- lib/torchtext/nn/scaled_dot_product.rb
 - lib/torchtext/version.rb
 - lib/torchtext/vocab.rb
 homepage: https://github.com/ankane/torchtext
 licenses:
 - BSD-3-Clause
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -100,8 +104,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.1.2
-signing_key:
+rubygems_version: 3.2.22
+signing_key:
 specification_version: 4
 summary: Data loaders and abstractions for text and NLP
 test_files: []