torchtext 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 86469f8148e519b940a643f81b5317d3e180d6ebc031da14cb0599b48e3f6556
4
+ data.tar.gz: 499079c8a32de3ea6704b58a04ad8511f7a6784cc138b08e87696c69d7835863
5
+ SHA512:
6
+ metadata.gz: e3ea0d3719d35a58b757ac3d11adeda30912f35f69f7de37047ef702c556e5384862f950e055565db8396d8495a760b4919fd416affbdc0fd815dc14ed02e3a3
7
+ data.tar.gz: 16d2817864dc4bba2d54ca4a7288bc609b95ab5d59da51c67eccf70107fd78e67af141b765d01b8eea82c0a112b3dc779c174d7864afee6fb5651a5d787df7c5
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (2020-08-24)
2
+
3
+ - First release
@@ -0,0 +1,30 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) James Bradbury and Soumith Chintala 2016,
4
+ Copyright (c) Andrew Kane 2020,
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from
19
+ this software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,62 @@
1
+ # TorchText
2
+
3
+ :fire: Data loaders and abstractions for text and NLP - for Ruby
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application’s Gemfile:
8
+
9
+ ```ruby
10
+ gem 'torchtext'
11
+ ```
12
+
13
+ ## Getting Started
14
+
15
+ This library follows the [Python API](https://pytorch.org/text/). Many methods and options are missing at the moment. PRs welcome!
16
+
17
+ ## Examples
18
+
19
+ Text classification
20
+
21
+ - [PyTorch tutorial](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html)
22
+ - [Ruby code](examples/text_classification)
23
+
24
+ ## Datasets
25
+
26
+ Load a dataset
27
+
28
+ ```ruby
29
+ train_dataset, test_dataset = TorchText::Datasets::AG_NEWS.load(root: ".data", ngrams: 2)
30
+ ```
31
+
32
+ Supported datasets are:
33
+
34
+ - [AG_NEWS](http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html)
35
+
36
+ ## Disclaimer
37
+
38
+ This library downloads and prepares public datasets. We don’t host any datasets. Be sure to adhere to the license for each dataset.
39
+
40
+ If you’re a dataset owner and wish to update any details or remove it from this project, let us know.
41
+
42
+ ## History
43
+
44
+ View the [changelog](https://github.com/ankane/torchtext/blob/master/CHANGELOG.md)
45
+
46
+ ## Contributing
47
+
48
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
49
+
50
+ - [Report bugs](https://github.com/ankane/torchtext/issues)
51
+ - Fix bugs and [submit pull requests](https://github.com/ankane/torchtext/pulls)
52
+ - Write, clarify, or fix documentation
53
+ - Suggest or add new features
54
+
55
+ To get started with development:
56
+
57
+ ```sh
58
+ git clone https://github.com/ankane/torchtext.git
59
+ cd torchtext
60
+ bundle install
61
+ bundle exec rake test
62
+ ```
@@ -0,0 +1,19 @@
1
+ # dependencies
2
+ require "torch"
3
+
4
+ # stdlib
5
+ require "csv"
6
+ require "fileutils"
7
+ require "rubygems/package"
8
+ require "set"
9
+
10
+ # modules
11
+ require "torchtext/data/utils"
12
+ require "torchtext/datasets/text_classification"
13
+ require "torchtext/datasets/text_classification_dataset"
14
+ require "torchtext/vocab"
15
+ require "torchtext/version"
16
+
17
+ module TorchText
18
+ class Error < StandardError; end
19
+ end
@@ -0,0 +1,60 @@
1
+ module TorchText
2
+ module Data
3
+ module Utils
4
+ def tokenizer(tokenizer, language: "en")
5
+ return method(:split_tokenizer) if tokenizer.nil?
6
+
7
+ if tokenizer == "basic_english"
8
+ if language != "en"
9
+ raise ArgumentError, "Basic normalization is only available for English(en)"
10
+ end
11
+ return method(:basic_english_normalize)
12
+ end
13
+
14
+ raise "Not implemented yet"
15
+ end
16
+
17
+ def ngrams_iterator(token_list, ngrams)
18
+ return enum_for(:ngrams_iterator, token_list, ngrams) unless block_given?
19
+
20
+ get_ngrams = lambda do |n|
21
+ (token_list.size - n + 1).times.map { |i| token_list[i...(i + n)] }
22
+ end
23
+
24
+ token_list.each do |x|
25
+ yield x
26
+ end
27
+ 2.upto(ngrams) do |n|
28
+ get_ngrams.call(n).each do |x|
29
+ yield x.join(" ")
30
+ end
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def split_tokenizer(x)
37
+ x.split
38
+ end
39
+
40
+ _patterns = [%r{\'}, %r{\"}, %r{\.}, %r{<br \/>}, %r{,}, %r{\(}, %r{\)}, %r{\!}, %r{\?}, %r{\;}, %r{\:}, %r{\s+}]
41
+ _replacements = [" \' ", "", " . ", " ", " , ", " ( ", " ) ", " ! ", " ? ", " ", " ", " "]
42
+
43
+ PATTERNS_DICT = _patterns.zip(_replacements)
44
+
45
+ def basic_english_normalize(line)
46
+ line = line.downcase
47
+
48
+ PATTERNS_DICT.each do |pattern_re, replaced_str|
49
+ line.sub!(pattern_re, replaced_str)
50
+ end
51
+ line.split
52
+ end
53
+
54
+ extend self
55
+ end
56
+
57
+ # TODO only tokenizer method
58
+ extend Utils
59
+ end
60
+ end
@@ -0,0 +1,166 @@
1
+ module TorchText
2
+ module Datasets
3
+ module TextClassification
4
+ URLS = {
5
+ "AG_NEWS" => "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbUDNpeUdjb0wxRms"
6
+ }
7
+ PATHS = {
8
+ "AG_NEWS" => "ag_news_csv"
9
+ }
10
+ FILENAMES = {
11
+ "AG_NEWS" => "ag_news_csv.tar.gz"
12
+ }
13
+
14
+ class << self
15
+ def ag_news(*args, **kwargs)
16
+ setup_datasets("AG_NEWS", *args, **kwargs)
17
+ end
18
+
19
+ private
20
+
21
+ def setup_datasets(dataset_name, root: ".data", ngrams: 1, vocab: nil, include_unk: false)
22
+ dataset_tar = download_from_url(URLS[dataset_name], root: root, filename: FILENAMES[dataset_name])
23
+ to_path = extract_archive(dataset_tar)
24
+ extracted_files = Dir["#{to_path}/#{PATHS[dataset_name]}/*"]
25
+
26
+ train_csv_path = nil
27
+ test_csv_path = nil
28
+ extracted_files.each do |fname|
29
+ if fname.end_with?("train.csv")
30
+ train_csv_path = fname
31
+ elsif fname.end_with?("test.csv")
32
+ test_csv_path = fname
33
+ end
34
+ end
35
+
36
+ if vocab.nil?
37
+ vocab = Vocab.build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams))
38
+ else
39
+ unless vocab.is_a?(Vocab)
40
+ raise ArgumentError, "Passed vocabulary is not of type Vocab"
41
+ end
42
+ end
43
+ train_data, train_labels = _create_data_from_iterator(vocab, _csv_iterator(train_csv_path, ngrams, yield_cls: true), include_unk)
44
+ test_data, test_labels = _create_data_from_iterator(vocab, _csv_iterator(test_csv_path, ngrams, yield_cls: true), include_unk)
45
+ if (train_labels ^ test_labels).length > 0
46
+ raise ArgumentError, "Training and test labels don't match"
47
+ end
48
+
49
+ [
50
+ TextClassificationDataset.new(vocab, train_data, train_labels),
51
+ TextClassificationDataset.new(vocab, test_data, test_labels)
52
+ ]
53
+ end
54
+
55
+ def _csv_iterator(data_path, ngrams, yield_cls: false)
56
+ return enum_for(:_csv_iterator, data_path, ngrams, yield_cls: yield_cls) unless block_given?
57
+
58
+ tokenizer = Data.tokenizer("basic_english")
59
+ CSV.foreach(data_path) do |row|
60
+ tokens = row[1..-1].join(" ")
61
+ tokens = tokenizer.call(tokens)
62
+ if yield_cls
63
+ yield row[0].to_i - 1, Data::Utils.ngrams_iterator(tokens, ngrams)
64
+ else
65
+ yield Data::Utils.ngrams_iterator(tokens, ngrams)
66
+ end
67
+ end
68
+ end
69
+
70
+ def _create_data_from_iterator(vocab, iterator, include_unk)
71
+ data = []
72
+ labels = []
73
+ iterator.each do |cls, tokens|
74
+ if include_unk
75
+ tokens = Torch.tensor(tokens.map { |token| vocab[token] })
76
+ else
77
+ token_ids = tokens.map { |token| vocab[token] }.select { |x| x != Vocab::UNK }
78
+ tokens = Torch.tensor(token_ids)
79
+ end
80
+ data << [cls, tokens]
81
+ labels << cls
82
+ end
83
+ [data, Set.new(labels)]
84
+ end
85
+
86
+ # extra filename parameter
87
+ def download_from_url(url, root:, filename:)
88
+ path = File.join(root, filename)
89
+ return path if File.exist?(path)
90
+
91
+ FileUtils.mkdir_p(root)
92
+
93
+ puts "Downloading #{url}..."
94
+ download_url_to_file(url, path)
95
+ end
96
+
97
+ # follows redirects
98
+ def download_url_to_file(url, dst)
99
+ uri = URI(url)
100
+ tmp = nil
101
+ location = nil
102
+
103
+ Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
104
+ request = Net::HTTP::Get.new(uri)
105
+
106
+ http.request(request) do |response|
107
+ case response
108
+ when Net::HTTPRedirection
109
+ location = response["location"]
110
+ when Net::HTTPSuccess
111
+ tmp = "#{Dir.tmpdir}/#{Time.now.to_f}" # TODO better name
112
+ File.open(tmp, "wb") do |f|
113
+ response.read_body do |chunk|
114
+ f.write(chunk)
115
+ end
116
+ end
117
+ else
118
+ raise Error, "Bad response"
119
+ end
120
+ end
121
+ end
122
+
123
+ if location
124
+ download_url_to_file(location, dst)
125
+ else
126
+ FileUtils.mv(tmp, dst)
127
+ dst
128
+ end
129
+ end
130
+
131
+ # extract_tar_gz doesn't list files, so just return to_path
132
+ def extract_archive(from_path, to_path: nil, overwrite: nil)
133
+ to_path ||= File.dirname(from_path)
134
+
135
+ if from_path.end_with?(".tar.gz") || from_path.end_with?(".tgz")
136
+ File.open(from_path, "rb") do |io|
137
+ Gem::Package.new("").extract_tar_gz(io, to_path)
138
+ end
139
+ return to_path
140
+ end
141
+
142
+ raise "Not implemented yet"
143
+ end
144
+ end
145
+
146
+ DATASETS = {
147
+ "AG_NEWS" => method(:ag_news)
148
+ }
149
+
150
+ LABELS = {
151
+ "AG_NEWS" => {
152
+ 0 => "World",
153
+ 1 => "Sports",
154
+ 2 => "Business",
155
+ 3 => "Sci/Tech"
156
+ }
157
+ }
158
+ end
159
+
160
+ class AG_NEWS
161
+ def self.load(*args, **kwargs)
162
+ TextClassification.ag_news(*args, **kwargs)
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,29 @@
1
+ module TorchText
2
+ module Datasets
3
+ class TextClassificationDataset < Torch::Utils::Data::Dataset
4
+ attr_reader :labels, :vocab
5
+
6
+ def initialize(vocab, data, labels)
7
+ super()
8
+ @data = data
9
+ @labels = labels
10
+ @vocab = vocab
11
+ end
12
+
13
+ def [](i)
14
+ @data[i]
15
+ end
16
+
17
+ def length
18
+ @data.length
19
+ end
20
+ alias_method :size, :length
21
+
22
+ def each
23
+ @data.each do |x|
24
+ yield x
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,3 @@
1
+ module TorchText
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,87 @@
1
+ module TorchText
2
+ class Vocab
3
+ UNK = "<unk>"
4
+
5
+ def initialize(
6
+ counter, max_size: nil, min_freq: 1, specials: ["<unk>", "<pad>"],
7
+ vectors: nil, unk_init: nil, vectors_cache: nil, specials_first: true
8
+ )
9
+
10
+ @freqs = counter
11
+ counter = counter.dup
12
+ min_freq = [min_freq, 1].max
13
+
14
+ @itos = []
15
+ @unk_index = nil
16
+
17
+ if specials_first
18
+ @itos = specials
19
+ # only extend max size if specials are prepended
20
+ max_size += specials.size if max_size
21
+ end
22
+
23
+ # frequencies of special tokens are not counted when building vocabulary
24
+ # in frequency order
25
+ specials.each do |tok|
26
+ counter.delete(tok)
27
+ end
28
+
29
+ # sort by frequency, then alphabetically
30
+ words_and_frequencies = counter.sort_by { |k, v| [-v, k] }
31
+
32
+ words_and_frequencies.each do |word, freq|
33
+ break if freq < min_freq || @itos.length == max_size
34
+ @itos << word
35
+ end
36
+
37
+ if specials.include?(UNK) # hard-coded for now
38
+ unk_index = specials.index(UNK) # position in list
39
+ # account for ordering of specials, set variable
40
+ @unk_index = specials_first ? unk_index : @itos.length + unk_index
41
+ @stoi = Hash.new(@unk_index)
42
+ else
43
+ @stoi = {}
44
+ end
45
+
46
+ if !specials_first
47
+ @itos.concat(specials)
48
+ end
49
+
50
+ # stoi is simply a reverse dict for itos
51
+ @itos.each_with_index do |tok, i|
52
+ @stoi[tok] = i
53
+ end
54
+
55
+ @vectors = nil
56
+ if !vectors.nil?
57
+ # self.load_vectors(vectors, unk_init=unk_init, cache=vectors_cache)
58
+ raise "Not implemented yet"
59
+ else
60
+ raise "Failed assertion" unless unk_init.nil?
61
+ raise "Failed assertion" unless vectors_cache.nil?
62
+ end
63
+ end
64
+
65
+ def [](token)
66
+ @stoi.fetch(token, @stoi.fetch(UNK))
67
+ end
68
+
69
+ def length
70
+ @itos.length
71
+ end
72
+ alias_method :size, :length
73
+
74
+ def self.build_vocab_from_iterator(iterator)
75
+ counter = Hash.new(0)
76
+ i = 0
77
+ iterator.each do |tokens|
78
+ tokens.each do |token|
79
+ counter[token] += 1
80
+ end
81
+ i += 1
82
+ puts "Processed #{i}" if i % 10000 == 0
83
+ end
84
+ Vocab.new(counter)
85
+ end
86
+ end
87
+ end
metadata ADDED
@@ -0,0 +1,107 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: torchtext
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-08-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: torch-rb
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.3.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.3.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '5'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '5'
69
+ description:
70
+ email: andrew@chartkick.com
71
+ executables: []
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - CHANGELOG.md
76
+ - LICENSE.txt
77
+ - README.md
78
+ - lib/torchtext.rb
79
+ - lib/torchtext/data/utils.rb
80
+ - lib/torchtext/datasets/text_classification.rb
81
+ - lib/torchtext/datasets/text_classification_dataset.rb
82
+ - lib/torchtext/version.rb
83
+ - lib/torchtext/vocab.rb
84
+ homepage: https://github.com/ankane/torchtext
85
+ licenses:
86
+ - BSD-3-Clause
87
+ metadata: {}
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '2.5'
97
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ requirements: []
103
+ rubygems_version: 3.1.2
104
+ signing_key:
105
+ specification_version: 4
106
+ summary: Data loaders and abstractions for text and NLP
107
+ test_files: []