torchtext 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 86469f8148e519b940a643f81b5317d3e180d6ebc031da14cb0599b48e3f6556
4
+ data.tar.gz: 499079c8a32de3ea6704b58a04ad8511f7a6784cc138b08e87696c69d7835863
5
+ SHA512:
6
+ metadata.gz: e3ea0d3719d35a58b757ac3d11adeda30912f35f69f7de37047ef702c556e5384862f950e055565db8396d8495a760b4919fd416affbdc0fd815dc14ed02e3a3
7
+ data.tar.gz: 16d2817864dc4bba2d54ca4a7288bc609b95ab5d59da51c67eccf70107fd78e67af141b765d01b8eea82c0a112b3dc779c174d7864afee6fb5651a5d787df7c5
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (2020-08-24)
2
+
3
+ - First release
@@ -0,0 +1,30 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) James Bradbury and Soumith Chintala 2016,
4
+ Copyright (c) Andrew Kane 2020,
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from
19
+ this software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,62 @@
1
+ # TorchText
2
+
3
+ :fire: Data loaders and abstractions for text and NLP - for Ruby
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application’s Gemfile:
8
+
9
+ ```ruby
10
+ gem 'torchtext'
11
+ ```
12
+
13
+ ## Getting Started
14
+
15
+ This library follows the [Python API](https://pytorch.org/text/). Many methods and options are missing at the moment. PRs welcome!
16
+
17
+ ## Examples
18
+
19
+ Text classification
20
+
21
+ - [PyTorch tutorial](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html)
22
+ - [Ruby code](examples/text_classification)
23
+
24
+ ## Datasets
25
+
26
+ Load a dataset
27
+
28
+ ```ruby
29
+ train_dataset, test_dataset = TorchText::Datasets::AG_NEWS.load(root: ".data", ngrams: 2)
30
+ ```
31
+
32
+ Supported datasets are:
33
+
34
+ - [AG_NEWS](http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html)
35
+
36
+ ## Disclaimer
37
+
38
+ This library downloads and prepares public datasets. We don’t host any datasets. Be sure to adhere to the license for each dataset.
39
+
40
+ If you’re a dataset owner and wish to update any details or remove it from this project, let us know.
41
+
42
+ ## History
43
+
44
+ View the [changelog](https://github.com/ankane/torchtext/blob/master/CHANGELOG.md)
45
+
46
+ ## Contributing
47
+
48
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
49
+
50
+ - [Report bugs](https://github.com/ankane/torchtext/issues)
51
+ - Fix bugs and [submit pull requests](https://github.com/ankane/torchtext/pulls)
52
+ - Write, clarify, or fix documentation
53
+ - Suggest or add new features
54
+
55
+ To get started with development:
56
+
57
+ ```sh
58
+ git clone https://github.com/ankane/torchtext.git
59
+ cd torchtext
60
+ bundle install
61
+ bundle exec rake test
62
+ ```
@@ -0,0 +1,19 @@
1
+ # dependencies
2
+ require "torch"
3
+
4
+ # stdlib
5
+ require "csv"
6
+ require "fileutils"
7
+ require "rubygems/package"
8
+ require "set"
9
+
10
+ # modules
11
+ require "torchtext/data/utils"
12
+ require "torchtext/datasets/text_classification"
13
+ require "torchtext/datasets/text_classification_dataset"
14
+ require "torchtext/vocab"
15
+ require "torchtext/version"
16
+
17
+ module TorchText
18
+ class Error < StandardError; end
19
+ end
@@ -0,0 +1,60 @@
1
+ module TorchText
2
+ module Data
3
+ module Utils
4
+ def tokenizer(tokenizer, language: "en")
5
+ return method(:split_tokenizer) if tokenizer.nil?
6
+
7
+ if tokenizer == "basic_english"
8
+ if language != "en"
9
+ raise ArgumentError, "Basic normalization is only available for English(en)"
10
+ end
11
+ return method(:basic_english_normalize)
12
+ end
13
+
14
+ raise "Not implemented yet"
15
+ end
16
+
17
+ def ngrams_iterator(token_list, ngrams)
18
+ return enum_for(:ngrams_iterator, token_list, ngrams) unless block_given?
19
+
20
+ get_ngrams = lambda do |n|
21
+ (token_list.size - n + 1).times.map { |i| token_list[i...(i + n)] }
22
+ end
23
+
24
+ token_list.each do |x|
25
+ yield x
26
+ end
27
+ 2.upto(ngrams) do |n|
28
+ get_ngrams.call(n).each do |x|
29
+ yield x.join(" ")
30
+ end
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def split_tokenizer(x)
37
+ x.split
38
+ end
39
+
40
+ _patterns = [%r{\'}, %r{\"}, %r{\.}, %r{<br \/>}, %r{,}, %r{\(}, %r{\)}, %r{\!}, %r{\?}, %r{\;}, %r{\:}, %r{\s+}]
41
+ _replacements = [" \' ", "", " . ", " ", " , ", " ( ", " ) ", " ! ", " ? ", " ", " ", " "]
42
+
43
+ PATTERNS_DICT = _patterns.zip(_replacements)
44
+
45
+ def basic_english_normalize(line)
46
+ line = line.downcase
47
+
48
+ PATTERNS_DICT.each do |pattern_re, replaced_str|
49
+ line.sub!(pattern_re, replaced_str)
50
+ end
51
+ line.split
52
+ end
53
+
54
+ extend self
55
+ end
56
+
57
+ # TODO only tokenizer method
58
+ extend Utils
59
+ end
60
+ end
@@ -0,0 +1,166 @@
1
+ module TorchText
2
+ module Datasets
3
+ module TextClassification
4
+ URLS = {
5
+ "AG_NEWS" => "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbUDNpeUdjb0wxRms"
6
+ }
7
+ PATHS = {
8
+ "AG_NEWS" => "ag_news_csv"
9
+ }
10
+ FILENAMES = {
11
+ "AG_NEWS" => "ag_news_csv.tar.gz"
12
+ }
13
+
14
+ class << self
15
+ def ag_news(*args, **kwargs)
16
+ setup_datasets("AG_NEWS", *args, **kwargs)
17
+ end
18
+
19
+ private
20
+
21
+ def setup_datasets(dataset_name, root: ".data", ngrams: 1, vocab: nil, include_unk: false)
22
+ dataset_tar = download_from_url(URLS[dataset_name], root: root, filename: FILENAMES[dataset_name])
23
+ to_path = extract_archive(dataset_tar)
24
+ extracted_files = Dir["#{to_path}/#{PATHS[dataset_name]}/*"]
25
+
26
+ train_csv_path = nil
27
+ test_csv_path = nil
28
+ extracted_files.each do |fname|
29
+ if fname.end_with?("train.csv")
30
+ train_csv_path = fname
31
+ elsif fname.end_with?("test.csv")
32
+ test_csv_path = fname
33
+ end
34
+ end
35
+
36
+ if vocab.nil?
37
+ vocab = Vocab.build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams))
38
+ else
39
+ unless vocab.is_a?(Vocab)
40
+ raise ArgumentError, "Passed vocabulary is not of type Vocab"
41
+ end
42
+ end
43
+ train_data, train_labels = _create_data_from_iterator(vocab, _csv_iterator(train_csv_path, ngrams, yield_cls: true), include_unk)
44
+ test_data, test_labels = _create_data_from_iterator(vocab, _csv_iterator(test_csv_path, ngrams, yield_cls: true), include_unk)
45
+ if (train_labels ^ test_labels).length > 0
46
+ raise ArgumentError, "Training and test labels don't match"
47
+ end
48
+
49
+ [
50
+ TextClassificationDataset.new(vocab, train_data, train_labels),
51
+ TextClassificationDataset.new(vocab, test_data, test_labels)
52
+ ]
53
+ end
54
+
55
+ def _csv_iterator(data_path, ngrams, yield_cls: false)
56
+ return enum_for(:_csv_iterator, data_path, ngrams, yield_cls: yield_cls) unless block_given?
57
+
58
+ tokenizer = Data.tokenizer("basic_english")
59
+ CSV.foreach(data_path) do |row|
60
+ tokens = row[1..-1].join(" ")
61
+ tokens = tokenizer.call(tokens)
62
+ if yield_cls
63
+ yield row[0].to_i - 1, Data::Utils.ngrams_iterator(tokens, ngrams)
64
+ else
65
+ yield Data::Utils.ngrams_iterator(tokens, ngrams)
66
+ end
67
+ end
68
+ end
69
+
70
+ def _create_data_from_iterator(vocab, iterator, include_unk)
71
+ data = []
72
+ labels = []
73
+ iterator.each do |cls, tokens|
74
+ if include_unk
75
+ tokens = Torch.tensor(tokens.map { |token| vocab[token] })
76
+ else
77
+ token_ids = tokens.map { |token| vocab[token] }.select { |x| x != Vocab::UNK }
78
+ tokens = Torch.tensor(token_ids)
79
+ end
80
+ data << [cls, tokens]
81
+ labels << cls
82
+ end
83
+ [data, Set.new(labels)]
84
+ end
85
+
86
+ # extra filename parameter
87
+ def download_from_url(url, root:, filename:)
88
+ path = File.join(root, filename)
89
+ return path if File.exist?(path)
90
+
91
+ FileUtils.mkdir_p(root)
92
+
93
+ puts "Downloading #{url}..."
94
+ download_url_to_file(url, path)
95
+ end
96
+
97
+ # follows redirects
98
+ def download_url_to_file(url, dst)
99
+ uri = URI(url)
100
+ tmp = nil
101
+ location = nil
102
+
103
+ Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
104
+ request = Net::HTTP::Get.new(uri)
105
+
106
+ http.request(request) do |response|
107
+ case response
108
+ when Net::HTTPRedirection
109
+ location = response["location"]
110
+ when Net::HTTPSuccess
111
+ tmp = "#{Dir.tmpdir}/#{Time.now.to_f}" # TODO better name
112
+ File.open(tmp, "wb") do |f|
113
+ response.read_body do |chunk|
114
+ f.write(chunk)
115
+ end
116
+ end
117
+ else
118
+ raise Error, "Bad response"
119
+ end
120
+ end
121
+ end
122
+
123
+ if location
124
+ download_url_to_file(location, dst)
125
+ else
126
+ FileUtils.mv(tmp, dst)
127
+ dst
128
+ end
129
+ end
130
+
131
+ # extract_tar_gz doesn't list files, so just return to_path
132
+ def extract_archive(from_path, to_path: nil, overwrite: nil)
133
+ to_path ||= File.dirname(from_path)
134
+
135
+ if from_path.end_with?(".tar.gz") || from_path.end_with?(".tgz")
136
+ File.open(from_path, "rb") do |io|
137
+ Gem::Package.new("").extract_tar_gz(io, to_path)
138
+ end
139
+ return to_path
140
+ end
141
+
142
+ raise "Not implemented yet"
143
+ end
144
+ end
145
+
146
+ DATASETS = {
147
+ "AG_NEWS" => method(:ag_news)
148
+ }
149
+
150
+ LABELS = {
151
+ "AG_NEWS" => {
152
+ 0 => "World",
153
+ 1 => "Sports",
154
+ 2 => "Business",
155
+ 3 => "Sci/Tech"
156
+ }
157
+ }
158
+ end
159
+
160
+ class AG_NEWS
161
+ def self.load(*args, **kwargs)
162
+ TextClassification.ag_news(*args, **kwargs)
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,29 @@
1
+ module TorchText
2
+ module Datasets
3
+ class TextClassificationDataset < Torch::Utils::Data::Dataset
4
+ attr_reader :labels, :vocab
5
+
6
+ def initialize(vocab, data, labels)
7
+ super()
8
+ @data = data
9
+ @labels = labels
10
+ @vocab = vocab
11
+ end
12
+
13
+ def [](i)
14
+ @data[i]
15
+ end
16
+
17
+ def length
18
+ @data.length
19
+ end
20
+ alias_method :size, :length
21
+
22
+ def each
23
+ @data.each do |x|
24
+ yield x
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,3 @@
1
+ module TorchText
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,87 @@
1
+ module TorchText
2
+ class Vocab
3
+ UNK = "<unk>"
4
+
5
+ def initialize(
6
+ counter, max_size: nil, min_freq: 1, specials: ["<unk>", "<pad>"],
7
+ vectors: nil, unk_init: nil, vectors_cache: nil, specials_first: true
8
+ )
9
+
10
+ @freqs = counter
11
+ counter = counter.dup
12
+ min_freq = [min_freq, 1].max
13
+
14
+ @itos = []
15
+ @unk_index = nil
16
+
17
+ if specials_first
18
+ @itos = specials
19
+ # only extend max size if specials are prepended
20
+ max_size += specials.size if max_size
21
+ end
22
+
23
+ # frequencies of special tokens are not counted when building vocabulary
24
+ # in frequency order
25
+ specials.each do |tok|
26
+ counter.delete(tok)
27
+ end
28
+
29
+ # sort by frequency, then alphabetically
30
+ words_and_frequencies = counter.sort_by { |k, v| [-v, k] }
31
+
32
+ words_and_frequencies.each do |word, freq|
33
+ break if freq < min_freq || @itos.length == max_size
34
+ @itos << word
35
+ end
36
+
37
+ if specials.include?(UNK) # hard-coded for now
38
+ unk_index = specials.index(UNK) # position in list
39
+ # account for ordering of specials, set variable
40
+ @unk_index = specials_first ? unk_index : @itos.length + unk_index
41
+ @stoi = Hash.new(@unk_index)
42
+ else
43
+ @stoi = {}
44
+ end
45
+
46
+ if !specials_first
47
+ @itos.concat(specials)
48
+ end
49
+
50
+ # stoi is simply a reverse dict for itos
51
+ @itos.each_with_index do |tok, i|
52
+ @stoi[tok] = i
53
+ end
54
+
55
+ @vectors = nil
56
+ if !vectors.nil?
57
+ # self.load_vectors(vectors, unk_init=unk_init, cache=vectors_cache)
58
+ raise "Not implemented yet"
59
+ else
60
+ raise "Failed assertion" unless unk_init.nil?
61
+ raise "Failed assertion" unless vectors_cache.nil?
62
+ end
63
+ end
64
+
65
+ def [](token)
66
+ @stoi.fetch(token, @stoi.fetch(UNK))
67
+ end
68
+
69
+ def length
70
+ @itos.length
71
+ end
72
+ alias_method :size, :length
73
+
74
+ def self.build_vocab_from_iterator(iterator)
75
+ counter = Hash.new(0)
76
+ i = 0
77
+ iterator.each do |tokens|
78
+ tokens.each do |token|
79
+ counter[token] += 1
80
+ end
81
+ i += 1
82
+ puts "Processed #{i}" if i % 10000 == 0
83
+ end
84
+ Vocab.new(counter)
85
+ end
86
+ end
87
+ end
metadata ADDED
@@ -0,0 +1,107 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: torchtext
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-08-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: torch-rb
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.3.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.3.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '5'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '5'
69
+ description:
70
+ email: andrew@chartkick.com
71
+ executables: []
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - CHANGELOG.md
76
+ - LICENSE.txt
77
+ - README.md
78
+ - lib/torchtext.rb
79
+ - lib/torchtext/data/utils.rb
80
+ - lib/torchtext/datasets/text_classification.rb
81
+ - lib/torchtext/datasets/text_classification_dataset.rb
82
+ - lib/torchtext/version.rb
83
+ - lib/torchtext/vocab.rb
84
+ homepage: https://github.com/ankane/torchtext
85
+ licenses:
86
+ - BSD-3-Clause
87
+ metadata: {}
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '2.5'
97
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ requirements: []
103
+ rubygems_version: 3.1.2
104
+ signing_key:
105
+ specification_version: 4
106
+ summary: Data loaders and abstractions for text and NLP
107
+ test_files: []