tensor_stream 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +10 -3
- data/lib/tensor_stream.rb +1 -0
- data/lib/tensor_stream/evaluator/base_evaluator.rb +6 -0
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +60 -0
- data/lib/tensor_stream/evaluator/ruby/array_ops.rb +53 -1
- data/lib/tensor_stream/evaluator/ruby/math_ops.rb +42 -5
- data/lib/tensor_stream/generated_stub/ops.rb +61 -5
- data/lib/tensor_stream/helpers/tensor_mixins.rb +10 -1
- data/lib/tensor_stream/math/math_ops.rb +22 -0
- data/lib/tensor_stream/math_gradients.rb +15 -1
- data/lib/tensor_stream/nn/embedding_lookup.rb +114 -0
- data/lib/tensor_stream/nn/nn_ops.rb +3 -0
- data/lib/tensor_stream/op_maker.rb +15 -3
- data/lib/tensor_stream/ops.rb +12 -0
- data/lib/tensor_stream/ops/rsqrt.rb +11 -0
- data/lib/tensor_stream/ops/strided_slice.rb +24 -0
- data/lib/tensor_stream/ops/sum.rb +4 -2
- data/lib/tensor_stream/ops/top_k.rb +23 -0
- data/lib/tensor_stream/session.rb +3 -0
- data/lib/tensor_stream/tensor_shape.rb +32 -1
- data/lib/tensor_stream/train/saver.rb +2 -2
- data/lib/tensor_stream/utils.rb +8 -0
- data/lib/tensor_stream/utils/py_ports.rb +11 -0
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/word_embeddings/word_embedding_1.rb +192 -0
- data/samples/word_embeddings/word_embedding_2.rb +203 -0
- data/tensor_stream.gemspec +3 -0
- metadata +40 -4
- data/samples/neural_networks/lstm.rb +0 -22
@@ -0,0 +1,203 @@
|
|
1
|
+
#
|
2
|
+
# A ruby port of https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer by Guillaume Chevalier
|
3
|
+
#
|
4
|
+
# This is a port so some weird python like conventions may have been left behind
|
5
|
+
require "bundler/setup"
|
6
|
+
require "tensor_stream"
|
7
|
+
require "chakin-rb/chakin"
|
8
|
+
# require 'pry-byebug'
|
9
|
+
require 'zip'
|
10
|
+
|
11
|
+
tf = TensorStream
|
12
|
+
|
13
|
+
batch_size = nil # Any size is accepted
|
14
|
+
word_representations_dimensions = 25 # Embedding of size (vocab_len, nb_dimensions)
|
15
|
+
|
16
|
+
|
17
|
+
DATA_FOLDER = "embeddings"
|
18
|
+
SUBFOLDER_NAME = "glove.twitter.27B"
|
19
|
+
TF_EMBEDDING_FILE_NAME = "#{SUBFOLDER_NAME}.ckpt"
|
20
|
+
SUFFIX = SUBFOLDER_NAME + "." + word_representations_dimensions.to_s
|
21
|
+
TF_EMBEDDINGS_FILE_PATH = File.join(DATA_FOLDER, SUFFIX + "d.ckpt")
|
22
|
+
DICT_WORD_TO_INDEX_FILE_NAME = File.join(DATA_FOLDER, SUFFIX + "d.json")
|
23
|
+
|
24
|
+
# Load a `word_to_index` dict mapping words to their id, with a default value
|
25
|
+
# of pointing to the last index when not found, which is the unknown word.
|
26
|
+
def load_word_to_index(dict_word_to_index_file_name)
|
27
|
+
word_to_index = JSON.parse(File.read(dict_word_to_index_file_name))
|
28
|
+
_LAST_INDEX = word_to_index.size - 1
|
29
|
+
puts "word_to_index dict restored from '#{dict_word_to_index_file_name}'."
|
30
|
+
word_to_index = Hash.new(_LAST_INDEX).merge(word_to_index)
|
31
|
+
word_to_index
|
32
|
+
end
|
33
|
+
|
34
|
+
# Define the embedding tf.Variable and load it.
|
35
|
+
def load_embedding_tf(sess, word_to_index, tf_embeddings_file_path, nb_dims)
|
36
|
+
|
37
|
+
# 1. Define the variable that will hold the embedding:
|
38
|
+
tf_embedding = TensorStream.variable(
|
39
|
+
TensorStream.constant(0.0, shape: [word_to_index.size-1, nb_dims]),
|
40
|
+
trainable: false,
|
41
|
+
name: "Embedding"
|
42
|
+
)
|
43
|
+
|
44
|
+
# 2. Restore the embedding from disks to TensorFlow, GPU (or CPU if GPU unavailable):
|
45
|
+
variables_to_restore = [tf_embedding]
|
46
|
+
embedding_saver = TensorStream::Train::Saver.new(variables_to_restore)
|
47
|
+
embedding_saver.restore(sess, tf_embeddings_file_path)
|
48
|
+
puts "TF embeddings restored from '#{tf_embeddings_file_path}'."
|
49
|
+
|
50
|
+
tf_embedding
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
# Returns the `cosine_similarity = cos(angle_between_a_and_b_in_space)`
|
55
|
+
# for the two word A to all the words B.
|
56
|
+
# The first input word must be a 1D Tensors (word_representation).
|
57
|
+
# The second input words must be 2D Tensors (batch_size, word_representation).
|
58
|
+
# The result is a tf tensor that must be fetched with `sess.run`.
|
59
|
+
def cosine_similarity_tensorflow(tf_word_representation_A, tf_words_representation_B)
|
60
|
+
a_normalized = TensorStream.nn.l2_normalize(tf_word_representation_A, axis: -1)
|
61
|
+
b_normalized = TensorStream.nn.l2_normalize(tf_words_representation_B, axis: -1)
|
62
|
+
TensorStream.reduce_sum(
|
63
|
+
TensorStream.multiply(a_normalized, b_normalized),
|
64
|
+
axis: -1
|
65
|
+
)
|
66
|
+
end
|
67
|
+
|
68
|
+
# In case you didn't do the "%reset":
|
69
|
+
tf.reset_default_graph
|
70
|
+
sess = tf.session
|
71
|
+
|
72
|
+
# Load the embedding matrix in tf
|
73
|
+
word_to_index = load_word_to_index(
|
74
|
+
DICT_WORD_TO_INDEX_FILE_NAME)
|
75
|
+
tf_embedding = load_embedding_tf(sess,
|
76
|
+
word_to_index,
|
77
|
+
TF_EMBEDDINGS_FILE_PATH,
|
78
|
+
word_representations_dimensions)
|
79
|
+
|
80
|
+
|
81
|
+
# Input to the graph where word IDs can be sent in batch. Look at the "shape" args:
|
82
|
+
@tf_word_A_id = tf.placeholder(:int32, shape: [1])
|
83
|
+
@tf_words_B_ids = tf.placeholder(:int32, shape: [batch_size])
|
84
|
+
|
85
|
+
# Conversion of words to a representation
|
86
|
+
tf_word_representation_A = tf.nn.embedding_lookup(tf_embedding, @tf_word_A_id)
|
87
|
+
tf_words_representation_B = tf.nn.embedding_lookup(tf_embedding, @tf_words_B_ids)
|
88
|
+
|
89
|
+
# The graph output are the "cosine_similarities" which we want to fetch in sess.run(...).
|
90
|
+
@cosine_similarities = cosine_similarity_tensorflow(tf_word_representation_A, tf_words_representation_B)
|
91
|
+
|
92
|
+
print("Model created.")
|
93
|
+
|
94
|
+
# Note: there might be a better way to split sentences for GloVe.
|
95
|
+
# Please look at the documentation or open an issue to suggest a fix.
|
96
|
+
def sentence_to_word_ids(sentence, word_to_index)
|
97
|
+
punctuation = ['.', '!', '?', ',', ':', ';', "'", '"', '(', ')']
|
98
|
+
# Separating punctuation from words:
|
99
|
+
punctuation.each do |punctuation_character|
|
100
|
+
sentence.gsub!(punctuation_character, " #{punctuation_character} ")
|
101
|
+
end
|
102
|
+
# Removing double spaces and lowercasing:
|
103
|
+
sentence = sentence.downcase.squeeze(" ").strip
|
104
|
+
|
105
|
+
# Splitting on every space:
|
106
|
+
split_sentence = sentence.split(" ")
|
107
|
+
ids = split_sentence.map { |w| word_to_index[w.strip] }
|
108
|
+
# Converting to IDs:
|
109
|
+
ids = split_sentence.map { |w| word_to_index[w.strip] }
|
110
|
+
[ids, split_sentence]
|
111
|
+
end
|
112
|
+
|
113
|
+
# Use the model in sess to predict cosine similarities.
|
114
|
+
def predict_cosine_similarities(sess, word_to_index, word_A, words_B)
|
115
|
+
word_A_id, _ = sentence_to_word_ids(word_A, word_to_index)
|
116
|
+
words_B_ids, split_sentence = sentence_to_word_ids(words_B, word_to_index)
|
117
|
+
|
118
|
+
evaluated_cos_similarities = sess.run(
|
119
|
+
@cosine_similarities,
|
120
|
+
feed_dict: {
|
121
|
+
@tf_word_A_id => word_A_id,
|
122
|
+
@tf_words_B_ids => words_B_ids
|
123
|
+
}
|
124
|
+
)
|
125
|
+
[evaluated_cos_similarities, split_sentence]
|
126
|
+
end
|
127
|
+
|
128
|
+
word_A = "Science"
|
129
|
+
words_B = "Hello internet, a vocano erupt like the bitcoin out of the blue and there is an unknownWord00!"
|
130
|
+
|
131
|
+
evaluated_cos_similarities, splitted = predict_cosine_similarities(sess, word_to_index, word_A, words_B)
|
132
|
+
|
133
|
+
puts "Cosine similarities with \"#{word_A}\":"
|
134
|
+
splitted.zip(evaluated_cos_similarities).each do |word, similarity|
|
135
|
+
puts " #{(word+":").ljust(15)}#{similarity}"
|
136
|
+
end
|
137
|
+
|
138
|
+
tf.reset_default_graph()
|
139
|
+
|
140
|
+
|
141
|
+
# Transpose word_to_index dict:
|
142
|
+
index_to_word = word_to_index.invert
|
143
|
+
|
144
|
+
# New graph
|
145
|
+
tf.reset_default_graph()
|
146
|
+
sess = tf.session
|
147
|
+
|
148
|
+
# Load the embedding matrix in tf
|
149
|
+
tf_word_to_index = load_word_to_index(
|
150
|
+
DICT_WORD_TO_INDEX_FILE_NAME)
|
151
|
+
|
152
|
+
tf_embedding = load_embedding_tf(sess,
|
153
|
+
tf_word_to_index,
|
154
|
+
TF_EMBEDDINGS_FILE_PATH,
|
155
|
+
word_representations_dimensions)
|
156
|
+
|
157
|
+
# An input word
|
158
|
+
tf_word_id = tf.placeholder(:int32, shape: [1])
|
159
|
+
tf_word_representation = tf.nn.embedding_lookup(tf_embedding, tf_word_id)
|
160
|
+
|
161
|
+
# An input
|
162
|
+
tf_nb_similar_words_to_get = tf.placeholder(:int32)
|
163
|
+
|
164
|
+
# Dot the word to every embedding
|
165
|
+
tf_all_cosine_similarities = cosine_similarity_tensorflow(
|
166
|
+
tf_word_representation,
|
167
|
+
tf_embedding)
|
168
|
+
|
169
|
+
# Getting the top cosine similarities.
|
170
|
+
tf_top_cosine_similarities, tf_top_word_indices = tf.top_k(
|
171
|
+
tf_all_cosine_similarities,
|
172
|
+
tf_nb_similar_words_to_get + 1,
|
173
|
+
sorted: true
|
174
|
+
)
|
175
|
+
|
176
|
+
# Discard the first word because it's the input word itself:
|
177
|
+
tf_top_cosine_similarities = tf_top_cosine_similarities[1..nil]
|
178
|
+
tf_top_word_indices = tf_top_word_indices[1..nil]
|
179
|
+
|
180
|
+
# Get the top words' representations by fetching
|
181
|
+
# tf_top_words_representation = "tf_embedding[tf_top_word_indices]":
|
182
|
+
tf_top_words_representation = tf.gather(tf_embedding, tf_top_word_indices)
|
183
|
+
|
184
|
+
# Fetch 10 similar words:
|
185
|
+
nb_similar_words_to_get = 10
|
186
|
+
|
187
|
+
|
188
|
+
word = "king"
|
189
|
+
word_id = word_to_index[word]
|
190
|
+
|
191
|
+
top_cosine_similarities, top_word_indices, top_words_representation = sess.run(
|
192
|
+
[tf_top_cosine_similarities, tf_top_word_indices, tf_top_words_representation],
|
193
|
+
feed_dict: {
|
194
|
+
tf_word_id => [word_id],
|
195
|
+
tf_nb_similar_words_to_get => nb_similar_words_to_get
|
196
|
+
}
|
197
|
+
)
|
198
|
+
|
199
|
+
puts "Top similar words to \"#{word}\":\n"
|
200
|
+
top_cosine_similarities.zip(top_word_indices).zip(top_words_representation).each do |w, word_repr|
|
201
|
+
cos_sim, word_id = w
|
202
|
+
puts "#{(index_to_word[word_id]+ ":").ljust(15)}#{(cos_sim.to_s + ",").ljust(15)}#{Vector::elements(word_repr).norm}"
|
203
|
+
end
|
data/tensor_stream.gemspec
CHANGED
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
|
|
17
17
|
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
18
18
|
if spec.respond_to?(:metadata)
|
19
19
|
spec.metadata["allowed_push_host"] = "https://rubygems.org"
|
20
|
+
spec.metadata["changelog_uri"] = "https://github.com/jedld/tensor_stream/blob/master/CHANGELOG.md"
|
20
21
|
else
|
21
22
|
raise "RubyGems 2.0 or newer is required to protect against " \
|
22
23
|
"public gem pushes."
|
@@ -42,8 +43,10 @@ Gem::Specification.new do |spec|
|
|
42
43
|
spec.add_development_dependency "colorize"
|
43
44
|
spec.add_development_dependency "rspec_junit_formatter"
|
44
45
|
spec.add_development_dependency "mnist-learn"
|
46
|
+
spec.add_development_dependency "chakin-rb"
|
45
47
|
spec.add_development_dependency "simplecov"
|
46
48
|
spec.add_development_dependency "standard"
|
49
|
+
spec.add_development_dependency "rubyzip"
|
47
50
|
spec.add_dependency "deep_merge"
|
48
51
|
spec.add_dependency "concurrent-ruby"
|
49
52
|
spec.add_dependency "chunky_png"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tensor_stream
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joseph Emmanuel Dayo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-04-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -164,6 +164,20 @@ dependencies:
|
|
164
164
|
- - ">="
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: chakin-rb
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - ">="
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
type: :development
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '0'
|
167
181
|
- !ruby/object:Gem::Dependency
|
168
182
|
name: simplecov
|
169
183
|
requirement: !ruby/object:Gem::Requirement
|
@@ -192,6 +206,20 @@ dependencies:
|
|
192
206
|
- - ">="
|
193
207
|
- !ruby/object:Gem::Version
|
194
208
|
version: '0'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: rubyzip
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - ">="
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '0'
|
216
|
+
type: :development
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - ">="
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '0'
|
195
223
|
- !ruby/object:Gem::Dependency
|
196
224
|
name: deep_merge
|
197
225
|
requirement: !ruby/object:Gem::Requirement
|
@@ -304,12 +332,14 @@ files:
|
|
304
332
|
- lib/tensor_stream/helpers/tensor_mixins.rb
|
305
333
|
- lib/tensor_stream/images.rb
|
306
334
|
- lib/tensor_stream/initializer.rb
|
335
|
+
- lib/tensor_stream/math/math_ops.rb
|
307
336
|
- lib/tensor_stream/math_gradients.rb
|
308
337
|
- lib/tensor_stream/monkey_patches/array.rb
|
309
338
|
- lib/tensor_stream/monkey_patches/float.rb
|
310
339
|
- lib/tensor_stream/monkey_patches/integer.rb
|
311
340
|
- lib/tensor_stream/monkey_patches/op_patch.rb
|
312
341
|
- lib/tensor_stream/monkey_patches/patch.rb
|
342
|
+
- lib/tensor_stream/nn/embedding_lookup.rb
|
313
343
|
- lib/tensor_stream/nn/nn_ops.rb
|
314
344
|
- lib/tensor_stream/op_maker.rb
|
315
345
|
- lib/tensor_stream/operation.rb
|
@@ -349,16 +379,19 @@ files:
|
|
349
379
|
- lib/tensor_stream/ops/rank.rb
|
350
380
|
- lib/tensor_stream/ops/reshape.rb
|
351
381
|
- lib/tensor_stream/ops/round.rb
|
382
|
+
- lib/tensor_stream/ops/rsqrt.rb
|
352
383
|
- lib/tensor_stream/ops/shape.rb
|
353
384
|
- lib/tensor_stream/ops/sigmoid.rb
|
354
385
|
- lib/tensor_stream/ops/sign.rb
|
355
386
|
- lib/tensor_stream/ops/sin.rb
|
356
387
|
- lib/tensor_stream/ops/size.rb
|
388
|
+
- lib/tensor_stream/ops/strided_slice.rb
|
357
389
|
- lib/tensor_stream/ops/sub.rb
|
358
390
|
- lib/tensor_stream/ops/sum.rb
|
359
391
|
- lib/tensor_stream/ops/tan.rb
|
360
392
|
- lib/tensor_stream/ops/tanh.rb
|
361
393
|
- lib/tensor_stream/ops/tile.rb
|
394
|
+
- lib/tensor_stream/ops/top_k.rb
|
362
395
|
- lib/tensor_stream/ops/zeros.rb
|
363
396
|
- lib/tensor_stream/placeholder.rb
|
364
397
|
- lib/tensor_stream/profile/report_tool.rb
|
@@ -381,25 +414,28 @@ files:
|
|
381
414
|
- lib/tensor_stream/utils.rb
|
382
415
|
- lib/tensor_stream/utils/data_type_utils.rb
|
383
416
|
- lib/tensor_stream/utils/freezer.rb
|
417
|
+
- lib/tensor_stream/utils/py_ports.rb
|
384
418
|
- lib/tensor_stream/variable.rb
|
385
419
|
- lib/tensor_stream/variable_scope.rb
|
386
420
|
- lib/tensor_stream/version.rb
|
387
421
|
- samples/datasets/iris.data
|
388
422
|
- samples/jupyter_notebooks/linear_regression.ipynb
|
389
423
|
- samples/neural_networks/iris.rb
|
390
|
-
- samples/neural_networks/lstm.rb
|
391
424
|
- samples/neural_networks/mnist_data.rb
|
392
425
|
- samples/neural_networks/raw_neural_net_sample.rb
|
393
426
|
- samples/neural_networks/rnn.rb
|
394
427
|
- samples/others/nearest_neighbor.rb
|
395
428
|
- samples/regression/linear_regression.rb
|
396
429
|
- samples/regression/logistic_regression.rb
|
430
|
+
- samples/word_embeddings/word_embedding_1.rb
|
431
|
+
- samples/word_embeddings/word_embedding_2.rb
|
397
432
|
- tensor_stream.gemspec
|
398
433
|
homepage: http://www.github.com/jedld/tensor_stream
|
399
434
|
licenses:
|
400
435
|
- MIT
|
401
436
|
metadata:
|
402
437
|
allowed_push_host: https://rubygems.org
|
438
|
+
changelog_uri: https://github.com/jedld/tensor_stream/blob/master/CHANGELOG.md
|
403
439
|
post_install_message:
|
404
440
|
rdoc_options: []
|
405
441
|
require_paths:
|
@@ -415,7 +451,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
415
451
|
- !ruby/object:Gem::Version
|
416
452
|
version: '0'
|
417
453
|
requirements: []
|
418
|
-
rubygems_version: 3.0.
|
454
|
+
rubygems_version: 3.0.1
|
419
455
|
signing_key:
|
420
456
|
specification_version: 4
|
421
457
|
summary: A Pure ruby tensorflow implementation
|
@@ -1,22 +0,0 @@
|
|
1
|
-
# A ruby port of the example code discussed by Martin Gorner in
|
2
|
-
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
|
-
#
|
4
|
-
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
|
-
#
|
6
|
-
# Requirements:
|
7
|
-
# mnist-learn gem
|
8
|
-
# opencl_ruby_ffi gem
|
9
|
-
require "bundler/setup"
|
10
|
-
require "tensor_stream"
|
11
|
-
require "mnist-learn"
|
12
|
-
|
13
|
-
# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
|
14
|
-
# gem install tensor_stream-opencl
|
15
|
-
require 'tensor_stream/opencl'
|
16
|
-
|
17
|
-
tf = TensorStream
|
18
|
-
|
19
|
-
# Import MNIST data
|
20
|
-
puts "downloading minst data"
|
21
|
-
mnist = Mnist.read_data_sets("/tmp/data", one_hot: true)
|
22
|
-
puts "downloading finished"
|