tensor_stream 1.0.6 → 1.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +10 -3
- data/lib/tensor_stream.rb +1 -0
- data/lib/tensor_stream/evaluator/base_evaluator.rb +6 -0
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +60 -0
- data/lib/tensor_stream/evaluator/ruby/array_ops.rb +53 -1
- data/lib/tensor_stream/evaluator/ruby/math_ops.rb +42 -5
- data/lib/tensor_stream/generated_stub/ops.rb +61 -5
- data/lib/tensor_stream/helpers/tensor_mixins.rb +10 -1
- data/lib/tensor_stream/math/math_ops.rb +22 -0
- data/lib/tensor_stream/math_gradients.rb +15 -1
- data/lib/tensor_stream/nn/embedding_lookup.rb +114 -0
- data/lib/tensor_stream/nn/nn_ops.rb +3 -0
- data/lib/tensor_stream/op_maker.rb +15 -3
- data/lib/tensor_stream/ops.rb +12 -0
- data/lib/tensor_stream/ops/rsqrt.rb +11 -0
- data/lib/tensor_stream/ops/strided_slice.rb +24 -0
- data/lib/tensor_stream/ops/sum.rb +4 -2
- data/lib/tensor_stream/ops/top_k.rb +23 -0
- data/lib/tensor_stream/session.rb +3 -0
- data/lib/tensor_stream/tensor_shape.rb +32 -1
- data/lib/tensor_stream/train/saver.rb +2 -2
- data/lib/tensor_stream/utils.rb +8 -0
- data/lib/tensor_stream/utils/py_ports.rb +11 -0
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/word_embeddings/word_embedding_1.rb +192 -0
- data/samples/word_embeddings/word_embedding_2.rb +203 -0
- data/tensor_stream.gemspec +3 -0
- metadata +40 -4
- data/samples/neural_networks/lstm.rb +0 -22
@@ -0,0 +1,203 @@
|
|
1
|
+
#
|
2
|
+
# A ruby port of https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer by Guillaume Chevalier
|
3
|
+
#
|
4
|
+
# This is a port so some weird python like conventions may have been left behind
|
5
|
+
require "bundler/setup"
|
6
|
+
require "tensor_stream"
|
7
|
+
require "chakin-rb/chakin"
|
8
|
+
# require 'pry-byebug'
|
9
|
+
require 'zip'
|
10
|
+
|
11
|
+
tf = TensorStream
|
12
|
+
|
13
|
+
batch_size = nil # Any size is accepted
|
14
|
+
word_representations_dimensions = 25 # Embedding of size (vocab_len, nb_dimensions)
|
15
|
+
|
16
|
+
|
17
|
+
DATA_FOLDER = "embeddings"
|
18
|
+
SUBFOLDER_NAME = "glove.twitter.27B"
|
19
|
+
TF_EMBEDDING_FILE_NAME = "#{SUBFOLDER_NAME}.ckpt"
|
20
|
+
SUFFIX = SUBFOLDER_NAME + "." + word_representations_dimensions.to_s
|
21
|
+
TF_EMBEDDINGS_FILE_PATH = File.join(DATA_FOLDER, SUFFIX + "d.ckpt")
|
22
|
+
DICT_WORD_TO_INDEX_FILE_NAME = File.join(DATA_FOLDER, SUFFIX + "d.json")
|
23
|
+
|
24
|
+
# Load a `word_to_index` dict mapping words to their id, with a default value
|
25
|
+
# of pointing to the last index when not found, which is the unknown word.
|
26
|
+
def load_word_to_index(dict_word_to_index_file_name)
|
27
|
+
word_to_index = JSON.parse(File.read(dict_word_to_index_file_name))
|
28
|
+
_LAST_INDEX = word_to_index.size - 1
|
29
|
+
puts "word_to_index dict restored from '#{dict_word_to_index_file_name}'."
|
30
|
+
word_to_index = Hash.new(_LAST_INDEX).merge(word_to_index)
|
31
|
+
word_to_index
|
32
|
+
end
|
33
|
+
|
34
|
+
# Define the embedding tf.Variable and load it.
|
35
|
+
def load_embedding_tf(sess, word_to_index, tf_embeddings_file_path, nb_dims)
|
36
|
+
|
37
|
+
# 1. Define the variable that will hold the embedding:
|
38
|
+
tf_embedding = TensorStream.variable(
|
39
|
+
TensorStream.constant(0.0, shape: [word_to_index.size-1, nb_dims]),
|
40
|
+
trainable: false,
|
41
|
+
name: "Embedding"
|
42
|
+
)
|
43
|
+
|
44
|
+
# 2. Restore the embedding from disks to TensorFlow, GPU (or CPU if GPU unavailable):
|
45
|
+
variables_to_restore = [tf_embedding]
|
46
|
+
embedding_saver = TensorStream::Train::Saver.new(variables_to_restore)
|
47
|
+
embedding_saver.restore(sess, tf_embeddings_file_path)
|
48
|
+
puts "TF embeddings restored from '#{tf_embeddings_file_path}'."
|
49
|
+
|
50
|
+
tf_embedding
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
# Returns the `cosine_similarity = cos(angle_between_a_and_b_in_space)`
|
55
|
+
# for the two word A to all the words B.
|
56
|
+
# The first input word must be a 1D Tensors (word_representation).
|
57
|
+
# The second input words must be 2D Tensors (batch_size, word_representation).
|
58
|
+
# The result is a tf tensor that must be fetched with `sess.run`.
|
59
|
+
def cosine_similarity_tensorflow(tf_word_representation_A, tf_words_representation_B)
|
60
|
+
a_normalized = TensorStream.nn.l2_normalize(tf_word_representation_A, axis: -1)
|
61
|
+
b_normalized = TensorStream.nn.l2_normalize(tf_words_representation_B, axis: -1)
|
62
|
+
TensorStream.reduce_sum(
|
63
|
+
TensorStream.multiply(a_normalized, b_normalized),
|
64
|
+
axis: -1
|
65
|
+
)
|
66
|
+
end
|
67
|
+
|
68
|
+
# In case you didn't do the "%reset":
|
69
|
+
tf.reset_default_graph
|
70
|
+
sess = tf.session
|
71
|
+
|
72
|
+
# Load the embedding matrix in tf
|
73
|
+
word_to_index = load_word_to_index(
|
74
|
+
DICT_WORD_TO_INDEX_FILE_NAME)
|
75
|
+
tf_embedding = load_embedding_tf(sess,
|
76
|
+
word_to_index,
|
77
|
+
TF_EMBEDDINGS_FILE_PATH,
|
78
|
+
word_representations_dimensions)
|
79
|
+
|
80
|
+
|
81
|
+
# Input to the graph where word IDs can be sent in batch. Look at the "shape" args:
|
82
|
+
@tf_word_A_id = tf.placeholder(:int32, shape: [1])
|
83
|
+
@tf_words_B_ids = tf.placeholder(:int32, shape: [batch_size])
|
84
|
+
|
85
|
+
# Conversion of words to a representation
|
86
|
+
tf_word_representation_A = tf.nn.embedding_lookup(tf_embedding, @tf_word_A_id)
|
87
|
+
tf_words_representation_B = tf.nn.embedding_lookup(tf_embedding, @tf_words_B_ids)
|
88
|
+
|
89
|
+
# The graph output are the "cosine_similarities" which we want to fetch in sess.run(...).
|
90
|
+
@cosine_similarities = cosine_similarity_tensorflow(tf_word_representation_A, tf_words_representation_B)
|
91
|
+
|
92
|
+
print("Model created.")
|
93
|
+
|
94
|
+
# Note: there might be a better way to split sentences for GloVe.
|
95
|
+
# Please look at the documentation or open an issue to suggest a fix.
|
96
|
+
def sentence_to_word_ids(sentence, word_to_index)
|
97
|
+
punctuation = ['.', '!', '?', ',', ':', ';', "'", '"', '(', ')']
|
98
|
+
# Separating punctuation from words:
|
99
|
+
punctuation.each do |punctuation_character|
|
100
|
+
sentence.gsub!(punctuation_character, " #{punctuation_character} ")
|
101
|
+
end
|
102
|
+
# Removing double spaces and lowercasing:
|
103
|
+
sentence = sentence.downcase.squeeze(" ").strip
|
104
|
+
|
105
|
+
# Splitting on every space:
|
106
|
+
split_sentence = sentence.split(" ")
|
107
|
+
ids = split_sentence.map { |w| word_to_index[w.strip] }
|
108
|
+
# Converting to IDs:
|
109
|
+
ids = split_sentence.map { |w| word_to_index[w.strip] }
|
110
|
+
[ids, split_sentence]
|
111
|
+
end
|
112
|
+
|
113
|
+
# Use the model in sess to predict cosine similarities.
|
114
|
+
def predict_cosine_similarities(sess, word_to_index, word_A, words_B)
|
115
|
+
word_A_id, _ = sentence_to_word_ids(word_A, word_to_index)
|
116
|
+
words_B_ids, split_sentence = sentence_to_word_ids(words_B, word_to_index)
|
117
|
+
|
118
|
+
evaluated_cos_similarities = sess.run(
|
119
|
+
@cosine_similarities,
|
120
|
+
feed_dict: {
|
121
|
+
@tf_word_A_id => word_A_id,
|
122
|
+
@tf_words_B_ids => words_B_ids
|
123
|
+
}
|
124
|
+
)
|
125
|
+
[evaluated_cos_similarities, split_sentence]
|
126
|
+
end
|
127
|
+
|
128
|
+
word_A = "Science"
|
129
|
+
words_B = "Hello internet, a vocano erupt like the bitcoin out of the blue and there is an unknownWord00!"
|
130
|
+
|
131
|
+
evaluated_cos_similarities, splitted = predict_cosine_similarities(sess, word_to_index, word_A, words_B)
|
132
|
+
|
133
|
+
puts "Cosine similarities with \"#{word_A}\":"
|
134
|
+
splitted.zip(evaluated_cos_similarities).each do |word, similarity|
|
135
|
+
puts " #{(word+":").ljust(15)}#{similarity}"
|
136
|
+
end
|
137
|
+
|
138
|
+
tf.reset_default_graph()
|
139
|
+
|
140
|
+
|
141
|
+
# Transpose word_to_index dict:
|
142
|
+
index_to_word = word_to_index.invert
|
143
|
+
|
144
|
+
# New graph
|
145
|
+
tf.reset_default_graph()
|
146
|
+
sess = tf.session
|
147
|
+
|
148
|
+
# Load the embedding matrix in tf
|
149
|
+
tf_word_to_index = load_word_to_index(
|
150
|
+
DICT_WORD_TO_INDEX_FILE_NAME)
|
151
|
+
|
152
|
+
tf_embedding = load_embedding_tf(sess,
|
153
|
+
tf_word_to_index,
|
154
|
+
TF_EMBEDDINGS_FILE_PATH,
|
155
|
+
word_representations_dimensions)
|
156
|
+
|
157
|
+
# An input word
|
158
|
+
tf_word_id = tf.placeholder(:int32, shape: [1])
|
159
|
+
tf_word_representation = tf.nn.embedding_lookup(tf_embedding, tf_word_id)
|
160
|
+
|
161
|
+
# An input
|
162
|
+
tf_nb_similar_words_to_get = tf.placeholder(:int32)
|
163
|
+
|
164
|
+
# Dot the word to every embedding
|
165
|
+
tf_all_cosine_similarities = cosine_similarity_tensorflow(
|
166
|
+
tf_word_representation,
|
167
|
+
tf_embedding)
|
168
|
+
|
169
|
+
# Getting the top cosine similarities.
|
170
|
+
tf_top_cosine_similarities, tf_top_word_indices = tf.top_k(
|
171
|
+
tf_all_cosine_similarities,
|
172
|
+
tf_nb_similar_words_to_get + 1,
|
173
|
+
sorted: true
|
174
|
+
)
|
175
|
+
|
176
|
+
# Discard the first word because it's the input word itself:
|
177
|
+
tf_top_cosine_similarities = tf_top_cosine_similarities[1..nil]
|
178
|
+
tf_top_word_indices = tf_top_word_indices[1..nil]
|
179
|
+
|
180
|
+
# Get the top words' representations by fetching
|
181
|
+
# tf_top_words_representation = "tf_embedding[tf_top_word_indices]":
|
182
|
+
tf_top_words_representation = tf.gather(tf_embedding, tf_top_word_indices)
|
183
|
+
|
184
|
+
# Fetch 10 similar words:
|
185
|
+
nb_similar_words_to_get = 10
|
186
|
+
|
187
|
+
|
188
|
+
word = "king"
|
189
|
+
word_id = word_to_index[word]
|
190
|
+
|
191
|
+
top_cosine_similarities, top_word_indices, top_words_representation = sess.run(
|
192
|
+
[tf_top_cosine_similarities, tf_top_word_indices, tf_top_words_representation],
|
193
|
+
feed_dict: {
|
194
|
+
tf_word_id => [word_id],
|
195
|
+
tf_nb_similar_words_to_get => nb_similar_words_to_get
|
196
|
+
}
|
197
|
+
)
|
198
|
+
|
199
|
+
puts "Top similar words to \"#{word}\":\n"
|
200
|
+
top_cosine_similarities.zip(top_word_indices).zip(top_words_representation).each do |w, word_repr|
|
201
|
+
cos_sim, word_id = w
|
202
|
+
puts "#{(index_to_word[word_id]+ ":").ljust(15)}#{(cos_sim.to_s + ",").ljust(15)}#{Vector::elements(word_repr).norm}"
|
203
|
+
end
|
data/tensor_stream.gemspec
CHANGED
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
|
|
17
17
|
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
18
18
|
if spec.respond_to?(:metadata)
|
19
19
|
spec.metadata["allowed_push_host"] = "https://rubygems.org"
|
20
|
+
spec.metadata["changelog_uri"] = "https://github.com/jedld/tensor_stream/blob/master/CHANGELOG.md"
|
20
21
|
else
|
21
22
|
raise "RubyGems 2.0 or newer is required to protect against " \
|
22
23
|
"public gem pushes."
|
@@ -42,8 +43,10 @@ Gem::Specification.new do |spec|
|
|
42
43
|
spec.add_development_dependency "colorize"
|
43
44
|
spec.add_development_dependency "rspec_junit_formatter"
|
44
45
|
spec.add_development_dependency "mnist-learn"
|
46
|
+
spec.add_development_dependency "chakin-rb"
|
45
47
|
spec.add_development_dependency "simplecov"
|
46
48
|
spec.add_development_dependency "standard"
|
49
|
+
spec.add_development_dependency "rubyzip"
|
47
50
|
spec.add_dependency "deep_merge"
|
48
51
|
spec.add_dependency "concurrent-ruby"
|
49
52
|
spec.add_dependency "chunky_png"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tensor_stream
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joseph Emmanuel Dayo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-04-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -164,6 +164,20 @@ dependencies:
|
|
164
164
|
- - ">="
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: chakin-rb
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - ">="
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
type: :development
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '0'
|
167
181
|
- !ruby/object:Gem::Dependency
|
168
182
|
name: simplecov
|
169
183
|
requirement: !ruby/object:Gem::Requirement
|
@@ -192,6 +206,20 @@ dependencies:
|
|
192
206
|
- - ">="
|
193
207
|
- !ruby/object:Gem::Version
|
194
208
|
version: '0'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: rubyzip
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - ">="
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '0'
|
216
|
+
type: :development
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - ">="
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '0'
|
195
223
|
- !ruby/object:Gem::Dependency
|
196
224
|
name: deep_merge
|
197
225
|
requirement: !ruby/object:Gem::Requirement
|
@@ -304,12 +332,14 @@ files:
|
|
304
332
|
- lib/tensor_stream/helpers/tensor_mixins.rb
|
305
333
|
- lib/tensor_stream/images.rb
|
306
334
|
- lib/tensor_stream/initializer.rb
|
335
|
+
- lib/tensor_stream/math/math_ops.rb
|
307
336
|
- lib/tensor_stream/math_gradients.rb
|
308
337
|
- lib/tensor_stream/monkey_patches/array.rb
|
309
338
|
- lib/tensor_stream/monkey_patches/float.rb
|
310
339
|
- lib/tensor_stream/monkey_patches/integer.rb
|
311
340
|
- lib/tensor_stream/monkey_patches/op_patch.rb
|
312
341
|
- lib/tensor_stream/monkey_patches/patch.rb
|
342
|
+
- lib/tensor_stream/nn/embedding_lookup.rb
|
313
343
|
- lib/tensor_stream/nn/nn_ops.rb
|
314
344
|
- lib/tensor_stream/op_maker.rb
|
315
345
|
- lib/tensor_stream/operation.rb
|
@@ -349,16 +379,19 @@ files:
|
|
349
379
|
- lib/tensor_stream/ops/rank.rb
|
350
380
|
- lib/tensor_stream/ops/reshape.rb
|
351
381
|
- lib/tensor_stream/ops/round.rb
|
382
|
+
- lib/tensor_stream/ops/rsqrt.rb
|
352
383
|
- lib/tensor_stream/ops/shape.rb
|
353
384
|
- lib/tensor_stream/ops/sigmoid.rb
|
354
385
|
- lib/tensor_stream/ops/sign.rb
|
355
386
|
- lib/tensor_stream/ops/sin.rb
|
356
387
|
- lib/tensor_stream/ops/size.rb
|
388
|
+
- lib/tensor_stream/ops/strided_slice.rb
|
357
389
|
- lib/tensor_stream/ops/sub.rb
|
358
390
|
- lib/tensor_stream/ops/sum.rb
|
359
391
|
- lib/tensor_stream/ops/tan.rb
|
360
392
|
- lib/tensor_stream/ops/tanh.rb
|
361
393
|
- lib/tensor_stream/ops/tile.rb
|
394
|
+
- lib/tensor_stream/ops/top_k.rb
|
362
395
|
- lib/tensor_stream/ops/zeros.rb
|
363
396
|
- lib/tensor_stream/placeholder.rb
|
364
397
|
- lib/tensor_stream/profile/report_tool.rb
|
@@ -381,25 +414,28 @@ files:
|
|
381
414
|
- lib/tensor_stream/utils.rb
|
382
415
|
- lib/tensor_stream/utils/data_type_utils.rb
|
383
416
|
- lib/tensor_stream/utils/freezer.rb
|
417
|
+
- lib/tensor_stream/utils/py_ports.rb
|
384
418
|
- lib/tensor_stream/variable.rb
|
385
419
|
- lib/tensor_stream/variable_scope.rb
|
386
420
|
- lib/tensor_stream/version.rb
|
387
421
|
- samples/datasets/iris.data
|
388
422
|
- samples/jupyter_notebooks/linear_regression.ipynb
|
389
423
|
- samples/neural_networks/iris.rb
|
390
|
-
- samples/neural_networks/lstm.rb
|
391
424
|
- samples/neural_networks/mnist_data.rb
|
392
425
|
- samples/neural_networks/raw_neural_net_sample.rb
|
393
426
|
- samples/neural_networks/rnn.rb
|
394
427
|
- samples/others/nearest_neighbor.rb
|
395
428
|
- samples/regression/linear_regression.rb
|
396
429
|
- samples/regression/logistic_regression.rb
|
430
|
+
- samples/word_embeddings/word_embedding_1.rb
|
431
|
+
- samples/word_embeddings/word_embedding_2.rb
|
397
432
|
- tensor_stream.gemspec
|
398
433
|
homepage: http://www.github.com/jedld/tensor_stream
|
399
434
|
licenses:
|
400
435
|
- MIT
|
401
436
|
metadata:
|
402
437
|
allowed_push_host: https://rubygems.org
|
438
|
+
changelog_uri: https://github.com/jedld/tensor_stream/blob/master/CHANGELOG.md
|
403
439
|
post_install_message:
|
404
440
|
rdoc_options: []
|
405
441
|
require_paths:
|
@@ -415,7 +451,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
415
451
|
- !ruby/object:Gem::Version
|
416
452
|
version: '0'
|
417
453
|
requirements: []
|
418
|
-
rubygems_version: 3.0.
|
454
|
+
rubygems_version: 3.0.1
|
419
455
|
signing_key:
|
420
456
|
specification_version: 4
|
421
457
|
summary: A Pure ruby tensorflow implementation
|
@@ -1,22 +0,0 @@
|
|
1
|
-
# A ruby port of the example code discussed by Martin Gorner in
|
2
|
-
# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
|
3
|
-
#
|
4
|
-
# https://www.youtube.com/watch?v=u4alGiomYP4
|
5
|
-
#
|
6
|
-
# Requirements:
|
7
|
-
# mnist-learn gem
|
8
|
-
# opencl_ruby_ffi gem
|
9
|
-
require "bundler/setup"
|
10
|
-
require "tensor_stream"
|
11
|
-
require "mnist-learn"
|
12
|
-
|
13
|
-
# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
|
14
|
-
# gem install tensor_stream-opencl
|
15
|
-
require 'tensor_stream/opencl'
|
16
|
-
|
17
|
-
tf = TensorStream
|
18
|
-
|
19
|
-
# Import MNIST data
|
20
|
-
puts "downloading minst data"
|
21
|
-
mnist = Mnist.read_data_sets("/tmp/data", one_hot: true)
|
22
|
-
puts "downloading finished"
|