tensor_stream 1.0.6 → 1.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,203 @@
1
+ #
2
+ # A ruby port of https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer by Guillaume Chevalier
3
+ #
4
+ # This is a port so some weird python like conventions may have been left behind
5
+ require "bundler/setup"
6
+ require "tensor_stream"
7
+ require "chakin-rb/chakin"
8
+ # require 'pry-byebug'
9
+ require 'zip'
10
+
11
+ tf = TensorStream
12
+
13
+ batch_size = nil # Any size is accepted
14
+ word_representations_dimensions = 25 # Embedding of size (vocab_len, nb_dimensions)
15
+
16
+
17
+ DATA_FOLDER = "embeddings"
18
+ SUBFOLDER_NAME = "glove.twitter.27B"
19
+ TF_EMBEDDING_FILE_NAME = "#{SUBFOLDER_NAME}.ckpt"
20
+ SUFFIX = SUBFOLDER_NAME + "." + word_representations_dimensions.to_s
21
+ TF_EMBEDDINGS_FILE_PATH = File.join(DATA_FOLDER, SUFFIX + "d.ckpt")
22
+ DICT_WORD_TO_INDEX_FILE_NAME = File.join(DATA_FOLDER, SUFFIX + "d.json")
23
+
24
+ # Load a `word_to_index` dict mapping words to their id, with a default value
25
+ # of pointing to the last index when not found, which is the unknown word.
26
+ def load_word_to_index(dict_word_to_index_file_name)
27
+ word_to_index = JSON.parse(File.read(dict_word_to_index_file_name))
28
+ _LAST_INDEX = word_to_index.size - 1
29
+ puts "word_to_index dict restored from '#{dict_word_to_index_file_name}'."
30
+ word_to_index = Hash.new(_LAST_INDEX).merge(word_to_index)
31
+ word_to_index
32
+ end
33
+
34
+ # Define the embedding tf.Variable and load it.
35
+ def load_embedding_tf(sess, word_to_index, tf_embeddings_file_path, nb_dims)
36
+
37
+ # 1. Define the variable that will hold the embedding:
38
+ tf_embedding = TensorStream.variable(
39
+ TensorStream.constant(0.0, shape: [word_to_index.size-1, nb_dims]),
40
+ trainable: false,
41
+ name: "Embedding"
42
+ )
43
+
44
+ # 2. Restore the embedding from disks to TensorFlow, GPU (or CPU if GPU unavailable):
45
+ variables_to_restore = [tf_embedding]
46
+ embedding_saver = TensorStream::Train::Saver.new(variables_to_restore)
47
+ embedding_saver.restore(sess, tf_embeddings_file_path)
48
+ puts "TF embeddings restored from '#{tf_embeddings_file_path}'."
49
+
50
+ tf_embedding
51
+ end
52
+
53
+
54
+ # Returns the `cosine_similarity = cos(angle_between_a_and_b_in_space)`
55
+ # for the two word A to all the words B.
56
+ # The first input word must be a 1D Tensors (word_representation).
57
+ # The second input words must be 2D Tensors (batch_size, word_representation).
58
+ # The result is a tf tensor that must be fetched with `sess.run`.
59
+ def cosine_similarity_tensorflow(tf_word_representation_A, tf_words_representation_B)
60
+ a_normalized = TensorStream.nn.l2_normalize(tf_word_representation_A, axis: -1)
61
+ b_normalized = TensorStream.nn.l2_normalize(tf_words_representation_B, axis: -1)
62
+ TensorStream.reduce_sum(
63
+ TensorStream.multiply(a_normalized, b_normalized),
64
+ axis: -1
65
+ )
66
+ end
67
+
68
+ # In case you didn't do the "%reset":
69
+ tf.reset_default_graph
70
+ sess = tf.session
71
+
72
+ # Load the embedding matrix in tf
73
+ word_to_index = load_word_to_index(
74
+ DICT_WORD_TO_INDEX_FILE_NAME)
75
+ tf_embedding = load_embedding_tf(sess,
76
+ word_to_index,
77
+ TF_EMBEDDINGS_FILE_PATH,
78
+ word_representations_dimensions)
79
+
80
+
81
+ # Input to the graph where word IDs can be sent in batch. Look at the "shape" args:
82
+ @tf_word_A_id = tf.placeholder(:int32, shape: [1])
83
+ @tf_words_B_ids = tf.placeholder(:int32, shape: [batch_size])
84
+
85
+ # Conversion of words to a representation
86
+ tf_word_representation_A = tf.nn.embedding_lookup(tf_embedding, @tf_word_A_id)
87
+ tf_words_representation_B = tf.nn.embedding_lookup(tf_embedding, @tf_words_B_ids)
88
+
89
+ # The graph output are the "cosine_similarities" which we want to fetch in sess.run(...).
90
+ @cosine_similarities = cosine_similarity_tensorflow(tf_word_representation_A, tf_words_representation_B)
91
+
92
+ print("Model created.")
93
+
94
+ # Note: there might be a better way to split sentences for GloVe.
95
+ # Please look at the documentation or open an issue to suggest a fix.
96
+ def sentence_to_word_ids(sentence, word_to_index)
97
+ punctuation = ['.', '!', '?', ',', ':', ';', "'", '"', '(', ')']
98
+ # Separating punctuation from words:
99
+ punctuation.each do |punctuation_character|
100
+ sentence.gsub!(punctuation_character, " #{punctuation_character} ")
101
+ end
102
+ # Removing double spaces and lowercasing:
103
+ sentence = sentence.downcase.squeeze(" ").strip
104
+
105
+ # Splitting on every space:
106
+ split_sentence = sentence.split(" ")
107
+ ids = split_sentence.map { |w| word_to_index[w.strip] }
108
+ # Converting to IDs:
109
+ ids = split_sentence.map { |w| word_to_index[w.strip] }
110
+ [ids, split_sentence]
111
+ end
112
+
113
+ # Use the model in sess to predict cosine similarities.
114
+ def predict_cosine_similarities(sess, word_to_index, word_A, words_B)
115
+ word_A_id, _ = sentence_to_word_ids(word_A, word_to_index)
116
+ words_B_ids, split_sentence = sentence_to_word_ids(words_B, word_to_index)
117
+
118
+ evaluated_cos_similarities = sess.run(
119
+ @cosine_similarities,
120
+ feed_dict: {
121
+ @tf_word_A_id => word_A_id,
122
+ @tf_words_B_ids => words_B_ids
123
+ }
124
+ )
125
+ [evaluated_cos_similarities, split_sentence]
126
+ end
127
+
128
+ word_A = "Science"
129
+ words_B = "Hello internet, a vocano erupt like the bitcoin out of the blue and there is an unknownWord00!"
130
+
131
+ evaluated_cos_similarities, splitted = predict_cosine_similarities(sess, word_to_index, word_A, words_B)
132
+
133
+ puts "Cosine similarities with \"#{word_A}\":"
134
+ splitted.zip(evaluated_cos_similarities).each do |word, similarity|
135
+ puts " #{(word+":").ljust(15)}#{similarity}"
136
+ end
137
+
138
+ tf.reset_default_graph()
139
+
140
+
141
+ # Transpose word_to_index dict:
142
+ index_to_word = word_to_index.invert
143
+
144
+ # New graph
145
+ tf.reset_default_graph()
146
+ sess = tf.session
147
+
148
+ # Load the embedding matrix in tf
149
+ tf_word_to_index = load_word_to_index(
150
+ DICT_WORD_TO_INDEX_FILE_NAME)
151
+
152
+ tf_embedding = load_embedding_tf(sess,
153
+ tf_word_to_index,
154
+ TF_EMBEDDINGS_FILE_PATH,
155
+ word_representations_dimensions)
156
+
157
+ # An input word
158
+ tf_word_id = tf.placeholder(:int32, shape: [1])
159
+ tf_word_representation = tf.nn.embedding_lookup(tf_embedding, tf_word_id)
160
+
161
+ # An input
162
+ tf_nb_similar_words_to_get = tf.placeholder(:int32)
163
+
164
+ # Dot the word to every embedding
165
+ tf_all_cosine_similarities = cosine_similarity_tensorflow(
166
+ tf_word_representation,
167
+ tf_embedding)
168
+
169
+ # Getting the top cosine similarities.
170
+ tf_top_cosine_similarities, tf_top_word_indices = tf.top_k(
171
+ tf_all_cosine_similarities,
172
+ tf_nb_similar_words_to_get + 1,
173
+ sorted: true
174
+ )
175
+
176
+ # Discard the first word because it's the input word itself:
177
+ tf_top_cosine_similarities = tf_top_cosine_similarities[1..nil]
178
+ tf_top_word_indices = tf_top_word_indices[1..nil]
179
+
180
+ # Get the top words' representations by fetching
181
+ # tf_top_words_representation = "tf_embedding[tf_top_word_indices]":
182
+ tf_top_words_representation = tf.gather(tf_embedding, tf_top_word_indices)
183
+
184
+ # Fetch 10 similar words:
185
+ nb_similar_words_to_get = 10
186
+
187
+
188
+ word = "king"
189
+ word_id = word_to_index[word]
190
+
191
+ top_cosine_similarities, top_word_indices, top_words_representation = sess.run(
192
+ [tf_top_cosine_similarities, tf_top_word_indices, tf_top_words_representation],
193
+ feed_dict: {
194
+ tf_word_id => [word_id],
195
+ tf_nb_similar_words_to_get => nb_similar_words_to_get
196
+ }
197
+ )
198
+
199
+ puts "Top similar words to \"#{word}\":\n"
200
+ top_cosine_similarities.zip(top_word_indices).zip(top_words_representation).each do |w, word_repr|
201
+ cos_sim, word_id = w
202
+ puts "#{(index_to_word[word_id]+ ":").ljust(15)}#{(cos_sim.to_s + ",").ljust(15)}#{Vector::elements(word_repr).norm}"
203
+ end
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
17
17
  # to allow pushing to a single host or delete this section to allow pushing to any host.
18
18
  if spec.respond_to?(:metadata)
19
19
  spec.metadata["allowed_push_host"] = "https://rubygems.org"
20
+ spec.metadata["changelog_uri"] = "https://github.com/jedld/tensor_stream/blob/master/CHANGELOG.md"
20
21
  else
21
22
  raise "RubyGems 2.0 or newer is required to protect against " \
22
23
  "public gem pushes."
@@ -42,8 +43,10 @@ Gem::Specification.new do |spec|
42
43
  spec.add_development_dependency "colorize"
43
44
  spec.add_development_dependency "rspec_junit_formatter"
44
45
  spec.add_development_dependency "mnist-learn"
46
+ spec.add_development_dependency "chakin-rb"
45
47
  spec.add_development_dependency "simplecov"
46
48
  spec.add_development_dependency "standard"
49
+ spec.add_development_dependency "rubyzip"
47
50
  spec.add_dependency "deep_merge"
48
51
  spec.add_dependency "concurrent-ruby"
49
52
  spec.add_dependency "chunky_png"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tensor_stream
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.6
4
+ version: 1.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joseph Emmanuel Dayo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-23 00:00:00.000000000 Z
11
+ date: 2019-04-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -164,6 +164,20 @@ dependencies:
164
164
  - - ">="
165
165
  - !ruby/object:Gem::Version
166
166
  version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: chakin-rb
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
167
181
  - !ruby/object:Gem::Dependency
168
182
  name: simplecov
169
183
  requirement: !ruby/object:Gem::Requirement
@@ -192,6 +206,20 @@ dependencies:
192
206
  - - ">="
193
207
  - !ruby/object:Gem::Version
194
208
  version: '0'
209
+ - !ruby/object:Gem::Dependency
210
+ name: rubyzip
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - ">="
214
+ - !ruby/object:Gem::Version
215
+ version: '0'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - ">="
221
+ - !ruby/object:Gem::Version
222
+ version: '0'
195
223
  - !ruby/object:Gem::Dependency
196
224
  name: deep_merge
197
225
  requirement: !ruby/object:Gem::Requirement
@@ -304,12 +332,14 @@ files:
304
332
  - lib/tensor_stream/helpers/tensor_mixins.rb
305
333
  - lib/tensor_stream/images.rb
306
334
  - lib/tensor_stream/initializer.rb
335
+ - lib/tensor_stream/math/math_ops.rb
307
336
  - lib/tensor_stream/math_gradients.rb
308
337
  - lib/tensor_stream/monkey_patches/array.rb
309
338
  - lib/tensor_stream/monkey_patches/float.rb
310
339
  - lib/tensor_stream/monkey_patches/integer.rb
311
340
  - lib/tensor_stream/monkey_patches/op_patch.rb
312
341
  - lib/tensor_stream/monkey_patches/patch.rb
342
+ - lib/tensor_stream/nn/embedding_lookup.rb
313
343
  - lib/tensor_stream/nn/nn_ops.rb
314
344
  - lib/tensor_stream/op_maker.rb
315
345
  - lib/tensor_stream/operation.rb
@@ -349,16 +379,19 @@ files:
349
379
  - lib/tensor_stream/ops/rank.rb
350
380
  - lib/tensor_stream/ops/reshape.rb
351
381
  - lib/tensor_stream/ops/round.rb
382
+ - lib/tensor_stream/ops/rsqrt.rb
352
383
  - lib/tensor_stream/ops/shape.rb
353
384
  - lib/tensor_stream/ops/sigmoid.rb
354
385
  - lib/tensor_stream/ops/sign.rb
355
386
  - lib/tensor_stream/ops/sin.rb
356
387
  - lib/tensor_stream/ops/size.rb
388
+ - lib/tensor_stream/ops/strided_slice.rb
357
389
  - lib/tensor_stream/ops/sub.rb
358
390
  - lib/tensor_stream/ops/sum.rb
359
391
  - lib/tensor_stream/ops/tan.rb
360
392
  - lib/tensor_stream/ops/tanh.rb
361
393
  - lib/tensor_stream/ops/tile.rb
394
+ - lib/tensor_stream/ops/top_k.rb
362
395
  - lib/tensor_stream/ops/zeros.rb
363
396
  - lib/tensor_stream/placeholder.rb
364
397
  - lib/tensor_stream/profile/report_tool.rb
@@ -381,25 +414,28 @@ files:
381
414
  - lib/tensor_stream/utils.rb
382
415
  - lib/tensor_stream/utils/data_type_utils.rb
383
416
  - lib/tensor_stream/utils/freezer.rb
417
+ - lib/tensor_stream/utils/py_ports.rb
384
418
  - lib/tensor_stream/variable.rb
385
419
  - lib/tensor_stream/variable_scope.rb
386
420
  - lib/tensor_stream/version.rb
387
421
  - samples/datasets/iris.data
388
422
  - samples/jupyter_notebooks/linear_regression.ipynb
389
423
  - samples/neural_networks/iris.rb
390
- - samples/neural_networks/lstm.rb
391
424
  - samples/neural_networks/mnist_data.rb
392
425
  - samples/neural_networks/raw_neural_net_sample.rb
393
426
  - samples/neural_networks/rnn.rb
394
427
  - samples/others/nearest_neighbor.rb
395
428
  - samples/regression/linear_regression.rb
396
429
  - samples/regression/logistic_regression.rb
430
+ - samples/word_embeddings/word_embedding_1.rb
431
+ - samples/word_embeddings/word_embedding_2.rb
397
432
  - tensor_stream.gemspec
398
433
  homepage: http://www.github.com/jedld/tensor_stream
399
434
  licenses:
400
435
  - MIT
401
436
  metadata:
402
437
  allowed_push_host: https://rubygems.org
438
+ changelog_uri: https://github.com/jedld/tensor_stream/blob/master/CHANGELOG.md
403
439
  post_install_message:
404
440
  rdoc_options: []
405
441
  require_paths:
@@ -415,7 +451,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
415
451
  - !ruby/object:Gem::Version
416
452
  version: '0'
417
453
  requirements: []
418
- rubygems_version: 3.0.3
454
+ rubygems_version: 3.0.1
419
455
  signing_key:
420
456
  specification_version: 4
421
457
  summary: A Pure ruby tensorflow implementation
@@ -1,22 +0,0 @@
1
- # A ruby port of the example code discussed by Martin Gorner in
2
- # "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
3
- #
4
- # https://www.youtube.com/watch?v=u4alGiomYP4
5
- #
6
- # Requirements:
7
- # mnist-learn gem
8
- # opencl_ruby_ffi gem
9
- require "bundler/setup"
10
- require "tensor_stream"
11
- require "mnist-learn"
12
-
13
- # Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
14
- # gem install tensor_stream-opencl
15
- require 'tensor_stream/opencl'
16
-
17
- tf = TensorStream
18
-
19
- # Import MNIST data
20
- puts "downloading minst data"
21
- mnist = Mnist.read_data_sets("/tmp/data", one_hot: true)
22
- puts "downloading finished"