tensor_stream 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,203 @@
1
+ #
2
+ # A ruby port of https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer by Guillaume Chevalier
3
+ #
4
+ # This is a port so some weird python like conventions may have been left behind
5
+ require "bundler/setup"
6
+ require "tensor_stream"
7
+ require "chakin-rb/chakin"
8
+ # require 'pry-byebug'
9
+ require 'zip'
10
+
11
+ tf = TensorStream
12
+
13
+ batch_size = nil # Any size is accepted
14
+ word_representations_dimensions = 25 # Embedding of size (vocab_len, nb_dimensions)
15
+
16
+
17
+ DATA_FOLDER = "embeddings"
18
+ SUBFOLDER_NAME = "glove.twitter.27B"
19
+ TF_EMBEDDING_FILE_NAME = "#{SUBFOLDER_NAME}.ckpt"
20
+ SUFFIX = SUBFOLDER_NAME + "." + word_representations_dimensions.to_s
21
+ TF_EMBEDDINGS_FILE_PATH = File.join(DATA_FOLDER, SUFFIX + "d.ckpt")
22
+ DICT_WORD_TO_INDEX_FILE_NAME = File.join(DATA_FOLDER, SUFFIX + "d.json")
23
+
24
+ # Load a `word_to_index` dict mapping words to their id, with a default value
25
+ # of pointing to the last index when not found, which is the unknown word.
26
+ def load_word_to_index(dict_word_to_index_file_name)
27
+ word_to_index = JSON.parse(File.read(dict_word_to_index_file_name))
28
+ _LAST_INDEX = word_to_index.size - 1
29
+ puts "word_to_index dict restored from '#{dict_word_to_index_file_name}'."
30
+ word_to_index = Hash.new(_LAST_INDEX).merge(word_to_index)
31
+ word_to_index
32
+ end
33
+
34
+ # Define the embedding tf.Variable and load it.
35
+ def load_embedding_tf(sess, word_to_index, tf_embeddings_file_path, nb_dims)
36
+
37
+ # 1. Define the variable that will hold the embedding:
38
+ tf_embedding = TensorStream.variable(
39
+ TensorStream.constant(0.0, shape: [word_to_index.size-1, nb_dims]),
40
+ trainable: false,
41
+ name: "Embedding"
42
+ )
43
+
44
+ # 2. Restore the embedding from disks to TensorFlow, GPU (or CPU if GPU unavailable):
45
+ variables_to_restore = [tf_embedding]
46
+ embedding_saver = TensorStream::Train::Saver.new(variables_to_restore)
47
+ embedding_saver.restore(sess, tf_embeddings_file_path)
48
+ puts "TF embeddings restored from '#{tf_embeddings_file_path}'."
49
+
50
+ tf_embedding
51
+ end
52
+
53
+
54
+ # Returns the `cosine_similarity = cos(angle_between_a_and_b_in_space)`
55
+ # for the two word A to all the words B.
56
+ # The first input word must be a 1D Tensors (word_representation).
57
+ # The second input words must be 2D Tensors (batch_size, word_representation).
58
+ # The result is a tf tensor that must be fetched with `sess.run`.
59
+ def cosine_similarity_tensorflow(tf_word_representation_A, tf_words_representation_B)
60
+ a_normalized = TensorStream.nn.l2_normalize(tf_word_representation_A, axis: -1)
61
+ b_normalized = TensorStream.nn.l2_normalize(tf_words_representation_B, axis: -1)
62
+ TensorStream.reduce_sum(
63
+ TensorStream.multiply(a_normalized, b_normalized),
64
+ axis: -1
65
+ )
66
+ end
67
+
68
+ # In case you didn't do the "%reset":
69
+ tf.reset_default_graph
70
+ sess = tf.session
71
+
72
+ # Load the embedding matrix in tf
73
+ word_to_index = load_word_to_index(
74
+ DICT_WORD_TO_INDEX_FILE_NAME)
75
+ tf_embedding = load_embedding_tf(sess,
76
+ word_to_index,
77
+ TF_EMBEDDINGS_FILE_PATH,
78
+ word_representations_dimensions)
79
+
80
+
81
+ # Input to the graph where word IDs can be sent in batch. Look at the "shape" args:
82
+ @tf_word_A_id = tf.placeholder(:int32, shape: [1])
83
+ @tf_words_B_ids = tf.placeholder(:int32, shape: [batch_size])
84
+
85
+ # Conversion of words to a representation
86
+ tf_word_representation_A = tf.nn.embedding_lookup(tf_embedding, @tf_word_A_id)
87
+ tf_words_representation_B = tf.nn.embedding_lookup(tf_embedding, @tf_words_B_ids)
88
+
89
+ # The graph output are the "cosine_similarities" which we want to fetch in sess.run(...).
90
+ @cosine_similarities = cosine_similarity_tensorflow(tf_word_representation_A, tf_words_representation_B)
91
+
92
+ print("Model created.")
93
+
94
+ # Note: there might be a better way to split sentences for GloVe.
95
+ # Please look at the documentation or open an issue to suggest a fix.
96
+ def sentence_to_word_ids(sentence, word_to_index)
97
+ punctuation = ['.', '!', '?', ',', ':', ';', "'", '"', '(', ')']
98
+ # Separating punctuation from words:
99
+ punctuation.each do |punctuation_character|
100
+ sentence.gsub!(punctuation_character, " #{punctuation_character} ")
101
+ end
102
+ # Removing double spaces and lowercasing:
103
+ sentence = sentence.downcase.squeeze(" ").strip
104
+
105
+ # Splitting on every space:
106
+ split_sentence = sentence.split(" ")
107
+ ids = split_sentence.map { |w| word_to_index[w.strip] }
108
+ # Converting to IDs:
109
+ ids = split_sentence.map { |w| word_to_index[w.strip] }
110
+ [ids, split_sentence]
111
+ end
112
+
113
+ # Use the model in sess to predict cosine similarities.
114
+ def predict_cosine_similarities(sess, word_to_index, word_A, words_B)
115
+ word_A_id, _ = sentence_to_word_ids(word_A, word_to_index)
116
+ words_B_ids, split_sentence = sentence_to_word_ids(words_B, word_to_index)
117
+
118
+ evaluated_cos_similarities = sess.run(
119
+ @cosine_similarities,
120
+ feed_dict: {
121
+ @tf_word_A_id => word_A_id,
122
+ @tf_words_B_ids => words_B_ids
123
+ }
124
+ )
125
+ [evaluated_cos_similarities, split_sentence]
126
+ end
127
+
128
+ word_A = "Science"
129
+ words_B = "Hello internet, a vocano erupt like the bitcoin out of the blue and there is an unknownWord00!"
130
+
131
+ evaluated_cos_similarities, splitted = predict_cosine_similarities(sess, word_to_index, word_A, words_B)
132
+
133
+ puts "Cosine similarities with \"#{word_A}\":"
134
+ splitted.zip(evaluated_cos_similarities).each do |word, similarity|
135
+ puts " #{(word+":").ljust(15)}#{similarity}"
136
+ end
137
+
138
+ tf.reset_default_graph()
139
+
140
+
141
+ # Transpose word_to_index dict:
142
+ index_to_word = word_to_index.invert
143
+
144
+ # New graph
145
+ tf.reset_default_graph()
146
+ sess = tf.session
147
+
148
+ # Load the embedding matrix in tf
149
+ tf_word_to_index = load_word_to_index(
150
+ DICT_WORD_TO_INDEX_FILE_NAME)
151
+
152
+ tf_embedding = load_embedding_tf(sess,
153
+ tf_word_to_index,
154
+ TF_EMBEDDINGS_FILE_PATH,
155
+ word_representations_dimensions)
156
+
157
+ # An input word
158
+ tf_word_id = tf.placeholder(:int32, shape: [1])
159
+ tf_word_representation = tf.nn.embedding_lookup(tf_embedding, tf_word_id)
160
+
161
+ # An input
162
+ tf_nb_similar_words_to_get = tf.placeholder(:int32)
163
+
164
+ # Dot the word to every embedding
165
+ tf_all_cosine_similarities = cosine_similarity_tensorflow(
166
+ tf_word_representation,
167
+ tf_embedding)
168
+
169
+ # Getting the top cosine similarities.
170
+ tf_top_cosine_similarities, tf_top_word_indices = tf.top_k(
171
+ tf_all_cosine_similarities,
172
+ tf_nb_similar_words_to_get + 1,
173
+ sorted: true
174
+ )
175
+
176
+ # Discard the first word because it's the input word itself:
177
+ tf_top_cosine_similarities = tf_top_cosine_similarities[1..nil]
178
+ tf_top_word_indices = tf_top_word_indices[1..nil]
179
+
180
+ # Get the top words' representations by fetching
181
+ # tf_top_words_representation = "tf_embedding[tf_top_word_indices]":
182
+ tf_top_words_representation = tf.gather(tf_embedding, tf_top_word_indices)
183
+
184
+ # Fetch 10 similar words:
185
+ nb_similar_words_to_get = 10
186
+
187
+
188
+ word = "king"
189
+ word_id = word_to_index[word]
190
+
191
+ top_cosine_similarities, top_word_indices, top_words_representation = sess.run(
192
+ [tf_top_cosine_similarities, tf_top_word_indices, tf_top_words_representation],
193
+ feed_dict: {
194
+ tf_word_id => [word_id],
195
+ tf_nb_similar_words_to_get => nb_similar_words_to_get
196
+ }
197
+ )
198
+
199
+ puts "Top similar words to \"#{word}\":\n"
200
+ top_cosine_similarities.zip(top_word_indices).zip(top_words_representation).each do |w, word_repr|
201
+ cos_sim, word_id = w
202
+ puts "#{(index_to_word[word_id]+ ":").ljust(15)}#{(cos_sim.to_s + ",").ljust(15)}#{Vector::elements(word_repr).norm}"
203
+ end
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
17
17
  # to allow pushing to a single host or delete this section to allow pushing to any host.
18
18
  if spec.respond_to?(:metadata)
19
19
  spec.metadata["allowed_push_host"] = "https://rubygems.org"
20
+ spec.metadata["changelog_uri"] = "https://github.com/jedld/tensor_stream/blob/master/CHANGELOG.md"
20
21
  else
21
22
  raise "RubyGems 2.0 or newer is required to protect against " \
22
23
  "public gem pushes."
@@ -42,8 +43,10 @@ Gem::Specification.new do |spec|
42
43
  spec.add_development_dependency "colorize"
43
44
  spec.add_development_dependency "rspec_junit_formatter"
44
45
  spec.add_development_dependency "mnist-learn"
46
+ spec.add_development_dependency "chakin-rb"
45
47
  spec.add_development_dependency "simplecov"
46
48
  spec.add_development_dependency "standard"
49
+ spec.add_development_dependency "rubyzip"
47
50
  spec.add_dependency "deep_merge"
48
51
  spec.add_dependency "concurrent-ruby"
49
52
  spec.add_dependency "chunky_png"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tensor_stream
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.6
4
+ version: 1.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joseph Emmanuel Dayo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-23 00:00:00.000000000 Z
11
+ date: 2019-04-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -164,6 +164,20 @@ dependencies:
164
164
  - - ">="
165
165
  - !ruby/object:Gem::Version
166
166
  version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: chakin-rb
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
167
181
  - !ruby/object:Gem::Dependency
168
182
  name: simplecov
169
183
  requirement: !ruby/object:Gem::Requirement
@@ -192,6 +206,20 @@ dependencies:
192
206
  - - ">="
193
207
  - !ruby/object:Gem::Version
194
208
  version: '0'
209
+ - !ruby/object:Gem::Dependency
210
+ name: rubyzip
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - ">="
214
+ - !ruby/object:Gem::Version
215
+ version: '0'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - ">="
221
+ - !ruby/object:Gem::Version
222
+ version: '0'
195
223
  - !ruby/object:Gem::Dependency
196
224
  name: deep_merge
197
225
  requirement: !ruby/object:Gem::Requirement
@@ -304,12 +332,14 @@ files:
304
332
  - lib/tensor_stream/helpers/tensor_mixins.rb
305
333
  - lib/tensor_stream/images.rb
306
334
  - lib/tensor_stream/initializer.rb
335
+ - lib/tensor_stream/math/math_ops.rb
307
336
  - lib/tensor_stream/math_gradients.rb
308
337
  - lib/tensor_stream/monkey_patches/array.rb
309
338
  - lib/tensor_stream/monkey_patches/float.rb
310
339
  - lib/tensor_stream/monkey_patches/integer.rb
311
340
  - lib/tensor_stream/monkey_patches/op_patch.rb
312
341
  - lib/tensor_stream/monkey_patches/patch.rb
342
+ - lib/tensor_stream/nn/embedding_lookup.rb
313
343
  - lib/tensor_stream/nn/nn_ops.rb
314
344
  - lib/tensor_stream/op_maker.rb
315
345
  - lib/tensor_stream/operation.rb
@@ -349,16 +379,19 @@ files:
349
379
  - lib/tensor_stream/ops/rank.rb
350
380
  - lib/tensor_stream/ops/reshape.rb
351
381
  - lib/tensor_stream/ops/round.rb
382
+ - lib/tensor_stream/ops/rsqrt.rb
352
383
  - lib/tensor_stream/ops/shape.rb
353
384
  - lib/tensor_stream/ops/sigmoid.rb
354
385
  - lib/tensor_stream/ops/sign.rb
355
386
  - lib/tensor_stream/ops/sin.rb
356
387
  - lib/tensor_stream/ops/size.rb
388
+ - lib/tensor_stream/ops/strided_slice.rb
357
389
  - lib/tensor_stream/ops/sub.rb
358
390
  - lib/tensor_stream/ops/sum.rb
359
391
  - lib/tensor_stream/ops/tan.rb
360
392
  - lib/tensor_stream/ops/tanh.rb
361
393
  - lib/tensor_stream/ops/tile.rb
394
+ - lib/tensor_stream/ops/top_k.rb
362
395
  - lib/tensor_stream/ops/zeros.rb
363
396
  - lib/tensor_stream/placeholder.rb
364
397
  - lib/tensor_stream/profile/report_tool.rb
@@ -381,25 +414,28 @@ files:
381
414
  - lib/tensor_stream/utils.rb
382
415
  - lib/tensor_stream/utils/data_type_utils.rb
383
416
  - lib/tensor_stream/utils/freezer.rb
417
+ - lib/tensor_stream/utils/py_ports.rb
384
418
  - lib/tensor_stream/variable.rb
385
419
  - lib/tensor_stream/variable_scope.rb
386
420
  - lib/tensor_stream/version.rb
387
421
  - samples/datasets/iris.data
388
422
  - samples/jupyter_notebooks/linear_regression.ipynb
389
423
  - samples/neural_networks/iris.rb
390
- - samples/neural_networks/lstm.rb
391
424
  - samples/neural_networks/mnist_data.rb
392
425
  - samples/neural_networks/raw_neural_net_sample.rb
393
426
  - samples/neural_networks/rnn.rb
394
427
  - samples/others/nearest_neighbor.rb
395
428
  - samples/regression/linear_regression.rb
396
429
  - samples/regression/logistic_regression.rb
430
+ - samples/word_embeddings/word_embedding_1.rb
431
+ - samples/word_embeddings/word_embedding_2.rb
397
432
  - tensor_stream.gemspec
398
433
  homepage: http://www.github.com/jedld/tensor_stream
399
434
  licenses:
400
435
  - MIT
401
436
  metadata:
402
437
  allowed_push_host: https://rubygems.org
438
+ changelog_uri: https://github.com/jedld/tensor_stream/blob/master/CHANGELOG.md
403
439
  post_install_message:
404
440
  rdoc_options: []
405
441
  require_paths:
@@ -415,7 +451,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
415
451
  - !ruby/object:Gem::Version
416
452
  version: '0'
417
453
  requirements: []
418
- rubygems_version: 3.0.3
454
+ rubygems_version: 3.0.1
419
455
  signing_key:
420
456
  specification_version: 4
421
457
  summary: A Pure ruby tensorflow implementation
@@ -1,22 +0,0 @@
1
- # A ruby port of the example code discussed by Martin Gorner in
2
- # "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
3
- #
4
- # https://www.youtube.com/watch?v=u4alGiomYP4
5
- #
6
- # Requirements:
7
- # mnist-learn gem
8
- # opencl_ruby_ffi gem
9
- require "bundler/setup"
10
- require "tensor_stream"
11
- require "mnist-learn"
12
-
13
- # Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
14
- # gem install tensor_stream-opencl
15
- require 'tensor_stream/opencl'
16
-
17
- tf = TensorStream
18
-
19
- # Import MNIST data
20
- puts "downloading minst data"
21
- mnist = Mnist.read_data_sets("/tmp/data", one_hot: true)
22
- puts "downloading finished"