tensor_stream 1.0.4 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +12 -2
- data/Dockerfile +1 -1
- data/USAGE_GUIDE.md +68 -0
- data/lib/tensor_stream.rb +1 -0
- data/lib/tensor_stream/evaluator/base_evaluator.rb +21 -1
- data/lib/tensor_stream/evaluator/evaluator.rb +1 -0
- data/lib/tensor_stream/evaluator/evaluator_utils.rb +20 -0
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +60 -0
- data/lib/tensor_stream/evaluator/ruby/array_ops.rb +53 -1
- data/lib/tensor_stream/evaluator/ruby/images_ops.rb +26 -0
- data/lib/tensor_stream/evaluator/ruby/math_ops.rb +60 -5
- data/lib/tensor_stream/evaluator/ruby/nn_ops.rb +25 -29
- data/lib/tensor_stream/evaluator/ruby/random_ops.rb +7 -11
- data/lib/tensor_stream/evaluator/ruby/storage_manager.rb +40 -0
- data/lib/tensor_stream/evaluator/ruby/variable_ops.rb +74 -0
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +31 -77
- data/lib/tensor_stream/generated_stub/ops.rb +256 -166
- data/lib/tensor_stream/generated_stub/stub_file.erb +4 -4
- data/lib/tensor_stream/graph.rb +3 -3
- data/lib/tensor_stream/graph_deserializers/yaml_loader.rb +4 -6
- data/lib/tensor_stream/helpers/infer_shape.rb +1 -7
- data/lib/tensor_stream/helpers/tensor_mixins.rb +10 -1
- data/lib/tensor_stream/images.rb +4 -0
- data/lib/tensor_stream/math/math_ops.rb +22 -0
- data/lib/tensor_stream/math_gradients.rb +15 -1
- data/lib/tensor_stream/nn/embedding_lookup.rb +114 -0
- data/lib/tensor_stream/nn/nn_ops.rb +16 -0
- data/lib/tensor_stream/op_maker.rb +36 -3
- data/lib/tensor_stream/operation.rb +8 -20
- data/lib/tensor_stream/ops.rb +14 -11
- data/lib/tensor_stream/ops/bias_add.rb +16 -0
- data/lib/tensor_stream/ops/equal.rb +4 -0
- data/lib/tensor_stream/ops/greater.rb +4 -0
- data/lib/tensor_stream/ops/greater_equal.rb +4 -0
- data/lib/tensor_stream/ops/less.rb +19 -0
- data/lib/tensor_stream/ops/less_equal.rb +4 -0
- data/lib/tensor_stream/ops/not_equal.rb +19 -0
- data/lib/tensor_stream/ops/rsqrt.rb +11 -0
- data/lib/tensor_stream/ops/strided_slice.rb +24 -0
- data/lib/tensor_stream/ops/sum.rb +4 -2
- data/lib/tensor_stream/ops/top_k.rb +23 -0
- data/lib/tensor_stream/session.rb +6 -12
- data/lib/tensor_stream/tensor.rb +1 -0
- data/lib/tensor_stream/tensor_shape.rb +32 -1
- data/lib/tensor_stream/train/saver.rb +2 -3
- data/lib/tensor_stream/utils.rb +18 -13
- data/lib/tensor_stream/utils/freezer.rb +5 -1
- data/lib/tensor_stream/utils/py_ports.rb +11 -0
- data/lib/tensor_stream/variable.rb +9 -6
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/word_embeddings/word_embedding_1.rb +192 -0
- data/samples/word_embeddings/word_embedding_2.rb +203 -0
- data/tensor_stream.gemspec +7 -2
- metadata +67 -10
@@ -0,0 +1,192 @@
|
|
1
|
+
#
|
2
|
+
# A ruby port of https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer by Guillaume Chevalier
|
3
|
+
#
|
4
|
+
# This is a port so some weird python like conventions may have been left behind
|
5
|
+
require "bundler/setup"
|
6
|
+
require "tensor_stream"
|
7
|
+
require "chakin-rb/chakin"
|
8
|
+
# require 'pry-byebug'
|
9
|
+
require 'zip'
|
10
|
+
|
11
|
+
tf = TensorStream
|
12
|
+
|
13
|
+
CHAKIN_INDEX = 17
|
14
|
+
NUMBER_OF_DIMENSIONS = 25
|
15
|
+
SUBFOLDER_NAME = "glove.twitter.27B"
|
16
|
+
|
17
|
+
DATA_FOLDER = "embeddings"
|
18
|
+
ZIP_FILE = File.join(DATA_FOLDER, "#{SUBFOLDER_NAME}.zip")
|
19
|
+
ZIP_FILE_ALT = "glove" + ZIP_FILE[5..nil] # sometimes it's lowercase only...
|
20
|
+
UNZIP_FOLDER = File.join(DATA_FOLDER, SUBFOLDER_NAME)
|
21
|
+
|
22
|
+
if SUBFOLDER_NAME[-1] == "d"
|
23
|
+
GLOVE_FILENAME = File.join(UNZIP_FOLDER, "#{SUBFOLDER_NAME}.txt")
|
24
|
+
else
|
25
|
+
GLOVE_FILENAME = File.join(UNZIP_FOLDER, "#{SUBFOLDER_NAME}.#{NUMBER_OF_DIMENSIONS}d.txt")
|
26
|
+
end
|
27
|
+
|
28
|
+
if !File.exist?(ZIP_FILE) && !File.exist?(UNZIP_FOLDER)
|
29
|
+
# GloVe by Stanford is licensed Apache 2.0:
|
30
|
+
# https://github.com/stanfordnlp/GloVe/blob/master/LICENSE
|
31
|
+
# http://nlp.stanford.edu/data/glove.twitter.27B.zip
|
32
|
+
# Copyright 2014 The Board of Trustees of The Leland Stanford Junior University
|
33
|
+
puts "Downloading embeddings to '#{ZIP_FILE}'"
|
34
|
+
Chakin::Vectors.download(number: CHAKIN_INDEX, save_dir: "./#{DATA_FOLDER}")
|
35
|
+
else
|
36
|
+
puts "Embeddings already downloaded."
|
37
|
+
end
|
38
|
+
|
39
|
+
if !File.exists?(UNZIP_FOLDER)
|
40
|
+
if !File.exists?(ZIP_FILE) && !File.exists?(ZIP_FILE_ALT)
|
41
|
+
ZIP_FILE = ZIP_FILE_ALT
|
42
|
+
end
|
43
|
+
FileUtils.mkdir_p(UNZIP_FOLDER)
|
44
|
+
Zip::File.open(ZIP_FILE) do |zipfile|
|
45
|
+
zipfile.each do |file|
|
46
|
+
puts "Extracting embeddings to '#{UNZIP_FOLDER}/#{file.name}'"
|
47
|
+
fpath = File.join(UNZIP_FOLDER, file.name)
|
48
|
+
zipfile.extract(file, fpath) unless File.exist?(fpath)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
else
|
52
|
+
puts "Embeddings already extracted."
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Read a GloVe txt file. If `with_indexes=True`, we return a tuple of two dictionnaries
|
57
|
+
# `(word_to_index_dict, index_to_embedding_array)`, otherwise we return only a direct
|
58
|
+
# `word_to_embedding_dict` dictionnary mapping from a string to a numpy array.
|
59
|
+
def load_embedding_from_disks(glove_filename, with_indexes: true)
|
60
|
+
word_to_index_dict = {}
|
61
|
+
index_to_embedding_array = []
|
62
|
+
word_to_embedding_dict = {}
|
63
|
+
representation = nil
|
64
|
+
|
65
|
+
last_index = nil
|
66
|
+
File.open(glove_filename, 'r').each_with_index do |line, i|
|
67
|
+
split = line.split(' ')
|
68
|
+
|
69
|
+
word = split.shift
|
70
|
+
|
71
|
+
representation = split
|
72
|
+
representation.map! { |val| val.to_f }
|
73
|
+
|
74
|
+
if with_indexes
|
75
|
+
word_to_index_dict[word] = i
|
76
|
+
index_to_embedding_array << representation
|
77
|
+
else
|
78
|
+
word_to_embedding_dict[word] = representation
|
79
|
+
end
|
80
|
+
last_index = i
|
81
|
+
end
|
82
|
+
|
83
|
+
_WORD_NOT_FOUND = [0.0] * representation.size # Empty representation for unknown words.
|
84
|
+
if with_indexes
|
85
|
+
_LAST_INDEX = last_index + 1
|
86
|
+
word_to_index_dict = Hash.new(_LAST_INDEX).merge(word_to_index_dict)
|
87
|
+
index_to_embedding_array = index_to_embedding_array + [_WORD_NOT_FOUND]
|
88
|
+
return word_to_index_dict, index_to_embedding_array
|
89
|
+
else
|
90
|
+
word_to_embedding_dict = Hash.new(_WORD_NOT_FOUND)
|
91
|
+
return word_to_embedding_dict
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
puts "Loading embedding from disks..."
|
96
|
+
word_to_index, index_to_embedding = load_embedding_from_disks(GLOVE_FILENAME, with_indexes: true)
|
97
|
+
puts "Embedding loaded from disks."
|
98
|
+
|
99
|
+
vocab_size, embedding_dim = index_to_embedding.shape
|
100
|
+
puts "Embedding is of shape: #{index_to_embedding.shape}"
|
101
|
+
puts "This means (number of words, number of dimensions per word)"
|
102
|
+
puts "The first words are words that tend occur more often."
|
103
|
+
|
104
|
+
puts "Note: for unknown words, the representation is an empty vector,\n" +
|
105
|
+
"and the index is the last one. The dictionnary has a limit:"
|
106
|
+
puts " \"A word\" --> \"Index in embedding\" --> \"Representation\""
|
107
|
+
word = "worsdfkljsdf"
|
108
|
+
idx = word_to_index[word]
|
109
|
+
embd = index_to_embedding[idx].map { |v| v.to_i } # "int" for compact print only.
|
110
|
+
puts " #{word} --> #{idx} --> #{embd}"
|
111
|
+
word = "the"
|
112
|
+
idx = word_to_index[word]
|
113
|
+
embd = index_to_embedding[idx] # "int" for compact print only.
|
114
|
+
puts " #{word} --> #{idx} --> #{embd}"
|
115
|
+
|
116
|
+
words = [
|
117
|
+
"The", "Teh", "A", "It", "Its", "Bacon", "Star", "Clone", "Bonjour", "Intelligence",
|
118
|
+
"À", "A", "Ça", "Ca", "Été", "C'est", "Aujourd'hui", "Aujourd", "'", "hui", "?", "!", ",", ".", "-", "/", "~"
|
119
|
+
]
|
120
|
+
|
121
|
+
words.each do |word|
|
122
|
+
word_ = word.downcase
|
123
|
+
embedding = index_to_embedding[word_to_index[word_]]
|
124
|
+
norm = Vector::elements(embedding).norm
|
125
|
+
puts (word + ": ").ljust(15) + norm.to_s
|
126
|
+
end
|
127
|
+
|
128
|
+
puts "Note: here we printed words starting with capital letters, \n" +
|
129
|
+
"however to take their embeddings we need their lowercase version (str.downcase)"
|
130
|
+
|
131
|
+
batch_size = nil # Any size is accepted
|
132
|
+
|
133
|
+
tf.reset_default_graph
|
134
|
+
sess = tf.session
|
135
|
+
|
136
|
+
# Define the variable that will hold the embedding:
|
137
|
+
tf_embedding = tf.variable(
|
138
|
+
tf.constant(0.0, shape: index_to_embedding.shape),
|
139
|
+
trainable: false,
|
140
|
+
name: "Embedding"
|
141
|
+
)
|
142
|
+
|
143
|
+
tf_word_ids = tf.placeholder(:int32, shape: [batch_size])
|
144
|
+
|
145
|
+
tf_word_representation_layer = tf.nn.embedding_lookup(tf_embedding, tf_word_ids)
|
146
|
+
|
147
|
+
tf_embedding_placeholder = tf.placeholder(:float32, shape: index_to_embedding.shape)
|
148
|
+
tf_embedding_init = tf_embedding.assign(tf_embedding_placeholder)
|
149
|
+
|
150
|
+
sess.run(
|
151
|
+
tf_embedding_init,
|
152
|
+
feed_dict: {
|
153
|
+
tf_embedding_placeholder => index_to_embedding
|
154
|
+
}
|
155
|
+
)
|
156
|
+
|
157
|
+
puts "Embedding now stored in TensorStream. Can delete ruby array to clear some CPU RAM."
|
158
|
+
|
159
|
+
batch_of_words = ["Hello", "World", "!"]
|
160
|
+
batch_indexes = batch_of_words.map { |w| word_to_index[w.downcase] }
|
161
|
+
|
162
|
+
embedding_from_batch_lookup = sess.run(
|
163
|
+
tf_word_representation_layer,
|
164
|
+
feed_dict: {
|
165
|
+
tf_word_ids => batch_indexes
|
166
|
+
}
|
167
|
+
)
|
168
|
+
|
169
|
+
puts "Representations for #{batch_of_words}:"
|
170
|
+
puts embedding_from_batch_lookup.inspect
|
171
|
+
|
172
|
+
prefix = SUBFOLDER_NAME + "." + NUMBER_OF_DIMENSIONS.to_s + "d"
|
173
|
+
TF_EMBEDDINGS_FILE_NAME = File.join(DATA_FOLDER, prefix + ".ckpt")
|
174
|
+
DICT_WORD_TO_INDEX_FILE_NAME = File.join(DATA_FOLDER, prefix + ".json")
|
175
|
+
|
176
|
+
variables_to_save = [tf_embedding]
|
177
|
+
embedding_saver = tf::Train::Saver.new(variables_to_save)
|
178
|
+
embedding_saver.save(sess, TF_EMBEDDINGS_FILE_NAME)
|
179
|
+
puts "TF embeddings saved to '#{TF_EMBEDDINGS_FILE_NAME}'."
|
180
|
+
|
181
|
+
sess.close
|
182
|
+
|
183
|
+
File.open(DICT_WORD_TO_INDEX_FILE_NAME, 'w') do |f|
|
184
|
+
f.write(word_to_index.to_json)
|
185
|
+
end
|
186
|
+
puts "word_to_index dict saved to '#{DICT_WORD_TO_INDEX_FILE_NAME}'."
|
187
|
+
|
188
|
+
words_B = "like absolutely crazy not hate bag sand rock soap"
|
189
|
+
r = words_B.split.map { |w| word_to_index[w.strip()] }
|
190
|
+
puts words_B
|
191
|
+
puts r.inspect
|
192
|
+
puts "done"
|
@@ -0,0 +1,203 @@
|
|
1
|
+
#
|
2
|
+
# A ruby port of https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer by Guillaume Chevalier
|
3
|
+
#
|
4
|
+
# This is a port so some weird python like conventions may have been left behind
|
5
|
+
require "bundler/setup"
|
6
|
+
require "tensor_stream"
|
7
|
+
require "chakin-rb/chakin"
|
8
|
+
# require 'pry-byebug'
|
9
|
+
require 'zip'
|
10
|
+
|
11
|
+
tf = TensorStream
|
12
|
+
|
13
|
+
batch_size = nil # Any size is accepted
|
14
|
+
word_representations_dimensions = 25 # Embedding of size (vocab_len, nb_dimensions)
|
15
|
+
|
16
|
+
|
17
|
+
DATA_FOLDER = "embeddings"
|
18
|
+
SUBFOLDER_NAME = "glove.twitter.27B"
|
19
|
+
TF_EMBEDDING_FILE_NAME = "#{SUBFOLDER_NAME}.ckpt"
|
20
|
+
SUFFIX = SUBFOLDER_NAME + "." + word_representations_dimensions.to_s
|
21
|
+
TF_EMBEDDINGS_FILE_PATH = File.join(DATA_FOLDER, SUFFIX + "d.ckpt")
|
22
|
+
DICT_WORD_TO_INDEX_FILE_NAME = File.join(DATA_FOLDER, SUFFIX + "d.json")
|
23
|
+
|
24
|
+
# Load a `word_to_index` dict mapping words to their id, with a default value
|
25
|
+
# of pointing to the last index when not found, which is the unknown word.
|
26
|
+
def load_word_to_index(dict_word_to_index_file_name)
|
27
|
+
word_to_index = JSON.parse(File.read(dict_word_to_index_file_name))
|
28
|
+
_LAST_INDEX = word_to_index.size - 1
|
29
|
+
puts "word_to_index dict restored from '#{dict_word_to_index_file_name}'."
|
30
|
+
word_to_index = Hash.new(_LAST_INDEX).merge(word_to_index)
|
31
|
+
word_to_index
|
32
|
+
end
|
33
|
+
|
34
|
+
# Define the embedding tf.Variable and load it.
|
35
|
+
def load_embedding_tf(sess, word_to_index, tf_embeddings_file_path, nb_dims)
|
36
|
+
|
37
|
+
# 1. Define the variable that will hold the embedding:
|
38
|
+
tf_embedding = TensorStream.variable(
|
39
|
+
TensorStream.constant(0.0, shape: [word_to_index.size-1, nb_dims]),
|
40
|
+
trainable: false,
|
41
|
+
name: "Embedding"
|
42
|
+
)
|
43
|
+
|
44
|
+
# 2. Restore the embedding from disks to TensorFlow, GPU (or CPU if GPU unavailable):
|
45
|
+
variables_to_restore = [tf_embedding]
|
46
|
+
embedding_saver = TensorStream::Train::Saver.new(variables_to_restore)
|
47
|
+
embedding_saver.restore(sess, tf_embeddings_file_path)
|
48
|
+
puts "TF embeddings restored from '#{tf_embeddings_file_path}'."
|
49
|
+
|
50
|
+
tf_embedding
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
# Returns the `cosine_similarity = cos(angle_between_a_and_b_in_space)`
|
55
|
+
# for the two word A to all the words B.
|
56
|
+
# The first input word must be a 1D Tensors (word_representation).
|
57
|
+
# The second input words must be 2D Tensors (batch_size, word_representation).
|
58
|
+
# The result is a tf tensor that must be fetched with `sess.run`.
|
59
|
+
def cosine_similarity_tensorflow(tf_word_representation_A, tf_words_representation_B)
|
60
|
+
a_normalized = TensorStream.nn.l2_normalize(tf_word_representation_A, axis: -1)
|
61
|
+
b_normalized = TensorStream.nn.l2_normalize(tf_words_representation_B, axis: -1)
|
62
|
+
TensorStream.reduce_sum(
|
63
|
+
TensorStream.multiply(a_normalized, b_normalized),
|
64
|
+
axis: -1
|
65
|
+
)
|
66
|
+
end
|
67
|
+
|
68
|
+
# In case you didn't do the "%reset":
|
69
|
+
tf.reset_default_graph
|
70
|
+
sess = tf.session
|
71
|
+
|
72
|
+
# Load the embedding matrix in tf
|
73
|
+
word_to_index = load_word_to_index(
|
74
|
+
DICT_WORD_TO_INDEX_FILE_NAME)
|
75
|
+
tf_embedding = load_embedding_tf(sess,
|
76
|
+
word_to_index,
|
77
|
+
TF_EMBEDDINGS_FILE_PATH,
|
78
|
+
word_representations_dimensions)
|
79
|
+
|
80
|
+
|
81
|
+
# Input to the graph where word IDs can be sent in batch. Look at the "shape" args:
|
82
|
+
@tf_word_A_id = tf.placeholder(:int32, shape: [1])
|
83
|
+
@tf_words_B_ids = tf.placeholder(:int32, shape: [batch_size])
|
84
|
+
|
85
|
+
# Conversion of words to a representation
|
86
|
+
tf_word_representation_A = tf.nn.embedding_lookup(tf_embedding, @tf_word_A_id)
|
87
|
+
tf_words_representation_B = tf.nn.embedding_lookup(tf_embedding, @tf_words_B_ids)
|
88
|
+
|
89
|
+
# The graph output are the "cosine_similarities" which we want to fetch in sess.run(...).
|
90
|
+
@cosine_similarities = cosine_similarity_tensorflow(tf_word_representation_A, tf_words_representation_B)
|
91
|
+
|
92
|
+
print("Model created.")
|
93
|
+
|
94
|
+
# Note: there might be a better way to split sentences for GloVe.
|
95
|
+
# Please look at the documentation or open an issue to suggest a fix.
|
96
|
+
def sentence_to_word_ids(sentence, word_to_index)
|
97
|
+
punctuation = ['.', '!', '?', ',', ':', ';', "'", '"', '(', ')']
|
98
|
+
# Separating punctuation from words:
|
99
|
+
punctuation.each do |punctuation_character|
|
100
|
+
sentence.gsub!(punctuation_character, " #{punctuation_character} ")
|
101
|
+
end
|
102
|
+
# Removing double spaces and lowercasing:
|
103
|
+
sentence = sentence.downcase.squeeze(" ").strip
|
104
|
+
|
105
|
+
# Splitting on every space:
|
106
|
+
split_sentence = sentence.split(" ")
|
107
|
+
ids = split_sentence.map { |w| word_to_index[w.strip] }
|
108
|
+
# Converting to IDs:
|
109
|
+
ids = split_sentence.map { |w| word_to_index[w.strip] }
|
110
|
+
[ids, split_sentence]
|
111
|
+
end
|
112
|
+
|
113
|
+
# Use the model in sess to predict cosine similarities.
|
114
|
+
def predict_cosine_similarities(sess, word_to_index, word_A, words_B)
|
115
|
+
word_A_id, _ = sentence_to_word_ids(word_A, word_to_index)
|
116
|
+
words_B_ids, split_sentence = sentence_to_word_ids(words_B, word_to_index)
|
117
|
+
|
118
|
+
evaluated_cos_similarities = sess.run(
|
119
|
+
@cosine_similarities,
|
120
|
+
feed_dict: {
|
121
|
+
@tf_word_A_id => word_A_id,
|
122
|
+
@tf_words_B_ids => words_B_ids
|
123
|
+
}
|
124
|
+
)
|
125
|
+
[evaluated_cos_similarities, split_sentence]
|
126
|
+
end
|
127
|
+
|
128
|
+
word_A = "Science"
|
129
|
+
words_B = "Hello internet, a vocano erupt like the bitcoin out of the blue and there is an unknownWord00!"
|
130
|
+
|
131
|
+
evaluated_cos_similarities, splitted = predict_cosine_similarities(sess, word_to_index, word_A, words_B)
|
132
|
+
|
133
|
+
puts "Cosine similarities with \"#{word_A}\":"
|
134
|
+
splitted.zip(evaluated_cos_similarities).each do |word, similarity|
|
135
|
+
puts " #{(word+":").ljust(15)}#{similarity}"
|
136
|
+
end
|
137
|
+
|
138
|
+
tf.reset_default_graph()
|
139
|
+
|
140
|
+
|
141
|
+
# Transpose word_to_index dict:
|
142
|
+
index_to_word = word_to_index.invert
|
143
|
+
|
144
|
+
# New graph
|
145
|
+
tf.reset_default_graph()
|
146
|
+
sess = tf.session
|
147
|
+
|
148
|
+
# Load the embedding matrix in tf
|
149
|
+
tf_word_to_index = load_word_to_index(
|
150
|
+
DICT_WORD_TO_INDEX_FILE_NAME)
|
151
|
+
|
152
|
+
tf_embedding = load_embedding_tf(sess,
|
153
|
+
tf_word_to_index,
|
154
|
+
TF_EMBEDDINGS_FILE_PATH,
|
155
|
+
word_representations_dimensions)
|
156
|
+
|
157
|
+
# An input word
|
158
|
+
tf_word_id = tf.placeholder(:int32, shape: [1])
|
159
|
+
tf_word_representation = tf.nn.embedding_lookup(tf_embedding, tf_word_id)
|
160
|
+
|
161
|
+
# An input
|
162
|
+
tf_nb_similar_words_to_get = tf.placeholder(:int32)
|
163
|
+
|
164
|
+
# Dot the word to every embedding
|
165
|
+
tf_all_cosine_similarities = cosine_similarity_tensorflow(
|
166
|
+
tf_word_representation,
|
167
|
+
tf_embedding)
|
168
|
+
|
169
|
+
# Getting the top cosine similarities.
|
170
|
+
tf_top_cosine_similarities, tf_top_word_indices = tf.top_k(
|
171
|
+
tf_all_cosine_similarities,
|
172
|
+
tf_nb_similar_words_to_get + 1,
|
173
|
+
sorted: true
|
174
|
+
)
|
175
|
+
|
176
|
+
# Discard the first word because it's the input word itself:
|
177
|
+
tf_top_cosine_similarities = tf_top_cosine_similarities[1..nil]
|
178
|
+
tf_top_word_indices = tf_top_word_indices[1..nil]
|
179
|
+
|
180
|
+
# Get the top words' representations by fetching
|
181
|
+
# tf_top_words_representation = "tf_embedding[tf_top_word_indices]":
|
182
|
+
tf_top_words_representation = tf.gather(tf_embedding, tf_top_word_indices)
|
183
|
+
|
184
|
+
# Fetch 10 similar words:
|
185
|
+
nb_similar_words_to_get = 10
|
186
|
+
|
187
|
+
|
188
|
+
word = "king"
|
189
|
+
word_id = word_to_index[word]
|
190
|
+
|
191
|
+
top_cosine_similarities, top_word_indices, top_words_representation = sess.run(
|
192
|
+
[tf_top_cosine_similarities, tf_top_word_indices, tf_top_words_representation],
|
193
|
+
feed_dict: {
|
194
|
+
tf_word_id => [word_id],
|
195
|
+
tf_nb_similar_words_to_get => nb_similar_words_to_get
|
196
|
+
}
|
197
|
+
)
|
198
|
+
|
199
|
+
puts "Top similar words to \"#{word}\":\n"
|
200
|
+
top_cosine_similarities.zip(top_word_indices).zip(top_words_representation).each do |w, word_repr|
|
201
|
+
cos_sim, word_id = w
|
202
|
+
puts "#{(index_to_word[word_id]+ ":").ljust(15)}#{(cos_sim.to_s + ",").ljust(15)}#{Vector::elements(word_repr).norm}"
|
203
|
+
end
|
data/tensor_stream.gemspec
CHANGED
@@ -17,6 +17,8 @@ Gem::Specification.new do |spec|
|
|
17
17
|
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
18
18
|
if spec.respond_to?(:metadata)
|
19
19
|
spec.metadata["allowed_push_host"] = "https://rubygems.org"
|
20
|
+
spec.metadata["source_code_uri"] = "https://github.com/jedld/tensor_stream"
|
21
|
+
spec.metadata["changelog_uri"] = "https://github.com/jedld/tensor_stream/blob/master/CHANGELOG.md"
|
20
22
|
else
|
21
23
|
raise "RubyGems 2.0 or newer is required to protect against " \
|
22
24
|
"public gem pushes."
|
@@ -29,8 +31,8 @@ Gem::Specification.new do |spec|
|
|
29
31
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
30
32
|
spec.require_paths = ["lib"]
|
31
33
|
|
32
|
-
spec.add_development_dependency "bundler"
|
33
|
-
spec.add_development_dependency "rake", "~>
|
34
|
+
spec.add_development_dependency "bundler"
|
35
|
+
spec.add_development_dependency "rake", "~> 12.3"
|
34
36
|
spec.add_development_dependency "rspec", "~> 3.0"
|
35
37
|
spec.add_development_dependency "awesome_print"
|
36
38
|
spec.add_development_dependency "rubocop"
|
@@ -38,12 +40,15 @@ Gem::Specification.new do |spec|
|
|
38
40
|
spec.add_development_dependency "pry-byebug"
|
39
41
|
spec.add_development_dependency "byepry"
|
40
42
|
spec.add_development_dependency "tensor_stream-opencl"
|
43
|
+
spec.add_dependency "jpeg"
|
41
44
|
end
|
42
45
|
spec.add_development_dependency "colorize"
|
43
46
|
spec.add_development_dependency "rspec_junit_formatter"
|
44
47
|
spec.add_development_dependency "mnist-learn"
|
48
|
+
spec.add_development_dependency "chakin-rb"
|
45
49
|
spec.add_development_dependency "simplecov"
|
46
50
|
spec.add_development_dependency "standard"
|
51
|
+
spec.add_development_dependency "rubyzip"
|
47
52
|
spec.add_dependency "deep_merge"
|
48
53
|
spec.add_dependency "concurrent-ruby"
|
49
54
|
spec.add_dependency "chunky_png"
|
metadata
CHANGED
@@ -1,43 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tensor_stream
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joseph Emmanuel Dayo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-12-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '12.3'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '12.3'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: jpeg
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
140
|
name: colorize
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,6 +178,20 @@ dependencies:
|
|
164
178
|
- - ">="
|
165
179
|
- !ruby/object:Gem::Version
|
166
180
|
version: '0'
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: chakin-rb
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - ">="
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '0'
|
188
|
+
type: :development
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - ">="
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '0'
|
167
195
|
- !ruby/object:Gem::Dependency
|
168
196
|
name: simplecov
|
169
197
|
requirement: !ruby/object:Gem::Requirement
|
@@ -192,6 +220,20 @@ dependencies:
|
|
192
220
|
- - ">="
|
193
221
|
- !ruby/object:Gem::Version
|
194
222
|
version: '0'
|
223
|
+
- !ruby/object:Gem::Dependency
|
224
|
+
name: rubyzip
|
225
|
+
requirement: !ruby/object:Gem::Requirement
|
226
|
+
requirements:
|
227
|
+
- - ">="
|
228
|
+
- !ruby/object:Gem::Version
|
229
|
+
version: '0'
|
230
|
+
type: :development
|
231
|
+
prerelease: false
|
232
|
+
version_requirements: !ruby/object:Gem::Requirement
|
233
|
+
requirements:
|
234
|
+
- - ">="
|
235
|
+
- !ruby/object:Gem::Version
|
236
|
+
version: '0'
|
195
237
|
- !ruby/object:Gem::Dependency
|
196
238
|
name: deep_merge
|
197
239
|
requirement: !ruby/object:Gem::Requirement
|
@@ -275,6 +317,7 @@ files:
|
|
275
317
|
- lib/tensor_stream/evaluator/base_evaluator.rb
|
276
318
|
- lib/tensor_stream/evaluator/buffer.rb
|
277
319
|
- lib/tensor_stream/evaluator/evaluator.rb
|
320
|
+
- lib/tensor_stream/evaluator/evaluator_utils.rb
|
278
321
|
- lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb
|
279
322
|
- lib/tensor_stream/evaluator/operation_helpers/math_helper.rb
|
280
323
|
- lib/tensor_stream/evaluator/operation_helpers/random_gaussian.rb
|
@@ -284,6 +327,8 @@ files:
|
|
284
327
|
- lib/tensor_stream/evaluator/ruby/math_ops.rb
|
285
328
|
- lib/tensor_stream/evaluator/ruby/nn_ops.rb
|
286
329
|
- lib/tensor_stream/evaluator/ruby/random_ops.rb
|
330
|
+
- lib/tensor_stream/evaluator/ruby/storage_manager.rb
|
331
|
+
- lib/tensor_stream/evaluator/ruby/variable_ops.rb
|
287
332
|
- lib/tensor_stream/evaluator/ruby_evaluator.rb
|
288
333
|
- lib/tensor_stream/exceptions.rb
|
289
334
|
- lib/tensor_stream/generated_stub/ops.rb
|
@@ -304,12 +349,14 @@ files:
|
|
304
349
|
- lib/tensor_stream/helpers/tensor_mixins.rb
|
305
350
|
- lib/tensor_stream/images.rb
|
306
351
|
- lib/tensor_stream/initializer.rb
|
352
|
+
- lib/tensor_stream/math/math_ops.rb
|
307
353
|
- lib/tensor_stream/math_gradients.rb
|
308
354
|
- lib/tensor_stream/monkey_patches/array.rb
|
309
355
|
- lib/tensor_stream/monkey_patches/float.rb
|
310
356
|
- lib/tensor_stream/monkey_patches/integer.rb
|
311
357
|
- lib/tensor_stream/monkey_patches/op_patch.rb
|
312
358
|
- lib/tensor_stream/monkey_patches/patch.rb
|
359
|
+
- lib/tensor_stream/nn/embedding_lookup.rb
|
313
360
|
- lib/tensor_stream/nn/nn_ops.rb
|
314
361
|
- lib/tensor_stream/op_maker.rb
|
315
362
|
- lib/tensor_stream/operation.rb
|
@@ -317,6 +364,7 @@ files:
|
|
317
364
|
- lib/tensor_stream/ops/add.rb
|
318
365
|
- lib/tensor_stream/ops/argmax.rb
|
319
366
|
- lib/tensor_stream/ops/argmin.rb
|
367
|
+
- lib/tensor_stream/ops/bias_add.rb
|
320
368
|
- lib/tensor_stream/ops/case.rb
|
321
369
|
- lib/tensor_stream/ops/cast.rb
|
322
370
|
- lib/tensor_stream/ops/ceil.rb
|
@@ -330,6 +378,7 @@ files:
|
|
330
378
|
- lib/tensor_stream/ops/floor_div.rb
|
331
379
|
- lib/tensor_stream/ops/greater.rb
|
332
380
|
- lib/tensor_stream/ops/greater_equal.rb
|
381
|
+
- lib/tensor_stream/ops/less.rb
|
333
382
|
- lib/tensor_stream/ops/less_equal.rb
|
334
383
|
- lib/tensor_stream/ops/log.rb
|
335
384
|
- lib/tensor_stream/ops/mat_mul.rb
|
@@ -338,6 +387,7 @@ files:
|
|
338
387
|
- lib/tensor_stream/ops/mod.rb
|
339
388
|
- lib/tensor_stream/ops/mul.rb
|
340
389
|
- lib/tensor_stream/ops/negate.rb
|
390
|
+
- lib/tensor_stream/ops/not_equal.rb
|
341
391
|
- lib/tensor_stream/ops/ones_like.rb
|
342
392
|
- lib/tensor_stream/ops/pow.rb
|
343
393
|
- lib/tensor_stream/ops/prod.rb
|
@@ -346,16 +396,19 @@ files:
|
|
346
396
|
- lib/tensor_stream/ops/rank.rb
|
347
397
|
- lib/tensor_stream/ops/reshape.rb
|
348
398
|
- lib/tensor_stream/ops/round.rb
|
399
|
+
- lib/tensor_stream/ops/rsqrt.rb
|
349
400
|
- lib/tensor_stream/ops/shape.rb
|
350
401
|
- lib/tensor_stream/ops/sigmoid.rb
|
351
402
|
- lib/tensor_stream/ops/sign.rb
|
352
403
|
- lib/tensor_stream/ops/sin.rb
|
353
404
|
- lib/tensor_stream/ops/size.rb
|
405
|
+
- lib/tensor_stream/ops/strided_slice.rb
|
354
406
|
- lib/tensor_stream/ops/sub.rb
|
355
407
|
- lib/tensor_stream/ops/sum.rb
|
356
408
|
- lib/tensor_stream/ops/tan.rb
|
357
409
|
- lib/tensor_stream/ops/tanh.rb
|
358
410
|
- lib/tensor_stream/ops/tile.rb
|
411
|
+
- lib/tensor_stream/ops/top_k.rb
|
359
412
|
- lib/tensor_stream/ops/zeros.rb
|
360
413
|
- lib/tensor_stream/placeholder.rb
|
361
414
|
- lib/tensor_stream/profile/report_tool.rb
|
@@ -378,6 +431,7 @@ files:
|
|
378
431
|
- lib/tensor_stream/utils.rb
|
379
432
|
- lib/tensor_stream/utils/data_type_utils.rb
|
380
433
|
- lib/tensor_stream/utils/freezer.rb
|
434
|
+
- lib/tensor_stream/utils/py_ports.rb
|
381
435
|
- lib/tensor_stream/variable.rb
|
382
436
|
- lib/tensor_stream/variable_scope.rb
|
383
437
|
- lib/tensor_stream/version.rb
|
@@ -390,12 +444,16 @@ files:
|
|
390
444
|
- samples/others/nearest_neighbor.rb
|
391
445
|
- samples/regression/linear_regression.rb
|
392
446
|
- samples/regression/logistic_regression.rb
|
447
|
+
- samples/word_embeddings/word_embedding_1.rb
|
448
|
+
- samples/word_embeddings/word_embedding_2.rb
|
393
449
|
- tensor_stream.gemspec
|
394
450
|
homepage: http://www.github.com/jedld/tensor_stream
|
395
451
|
licenses:
|
396
452
|
- MIT
|
397
453
|
metadata:
|
398
454
|
allowed_push_host: https://rubygems.org
|
455
|
+
source_code_uri: https://github.com/jedld/tensor_stream
|
456
|
+
changelog_uri: https://github.com/jedld/tensor_stream/blob/master/CHANGELOG.md
|
399
457
|
post_install_message:
|
400
458
|
rdoc_options: []
|
401
459
|
require_paths:
|
@@ -411,8 +469,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
411
469
|
- !ruby/object:Gem::Version
|
412
470
|
version: '0'
|
413
471
|
requirements: []
|
414
|
-
|
415
|
-
rubygems_version: 2.7.7
|
472
|
+
rubygems_version: 3.0.3
|
416
473
|
signing_key:
|
417
474
|
specification_version: 4
|
418
475
|
summary: A Pure ruby tensorflow implementation
|