tensor_stream 1.0.6 → 1.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +10 -3
- data/lib/tensor_stream.rb +1 -0
- data/lib/tensor_stream/evaluator/base_evaluator.rb +6 -0
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +60 -0
- data/lib/tensor_stream/evaluator/ruby/array_ops.rb +53 -1
- data/lib/tensor_stream/evaluator/ruby/math_ops.rb +42 -5
- data/lib/tensor_stream/generated_stub/ops.rb +61 -5
- data/lib/tensor_stream/helpers/tensor_mixins.rb +10 -1
- data/lib/tensor_stream/math/math_ops.rb +22 -0
- data/lib/tensor_stream/math_gradients.rb +15 -1
- data/lib/tensor_stream/nn/embedding_lookup.rb +114 -0
- data/lib/tensor_stream/nn/nn_ops.rb +3 -0
- data/lib/tensor_stream/op_maker.rb +15 -3
- data/lib/tensor_stream/ops.rb +12 -0
- data/lib/tensor_stream/ops/rsqrt.rb +11 -0
- data/lib/tensor_stream/ops/strided_slice.rb +24 -0
- data/lib/tensor_stream/ops/sum.rb +4 -2
- data/lib/tensor_stream/ops/top_k.rb +23 -0
- data/lib/tensor_stream/session.rb +3 -0
- data/lib/tensor_stream/tensor_shape.rb +32 -1
- data/lib/tensor_stream/train/saver.rb +2 -2
- data/lib/tensor_stream/utils.rb +8 -0
- data/lib/tensor_stream/utils/py_ports.rb +11 -0
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/word_embeddings/word_embedding_1.rb +192 -0
- data/samples/word_embeddings/word_embedding_2.rb +203 -0
- data/tensor_stream.gemspec +3 -0
- metadata +40 -4
- data/samples/neural_networks/lstm.rb +0 -22
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'tensor_stream/utils/py_ports'
|
2
|
+
##
|
3
|
+
# ruby port of https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow/python/ops/embedding_ops.py
|
4
|
+
#
|
5
|
+
module TensorStream
|
6
|
+
module EmbeddingLookup
|
7
|
+
include TensorStream::PyPorts
|
8
|
+
|
9
|
+
##
|
10
|
+
# Looks up `ids` in a list of embedding tensors.
|
11
|
+
def embedding_lookup(params, ids, partition_strategy: "mod", name: nil, validate_indices: true, max_norm: nil)
|
12
|
+
_embedding_lookup_and_transform(params, ids, partition_strategy: partition_strategy, name: name, max_norm: max_norm, transform_fn: nil)
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Helper function for embedding_lookup and _compute_sampled_logits.
|
17
|
+
def _embedding_lookup_and_transform(params, ids, partition_strategy: "mod", name: nil, max_norm: nil, transform_fn: nil)
|
18
|
+
raise TensorStream::ValueError, "Need at least one param" if params.nil?
|
19
|
+
|
20
|
+
params = [params] unless params.is_a?(Array)
|
21
|
+
|
22
|
+
TensorStream.name_scope(name, "embedding_lookup", values: params + [ids]) do |name|
|
23
|
+
np = params.size
|
24
|
+
ids = TensorStream.convert_to_tensor(ids, name: "ids")
|
25
|
+
if (np == 1) && (transform_fn.nil? || (ids.shape.size == 1))
|
26
|
+
result = nil
|
27
|
+
TensorStream.colocate_with(params[0]) do
|
28
|
+
result = _clip(TensorStream.gather(params[0], ids, name: name), ids, max_norm)
|
29
|
+
result = transform_fn.call(result) if transform_fn
|
30
|
+
end
|
31
|
+
|
32
|
+
return TensorStream.identity(result)
|
33
|
+
else
|
34
|
+
flat_ids = TensorStream.reshape(ids, [-1])
|
35
|
+
original_indices = TensorStream.range(TensorStream.size(flat_ids))
|
36
|
+
|
37
|
+
p_assignments = nil
|
38
|
+
new_ids = nil
|
39
|
+
|
40
|
+
if partition_strategy == "mod"
|
41
|
+
p_assignments = flat_ids % np
|
42
|
+
new_ids = floor_div(flat_ids, np)
|
43
|
+
elsif partition_strategy == "div"
|
44
|
+
raise "not yet supported!"
|
45
|
+
else
|
46
|
+
raise TensorStream::ValueError, "Unrecognized partition strategy: " + partition_strategy
|
47
|
+
end
|
48
|
+
|
49
|
+
p_assignments = TensorStream.cast(p_assignments, :int32)
|
50
|
+
gather_ids = TensorStream.dynamic_partition(new_ids, p_assignments, np)
|
51
|
+
pindices = TensorStream.dynamic_partition(original_indices, p_assignments, np)
|
52
|
+
partitioned_result = []
|
53
|
+
(0...np).each do |p|
|
54
|
+
pids = gather_ids[p]
|
55
|
+
result = nil
|
56
|
+
TensorStream.colocate_with(params[p]) do
|
57
|
+
result = TensorStream.gather(params[p], pids)
|
58
|
+
if transform_fn
|
59
|
+
# If transform_fn is provided, the clip_by_norm precedes
|
60
|
+
# the transform and hence must be co-located. See below
|
61
|
+
# for the counterpart if transform_fn is not proveded.
|
62
|
+
result = transform_fn.call(_clip(result, pids, max_norm))
|
63
|
+
end
|
64
|
+
end
|
65
|
+
partitioned_result << result
|
66
|
+
end
|
67
|
+
ret = TensorStream.dynamic_stitch(pindices, partitioned_result, name: name)
|
68
|
+
|
69
|
+
if transform_fn.nil?
|
70
|
+
element_shape_s = params[0].shape[1..-1]
|
71
|
+
params[1..-1].each { |p| element_shape_s = element_shape_s.merge_with(p.shape[1..-1]) }
|
72
|
+
else
|
73
|
+
element_shape_s = ret.shape[1..-1]
|
74
|
+
end
|
75
|
+
|
76
|
+
# Compute the dynamic element shape.
|
77
|
+
element_shape_d = if element_shape_s.fully_defined?
|
78
|
+
element_shape_s
|
79
|
+
elsif transform_fn.nil?
|
80
|
+
# It's important that we compute params[0].shape on the right device
|
81
|
+
# to avoid data motion.
|
82
|
+
TensorStream.colocate_with(params[0]) do
|
83
|
+
params_shape = TensorStream.shape(params[0])
|
84
|
+
params_shape[1..-1]
|
85
|
+
end
|
86
|
+
else
|
87
|
+
TensorStream.shape(ret)[1..-1]
|
88
|
+
end
|
89
|
+
ret = TensorStream.reshape(ret, TensorStream.concat([TensorStream.shape(ids), element_shape_d], 0))
|
90
|
+
ret = _clip(ret, ids, max_norm) unless transform_fn
|
91
|
+
ret
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def _clip(params, ids, max_norm)
|
97
|
+
return params if max_norm.nil?
|
98
|
+
|
99
|
+
ids_rank, ids_static = _rank(ids)
|
100
|
+
params_rank, params_static = _rank(params)
|
101
|
+
|
102
|
+
TensorStream.clip_by_norm(params, max_norm, axes: ids_static && params_static ? (ids_rank...params_rank).to_a : TensorStream.range(ids_rank, params_rank))
|
103
|
+
end
|
104
|
+
|
105
|
+
def _rank(x)
|
106
|
+
rank = TensorStream.convert_to_tensor(x).shape.ndims
|
107
|
+
if rank
|
108
|
+
[rank, false]
|
109
|
+
else
|
110
|
+
[TensorStream.rank(x), false]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -1,7 +1,10 @@
|
|
1
|
+
require 'tensor_stream/nn/embedding_lookup'
|
1
2
|
module TensorStream
|
2
3
|
# High level machine learning functions
|
3
4
|
class NN
|
4
5
|
extend TensorStream::OpHelper
|
6
|
+
extend TensorStream::EmbeddingLookup
|
7
|
+
extend TensorStream::Maths::MathFunctions
|
5
8
|
|
6
9
|
class << self
|
7
10
|
def softmax(logits, axis: nil, name: nil)
|
@@ -2,7 +2,7 @@ class TensorStream::OpMaker
|
|
2
2
|
attr_reader :operation, :description, :parameters,
|
3
3
|
:options, :gradient, :check_types,
|
4
4
|
:supports_broadcast, :data_type_coercion,
|
5
|
-
:aliases, :custom, :infer_type_proc, :exclude,
|
5
|
+
:aliases, :custom, :custom_post, :infer_type_proc, :exclude,
|
6
6
|
:data_type_block
|
7
7
|
|
8
8
|
def initialize(op)
|
@@ -16,6 +16,7 @@ class TensorStream::OpMaker
|
|
16
16
|
@description = []
|
17
17
|
@aliases = []
|
18
18
|
@custom = []
|
19
|
+
@custom_post = []
|
19
20
|
@infer_type_proc = lambda { |tensor|
|
20
21
|
next nil if tensor.inputs[0].nil?
|
21
22
|
next tensor.inputs[0].shape.shape if tensor.inputs.size == 1
|
@@ -32,6 +33,10 @@ class TensorStream::OpMaker
|
|
32
33
|
@custom << custom_code
|
33
34
|
end
|
34
35
|
|
36
|
+
def add_custom_post(custom_code)
|
37
|
+
@custom_post << custom_code
|
38
|
+
end
|
39
|
+
|
35
40
|
def self.scan
|
36
41
|
op_files = Dir[File.join(File.dirname(__FILE__), "ops", "*.rb")]
|
37
42
|
op_files.each { |file|
|
@@ -111,7 +116,14 @@ class TensorStream::OpMaker
|
|
111
116
|
custom.each do |c|
|
112
117
|
body << c
|
113
118
|
end
|
114
|
-
|
119
|
+
if custom_post.empty?
|
120
|
+
body << "_op(:#{operation}, #{(expand_params(false) + options_call).join(', ')})"
|
121
|
+
else
|
122
|
+
body << "result = _op(:#{operation}, #{(expand_params(false) + options_call).join(', ')})"
|
123
|
+
end
|
124
|
+
custom_post.each do |c|
|
125
|
+
body << c
|
126
|
+
end
|
115
127
|
body.map { |line| " #{line}"}.join("\n")
|
116
128
|
end
|
117
129
|
|
@@ -184,7 +196,7 @@ class TensorStream::OpMaker
|
|
184
196
|
end
|
185
197
|
|
186
198
|
def options_call
|
187
|
-
@options.map { |k, v|
|
199
|
+
@options.reject { |k, v| v.dig(:options, :exclude) }.map { |k, v|
|
188
200
|
if v.dig(:options, :alias)
|
189
201
|
"#{v.dig(:options, :alias)}: #{k}"
|
190
202
|
else
|
data/lib/tensor_stream/ops.rb
CHANGED
@@ -195,6 +195,15 @@ module TensorStream
|
|
195
195
|
end
|
196
196
|
end
|
197
197
|
|
198
|
+
##
|
199
|
+
# Partitions data into num_partitions tensors using indices from partitions
|
200
|
+
def dynamic_partition(data, partitions, num_partitions, name: nil)
|
201
|
+
result = _op(:dynamic_partition, data, partitions, num_partitions: num_partitions, name: nil)
|
202
|
+
num_partitions.times.map do |index|
|
203
|
+
result[index]
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
198
207
|
def split(value, num_or_size_splits, axis: 0, num: nil, name: "split")
|
199
208
|
value = convert_to_tensor(value)
|
200
209
|
num_or_size_splits = convert_to_tensor(num_or_size_splits)
|
@@ -524,6 +533,9 @@ module TensorStream
|
|
524
533
|
_op(:squeeze, value, axis: axis, name: nil)
|
525
534
|
end
|
526
535
|
|
536
|
+
def clip_by_norm(tensor, clip_norm, axes: nil, name: nil)
|
537
|
+
end
|
538
|
+
|
527
539
|
##
|
528
540
|
# Computes the difference between two lists of numbers or strings.
|
529
541
|
# Given a list x and a list y, this operation returns a list out that represents all values
|
@@ -0,0 +1,11 @@
|
|
1
|
+
TensorStream::OpMaker.define_operation :rsqrt do |op|
|
2
|
+
op.what_it_does "Computes reciprocal of square root of x element-wise."
|
3
|
+
|
4
|
+
op.parameter :input_a, "tensor X", validate: 'FLOATING_POINT_TYPES'
|
5
|
+
op.option :name, "Optional name", :nil
|
6
|
+
|
7
|
+
op.define_gradient do |grad, node, params|
|
8
|
+
# Returns -0.5 * grad * conj(y)^3.
|
9
|
+
i_op(:rsqrt_grad, node, grad)
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
TensorStream::OpMaker.define_operation :strided_slice do |op|
|
2
|
+
op.what_it_does "Extracts a strided slice of a tensor "
|
3
|
+
op.what_it_does "this op extracts a slice of size `(end-begin)/stride`
|
4
|
+
from the given `input_` tensor. Starting at the location specified by `begin`
|
5
|
+
the slice continues by adding `stride` to the index until all dimensions are
|
6
|
+
not less than `end`.
|
7
|
+
Note that a stride can be negative, which causes a reverse slice."
|
8
|
+
|
9
|
+
op.parameter :input, "A tensor"
|
10
|
+
op.parameter :_begin, "start index"
|
11
|
+
op.parameter :_end, "end index"
|
12
|
+
op.parameter :strides, "end index", :nil
|
13
|
+
op.option :name, "Optional name", :nil
|
14
|
+
|
15
|
+
op.define_gradient do |grad, node, params|
|
16
|
+
input, b_index, e_index, strides = params
|
17
|
+
x = ts.shape(input, out_type: node.inputs[0].data_type)
|
18
|
+
|
19
|
+
_op(:strided_slice_grad, x, b_index, e_index, strides, grad)
|
20
|
+
end
|
21
|
+
|
22
|
+
op.define_shape do |tensor|
|
23
|
+
end
|
24
|
+
end
|
@@ -7,14 +7,16 @@ TensorStream::OpMaker.define_operation :sum do |op|
|
|
7
7
|
op.what_it_does "If axis has no entries, all dimensions are reduced, and a tensor with a single element is returned."
|
8
8
|
|
9
9
|
op.parameter :input_a, "tensor X"
|
10
|
-
op.parameter :
|
10
|
+
op.parameter :axis_p, "tensor X", :nil, validate: 'INTEGER_TYPES'
|
11
11
|
|
12
|
+
op.option :axis, "axis", :nil, exclude: true
|
12
13
|
op.option :name, "Optional name", :nil
|
13
14
|
op.option :keepdims, "If true, retains reduced dimensions with length 1.", :false
|
14
15
|
|
15
16
|
op.add_custom "input_a = TensorStream.convert_to_tensor(input_a)"
|
16
17
|
op.add_custom "return input_a if input_a.shape.scalar?"
|
17
|
-
op.add_custom "
|
18
|
+
op.add_custom "axis_p = axis_p || axis"
|
19
|
+
op.add_custom "axis_p = cast_axis(input_a, axis_p)"
|
18
20
|
|
19
21
|
op.define_gradient do |grad, node, params|
|
20
22
|
x, y = params
|
@@ -0,0 +1,23 @@
|
|
1
|
+
TensorStream::OpMaker.define_operation :top_k do |op|
|
2
|
+
op.what_it_does "Finds values and indices of the `k` largest entries for the last dimension."
|
3
|
+
|
4
|
+
op.parameter :input, "1-D or higher `Tensor` with last dimension at least `k`."
|
5
|
+
op.parameter :k, "0-D `int32` `Tensor`. Number of top elements to look for along the last dimension (along each row for matrices)", 1
|
6
|
+
op.option :sorted, "If true the resulting `k` elements will be sorted by the values in descending order.", "true"
|
7
|
+
op.option :name, "Optional name", :nil
|
8
|
+
|
9
|
+
op.add_custom_post "[result[0], result[1]]"
|
10
|
+
|
11
|
+
op.define_shape do |tensor|
|
12
|
+
next nil unless tensor.inputs[0].shape.known?
|
13
|
+
|
14
|
+
input_shape = tensor.inputs[0].shape.shape.dup
|
15
|
+
k = tensor.options[:k]
|
16
|
+
input_shape[-1] = k
|
17
|
+
input_shape
|
18
|
+
end
|
19
|
+
|
20
|
+
op.define_gradient do |grad, node, params|
|
21
|
+
#TODO
|
22
|
+
end
|
23
|
+
end
|
@@ -18,7 +18,8 @@ module TensorStream
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def [](index)
|
21
|
-
@shape[index]
|
21
|
+
new_shape = @shape[index]
|
22
|
+
TensorShape.new(@shape[index])
|
22
23
|
end
|
23
24
|
|
24
25
|
def ndims
|
@@ -42,6 +43,36 @@ module TensorStream
|
|
42
43
|
known?
|
43
44
|
end
|
44
45
|
|
46
|
+
def merge_with(other)
|
47
|
+
assert_compatible_with(other)
|
48
|
+
|
49
|
+
if @shape.nil?
|
50
|
+
TensorShape.new(other)
|
51
|
+
else
|
52
|
+
TensorShape.new(@shape)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def compatible_with?(other)
|
57
|
+
other = as_dimension(other)
|
58
|
+
|
59
|
+
shape.nil? || other.nil? || shape == other
|
60
|
+
end
|
61
|
+
|
62
|
+
def as_dimension(value)
|
63
|
+
value.is_a?(TensorShape) ? value.shape : value
|
64
|
+
end
|
65
|
+
|
66
|
+
def value
|
67
|
+
shape
|
68
|
+
end
|
69
|
+
|
70
|
+
##
|
71
|
+
# Raises an exception if `other` is not compatible with this shape.
|
72
|
+
def assert_compatible_with(other)
|
73
|
+
raise TensorStream::ValueError, "Dimensions #{self} and #{other} are not compatible" unless compatible_with?(other)
|
74
|
+
end
|
75
|
+
|
45
76
|
def self.infer_shape(shape_a, shape_b)
|
46
77
|
return nil if shape_a.nil? || shape_b.nil?
|
47
78
|
return shape_a if shape_b.empty?
|
@@ -7,9 +7,9 @@ module TensorStream
|
|
7
7
|
class Saver
|
8
8
|
include TensorStream::OpHelper
|
9
9
|
|
10
|
-
def initialize
|
10
|
+
def initialize(var_list = nil)
|
11
11
|
graph = TensorStream::Graph.get_default_graph
|
12
|
-
vars = graph.get_collection(GraphKeys::GLOBAL_VARIABLES)
|
12
|
+
vars = var_list || graph.get_collection(GraphKeys::GLOBAL_VARIABLES)
|
13
13
|
|
14
14
|
@filename = graph["ts_filename"] || TensorStream.placeholder(:string, name: "ts_filename", shape: [])
|
15
15
|
|
data/lib/tensor_stream/utils.rb
CHANGED
@@ -219,6 +219,10 @@ module TensorStream
|
|
219
219
|
TensorStream::Trainer
|
220
220
|
end
|
221
221
|
|
222
|
+
def math
|
223
|
+
TensorStream::Maths
|
224
|
+
end
|
225
|
+
|
222
226
|
def image
|
223
227
|
TensorStream::Images
|
224
228
|
end
|
@@ -248,6 +252,10 @@ module TensorStream
|
|
248
252
|
return TensorStream.expand_dims(value[0], 0)
|
249
253
|
end
|
250
254
|
|
255
|
+
if value.is_a?(TensorShape)
|
256
|
+
value = value.shape
|
257
|
+
end
|
258
|
+
|
251
259
|
check_if_dense(value)
|
252
260
|
i_cons(value, dtype: dtype || Tensor.detect_type(value), name: name)
|
253
261
|
end
|
@@ -0,0 +1,192 @@
|
|
1
|
+
#
|
2
|
+
# A ruby port of https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer by Guillaume Chevalier
|
3
|
+
#
|
4
|
+
# This is a port so some weird python like conventions may have been left behind
|
5
|
+
require "bundler/setup"
|
6
|
+
require "tensor_stream"
|
7
|
+
require "chakin-rb/chakin"
|
8
|
+
# require 'pry-byebug'
|
9
|
+
require 'zip'
|
10
|
+
|
11
|
+
tf = TensorStream
|
12
|
+
|
13
|
+
CHAKIN_INDEX = 17
|
14
|
+
NUMBER_OF_DIMENSIONS = 25
|
15
|
+
SUBFOLDER_NAME = "glove.twitter.27B"
|
16
|
+
|
17
|
+
DATA_FOLDER = "embeddings"
|
18
|
+
ZIP_FILE = File.join(DATA_FOLDER, "#{SUBFOLDER_NAME}.zip")
|
19
|
+
ZIP_FILE_ALT = "glove" + ZIP_FILE[5..nil] # sometimes it's lowercase only...
|
20
|
+
UNZIP_FOLDER = File.join(DATA_FOLDER, SUBFOLDER_NAME)
|
21
|
+
|
22
|
+
if SUBFOLDER_NAME[-1] == "d"
|
23
|
+
GLOVE_FILENAME = File.join(UNZIP_FOLDER, "#{SUBFOLDER_NAME}.txt")
|
24
|
+
else
|
25
|
+
GLOVE_FILENAME = File.join(UNZIP_FOLDER, "#{SUBFOLDER_NAME}.#{NUMBER_OF_DIMENSIONS}d.txt")
|
26
|
+
end
|
27
|
+
|
28
|
+
if !File.exist?(ZIP_FILE) && !File.exist?(UNZIP_FOLDER)
|
29
|
+
# GloVe by Stanford is licensed Apache 2.0:
|
30
|
+
# https://github.com/stanfordnlp/GloVe/blob/master/LICENSE
|
31
|
+
# http://nlp.stanford.edu/data/glove.twitter.27B.zip
|
32
|
+
# Copyright 2014 The Board of Trustees of The Leland Stanford Junior University
|
33
|
+
puts "Downloading embeddings to '#{ZIP_FILE}'"
|
34
|
+
Chakin::Vectors.download(number: CHAKIN_INDEX, save_dir: "./#{DATA_FOLDER}")
|
35
|
+
else
|
36
|
+
puts "Embeddings already downloaded."
|
37
|
+
end
|
38
|
+
|
39
|
+
if !File.exists?(UNZIP_FOLDER)
|
40
|
+
if !File.exists?(ZIP_FILE) && !File.exists?(ZIP_FILE_ALT)
|
41
|
+
ZIP_FILE = ZIP_FILE_ALT
|
42
|
+
end
|
43
|
+
FileUtils.mkdir_p(UNZIP_FOLDER)
|
44
|
+
Zip::File.open(ZIP_FILE) do |zipfile|
|
45
|
+
zipfile.each do |file|
|
46
|
+
puts "Extracting embeddings to '#{UNZIP_FOLDER}/#{file.name}'"
|
47
|
+
fpath = File.join(UNZIP_FOLDER, file.name)
|
48
|
+
zipfile.extract(file, fpath) unless File.exist?(fpath)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
else
|
52
|
+
puts "Embeddings already extracted."
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Read a GloVe txt file. If `with_indexes=True`, we return a tuple of two dictionnaries
|
57
|
+
# `(word_to_index_dict, index_to_embedding_array)`, otherwise we return only a direct
|
58
|
+
# `word_to_embedding_dict` dictionnary mapping from a string to a numpy array.
|
59
|
+
def load_embedding_from_disks(glove_filename, with_indexes: true)
|
60
|
+
word_to_index_dict = {}
|
61
|
+
index_to_embedding_array = []
|
62
|
+
word_to_embedding_dict = {}
|
63
|
+
representation = nil
|
64
|
+
|
65
|
+
last_index = nil
|
66
|
+
File.open(glove_filename, 'r').each_with_index do |line, i|
|
67
|
+
split = line.split(' ')
|
68
|
+
|
69
|
+
word = split.shift
|
70
|
+
|
71
|
+
representation = split
|
72
|
+
representation.map! { |val| val.to_f }
|
73
|
+
|
74
|
+
if with_indexes
|
75
|
+
word_to_index_dict[word] = i
|
76
|
+
index_to_embedding_array << representation
|
77
|
+
else
|
78
|
+
word_to_embedding_dict[word] = representation
|
79
|
+
end
|
80
|
+
last_index = i
|
81
|
+
end
|
82
|
+
|
83
|
+
_WORD_NOT_FOUND = [0.0] * representation.size # Empty representation for unknown words.
|
84
|
+
if with_indexes
|
85
|
+
_LAST_INDEX = last_index + 1
|
86
|
+
word_to_index_dict = Hash.new(_LAST_INDEX).merge(word_to_index_dict)
|
87
|
+
index_to_embedding_array = index_to_embedding_array + [_WORD_NOT_FOUND]
|
88
|
+
return word_to_index_dict, index_to_embedding_array
|
89
|
+
else
|
90
|
+
word_to_embedding_dict = Hash.new(_WORD_NOT_FOUND)
|
91
|
+
return word_to_embedding_dict
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
puts "Loading embedding from disks..."
|
96
|
+
word_to_index, index_to_embedding = load_embedding_from_disks(GLOVE_FILENAME, with_indexes: true)
|
97
|
+
puts "Embedding loaded from disks."
|
98
|
+
|
99
|
+
vocab_size, embedding_dim = index_to_embedding.shape
|
100
|
+
puts "Embedding is of shape: #{index_to_embedding.shape}"
|
101
|
+
puts "This means (number of words, number of dimensions per word)"
|
102
|
+
puts "The first words are words that tend occur more often."
|
103
|
+
|
104
|
+
puts "Note: for unknown words, the representation is an empty vector,\n" +
|
105
|
+
"and the index is the last one. The dictionnary has a limit:"
|
106
|
+
puts " \"A word\" --> \"Index in embedding\" --> \"Representation\""
|
107
|
+
word = "worsdfkljsdf"
|
108
|
+
idx = word_to_index[word]
|
109
|
+
embd = index_to_embedding[idx].map { |v| v.to_i } # "int" for compact print only.
|
110
|
+
puts " #{word} --> #{idx} --> #{embd}"
|
111
|
+
word = "the"
|
112
|
+
idx = word_to_index[word]
|
113
|
+
embd = index_to_embedding[idx] # "int" for compact print only.
|
114
|
+
puts " #{word} --> #{idx} --> #{embd}"
|
115
|
+
|
116
|
+
words = [
|
117
|
+
"The", "Teh", "A", "It", "Its", "Bacon", "Star", "Clone", "Bonjour", "Intelligence",
|
118
|
+
"À", "A", "Ça", "Ca", "Été", "C'est", "Aujourd'hui", "Aujourd", "'", "hui", "?", "!", ",", ".", "-", "/", "~"
|
119
|
+
]
|
120
|
+
|
121
|
+
words.each do |word|
|
122
|
+
word_ = word.downcase
|
123
|
+
embedding = index_to_embedding[word_to_index[word_]]
|
124
|
+
norm = Vector::elements(embedding).norm
|
125
|
+
puts (word + ": ").ljust(15) + norm.to_s
|
126
|
+
end
|
127
|
+
|
128
|
+
puts "Note: here we printed words starting with capital letters, \n" +
|
129
|
+
"however to take their embeddings we need their lowercase version (str.downcase)"
|
130
|
+
|
131
|
+
batch_size = nil # Any size is accepted
|
132
|
+
|
133
|
+
tf.reset_default_graph
|
134
|
+
sess = tf.session
|
135
|
+
|
136
|
+
# Define the variable that will hold the embedding:
|
137
|
+
tf_embedding = tf.variable(
|
138
|
+
tf.constant(0.0, shape: index_to_embedding.shape),
|
139
|
+
trainable: false,
|
140
|
+
name: "Embedding"
|
141
|
+
)
|
142
|
+
|
143
|
+
tf_word_ids = tf.placeholder(:int32, shape: [batch_size])
|
144
|
+
|
145
|
+
tf_word_representation_layer = tf.nn.embedding_lookup(tf_embedding, tf_word_ids)
|
146
|
+
|
147
|
+
tf_embedding_placeholder = tf.placeholder(:float32, shape: index_to_embedding.shape)
|
148
|
+
tf_embedding_init = tf_embedding.assign(tf_embedding_placeholder)
|
149
|
+
|
150
|
+
sess.run(
|
151
|
+
tf_embedding_init,
|
152
|
+
feed_dict: {
|
153
|
+
tf_embedding_placeholder => index_to_embedding
|
154
|
+
}
|
155
|
+
)
|
156
|
+
|
157
|
+
puts "Embedding now stored in TensorStream. Can delete ruby array to clear some CPU RAM."
|
158
|
+
|
159
|
+
batch_of_words = ["Hello", "World", "!"]
|
160
|
+
batch_indexes = batch_of_words.map { |w| word_to_index[w.downcase] }
|
161
|
+
|
162
|
+
embedding_from_batch_lookup = sess.run(
|
163
|
+
tf_word_representation_layer,
|
164
|
+
feed_dict: {
|
165
|
+
tf_word_ids => batch_indexes
|
166
|
+
}
|
167
|
+
)
|
168
|
+
|
169
|
+
puts "Representations for #{batch_of_words}:"
|
170
|
+
puts embedding_from_batch_lookup.inspect
|
171
|
+
|
172
|
+
prefix = SUBFOLDER_NAME + "." + NUMBER_OF_DIMENSIONS.to_s + "d"
|
173
|
+
TF_EMBEDDINGS_FILE_NAME = File.join(DATA_FOLDER, prefix + ".ckpt")
|
174
|
+
DICT_WORD_TO_INDEX_FILE_NAME = File.join(DATA_FOLDER, prefix + ".json")
|
175
|
+
|
176
|
+
variables_to_save = [tf_embedding]
|
177
|
+
embedding_saver = tf::Train::Saver.new(variables_to_save)
|
178
|
+
embedding_saver.save(sess, TF_EMBEDDINGS_FILE_NAME)
|
179
|
+
puts "TF embeddings saved to '#{TF_EMBEDDINGS_FILE_NAME}'."
|
180
|
+
|
181
|
+
sess.close
|
182
|
+
|
183
|
+
File.open(DICT_WORD_TO_INDEX_FILE_NAME, 'w') do |f|
|
184
|
+
f.write(word_to_index.to_json)
|
185
|
+
end
|
186
|
+
puts "word_to_index dict saved to '#{DICT_WORD_TO_INDEX_FILE_NAME}'."
|
187
|
+
|
188
|
+
words_B = "like absolutely crazy not hate bag sand rock soap"
|
189
|
+
r = words_B.split.map { |w| word_to_index[w.strip()] }
|
190
|
+
puts words_B
|
191
|
+
puts r.inspect
|
192
|
+
puts "done"
|