tensor_stream 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +10 -3
- data/lib/tensor_stream.rb +1 -0
- data/lib/tensor_stream/evaluator/base_evaluator.rb +6 -0
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +60 -0
- data/lib/tensor_stream/evaluator/ruby/array_ops.rb +53 -1
- data/lib/tensor_stream/evaluator/ruby/math_ops.rb +42 -5
- data/lib/tensor_stream/generated_stub/ops.rb +61 -5
- data/lib/tensor_stream/helpers/tensor_mixins.rb +10 -1
- data/lib/tensor_stream/math/math_ops.rb +22 -0
- data/lib/tensor_stream/math_gradients.rb +15 -1
- data/lib/tensor_stream/nn/embedding_lookup.rb +114 -0
- data/lib/tensor_stream/nn/nn_ops.rb +3 -0
- data/lib/tensor_stream/op_maker.rb +15 -3
- data/lib/tensor_stream/ops.rb +12 -0
- data/lib/tensor_stream/ops/rsqrt.rb +11 -0
- data/lib/tensor_stream/ops/strided_slice.rb +24 -0
- data/lib/tensor_stream/ops/sum.rb +4 -2
- data/lib/tensor_stream/ops/top_k.rb +23 -0
- data/lib/tensor_stream/session.rb +3 -0
- data/lib/tensor_stream/tensor_shape.rb +32 -1
- data/lib/tensor_stream/train/saver.rb +2 -2
- data/lib/tensor_stream/utils.rb +8 -0
- data/lib/tensor_stream/utils/py_ports.rb +11 -0
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/word_embeddings/word_embedding_1.rb +192 -0
- data/samples/word_embeddings/word_embedding_2.rb +203 -0
- data/tensor_stream.gemspec +3 -0
- metadata +40 -4
- data/samples/neural_networks/lstm.rb +0 -22
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'tensor_stream/utils/py_ports'
|
2
|
+
##
|
3
|
+
# ruby port of https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow/python/ops/embedding_ops.py
|
4
|
+
#
|
5
|
+
module TensorStream
|
6
|
+
module EmbeddingLookup
|
7
|
+
include TensorStream::PyPorts
|
8
|
+
|
9
|
+
##
|
10
|
+
# Looks up `ids` in a list of embedding tensors.
|
11
|
+
def embedding_lookup(params, ids, partition_strategy: "mod", name: nil, validate_indices: true, max_norm: nil)
|
12
|
+
_embedding_lookup_and_transform(params, ids, partition_strategy: partition_strategy, name: name, max_norm: max_norm, transform_fn: nil)
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Helper function for embedding_lookup and _compute_sampled_logits.
|
17
|
+
def _embedding_lookup_and_transform(params, ids, partition_strategy: "mod", name: nil, max_norm: nil, transform_fn: nil)
|
18
|
+
raise TensorStream::ValueError, "Need at least one param" if params.nil?
|
19
|
+
|
20
|
+
params = [params] unless params.is_a?(Array)
|
21
|
+
|
22
|
+
TensorStream.name_scope(name, "embedding_lookup", values: params + [ids]) do |name|
|
23
|
+
np = params.size
|
24
|
+
ids = TensorStream.convert_to_tensor(ids, name: "ids")
|
25
|
+
if (np == 1) && (transform_fn.nil? || (ids.shape.size == 1))
|
26
|
+
result = nil
|
27
|
+
TensorStream.colocate_with(params[0]) do
|
28
|
+
result = _clip(TensorStream.gather(params[0], ids, name: name), ids, max_norm)
|
29
|
+
result = transform_fn.call(result) if transform_fn
|
30
|
+
end
|
31
|
+
|
32
|
+
return TensorStream.identity(result)
|
33
|
+
else
|
34
|
+
flat_ids = TensorStream.reshape(ids, [-1])
|
35
|
+
original_indices = TensorStream.range(TensorStream.size(flat_ids))
|
36
|
+
|
37
|
+
p_assignments = nil
|
38
|
+
new_ids = nil
|
39
|
+
|
40
|
+
if partition_strategy == "mod"
|
41
|
+
p_assignments = flat_ids % np
|
42
|
+
new_ids = floor_div(flat_ids, np)
|
43
|
+
elsif partition_strategy == "div"
|
44
|
+
raise "not yet supported!"
|
45
|
+
else
|
46
|
+
raise TensorStream::ValueError, "Unrecognized partition strategy: " + partition_strategy
|
47
|
+
end
|
48
|
+
|
49
|
+
p_assignments = TensorStream.cast(p_assignments, :int32)
|
50
|
+
gather_ids = TensorStream.dynamic_partition(new_ids, p_assignments, np)
|
51
|
+
pindices = TensorStream.dynamic_partition(original_indices, p_assignments, np)
|
52
|
+
partitioned_result = []
|
53
|
+
(0...np).each do |p|
|
54
|
+
pids = gather_ids[p]
|
55
|
+
result = nil
|
56
|
+
TensorStream.colocate_with(params[p]) do
|
57
|
+
result = TensorStream.gather(params[p], pids)
|
58
|
+
if transform_fn
|
59
|
+
# If transform_fn is provided, the clip_by_norm precedes
|
60
|
+
# the transform and hence must be co-located. See below
|
61
|
+
# for the counterpart if transform_fn is not proveded.
|
62
|
+
result = transform_fn.call(_clip(result, pids, max_norm))
|
63
|
+
end
|
64
|
+
end
|
65
|
+
partitioned_result << result
|
66
|
+
end
|
67
|
+
ret = TensorStream.dynamic_stitch(pindices, partitioned_result, name: name)
|
68
|
+
|
69
|
+
if transform_fn.nil?
|
70
|
+
element_shape_s = params[0].shape[1..-1]
|
71
|
+
params[1..-1].each { |p| element_shape_s = element_shape_s.merge_with(p.shape[1..-1]) }
|
72
|
+
else
|
73
|
+
element_shape_s = ret.shape[1..-1]
|
74
|
+
end
|
75
|
+
|
76
|
+
# Compute the dynamic element shape.
|
77
|
+
element_shape_d = if element_shape_s.fully_defined?
|
78
|
+
element_shape_s
|
79
|
+
elsif transform_fn.nil?
|
80
|
+
# It's important that we compute params[0].shape on the right device
|
81
|
+
# to avoid data motion.
|
82
|
+
TensorStream.colocate_with(params[0]) do
|
83
|
+
params_shape = TensorStream.shape(params[0])
|
84
|
+
params_shape[1..-1]
|
85
|
+
end
|
86
|
+
else
|
87
|
+
TensorStream.shape(ret)[1..-1]
|
88
|
+
end
|
89
|
+
ret = TensorStream.reshape(ret, TensorStream.concat([TensorStream.shape(ids), element_shape_d], 0))
|
90
|
+
ret = _clip(ret, ids, max_norm) unless transform_fn
|
91
|
+
ret
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def _clip(params, ids, max_norm)
|
97
|
+
return params if max_norm.nil?
|
98
|
+
|
99
|
+
ids_rank, ids_static = _rank(ids)
|
100
|
+
params_rank, params_static = _rank(params)
|
101
|
+
|
102
|
+
TensorStream.clip_by_norm(params, max_norm, axes: ids_static && params_static ? (ids_rank...params_rank).to_a : TensorStream.range(ids_rank, params_rank))
|
103
|
+
end
|
104
|
+
|
105
|
+
def _rank(x)
|
106
|
+
rank = TensorStream.convert_to_tensor(x).shape.ndims
|
107
|
+
if rank
|
108
|
+
[rank, false]
|
109
|
+
else
|
110
|
+
[TensorStream.rank(x), false]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -1,7 +1,10 @@
|
|
1
|
+
require 'tensor_stream/nn/embedding_lookup'
|
1
2
|
module TensorStream
|
2
3
|
# High level machine learning functions
|
3
4
|
class NN
|
4
5
|
extend TensorStream::OpHelper
|
6
|
+
extend TensorStream::EmbeddingLookup
|
7
|
+
extend TensorStream::Maths::MathFunctions
|
5
8
|
|
6
9
|
class << self
|
7
10
|
def softmax(logits, axis: nil, name: nil)
|
@@ -2,7 +2,7 @@ class TensorStream::OpMaker
|
|
2
2
|
attr_reader :operation, :description, :parameters,
|
3
3
|
:options, :gradient, :check_types,
|
4
4
|
:supports_broadcast, :data_type_coercion,
|
5
|
-
:aliases, :custom, :infer_type_proc, :exclude,
|
5
|
+
:aliases, :custom, :custom_post, :infer_type_proc, :exclude,
|
6
6
|
:data_type_block
|
7
7
|
|
8
8
|
def initialize(op)
|
@@ -16,6 +16,7 @@ class TensorStream::OpMaker
|
|
16
16
|
@description = []
|
17
17
|
@aliases = []
|
18
18
|
@custom = []
|
19
|
+
@custom_post = []
|
19
20
|
@infer_type_proc = lambda { |tensor|
|
20
21
|
next nil if tensor.inputs[0].nil?
|
21
22
|
next tensor.inputs[0].shape.shape if tensor.inputs.size == 1
|
@@ -32,6 +33,10 @@ class TensorStream::OpMaker
|
|
32
33
|
@custom << custom_code
|
33
34
|
end
|
34
35
|
|
36
|
+
def add_custom_post(custom_code)
|
37
|
+
@custom_post << custom_code
|
38
|
+
end
|
39
|
+
|
35
40
|
def self.scan
|
36
41
|
op_files = Dir[File.join(File.dirname(__FILE__), "ops", "*.rb")]
|
37
42
|
op_files.each { |file|
|
@@ -111,7 +116,14 @@ class TensorStream::OpMaker
|
|
111
116
|
custom.each do |c|
|
112
117
|
body << c
|
113
118
|
end
|
114
|
-
|
119
|
+
if custom_post.empty?
|
120
|
+
body << "_op(:#{operation}, #{(expand_params(false) + options_call).join(', ')})"
|
121
|
+
else
|
122
|
+
body << "result = _op(:#{operation}, #{(expand_params(false) + options_call).join(', ')})"
|
123
|
+
end
|
124
|
+
custom_post.each do |c|
|
125
|
+
body << c
|
126
|
+
end
|
115
127
|
body.map { |line| " #{line}"}.join("\n")
|
116
128
|
end
|
117
129
|
|
@@ -184,7 +196,7 @@ class TensorStream::OpMaker
|
|
184
196
|
end
|
185
197
|
|
186
198
|
def options_call
|
187
|
-
@options.map { |k, v|
|
199
|
+
@options.reject { |k, v| v.dig(:options, :exclude) }.map { |k, v|
|
188
200
|
if v.dig(:options, :alias)
|
189
201
|
"#{v.dig(:options, :alias)}: #{k}"
|
190
202
|
else
|
data/lib/tensor_stream/ops.rb
CHANGED
@@ -195,6 +195,15 @@ module TensorStream
|
|
195
195
|
end
|
196
196
|
end
|
197
197
|
|
198
|
+
##
|
199
|
+
# Partitions data into num_partitions tensors using indices from partitions
|
200
|
+
def dynamic_partition(data, partitions, num_partitions, name: nil)
|
201
|
+
result = _op(:dynamic_partition, data, partitions, num_partitions: num_partitions, name: nil)
|
202
|
+
num_partitions.times.map do |index|
|
203
|
+
result[index]
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
198
207
|
def split(value, num_or_size_splits, axis: 0, num: nil, name: "split")
|
199
208
|
value = convert_to_tensor(value)
|
200
209
|
num_or_size_splits = convert_to_tensor(num_or_size_splits)
|
@@ -524,6 +533,9 @@ module TensorStream
|
|
524
533
|
_op(:squeeze, value, axis: axis, name: nil)
|
525
534
|
end
|
526
535
|
|
536
|
+
def clip_by_norm(tensor, clip_norm, axes: nil, name: nil)
|
537
|
+
end
|
538
|
+
|
527
539
|
##
|
528
540
|
# Computes the difference between two lists of numbers or strings.
|
529
541
|
# Given a list x and a list y, this operation returns a list out that represents all values
|
@@ -0,0 +1,11 @@
|
|
1
|
+
TensorStream::OpMaker.define_operation :rsqrt do |op|
|
2
|
+
op.what_it_does "Computes reciprocal of square root of x element-wise."
|
3
|
+
|
4
|
+
op.parameter :input_a, "tensor X", validate: 'FLOATING_POINT_TYPES'
|
5
|
+
op.option :name, "Optional name", :nil
|
6
|
+
|
7
|
+
op.define_gradient do |grad, node, params|
|
8
|
+
# Returns -0.5 * grad * conj(y)^3.
|
9
|
+
i_op(:rsqrt_grad, node, grad)
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
TensorStream::OpMaker.define_operation :strided_slice do |op|
|
2
|
+
op.what_it_does "Extracts a strided slice of a tensor "
|
3
|
+
op.what_it_does "this op extracts a slice of size `(end-begin)/stride`
|
4
|
+
from the given `input_` tensor. Starting at the location specified by `begin`
|
5
|
+
the slice continues by adding `stride` to the index until all dimensions are
|
6
|
+
not less than `end`.
|
7
|
+
Note that a stride can be negative, which causes a reverse slice."
|
8
|
+
|
9
|
+
op.parameter :input, "A tensor"
|
10
|
+
op.parameter :_begin, "start index"
|
11
|
+
op.parameter :_end, "end index"
|
12
|
+
op.parameter :strides, "end index", :nil
|
13
|
+
op.option :name, "Optional name", :nil
|
14
|
+
|
15
|
+
op.define_gradient do |grad, node, params|
|
16
|
+
input, b_index, e_index, strides = params
|
17
|
+
x = ts.shape(input, out_type: node.inputs[0].data_type)
|
18
|
+
|
19
|
+
_op(:strided_slice_grad, x, b_index, e_index, strides, grad)
|
20
|
+
end
|
21
|
+
|
22
|
+
op.define_shape do |tensor|
|
23
|
+
end
|
24
|
+
end
|
@@ -7,14 +7,16 @@ TensorStream::OpMaker.define_operation :sum do |op|
|
|
7
7
|
op.what_it_does "If axis has no entries, all dimensions are reduced, and a tensor with a single element is returned."
|
8
8
|
|
9
9
|
op.parameter :input_a, "tensor X"
|
10
|
-
op.parameter :
|
10
|
+
op.parameter :axis_p, "tensor X", :nil, validate: 'INTEGER_TYPES'
|
11
11
|
|
12
|
+
op.option :axis, "axis", :nil, exclude: true
|
12
13
|
op.option :name, "Optional name", :nil
|
13
14
|
op.option :keepdims, "If true, retains reduced dimensions with length 1.", :false
|
14
15
|
|
15
16
|
op.add_custom "input_a = TensorStream.convert_to_tensor(input_a)"
|
16
17
|
op.add_custom "return input_a if input_a.shape.scalar?"
|
17
|
-
op.add_custom "
|
18
|
+
op.add_custom "axis_p = axis_p || axis"
|
19
|
+
op.add_custom "axis_p = cast_axis(input_a, axis_p)"
|
18
20
|
|
19
21
|
op.define_gradient do |grad, node, params|
|
20
22
|
x, y = params
|
@@ -0,0 +1,23 @@
|
|
1
|
+
TensorStream::OpMaker.define_operation :top_k do |op|
|
2
|
+
op.what_it_does "Finds values and indices of the `k` largest entries for the last dimension."
|
3
|
+
|
4
|
+
op.parameter :input, "1-D or higher `Tensor` with last dimension at least `k`."
|
5
|
+
op.parameter :k, "0-D `int32` `Tensor`. Number of top elements to look for along the last dimension (along each row for matrices)", 1
|
6
|
+
op.option :sorted, "If true the resulting `k` elements will be sorted by the values in descending order.", "true"
|
7
|
+
op.option :name, "Optional name", :nil
|
8
|
+
|
9
|
+
op.add_custom_post "[result[0], result[1]]"
|
10
|
+
|
11
|
+
op.define_shape do |tensor|
|
12
|
+
next nil unless tensor.inputs[0].shape.known?
|
13
|
+
|
14
|
+
input_shape = tensor.inputs[0].shape.shape.dup
|
15
|
+
k = tensor.options[:k]
|
16
|
+
input_shape[-1] = k
|
17
|
+
input_shape
|
18
|
+
end
|
19
|
+
|
20
|
+
op.define_gradient do |grad, node, params|
|
21
|
+
#TODO
|
22
|
+
end
|
23
|
+
end
|
@@ -18,7 +18,8 @@ module TensorStream
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def [](index)
|
21
|
-
@shape[index]
|
21
|
+
new_shape = @shape[index]
|
22
|
+
TensorShape.new(@shape[index])
|
22
23
|
end
|
23
24
|
|
24
25
|
def ndims
|
@@ -42,6 +43,36 @@ module TensorStream
|
|
42
43
|
known?
|
43
44
|
end
|
44
45
|
|
46
|
+
def merge_with(other)
|
47
|
+
assert_compatible_with(other)
|
48
|
+
|
49
|
+
if @shape.nil?
|
50
|
+
TensorShape.new(other)
|
51
|
+
else
|
52
|
+
TensorShape.new(@shape)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def compatible_with?(other)
|
57
|
+
other = as_dimension(other)
|
58
|
+
|
59
|
+
shape.nil? || other.nil? || shape == other
|
60
|
+
end
|
61
|
+
|
62
|
+
def as_dimension(value)
|
63
|
+
value.is_a?(TensorShape) ? value.shape : value
|
64
|
+
end
|
65
|
+
|
66
|
+
def value
|
67
|
+
shape
|
68
|
+
end
|
69
|
+
|
70
|
+
##
|
71
|
+
# Raises an exception if `other` is not compatible with this shape.
|
72
|
+
def assert_compatible_with(other)
|
73
|
+
raise TensorStream::ValueError, "Dimensions #{self} and #{other} are not compatible" unless compatible_with?(other)
|
74
|
+
end
|
75
|
+
|
45
76
|
def self.infer_shape(shape_a, shape_b)
|
46
77
|
return nil if shape_a.nil? || shape_b.nil?
|
47
78
|
return shape_a if shape_b.empty?
|
@@ -7,9 +7,9 @@ module TensorStream
|
|
7
7
|
class Saver
|
8
8
|
include TensorStream::OpHelper
|
9
9
|
|
10
|
-
def initialize
|
10
|
+
def initialize(var_list = nil)
|
11
11
|
graph = TensorStream::Graph.get_default_graph
|
12
|
-
vars = graph.get_collection(GraphKeys::GLOBAL_VARIABLES)
|
12
|
+
vars = var_list || graph.get_collection(GraphKeys::GLOBAL_VARIABLES)
|
13
13
|
|
14
14
|
@filename = graph["ts_filename"] || TensorStream.placeholder(:string, name: "ts_filename", shape: [])
|
15
15
|
|
data/lib/tensor_stream/utils.rb
CHANGED
@@ -219,6 +219,10 @@ module TensorStream
|
|
219
219
|
TensorStream::Trainer
|
220
220
|
end
|
221
221
|
|
222
|
+
def math
|
223
|
+
TensorStream::Maths
|
224
|
+
end
|
225
|
+
|
222
226
|
def image
|
223
227
|
TensorStream::Images
|
224
228
|
end
|
@@ -248,6 +252,10 @@ module TensorStream
|
|
248
252
|
return TensorStream.expand_dims(value[0], 0)
|
249
253
|
end
|
250
254
|
|
255
|
+
if value.is_a?(TensorShape)
|
256
|
+
value = value.shape
|
257
|
+
end
|
258
|
+
|
251
259
|
check_if_dense(value)
|
252
260
|
i_cons(value, dtype: dtype || Tensor.detect_type(value), name: name)
|
253
261
|
end
|
@@ -0,0 +1,192 @@
|
|
1
|
+
#
|
2
|
+
# A ruby port of https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer by Guillaume Chevalier
|
3
|
+
#
|
4
|
+
# This is a port so some weird python like conventions may have been left behind
|
5
|
+
require "bundler/setup"
|
6
|
+
require "tensor_stream"
|
7
|
+
require "chakin-rb/chakin"
|
8
|
+
# require 'pry-byebug'
|
9
|
+
require 'zip'
|
10
|
+
|
11
|
+
tf = TensorStream
|
12
|
+
|
13
|
+
CHAKIN_INDEX = 17
|
14
|
+
NUMBER_OF_DIMENSIONS = 25
|
15
|
+
SUBFOLDER_NAME = "glove.twitter.27B"
|
16
|
+
|
17
|
+
DATA_FOLDER = "embeddings"
|
18
|
+
ZIP_FILE = File.join(DATA_FOLDER, "#{SUBFOLDER_NAME}.zip")
|
19
|
+
ZIP_FILE_ALT = "glove" + ZIP_FILE[5..nil] # sometimes it's lowercase only...
|
20
|
+
UNZIP_FOLDER = File.join(DATA_FOLDER, SUBFOLDER_NAME)
|
21
|
+
|
22
|
+
if SUBFOLDER_NAME[-1] == "d"
|
23
|
+
GLOVE_FILENAME = File.join(UNZIP_FOLDER, "#{SUBFOLDER_NAME}.txt")
|
24
|
+
else
|
25
|
+
GLOVE_FILENAME = File.join(UNZIP_FOLDER, "#{SUBFOLDER_NAME}.#{NUMBER_OF_DIMENSIONS}d.txt")
|
26
|
+
end
|
27
|
+
|
28
|
+
if !File.exist?(ZIP_FILE) && !File.exist?(UNZIP_FOLDER)
|
29
|
+
# GloVe by Stanford is licensed Apache 2.0:
|
30
|
+
# https://github.com/stanfordnlp/GloVe/blob/master/LICENSE
|
31
|
+
# http://nlp.stanford.edu/data/glove.twitter.27B.zip
|
32
|
+
# Copyright 2014 The Board of Trustees of The Leland Stanford Junior University
|
33
|
+
puts "Downloading embeddings to '#{ZIP_FILE}'"
|
34
|
+
Chakin::Vectors.download(number: CHAKIN_INDEX, save_dir: "./#{DATA_FOLDER}")
|
35
|
+
else
|
36
|
+
puts "Embeddings already downloaded."
|
37
|
+
end
|
38
|
+
|
39
|
+
if !File.exists?(UNZIP_FOLDER)
|
40
|
+
if !File.exists?(ZIP_FILE) && !File.exists?(ZIP_FILE_ALT)
|
41
|
+
ZIP_FILE = ZIP_FILE_ALT
|
42
|
+
end
|
43
|
+
FileUtils.mkdir_p(UNZIP_FOLDER)
|
44
|
+
Zip::File.open(ZIP_FILE) do |zipfile|
|
45
|
+
zipfile.each do |file|
|
46
|
+
puts "Extracting embeddings to '#{UNZIP_FOLDER}/#{file.name}'"
|
47
|
+
fpath = File.join(UNZIP_FOLDER, file.name)
|
48
|
+
zipfile.extract(file, fpath) unless File.exist?(fpath)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
else
|
52
|
+
puts "Embeddings already extracted."
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Read a GloVe txt file. If `with_indexes=True`, we return a tuple of two dictionnaries
|
57
|
+
# `(word_to_index_dict, index_to_embedding_array)`, otherwise we return only a direct
|
58
|
+
# `word_to_embedding_dict` dictionnary mapping from a string to a numpy array.
|
59
|
+
def load_embedding_from_disks(glove_filename, with_indexes: true)
|
60
|
+
word_to_index_dict = {}
|
61
|
+
index_to_embedding_array = []
|
62
|
+
word_to_embedding_dict = {}
|
63
|
+
representation = nil
|
64
|
+
|
65
|
+
last_index = nil
|
66
|
+
File.open(glove_filename, 'r').each_with_index do |line, i|
|
67
|
+
split = line.split(' ')
|
68
|
+
|
69
|
+
word = split.shift
|
70
|
+
|
71
|
+
representation = split
|
72
|
+
representation.map! { |val| val.to_f }
|
73
|
+
|
74
|
+
if with_indexes
|
75
|
+
word_to_index_dict[word] = i
|
76
|
+
index_to_embedding_array << representation
|
77
|
+
else
|
78
|
+
word_to_embedding_dict[word] = representation
|
79
|
+
end
|
80
|
+
last_index = i
|
81
|
+
end
|
82
|
+
|
83
|
+
_WORD_NOT_FOUND = [0.0] * representation.size # Empty representation for unknown words.
|
84
|
+
if with_indexes
|
85
|
+
_LAST_INDEX = last_index + 1
|
86
|
+
word_to_index_dict = Hash.new(_LAST_INDEX).merge(word_to_index_dict)
|
87
|
+
index_to_embedding_array = index_to_embedding_array + [_WORD_NOT_FOUND]
|
88
|
+
return word_to_index_dict, index_to_embedding_array
|
89
|
+
else
|
90
|
+
word_to_embedding_dict = Hash.new(_WORD_NOT_FOUND)
|
91
|
+
return word_to_embedding_dict
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
puts "Loading embedding from disks..."
|
96
|
+
word_to_index, index_to_embedding = load_embedding_from_disks(GLOVE_FILENAME, with_indexes: true)
|
97
|
+
puts "Embedding loaded from disks."
|
98
|
+
|
99
|
+
vocab_size, embedding_dim = index_to_embedding.shape
|
100
|
+
puts "Embedding is of shape: #{index_to_embedding.shape}"
|
101
|
+
puts "This means (number of words, number of dimensions per word)"
|
102
|
+
puts "The first words are words that tend occur more often."
|
103
|
+
|
104
|
+
puts "Note: for unknown words, the representation is an empty vector,\n" +
|
105
|
+
"and the index is the last one. The dictionnary has a limit:"
|
106
|
+
puts " \"A word\" --> \"Index in embedding\" --> \"Representation\""
|
107
|
+
word = "worsdfkljsdf"
|
108
|
+
idx = word_to_index[word]
|
109
|
+
embd = index_to_embedding[idx].map { |v| v.to_i } # "int" for compact print only.
|
110
|
+
puts " #{word} --> #{idx} --> #{embd}"
|
111
|
+
word = "the"
|
112
|
+
idx = word_to_index[word]
|
113
|
+
embd = index_to_embedding[idx] # "int" for compact print only.
|
114
|
+
puts " #{word} --> #{idx} --> #{embd}"
|
115
|
+
|
116
|
+
words = [
|
117
|
+
"The", "Teh", "A", "It", "Its", "Bacon", "Star", "Clone", "Bonjour", "Intelligence",
|
118
|
+
"À", "A", "Ça", "Ca", "Été", "C'est", "Aujourd'hui", "Aujourd", "'", "hui", "?", "!", ",", ".", "-", "/", "~"
|
119
|
+
]
|
120
|
+
|
121
|
+
words.each do |word|
|
122
|
+
word_ = word.downcase
|
123
|
+
embedding = index_to_embedding[word_to_index[word_]]
|
124
|
+
norm = Vector::elements(embedding).norm
|
125
|
+
puts (word + ": ").ljust(15) + norm.to_s
|
126
|
+
end
|
127
|
+
|
128
|
+
puts "Note: here we printed words starting with capital letters, \n" +
|
129
|
+
"however to take their embeddings we need their lowercase version (str.downcase)"
|
130
|
+
|
131
|
+
batch_size = nil # Any size is accepted
|
132
|
+
|
133
|
+
tf.reset_default_graph
|
134
|
+
sess = tf.session
|
135
|
+
|
136
|
+
# Define the variable that will hold the embedding:
|
137
|
+
tf_embedding = tf.variable(
|
138
|
+
tf.constant(0.0, shape: index_to_embedding.shape),
|
139
|
+
trainable: false,
|
140
|
+
name: "Embedding"
|
141
|
+
)
|
142
|
+
|
143
|
+
tf_word_ids = tf.placeholder(:int32, shape: [batch_size])
|
144
|
+
|
145
|
+
tf_word_representation_layer = tf.nn.embedding_lookup(tf_embedding, tf_word_ids)
|
146
|
+
|
147
|
+
tf_embedding_placeholder = tf.placeholder(:float32, shape: index_to_embedding.shape)
|
148
|
+
tf_embedding_init = tf_embedding.assign(tf_embedding_placeholder)
|
149
|
+
|
150
|
+
sess.run(
|
151
|
+
tf_embedding_init,
|
152
|
+
feed_dict: {
|
153
|
+
tf_embedding_placeholder => index_to_embedding
|
154
|
+
}
|
155
|
+
)
|
156
|
+
|
157
|
+
puts "Embedding now stored in TensorStream. Can delete ruby array to clear some CPU RAM."
|
158
|
+
|
159
|
+
batch_of_words = ["Hello", "World", "!"]
|
160
|
+
batch_indexes = batch_of_words.map { |w| word_to_index[w.downcase] }
|
161
|
+
|
162
|
+
embedding_from_batch_lookup = sess.run(
|
163
|
+
tf_word_representation_layer,
|
164
|
+
feed_dict: {
|
165
|
+
tf_word_ids => batch_indexes
|
166
|
+
}
|
167
|
+
)
|
168
|
+
|
169
|
+
puts "Representations for #{batch_of_words}:"
|
170
|
+
puts embedding_from_batch_lookup.inspect
|
171
|
+
|
172
|
+
prefix = SUBFOLDER_NAME + "." + NUMBER_OF_DIMENSIONS.to_s + "d"
|
173
|
+
TF_EMBEDDINGS_FILE_NAME = File.join(DATA_FOLDER, prefix + ".ckpt")
|
174
|
+
DICT_WORD_TO_INDEX_FILE_NAME = File.join(DATA_FOLDER, prefix + ".json")
|
175
|
+
|
176
|
+
variables_to_save = [tf_embedding]
|
177
|
+
embedding_saver = tf::Train::Saver.new(variables_to_save)
|
178
|
+
embedding_saver.save(sess, TF_EMBEDDINGS_FILE_NAME)
|
179
|
+
puts "TF embeddings saved to '#{TF_EMBEDDINGS_FILE_NAME}'."
|
180
|
+
|
181
|
+
sess.close
|
182
|
+
|
183
|
+
File.open(DICT_WORD_TO_INDEX_FILE_NAME, 'w') do |f|
|
184
|
+
f.write(word_to_index.to_json)
|
185
|
+
end
|
186
|
+
puts "word_to_index dict saved to '#{DICT_WORD_TO_INDEX_FILE_NAME}'."
|
187
|
+
|
188
|
+
words_B = "like absolutely crazy not hate bag sand rock soap"
|
189
|
+
r = words_B.split.map { |w| word_to_index[w.strip()] }
|
190
|
+
puts words_B
|
191
|
+
puts r.inspect
|
192
|
+
puts "done"
|