RubyGems - tensor_stream - Versions diffs - 0.1.5 → 0.2.0 - Mend

tensor_stream 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +5 -5
data/CHANGELOG.md +13 -0
data/README.md +34 -0
data/lib/tensor_stream.rb +7 -3
data/lib/tensor_stream/control_flow.rb +1 -2
data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +44 -3
data/lib/tensor_stream/evaluator/operation_helpers/math_helper.rb +9 -0
data/lib/tensor_stream/evaluator/ruby_evaluator.rb +70 -36
data/lib/tensor_stream/graph.rb +15 -7
data/lib/tensor_stream/graph_serializers/graphml.rb +183 -35
data/lib/tensor_stream/graph_serializers/pbtext.rb +81 -14
data/lib/tensor_stream/graph_serializers/serializer.rb +13 -0
data/lib/tensor_stream/helpers/string_helper.rb +12 -0
data/lib/tensor_stream/math_gradients.rb +203 -161
data/lib/tensor_stream/operation.rb +30 -16
data/lib/tensor_stream/ops.rb +29 -19
data/lib/tensor_stream/placeholder.rb +2 -3
data/lib/tensor_stream/session.rb +7 -13
data/lib/tensor_stream/tensor.rb +22 -5
data/lib/tensor_stream/tensor_shape.rb +2 -0
data/lib/tensor_stream/trainer.rb +6 -1
data/lib/tensor_stream/variable.rb +4 -3
data/lib/tensor_stream/version.rb +1 -1
data/samples/gradient_sample.graphml +1255 -0
data/samples/linear_regression.rb +1 -1
data/samples/logistic_regression.rb +9 -2
data/tensor_stream.gemspec +1 -1
data/test_samples/error.graphml +120 -0
data/test_samples/gradient_sample.graphml +1255 -0
data/{samples → test_samples}/iris.rb +0 -0
data/{samples → test_samples}/raw_neural_net_sample.rb +0 -0
data/{samples → test_samples}/test.py +2 -0
data/test_samples/test2.py +41 -0
metadata +41 -47

data/lib/tensor_stream/graph_serializers/graphml.rb CHANGED Viewed

@@ -1,91 +1,239 @@
 module TensorStream
-  class Graphml
+  class Graphml < Serializer
     def initialize
     end
-    def serialize(session, tensor, filename)
+    def get_string(tensor, session = nil)
+      tensor = TensorStream.convert_to_tensor(tensor) unless tensor.is_a?(Tensor)
       @session = session
-      @last_session_context = session.last_session_context
+      @name = tensor.name
+      @last_session_context = session ? session.last_session_context : {}
+      groups = {}
       arr_buf = []
       arr_buf << '<?xml version="1.0" encoding="UTF-8"?>'
-      arr_buf << '<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+      arr_buf << '<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:y="http://www.yworks.com/xml/graphml"
       xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">'
       arr_buf << '<key id="d0" for="node" attr.name="label" attr.type="string"/>'
       arr_buf << '<key id="d1" for="node" attr.name="formula" attr.type="string"/>'
       arr_buf << '<key id="d2" for="node" attr.name="color" attr.type="string"/>'
       arr_buf << '<key id="d3" for="node" attr.name="value" attr.type="string"/>'
+      arr_buf << '<key attr.name="description" attr.type="string" for="edge" id="d12"/>'
+      arr_buf << '<key for="edge" id="d13" yfiles.type="edgegraphics"/>'
+      arr_buf << '<key for="node" id="d9" yfiles.type="nodegraphics"/>'
       arr_buf << "<graph id=\"g_#{_gml_string(tensor.name)}\" edgedefault=\"directed\">"
       arr_buf << "<node id=\"out\">"
       arr_buf << "<data key=\"d0\">out</data>"
       arr_buf << "<data key=\"d2\">red</data>"
+      arr_buf << "<data key=\"d9\">"
+      arr_buf << "<y:ShapeNode>"
+      arr_buf << "  <y:Fill color=\"#FF0000\" transparent=\"false\"/>"
+      arr_buf << "  <y:NodeLabel alignment=\"center\">out</y:NodeLabel>"
+      arr_buf << "</y:ShapeNode>"
+      arr_buf << "</data>"
       arr_buf << "</node>"
-      to_graph_ml(tensor, arr_buf)
-      arr_buf << "<edge source=\"#{_gml_string(tensor.name)}\" target=\"out\"/>"
+      to_graph_ml(tensor, arr_buf, {}, groups)
+      #dump groups
+      groups.each do |k, g|
+        arr_buf << create_group(k, k, g)
+      end
+      output_edge(tensor, "out", arr_buf)
       arr_buf << "</graph>"
       arr_buf << "</graphml>"
-      File.write(filename, arr_buf.join("\n"))
+      arr_buf.flatten.join("\n")
     end
     private
+    def add_to_group(groups, name, arr_buf)
+      name_parts = name.split('/')
+      return false if name_parts.size < 2
+      prefix = name_parts.shift
+      ptr = find_or_create_group(prefix, groups)
+      Kernel.loop do
+        next_group = ptr[:group]
+        ptr = find_or_create_group(prefix, next_group)
+        break if name_parts.size < 2
+        prefix = name_parts.shift
+      end
+      ptr[:buf] << arr_buf
+      true
+    end
+    def find_or_create_group(prefix, groups)
+      if !groups[prefix]
+        groups[prefix] = { buf: [], group: {} }
+      end
+      return groups[prefix]
+    end
+    def create_group(id, title, group)
+      arr_buf = []
+      arr_buf << "<node id=\"#{id}\" yfiles.foldertype=\"group\">"
+      arr_buf << '<data key="d9">'
+      arr_buf << '<y:ProxyAutoBoundsNode>'
+      arr_buf << '<y:Realizers active="0">'
+      arr_buf << '<y:GroupNode>'
+      arr_buf << '<y:Fill color="#CAECFF84" transparent="false"/>'
+      arr_buf << '<y:BorderStyle color="#666699" type="dotted" width="1.0"/>'
+      arr_buf << '<y:NodeLabel alignment="right" autoSizePolicy="node_width" backgroundColor="#99CCFF" borderDistance="0.0" fontFamily="Dialog" fontSize="15" fontStyle="plain" hasLineColor="false" height="21.4609375" horizontalTextPosition="center" iconTextGap="4" modelName="internal" modelPosition="t" textColor="#000000" verticalTextPosition="bottom" visible="true" width="67.18603515625" x="-8.593017578125" y="0.0">'+ title + '</y:NodeLabel>'
+      arr_buf << '<y:Shape type="roundrectangle"/>'
+      arr_buf << '</y:GroupNode>'
+      arr_buf << '</y:Realizers>'
+      arr_buf << '</y:ProxyAutoBoundsNode>'
+      arr_buf << '</data>'
+      arr_buf << '<graph edgedefault="directed" id="n105:">'
+      arr_buf << group[:buf]
+      group[:group].each do |k, g|
+        arr_buf << create_group(k, k, g)
+      end
+      arr_buf << '</graph>'
+      arr_buf << '</node>'
+      arr_buf
+    end
     def _val(tensor)
-      JSON.pretty_generate(@last_session_context[tensor.name])
+      # JSON.pretty_generate(@last_session_context[tensor.name])
+      @last_session_context[tensor.name]
     end
-    def to_graph_ml(tensor, arr_buf = [], added = {}, _id = 0)
+    def to_graph_ml(tensor, arr_buf = [], added = {}, groups = {}, _id = 0)
       puts tensor.name
+      return unless tensor.is_a?(Operation)
       added[tensor.name] = true
-      arr_buf << "<node id=\"#{_gml_string(tensor.name)}\">"
-      arr_buf << "<data key=\"d0\">#{tensor.operation}</data>"
-      arr_buf << "<data key=\"d1\">#{tensor.to_math(true, 1)}</data>"
-      arr_buf << "<data key=\"d2\">blue</data>"
+      node_buf = []
+      node_buf << "<node id=\"#{_gml_string(tensor.name)}\">"
+      node_buf << "<data key=\"d0\">#{tensor.operation}</data>"
+      node_buf << "<data key=\"d1\">#{tensor.to_math(true, 1)}</data>"
+      node_buf << "<data key=\"d2\">blue</data>"
       if @last_session_context[tensor.name]
         arr_buf << "<data key=\"d3\">#{_val(tensor)}</data>"
       end
-      arr_buf << "</node>"
+      node_buf << "<data key=\"d9\">"
+      node_buf << "<y:ShapeNode>"
+      if tensor.internal?
+        node_buf << "  <y:Fill color=\"#FFFF99\" transparent=\"false\"/>"
+      else
+        node_buf << "  <y:Fill color=\"#99CC00\" transparent=\"false\"/>"
+      end
+      node_buf << "  <y:NodeLabel alignment=\"center\">#{tensor.operation}</y:NodeLabel>"
+      node_buf << "</y:ShapeNode>"
+      node_buf << "</data>"
+      node_buf << "</node>"
+      if !add_to_group(groups, tensor.name, node_buf)
+        add_to_group(groups, "program/#{tensor.name}", node_buf)
+      end
       tensor.items.each do |item|
         next unless item
-        next if _added[item.name]
+        next if added[item.name]
+        next to_graph_ml(item, arr_buf, added, groups) if item.is_a?(Operation)
-        next to_graph_ml(item, arr_buf, added) if item.is_a?(Operation)
         added[item.name] = true
+        item_buf = []
         if item.is_a?(Variable)
-          arr_buf << "<node id=\"#{_gml_string(item.name)}\">"
-          arr_buf << "<data key=\"d0\">#{item.name}</data>"
-          arr_buf << "<data key=\"d2\">green</data>"
+          item_buf << "<node id=\"#{_gml_string(item.name)}\">"
+          item_buf << "<data key=\"d0\">#{item.name}</data>"
+          item_buf << "<data key=\"d2\">green</data>"
           if @last_session_context[item.name]
-            arr_buf << "<data key=\"d3\">#{_val(tensor)}</data>"
+            item_buf << "<data key=\"d3\">#{_val(tensor)}</data>"
           end
-          arr_buf << "</node>"
+          item_buf << "<data key=\"d9\">"
+          item_buf << "<y:ShapeNode>"
+          item_buf << "  <y:Fill color=\"#33CCCC\" transparent=\"false\"/>"
+          item_buf << "  <y:NodeLabel alignment=\"center\">#{item.name}</y:NodeLabel>"
+          item_buf << "</y:ShapeNode>"
+          item_buf << "</data>"
+          item_buf << "</node>"
         elsif item.is_a?(Placeholder)
-          arr_buf << "<node id=\"#{_gml_string(item.name)}\">"
-          arr_buf << "<data key=\"d0\">#{item.name}</data>"
-          arr_buf << "<data key=\"d2\">yellow</data>"
+          item_buf << "<node id=\"#{_gml_string(item.name)}\">"
+          item_buf << "<data key=\"d9\">"
+          item_buf << "<y:ShapeNode>"
+          item_buf << "  <y:Fill color=\"#FFCC00\" transparent=\"false\"/>"
+          item_buf << "  <y:NodeLabel alignment=\"center\">#{item.name}</y:NodeLabel>"
+          item_buf << "</y:ShapeNode>"
+          item_buf << "</data>"
           if @last_session_context[item.name]
-            arr_buf << "<data key=\"d3\">#{_val(tensor)}</data>"
+            item_buf << "<data key=\"d3\">#{_val(tensor)}</data>"
           end
-          arr_buf << "</node>"
-        else
-          arr_buf << "<node id=\"#{_gml_string(item.name)}\">"
-          arr_buf << "<data key=\"d0\">#{item.name}</data>"
-          arr_buf << "<data key=\"d2\">black</data>"
-          if @last_session_context[item.name]
-            arr_buf << "<data key=\"d3\">#{_val(tensor)}</data>"
+          item_buf << "</node>"
+        elsif item.is_a?(Tensor)
+          item_buf << "<node id=\"#{_gml_string(item.name)}\">"
+          item_buf << "<data key=\"d0\">#{item.name}</data>"
+          item_buf << "<data key=\"d2\">black</data>"
+          item_buf << "<data key=\"d9\">"
+          item_buf << "<y:ShapeNode>"
+          if item.internal?
+            item_buf << "  <y:Fill color=\"#C0C0C0\" transparent=\"false\"/>"
+          else
+            item_buf << "  <y:Fill color=\"#FFFFFF\" transparent=\"false\"/>"
+          end
+          item_buf << "  <y:NodeLabel alignment=\"center\">#{item.name}</y:NodeLabel>"
+          item_buf << "</y:ShapeNode>"
+          item_buf << "</data>"
+          item_buf << "</node>"
+        end
+        if !add_to_group(groups, item.name, item_buf)
+          if item.is_a?(Variable)
+            add_to_group(groups, "variable/#{item.name}", item_buf)
+          else
+            add_to_group(groups, "program/#{item.name}", item_buf)
           end
-          arr_buf << "</node>"
         end
       end
-      tensor.items.each do |item|
+      tensor.items.each_with_index do |item, index|
         next unless item
-        arr_buf << "<edge source=\"#{_gml_string(item.name)}\" target=\"#{_gml_string(tensor.name)}\"/>"
+        output_edge(item, tensor, arr_buf, index)
       end
     end
     def _gml_string(str)
       str.gsub('/','-')
     end
+    def output_edge(item, tensor, arr_buf, index = 0)
+      target_name = tensor.is_a?(Tensor) ? tensor.name : tensor
+      arr_buf << "<edge source=\"#{_gml_string(item.name)}\" target=\"#{_gml_string(target_name)}\">"
+      arr_buf << "<data key=\"d13\">"
+      arr_buf << "<y:PolyLineEdge>"
+      arr_buf << "<y:EdgeLabel >"
+      if !@last_session_context.empty?
+        arr_buf << "<![CDATA[  #{_val(item)}  ]]>"
+      else
+        if item.shape.shape.nil?
+          arr_buf << "<![CDATA[ #{item.data_type.to_s} ? ]]>"
+        else
+          arr_buf << "<![CDATA[ #{item.data_type.to_s} #{item.shape.shape.empty? ? 'scalar' : item.shape.shape.to_json}  ]]>"
+        end
+      end
+      arr_buf << "</y:EdgeLabel >"
+      arr_buf << "<y:Arrows source=\"none\" target=\"standard\"/>"
+      if index == 0
+        arr_buf << "<y:LineStyle color=\"#FF0000\" type=\"line\" width=\"1.0\"/>"
+      else
+        arr_buf << "<y:LineStyle color=\"#0000FF\" type=\"line\" width=\"1.0\"/>"
+      end
+      arr_buf << "</y:PolyLineEdge>"
+      arr_buf << "</data>"
+      arr_buf << "</edge>"
+    end
   end
 end

data/lib/tensor_stream/graph_serializers/pbtext.rb CHANGED Viewed

@@ -1,54 +1,121 @@
 module TensorStream
-  class Pbtext
-    def initialize
-    end
-    def serialize(session, filename, tensor)
-    end
+  class Pbtext < TensorStream::Serializer
+    include TensorStream::StringHelper
+    include TensorStream::OpHelper
-    def get_string(graph)
+    def get_string(tensor_or_graph, session = nil)
+      graph = tensor_or_graph.is_a?(Tensor) ? tensor_or_graph.graph : tensor_or_graph
       @lines = []
       graph.nodes.each do |k, node|
         @lines << "node {"
         @lines << "  name: #{node.name.to_json}"
         if node.is_a?(TensorStream::Operation)
-          @lines << "  op: #{node.operation.to_json}"
+          @lines << "  op: #{camelize(node.operation.to_s).to_json}"
           node.items.each do |input|
             next unless input
             @lines << "  input: #{input.name.to_json}"
           end
           # type
-          pb_attr('T', sym_to_protobuf_type(node.data_type))
+          pb_attr('T', "dtype: #{sym_to_protobuf_type(node.data_type)}")
+          process_options(node)
         elsif node.is_a?(TensorStream::Tensor) && node.is_const
           @lines << "  op: \"Const\""
           # type
-          pb_attr('T', sym_to_protobuf_type(node.data_type))
+          pb_attr('T', "dtype: #{sym_to_protobuf_type(node.data_type)}")
           pb_attr('value', tensor_value(node))
+        elsif node.is_a?(TensorStream::Variable)
+          @lines << "  op: \"VariableV2\""
+          pb_attr('T', "dtype: #{sym_to_protobuf_type(node.data_type)}")
+          pb_attr('shape', shape_buf(node, 'shape'))
+          process_options(node)
         end
         @lines << "}"
       end
-      @lines.join("\n")
+      @lines << "versions {"
+      @lines << "  producer: 26"
+      @lines << "}"
+      @lines.flatten.join("\n")
     end
     private
+    def process_options(node)
+      node.options.each do |k, v|
+        next if %w[name].include?(k.to_s)
+        @lines << "  attr {"
+        @lines << "    key: \"#{k}\""
+        @lines << "    value {"
+        @lines << "    }"
+        @lines << "  }"
+      end
+    end
+    def pack_arr_float(float_arr)
+      float_arr.flatten.pack('f*').bytes.map { |b| b.chr =~ /[^[:print:]]/ ? "\\#{sprintf("%o", b).rjust(3, '0')}" : b.chr  }.join
+    end
+    def pack_arr_int(int_arr)
+      int_arr.flatten.pack('l*').bytes.map { |b| b.chr =~ /[^[:print:]]/ ? "\\#{sprintf("%o", b).rjust(3, '0')}" : b.chr  }.join
+    end
+    def shape_buf(tensor, shape_type = 'tensor_shape')
+      arr = []
+      arr << "  #{shape_type} {"
+      tensor.shape.shape.each do |dim|
+        arr << "    dim {"
+        arr << "      size: #{dim}"
+        arr << "    }"
+      end if tensor.shape.shape
+      arr << "  }"
+      arr
+    end
     def tensor_value(tensor)
       arr = []
       arr << "tensor {"
       arr << "  dtype: #{sym_to_protobuf_type(tensor.data_type)}"
-      arr << "  float_val: #{tensor.value}"
+      arr += shape_buf(tensor)
+      if tensor.rank > 0
+        if TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type)
+          packed = pack_arr_float(tensor.value)
+          arr << "  tensor_content: \"#{packed}\""
+        elsif TensorStream::Ops::INTEGER_TYPES.include?(tensor.data_type)
+          packed = pack_arr_int(tensor.value)
+          arr << "  tensor_content: \"#{packed}\""
+        elsif tensor.data_type == :string
+          tensor.value.each do |v|
+            arr << "  string_val: #{v.to_json}"
+          end
+        else
+          arr << "  tensor_content: #{tensor.value.flatten}"
+        end
+      else
+        val_type = if TensorStream::Ops::INTEGER_TYPES.include?(tensor.data_type)
+          "int_val"
+        elsif TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type)
+          "float_val"
+        elsif tensor.data_type == :string
+          "string_val"
+        else
+          "val"
+        end
+        arr << "  #{val_type}: #{tensor.value.to_json}"
+      end
       arr << "}"
       arr
     end
     def sym_to_protobuf_type(type)
       case type
-      when :int32
+      when :int32, :int
         "DT_INT32"
       when :float, :float32
         "DT_FLOAT"
+      when :string
+        "DT_STRING"
       else
-        "DT_UNKNOWN"
+        "UKNOWN"
       end
     end

data/lib/tensor_stream/graph_serializers/serializer.rb ADDED Viewed

@@ -0,0 +1,13 @@
+module TensorStream
+  class Serializer
+    def initialize
+    end
+    def serialize(filename, tensor, session = nil)
+      File.write(filename, get_string(tensor, session))
+    end
+    def get_string(tensor, session = nil)
+    end
+  end
+end

data/lib/tensor_stream/helpers/string_helper.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module TensorStream
+  module StringHelper
+    def camelize(string, uppercase_first_letter = true)
+      string = if uppercase_first_letter
+                 string.sub(/^[a-z\d]*/) { $&.capitalize }
+               else
+                 string.sub(/^(?:(?=\b|[A-Z_])|\w)/) { $&.downcase }
+               end
+      string.gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }.gsub('/', '::')
+    end
+  end
+end

data/lib/tensor_stream/math_gradients.rb CHANGED Viewed

@@ -3,201 +3,243 @@ module TensorStream
   class MathGradients
     extend TensorStream::OpHelper
+    def self.tf
+      TensorStream
+    end
     def self.derivative(tensor, wrt_dx, options = {})
-      gradient_program_name = "_grad_#{tensor.name}_#{wrt_dx.name}"
-      return options[:graph].get_node(gradient_program_name) if options[:graph] && options[:graph].node_added?(gradient_program_name)
+      return i_op(:ones_like, tensor) if tensor.equal?(wrt_dx)
+      return i_op(:zeros_like, tensor) unless wrt_dx.consumers.include?(tensor.name)
-      constant_options = { dtype: options[:dtype] }
-      constant_options_1 = { dtype: options[:dtype] || tensor.data_type }
+      nodes_to_compute = wrt_dx.consumers.select do |t|
+        node = tensor.graph.nodes[t]
+        node.consumers.include?(tensor.name) || node.equal?(tensor)
+      end.compact + [wrt_dx.name]
-      return i_op(:ones_like, wrt_dx, constant_options_1) if tensor.equal?(wrt_dx)
-      return i_cons(0, constant_options) if options[:stop_gradients] && _include?(options[:stop_gradients], tensor)
+      grad = i_op(:ones_like, wrt_dx)
-      if tensor.is_a?(Operation)
-        grad = derivative(tensor.items[0], wrt_dx, options) if tensor.items[0]
-        grad2 = derivative(tensor.items[1], wrt_dx, options) if tensor.items[1]
+      result = _propagate(grad, tensor, wrt_dx, nodes_to_compute, options[:stop_gradients] || [])
+      i_op(:truncate, result, tf.shape(wrt_dx))
+    end
-        case tensor.operation
-        when :zeros_like
-          i_cons(0, constant_options)
-        when :log1p
-          grad * _op(:reciprocal, i_cons(1, constant_options_1) + tensor.items[0])
-        when :max
-          x_mask = i_op(:where, i_op(:ones_like, tensor.items[0]), i_op(:zeros_like, tensor.items[1]), pred: tensor.items[0] > tensor.items[1])
-          y_mask = i_op(:where, i_op(:zeros_like, tensor.items[0]), i_op(:ones_like, tensor.items[1]), pred: tensor.items[0] < tensor.items[1])
-          x_mask * grad + y_mask * grad2
-        when :where
-          x_mask = i_op(:where, i_op(:ones_like, tensor.items[0]), i_op(:zeros_like, tensor.items[1]), pred: tensor.options[:pred])
-          y_mask = i_op(:where, i_op(:zeros_like, tensor.items[0]), i_op(:ones_like, tensor.items[1]), pred: tensor.options[:pred])
-          x_mask * grad + y_mask * grad2
-        when :cond
-          i_op(:cond, grad, grad2, pred: tensor.options[:pred])
-        when :identity, :print, :pad
-          grad
-        when :negate
-          i_cons(-1, constant_options_1) * grad
-        when :abs
-          grad * i_op(:sign, _ds(tensor.items[0]))
-        when :square
-          i_cons(2, constant_options_1) * _ds(tensor.items[0]) * grad
-        when :exp
-          i_op(:exp, tensor.items[0]) * grad
-        when :log
-          (i_cons(1, constant_options_1) / _ds(tensor.items[0])) * grad
-        when :tanh
-          i_op(:mul, (i_cons(1, constant_options_1) - (i_op(:tanh, _ds(tensor.items[0]))**2)), grad, name: 'grad_tanh')
-        when :tan
-          (i_cons(1, constant_options_1) / (i_op(:cos, _ds(tensor.items[0]))**2)) * grad
-        when :sin
-          i_op(:mul, i_op(:cos, tensor.items[0]), grad, name: 'grad_sin')
-        when :sqrt
-          i_cons(1, constant_options_1) / (i_cons(2, constant_options_1) * i_op(:sqrt, _ds(tensor.items[0]))) * grad
-        when :cos
-          -i_op(:sin, tensor.items[0]) * grad
-        when :add
-          # rx = _op(:shape, tensor.items[0])
-          # ry = _op(:shape, tensor.items[1])
+    def self._propagate(grad, tensor, stop_tensor, nodes_to_compute, stop_gradients = [])
+      return grad * i_op(:ones_like, stop_tensor) if stop_tensor.equal?(tensor)
+      return i_op(:zeros_like, stop_tensor) if stop_gradients && _include?(stop_gradients, tensor)
+      return i_op(:zeros_like, stop_tensor) unless tensor.is_a?(Operation)
+      computed_op = if _op_supports_broadcast?(tensor)
+                      _compute_derivative(tensor, _broadcast_transform(tensor, grad)[1])
+                    else
+                      _compute_derivative(tensor, grad)
+                    end
+      if computed_op.is_a?(Array)
+        partials = []
+        computed_op.each_with_index do |op_grad, index|
+          next if op_grad.nil?
+          if nodes_to_compute.include?(tensor.items[index].name)
+            partials << _propagate(op_grad, tensor.items[index], stop_tensor, nodes_to_compute, stop_gradients)
+          end
+        end
-          # ones_a = _op(:ones_like, tensor.items[0])
-          # ones_b = _op(:ones_like, tensor.items[1])
-          # inputs = _broadcast_transform(grad * ones_a, grad2 * ones_b)
-          # sx, sy = _broadcast_gradient_args(rx, ry)
+        partials.reduce(:+)
+      else
+        return tf.zeros_like(stop_tensor) if computed_op.nil?
+        _propagate(computed_op, tensor.items[0], stop_tensor, nodes_to_compute, stop_gradients)
+      end
+    end
-          # keep_dims_x = _op(:rank, inputs[0]) == _op(:rank, tensor.items[0])
-          # keep_dims_y = _op(:rank, inputs[1]) == _op(:rank, tensor.items[1])
+    def self._compute_derivative(node, grad)
+      node.graph.name_scope("#{node.name}_grad") do
+        x = node.items[0] if node.items[0]
+        y = node.items[1] if node.items[1]
-          # add_x = _op(:reduce_sum, inputs[0], nil, axis: sy, keepdims: keep_dims_x)
-          # add_y = _op(:reduce_sum, inputs[1], nil, axis: sx, keepdims: keep_dims_y)
-          # _filtered_sum(add_x, add_y, wrt_dx)
-          _grad_with_broadcast(tensor, wrt_dx, ->(a, b) { i_op(:add, a, b, name: 'grad_add') }, options)
-        when :sub
-          _grad_with_broadcast(tensor, wrt_dx, ->(a, b) { i_op(:sub, a, b, name: 'grad_sub') }, options)
-        when :pow
-          gx = _ds(tensor.items[1]) * (_ds(tensor.items[0])**(_ds(tensor.items[1]) - 1)) * grad
+        case node.operation
+        when :add
+          return [grad, grad] if _shapes_fully_specified_and_equal(x, y)
-          log_x = i_op(:where, i_op(:log, tensor.items[0], nil, name: 'log_pow_grad'), i_op(:zeros_like, tensor.items[0]), pred: tensor.items[0] > 0)
-          gy = _ds(tensor.items[0])**_ds(tensor.items[1]) * log_x * grad2
+          sx = tf.shape(x, name: 'add/shape_x')
+          sy = tf.shape(y, name: 'add/shape_y')
+          rx, ry = _broadcast_gradient_args(sx, sy)
+          keep_dims_x = tf.rank(x) == tf.rank(grad)
+          keep_dims_y = tf.rank(y) == tf.rank(grad)
-          gx + gy
-        when :div
-          # apply the quotient rule
-          gx = i_op(:div, grad, _ds(tensor.items[1]))
-          gy = grad2 * i_op(:div, i_op(:div, -_ds(tensor.items[0]), _ds(tensor.items[1])), _ds(tensor.items[1]))
+          [tf.reduce_sum(grad, rx, name: 'add/reduce_sum_x', keepdims: keep_dims_x),
+          tf.reduce_sum(grad, ry, name: 'add/reduce_sum_y', keepdims: keep_dims_y)]
+        when :sub
+          return [grad, -grad] if _shapes_fully_specified_and_equal(x, y)
-          _reduce_when_necessary(gx + gy, wrt_dx)
+          sx = tf.shape(x, name: 'sub/shape_x')
+          sy = tf.shape(y, name: 'sub/shape_y')
+          rx, ry = _broadcast_gradient_args(sx, sy)
+          [tf.reduce_sum(grad, rx), -tf.reduce_sum(grad, ry)]
         when :mul
-          # apply the product rule
-          rx = _op(:shape, tensor.items[0])
-          ry = _op(:shape, tensor.items[1])
-          sx, sy = _broadcast_gradient_args(rx, ry)
-          inputs = _broadcast_transform(tensor.items[0], tensor.items[1])
-          keep_dims_x = _op(:rank, inputs[0]) == _op(:rank, tensor.items[0])
-          keep_dims_y = _op(:rank, inputs[1]) == _op(:rank, tensor.items[1])
-          _filtered_sum(_op(:reduce_sum, grad * _ds(inputs[1]), nil, axis: sy, keepdims: keep_dims_x),
-                        _op(:reduce_sum, _ds(inputs[0]) * grad2, nil, axis: sx, keepdims: keep_dims_y), wrt_dx)
-        when :reduce_mean
-          input_size = i_op(:reduce_prod, i_op(:shape, tensor.items[0]))
-          output_size = i_op(:reduce_prod, i_op(:shape, tensor))
-          factor = input_size / output_size
-          (grad / i_op(:cast, factor, data_type: grad.dtype))
-        when :reduce_sum
-          grad
-        when :reciprocal
-          -grad * (i_cons(1, constant_options_1) / _ds(tensor.items[0])**2)
-        when :stop_gradient
-          return i_cons(0, constant_options)
-        when :matmul
-          derivative_a = derivative(tensor.items[0], wrt_dx)
-          derivative_b = derivative(tensor.items[1], wrt_dx)
-          s0 =  i_op(:shape, tensor.items[0])
-          s1 =  i_op(:shape, tensor.items[1])
+          sx = tf.shape(x)
+          sy = tf.shape(y)
+          rx, ry = _broadcast_gradient_args(sx, sy)
-          identity_0 = i_op(:ones, [s0[0], s1[1]], nil, data_type: tensor.items[0].data_type)
-          identity_1 = i_op(:ones, [s0[0], s1[1]], nil, data_type: tensor.items[1].data_type)
-          matmul_da = i_op(:matmul, identity_0, tensor.items[1], transpose_b: true,
-                                                                pad_zeros: true,
-                                                                name:        'matrix_dx')
-          matmul_db = i_op(:matmul, tensor.items[0], identity_1, transpose_a: true,
-                                                                pad_zeros: true,
-                                                                name:        'matrix_dy')
-          # matmul_db = _op(:transpose, matmul_db, nil).first
-          # begin_a = _op(:zeros, _op(:rank, matmul_db), nil, data_type: :int32, name: 'begin_a')
-          # matmul_b_shape = _op(:shape, matmul_db)
-          # end_a = [matmul_b_shape[0], 1]
+          [ tf.reduce_sum(tf.mul(grad, y), rx),
+            tf.reduce_sum(tf.mul(x, grad), ry)]
+        when :div
+          sx = i_op(:shape, x)
+          sy = i_op(:shape, y)
+          rx, ry = _broadcast_gradient_args(sx, sy)
-          matmul_da = i_op(:cond, matmul_da[0], matmul_da, pred: _op(:rank, derivative_a) > 0)
+          [tf.reduce_sum(tf.div(grad, y), rx),
+          tf.reduce_sum(grad * tf.div(tf.div(-x, y), y),
+                                  ry)]
+        when :matmul
+          t_a = node.options[:transpose_a]
+          t_b = node.options[:transpose_b]
+          s0 =  tf.shape(x)
+          s1 =  tf.shape(y)
+          identity_0 = tf.ones([ s0[0], s1[1] ], dtype: x.data_type, name: 'matmul/identity0')
+          identity_1 = tf.ones([ s0[0], s1[1] ], dtype: y.data_type, name: 'matmul/identity1')
+          grad_a, grad_b = nil
+          if !t_a && !t_b
+            grad_a = tf.matmul(identity_0, y, transpose_b: true)
+            grad_b = tf.matmul(x, identity_1, transpose_a: true)
+          elsif !ta && tb
+            grad_a = tf.matmul(identity_0, y)
+            grad_b = tf.matmul(identity_1, x, transpose_a: true)
+          elsif t_a && !t_b
+            grad_a = tf.matmul(y, identity_0, transpose_b: true)
+            grad_b = tf.matmul(x, identity_1)
+          elsif t_a && t_b
+            grad_a = tf.matmul(y, identity_0, transpose_a: true, transpose_b: true)
+            grad_b = tf.matmul(identity_1, x, transpose_a: true, transpose_b: true)
+          end
+          grad_a = i_op(:mul, grad, grad_a, name: 'matmul/grad_a_norm_mul_da')
+          grad_b = i_op(:mul, grad, grad_b, name: 'matmul/grad_b_norm_mul_db')
+          [grad_a, grad_b]
+        when :sin
+          grad * tf.cos(x)
+        when :tanh
+          grad * i_op(:tanh_grad, x)
+        when :pow
+          z = node
+          sx = tf.shape(x)
+          sy = tf.shape(y)
+          rx, ry = _broadcast_gradient_args(sx, sy)
+          gx = tf.reshape(
+            tf.reduce_sum(grad * y * tf.pow(x, y - 1), rx), sx)
-          # matmul_da = _op(:cond, matmul_da[0], matmul_da, pred: _op(:rank, derivative_a) > 0)
-          norm_a = i_op(:mul, derivative_a, matmul_da, name: 'grad_a_norm_mul_da')
-          norm_b = i_op(:mul, derivative_b, matmul_db, name: 'grad_b_norm_mul_db')
+          log_x = tf.where(x > 0, tf.log(x), tf.zeros_like(x))
+          gy = tf.reshape(tf.reduce_sum(grad * z * log_x, ry), sy)
-          # norm_a = i_op(:cond, norm_a[0], norm_a, pred: i_op(:rank, matmul_da) > i_op(:rank, derivative_a))
-          # norm_b = i_op(:cond, norm_b[0], norm_b, pred: i_op(:rank, matmul_db) > i_op(:rank, derivative_b))
-          _filtered_sum(norm_a, norm_b, wrt_dx)
+          [gx, gy]
+        when :abs
+          grad * tf.sign(x)
+        when :log
+          grad * tf.reciprocal(x)
+        when :tanh
+          i_op(:tanh_grad, x) * grad
+        when :cos
+          -grad * tf.sin(x)
+        when :max
+          x_mask = tf.where(x > y, tf.ones_like(x), tf.zeros_like(y))
+          y_mask = tf.where(x < y, tf.zeros_like(x), tf.ones_like(y))
+          [x_mask * grad, y_mask * grad]
+        when :tan
+          secx = tf.reciprocal(tf.cos(x))
+          secx2 = tf.square(secx)
+          grad * secx2
+        when :negate
+          -grad
+        when :exp
+          grad * node
+        when :identity
+          grad
+        when :sum
+          _sum_grad(x, y, grad)
+        when :reciprocal
+          -grad * (tf.constant(1, dtype: x.dtype) / x**2)
+        when :sqrt
+          tf.constant(1, dtype: x.dtype) / (tf.constant(2, dtype: x.dtype) * tf.sqrt(x)) * grad
+        when :stop_gradient
+          tf.zeros_like(grad)
+        when :square
+          y = tf.constant(2.0, dtype: x.dtype)
+          tf.multiply(grad, tf.multiply(x, y))
+        when :where
+          x_mask = i_op(:where, i_op(:ones_like, x), i_op(:zeros_like, y), pred: node.options[:pred])
+          y_mask = i_op(:where, i_op(:zeros_like, x), i_op(:ones_like, y), pred: node.options[:pred])
+          [x_mask * grad, y_mask * grad]
+        when :cond
+          x_cond = i_op(:cond, i_op(:ones_like, x), i_op(:zeros_like, y), pred: node.options[:pred])
+          y_cond = i_op(:cond, i_op(:zeros_like, x), i_op(:ones_like, x), pred: node.options[:pred])
+          [x_cond * grad, y_cond * grad]
+        when :mean
+          sum_grad = _sum_grad(x, y, grad)
+          input_shape = tf.shape(x)
+          output_shape = tf.shape(node)
+          factor = _safe_shape_div(tf.reduce_prod(input_shape), tf.reduce_prod(output_shape))
+          tf.div(sum_grad, tf.cast(factor, sum_grad.data_type))
+        when :log1p
+          grad * tf.reciprocal(i_cons(1, data_type: grad.data_type) + x)
+        when :sigmoid
+          i_op(:sigmoid_grad, x, grad)
+        when :zeros_like
+          # non differentiable
+          nil
         else
-          raise "no derivative implementation found for op #{tensor.operation}"
+          raise "no derivative op for #{node.operation}"
         end
-      elsif tensor.is_a?(TensorStream::Variable)
-        i_cons(0, constant_options)
-      elsif tensor.is_a?(TensorStream::Placeholder)
-        i_cons(0, constant_options)
-      else
-        i_cons(0, constant_options)
-      end.tap do |ops|
-        options[:graph].add_node!(gradient_program_name, ops) if options[:graph]
       end
     end
-    def self._ds(tensor)
-      return tensor unless tensor.is_a?(Operation)
+    def self._broadcast_gradient_args(input_a, input_b)
+      [_op(:broadcast_gradient_args, input_b, input_a), _op(:broadcast_gradient_args, input_a, input_b)]
+    end
-      case tensor.operation
-      when :reduce_sum
-        tensor.items[0]
-      else
-        tensor
-      end
+    def self._broadcast_transform(input_a, input_b)
+      _op(:broadcast_transform, input_a, input_b)
     end
-    def self._grad_with_broadcast(tensor, wrt_dx, func, options)
-      grad = derivative(tensor.items[0], wrt_dx, options)
-      grad2 = derivative(tensor.items[1], wrt_dx, options)
-      elements1 = i_op(:reduce_prod, i_op(:shape, tensor.items[0]), data_type: :float32)
-      elements2 = i_op(:reduce_prod, i_op(:shape, tensor.items[1]), data_type: :float32)
-      multiplier = elements1 / elements2
-      _reduce_when_necessary(func.call(grad, grad2 * multiplier), wrt_dx)
+    def self._safe_shape_div(x, y)
+      x / tf.maximum(y, 1)
     end
-    def self._include?(arr, obj)
-      arr.each { |a| return true if a.equal?(obj) }
+    def self._sum_grad(x, y, grad)
+      tf.ones_like(x) * grad
+    end
+    def self._op_supports_broadcast?(node)
+      return true if %i[add sub div mul pow].include?(node.operation)
       false
     end
-    def self._reduce_when_necessary(tensor, wrt_dx)
-      rank = _op(:rank, tensor)
-      dx_rank = _op(:rank, wrt_dx)
-      reduced = _op(:reduce_sum, tensor, nil, axis: 0)
-      _op(:cond, ->{ reduced }, tensor, pred: rank > dx_rank)
+    def self._min_or_max_grad(op, grad)
+      y = op
+      indicators = tf.cast(tf.equal(y, op.items[0]), grad.data_type)
+      num_selected = tf.reduce_sum(indicators, op.items[1])
+      _safe_shape_div(indicators, num_selected) * grad
     end
-    def self._broadcast_gradient_args(input_a, input_b)
-      [_op(:broadcast_gradient_args, input_a, input_b), _op(:broadcast_gradient_args, input_b, input_a)]
+    def self._include?(arr, obj)
+      arr.each { |a| return true if a.equal?(obj) }
+      false
     end
-    def self._broadcast_transform(input_a, input_b)
-      _op(:broadcast_transform, input_a, input_b)
+    def self._shapes_fully_specified_and_equal(x, y)
+     return false if !_shape_full_specified(x) || !_shape_full_specified(y)
+     return false if x.shape.shape != y.shape.shape
+     true
     end
-    # filter out zero arrays
-    def self._filtered_sum(input_a, input_b, wrt_dx)
-      zero_vect = _op(:zeros_like, wrt_dx)
-      (i_op(:cond, input_a, zero_vect, pred: i_op(:reduce_sum, input_a) != 0) + i_op(:cond, input_b, zero_vect, pred: i_op(:reduce_sum, input_b) != 0))
+    def self._shape_full_specified(tensor)
+      return false if tensor.shape.nil?
+      return false if tensor.shape.shape.nil?
+      tensor.shape.shape.each { |s| return false if s.nil? }
+      true
     end
   end
-end
+end