RubyGems - datadog - Versions diffs - 2.6.0 → 2.7.0 - Mend

datadog 2.6.0 → 2.7.0

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +20 -1
data/ext/libdatadog_api/crashtracker.c +6 -4
data/ext/libdatadog_extconf_helpers.rb +1 -1
data/lib/datadog/core/configuration/settings.rb +4 -4
data/lib/datadog/di/code_tracker.rb +30 -3
data/lib/datadog/di/component.rb +108 -0
data/lib/datadog/di/configuration/settings.rb +69 -44
data/lib/datadog/di/contrib/active_record.rb +11 -0
data/lib/datadog/di/error.rb +17 -0
data/lib/datadog/di/instrumenter.rb +27 -11
data/lib/datadog/di/probe.rb +23 -1
data/lib/datadog/di/probe_manager.rb +246 -0
data/lib/datadog/di/probe_notification_builder.rb +4 -12
data/lib/datadog/di/probe_notifier_worker.rb +68 -41
data/lib/datadog/di/serializer.rb +143 -95
data/lib/datadog/di/transport.rb +22 -9
data/lib/datadog/di.rb +49 -1
data/lib/datadog/version.rb +1 -1
metadata +10 -7

data/lib/datadog/di/probe_manager.rb ADDED Viewed

@@ -0,0 +1,246 @@
+# frozen_string_literal: true
+# rubocop:disable Lint/AssignmentInCondition
+require 'monitor'
+module Datadog
+  module DI
+    # Stores probes received from remote config (that we can parse, in other
+    # words, whose type/attributes we support), requests needed instrumentation
+    # for the probes via Instrumenter, and stores pending probes (those which
+    # haven't yet been instrumented successfully due to their targets not
+    # existing) and failed probes (where we are certain the target will not
+    # ever be loaded, or otherwise become valid).
+    #
+    # @api private
+    class ProbeManager
+      def initialize(settings, instrumenter, probe_notification_builder,
+        probe_notifier_worker, logger, telemetry: nil)
+        @settings = settings
+        @instrumenter = instrumenter
+        @probe_notification_builder = probe_notification_builder
+        @probe_notifier_worker = probe_notifier_worker
+        @logger = logger
+        @telemetry = telemetry
+        @installed_probes = {}
+        @pending_probes = {}
+        @failed_probes = {}
+        @lock = Monitor.new
+        @definition_trace_point = TracePoint.trace(:end) do |tp|
+          install_pending_method_probes(tp.self)
+        rescue => exc
+          raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
+          logger.warn("Unhandled exception in definition trace point: #{exc.class}: #{exc}")
+          telemetry&.report(exc, description: "Unhandled exception in definition trace point")
+          # TODO test this path
+        end
+      end
+      attr_reader :logger
+      attr_reader :telemetry
+      # TODO test that close is called during component teardown and
+      # the trace point is cleared
+      def close
+        definition_trace_point.disable
+        clear_hooks
+      end
+      def clear_hooks
+        @lock.synchronize do
+          @pending_probes.clear
+          @installed_probes.each do |probe_id, probe|
+            instrumenter.unhook(probe)
+          end
+          @installed_probes.clear
+        end
+      end
+      attr_reader :settings
+      attr_reader :instrumenter
+      attr_reader :probe_notification_builder
+      attr_reader :probe_notifier_worker
+      def installed_probes
+        @lock.synchronize do
+          @installed_probes
+        end
+      end
+      def pending_probes
+        @lock.synchronize do
+          @pending_probes
+        end
+      end
+      # Probes that failed to instrument for reasons other than the target is
+      # not yet loaded are added to this collection, so that we do not try
+      # to instrument them every time remote configuration is processed.
+      def failed_probes
+        @lock.synchronize do
+          @failed_probes
+        end
+      end
+      # Requests to install the specified probe.
+      #
+      # If the target of the probe does not exist, assume the relevant
+      # code is not loaded yet (rather than that it will never be loaded),
+      # and store the probe in a pending probe list. When classes are
+      # defined, or files loaded, the probe will be checked against the
+      # newly defined classes/loaded files, and will be installed if it
+      # matches.
+      def add_probe(probe)
+        @lock.synchronize do
+          # Probe failed to install previously, do not try to install it again.
+          if msg = @failed_probes[probe.id]
+            # TODO test this path
+            raise Error::ProbePreviouslyFailed, msg
+          end
+          begin
+            instrumenter.hook(probe, &method(:probe_executed_callback))
+            @installed_probes[probe.id] = probe
+            payload = probe_notification_builder.build_installed(probe)
+            probe_notifier_worker.add_status(payload)
+            # The probe would only be in the pending probes list if it was
+            # previously attempted to be installed and the target was not loaded.
+            # Always remove from pending list here because it makes the
+            # API smaller and shouldn't cause any actual problems.
+            @pending_probes.delete(probe.id)
+            true
+          rescue Error::DITargetNotDefined
+            @pending_probes[probe.id] = probe
+            false
+          end
+        rescue => exc
+          # In "propagate all exceptions" mode we will try to instrument again.
+          raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
+          logger.warn("Error processing probe configuration: #{exc.class}: #{exc}")
+          telemetry&.report(exc, description: "Error processing probe configuration")
+          # TODO report probe as failed to agent since we won't attempt to
+          # install it again.
+          # TODO add top stack frame to message
+          @failed_probes[probe.id] = "#{exc.class}: #{exc}"
+          raise
+        end
+      end
+      # Removes probes with ids other than in the specified list.
+      #
+      # This method is meant to be invoked from remote config processor.
+      # Remote config contains the list of currently defined probes; any
+      # probes not in that list have been removed by user and should be
+      # de-instrumented from the application.
+      def remove_other_probes(probe_ids)
+        @lock.synchronize do
+          @pending_probes.values.each do |probe|
+            unless probe_ids.include?(probe.id)
+              @pending_probes.delete(probe.id)
+            end
+          end
+          @installed_probes.values.each do |probe|
+            unless probe_ids.include?(probe.id)
+              begin
+                instrumenter.unhook(probe)
+                # Only remove the probe from installed list if it was
+                # successfully de-instrumented. Active probes do incur overhead
+                # for the running application, and if the error is ephemeral
+                # we want to try removing the probe again at the next opportunity.
+                #
+                # TODO give up after some time?
+                @installed_probes.delete(probe.id)
+              rescue => exc
+                raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
+                # Silence all exceptions?
+                # TODO should we propagate here and rescue upstream?
+                logger.warn("Error removing probe #{probe.id}: #{exc.class}: #{exc}")
+                telemetry&.report(exc, description: "Error removing probe #{probe.id}")
+              end
+            end
+          end
+        end
+      end
+      # Installs pending method probes, if any, for the specified class.
+      #
+      # This method is meant to be called from the "end" trace point,
+      # which is invoked for each class definition.
+      private def install_pending_method_probes(cls)
+        @lock.synchronize do
+          # TODO search more efficiently than linearly
+          @pending_probes.each do |probe_id, probe|
+            if probe.method?
+              # TODO move this stringification elsewhere
+              if probe.type_name == cls.name
+                begin
+                  # TODO is it OK to hook from trace point handler?
+                  # TODO the class is now defined, but can hooking still fail?
+                  instrumenter.hook(probe, &method(:probe_executed_callback))
+                  @pending_probes.delete(probe.id)
+                  break
+                rescue Error::DITargetNotDefined
+                  # This should not happen... try installing again later?
+                rescue => exc
+                  raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
+                  logger.warn("Error installing probe after class is defined: #{exc.class}: #{exc}")
+                  telemetry&.report(exc, description: "Error installing probe after class is defined")
+                end
+              end
+            end
+          end
+        end
+      end
+      # Installs pending line probes, if any, for the file of the specified
+      # absolute path.
+      #
+      # This method is meant to be called from the script_compiled trace
+      # point, which is invoked for each required or loaded file
+      # (and also for eval'd code, but those invocations are filtered out).
+      def install_pending_line_probes(path)
+        @lock.synchronize do
+          @pending_probes.values.each do |probe|
+            if probe.line?
+              if probe.file_matches?(path)
+                add_probe(probe)
+              end
+            end
+          end
+        end
+      end
+      # Entry point invoked from the instrumentation when the specfied probe
+      # is invoked (that is, either its target method is invoked, or
+      # execution reached its target file/line).
+      #
+      # This method is responsible for queueing probe status to be sent to the
+      # backend (once per the probe's lifetime) and a snapshot corresponding
+      # to the current invocation.
+      def probe_executed_callback(probe:, **opts)
+        unless probe.emitting_notified?
+          payload = probe_notification_builder.build_emitting(probe)
+          probe_notifier_worker.add_status(payload)
+          probe.emitting_notified = true
+        end
+        payload = probe_notification_builder.build_executed(probe, **opts)
+        probe_notifier_worker.add_snapshot(payload)
+      end
+      # Class/module definition trace point (:end type).
+      # Used to install hooks when the target classes/modules aren't yet
+      # defined when the hook request is received.
+      attr_reader :definition_trace_point
+    end
+  end
+end
+# rubocop:enable Lint/AssignmentInCondition

data/lib/datadog/di/probe_notification_builder.rb CHANGED Viewed

@@ -46,11 +46,13 @@ module Datadog
         # this should be all frames for enriched probes and no frames for
         # non-enriched probes?
         build_snapshot(probe, rv: rv, snapshot: snapshot,
+          # Actual path of the instrumented file.
+          path: trace_point&.path,
           duration: duration, caller_locations: caller_locations, args: args, kwargs: kwargs,
           serialized_entry_args: serialized_entry_args)
       end
-      def build_snapshot(probe, rv: nil, snapshot: nil,
+      def build_snapshot(probe, rv: nil, snapshot: nil, path: nil,
         duration: nil, caller_locations: nil, args: nil, kwargs: nil,
         serialized_entry_args: nil)
         # TODO also verify that non-capturing probe does not pass
@@ -85,18 +87,8 @@ module Datadog
         end
         location = if probe.line?
-          actual_file = if probe.file
-            # Normally caller_locations should always be filled for a line probe
-            # but in the test suite we don't always provide all arguments.
-            actual_file_basename = File.basename(probe.file)
-            caller_locations&.detect do |loc|
-              # TODO record actual path that probe was installed into,
-              # perform exact match here against that path.
-              File.basename(loc.path) == actual_file_basename
-            end&.path || probe.file
-          end
           {
-            file: actual_file,
+            file: path,
             lines: [probe.line_no],
           }
         elsif probe.method?

data/lib/datadog/di/probe_notifier_worker.rb CHANGED Viewed

@@ -23,12 +23,9 @@ module Datadog
     #
     # @api private
     class ProbeNotifierWorker
-      # Minimum interval between submissions.
-      # TODO make this into an internal setting and increase default to 2 or 3.
-      MIN_SEND_INTERVAL = 1
-      def initialize(settings, transport, logger)
+      def initialize(settings, transport, logger, telemetry: nil)
         @settings = settings
+        @telemetry = telemetry
         @status_queue = []
         @snapshot_queue = []
         @transport = transport
@@ -39,10 +36,12 @@ module Datadog
         @sleep_remaining = nil
         @wake_scheduled = false
         @thread = nil
+        @flush = 0
       end
       attr_reader :settings
       attr_reader :logger
+      attr_reader :telemetry
       def start
         return if @thread
@@ -53,33 +52,38 @@ module Datadog
             # and then quit?
             break if @stop_requested
-            sleep_remaining = @lock.synchronize do
-              if sleep_remaining && sleep_remaining > 0
-                # Recalculate how much sleep time is remaining, then sleep that long.
-                set_sleep_remaining
-              else
-                0
+            # If a flush was requested, send immediately and do not
+            # wait for the cooldown period.
+            if @lock.synchronize { @flush } == 0
+              sleep_remaining = @lock.synchronize do
+                if sleep_remaining && sleep_remaining > 0
+                  # Recalculate how much sleep time is remaining, then sleep that long.
+                  set_sleep_remaining
+                else
+                  0
+                end
               end
-            end
-            if sleep_remaining > 0
-              # Do not need to update @wake_scheduled here because
-              # wake-up is already scheduled for the earliest possible time.
-              wake.wait(sleep_remaining)
-              next
+              if sleep_remaining > 0
+                # Do not need to update @wake_scheduled here because
+                # wake-up is already scheduled for the earliest possible time.
+                wake.wait(sleep_remaining)
+                next
+              end
             end
             begin
               more = maybe_send
             rescue => exc
-              raise if settings.dynamic_instrumentation.propagate_all_exceptions
+              raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
               logger.warn("Error in probe notifier worker: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
+              telemetry&.report(exc, description: "Error in probe notifier worker")
             end
             @lock.synchronize do
               @wake_scheduled = more
             end
-            wake.wait(more ? MIN_SEND_INTERVAL : nil)
+            wake.wait(more ? min_send_interval : nil)
           end
         end
       end
@@ -106,26 +110,40 @@ module Datadog
       # therefore, it should only be called when there is no parallel
       # activity (in another thread) that causes more notifications
       # to be generated.
+      #
+      # This method is used by the test suite to wait until notifications have
+      # been sent out, and could be used for graceful stopping of the
+      # worker thread.
       def flush
-        loop do
-          if @thread.nil? || !@thread.alive?
-            return
-          end
+        @lock.synchronize do
+          @flush += 1
+        end
+        begin
+          loop do
+            if @thread.nil? || !@thread.alive?
+              return
+            end
-          io_in_progress, queues_empty = @lock.synchronize do
-            [io_in_progress?, status_queue.empty? && snapshot_queue.empty?]
-          end
+            io_in_progress, queues_empty = @lock.synchronize do
+              [io_in_progress?, status_queue.empty? && snapshot_queue.empty?]
+            end
-          if io_in_progress
-            # If we just call Thread.pass we could be in a busy loop -
-            # add a sleep.
-            sleep 0.25
-            next
-          elsif queues_empty
-            break
-          else
-            sleep 0.25
-            next
+            if io_in_progress
+              # If we just call Thread.pass we could be in a busy loop -
+              # add a sleep.
+              sleep 0.25
+              next
+            elsif queues_empty
+              break
+            else
+              wake.signal
+              sleep 0.25
+              next
+            end
+          end
+        ensure
+          @lock.synchronize do
+            @flush -= 1
           end
         end
       end
@@ -136,6 +154,11 @@ module Datadog
       attr_reader :wake
       attr_reader :thread
+      # Convenience method to keep line length reasonable in the rest of the file.
+      def min_send_interval
+        settings.dynamic_instrumentation.internal.min_send_interval
+      end
       # This method should be called while @lock is held.
       def io_in_progress?
         @io_in_progress
@@ -181,14 +204,14 @@ module Datadog
         end
         # Determine how much longer the worker thread should sleep
-        # so as not to send in less than MIN_SEND_INTERVAL since the last send.
+        # so as not to send in less than min send interval since the last send.
         # Important: this method must be called when @lock is held.
         #
         # Returns the time remaining to sleep.
         def set_sleep_remaining
           now = Core::Utils::Time.get_time
           @sleep_remaining = if last_sent
-            [last_sent + MIN_SEND_INTERVAL - now, 0].max
+            [last_sent + min_send_interval - now, 0].max
           else
             0
           end
@@ -218,16 +241,20 @@ module Datadog
                 @last_sent = time
               end
             rescue => exc
-              raise if settings.dynamic_instrumentation.propagate_all_exceptions
+              raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
               logger.warn("failed to send #{event_name}: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
+              # Should we report this error to telemetry? Most likely failure
+              # to send is due to a network issue, and trying to send a
+              # telemetry message would also fail.
             end
           end
           batch.any? # steep:ignore
-        rescue ThreadError
+        rescue ThreadError => exc
           # Normally the queue should only be consumed in this method,
           # however if anyone consumes it elsewhere we don't want to block
           # while consuming it here. Rescue ThreadError and return.
-          logger.warn("unexpected #{event_name} queue underflow - consumed elsewhere?")
+          logger.warn("Unexpected #{event_name} queue underflow - consumed elsewhere?")
+          telemetry&.report(exc, description: "Unexpected #{event_name} queue underflow")
         ensure
           @lock.synchronize do
             @io_in_progress = false