RubyGems - datadog - Versions diffs - 2.6.0 → 2.7.1 - Mend

datadog 2.6.0 → 2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +29 -1
data/ext/datadog_profiling_native_extension/extconf.rb +3 -0
data/ext/datadog_profiling_native_extension/private_vm_api_access.c +2 -8
data/ext/datadog_profiling_native_extension/profiling.c +6 -0
data/ext/datadog_profiling_native_extension/ruby_helpers.c +14 -4
data/ext/datadog_profiling_native_extension/ruby_helpers.h +4 -0
data/ext/libdatadog_api/crashtracker.c +6 -4
data/ext/libdatadog_extconf_helpers.rb +1 -1
data/lib/datadog/core/configuration/settings.rb +4 -4
data/lib/datadog/di/code_tracker.rb +30 -3
data/lib/datadog/di/component.rb +108 -0
data/lib/datadog/di/configuration/settings.rb +69 -44
data/lib/datadog/di/contrib/active_record.rb +11 -0
data/lib/datadog/di/error.rb +17 -0
data/lib/datadog/di/instrumenter.rb +27 -11
data/lib/datadog/di/probe.rb +23 -1
data/lib/datadog/di/probe_manager.rb +246 -0
data/lib/datadog/di/probe_notification_builder.rb +4 -12
data/lib/datadog/di/probe_notifier_worker.rb +68 -41
data/lib/datadog/di/serializer.rb +143 -95
data/lib/datadog/di/transport.rb +23 -9
data/lib/datadog/di.rb +49 -1
data/lib/datadog/tracing/tracer.rb +1 -1
data/lib/datadog/version.rb +2 -2
metadata +10 -7

data/lib/datadog/di/instrumenter.rb CHANGED Viewed

@@ -54,10 +54,11 @@ module Datadog
     #
     # @api private
     class Instrumenter
-      def initialize(settings, serializer, logger, code_tracker: nil)
+      def initialize(settings, serializer, logger, code_tracker: nil, telemetry: nil)
         @settings = settings
         @serializer = serializer
         @logger = logger
+        @telemetry = telemetry
         @code_tracker = code_tracker
         @lock = Mutex.new
@@ -66,6 +67,7 @@ module Datadog
       attr_reader :settings
       attr_reader :serializer
       attr_reader :logger
+      attr_reader :telemetry
       attr_reader :code_tracker
       # This is a substitute for Thread::Backtrace::Location
@@ -172,12 +174,12 @@ module Datadog
         # we use mock objects and the methods may be mocked with
         # individual invocations, yielding different return values on
         # different calls to the same method.
-        permit_untargeted_trace_points = settings.dynamic_instrumentation.untargeted_trace_points
+        permit_untargeted_trace_points = settings.dynamic_instrumentation.internal.untargeted_trace_points
         iseq = nil
         if code_tracker
-          iseq = code_tracker.iseqs_for_path_suffix(probe.file).first # steep:ignore
-          unless iseq
+          ret = code_tracker.iseqs_for_path_suffix(probe.file) # steep:ignore
+          unless ret
             if permit_untargeted_trace_points
               # Continue withoout targeting the trace point.
               # This is going to cause a serious performance penalty for
@@ -204,6 +206,10 @@ module Datadog
           raise Error::DITargetNotDefined, "File not in code tracker registry: #{probe.file}"
         end
+        if ret
+          actual_path, iseq = ret
+        end
         # If trace point is not targeted, we only need one trace point per file.
         # Creating a trace point for each probe does work but the performance
         # penalty will be taken for each trace point defined in the file.
@@ -217,18 +223,26 @@ module Datadog
         # this optimization just yet and create a trace point for each probe.
         tp = TracePoint.new(:line) do |tp|
-          # If trace point is not targeted, we must verify that the invocation
-          # is the file & line that we want, because untargeted trace points
-          # are invoked for *each* line of Ruby executed.
-          if iseq || tp.lineno == probe.line_no && probe.file_matches?(tp.path)
-            if rate_limiter.nil? || rate_limiter.allow?
-              # & is to stop steep complaints, block is always present here.
-              block&.call(probe: probe, trace_point: tp, caller_locations: caller_locations)
+          begin
+            # If trace point is not targeted, we must verify that the invocation
+            # is the file & line that we want, because untargeted trace points
+            # are invoked for *each* line of Ruby executed.
+            if iseq || tp.lineno == probe.line_no && probe.file_matches?(tp.path)
+              if rate_limiter.nil? || rate_limiter.allow?
+                # & is to stop steep complaints, block is always present here.
+                block&.call(probe: probe, trace_point: tp, caller_locations: caller_locations)
+              end
             end
+          rescue => exc
+            raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
+            logger.warn("Unhandled exception in line trace point: #{exc.class}: #{exc}")
+            telemetry&.report(exc, description: "Unhandled exception in line trace point")
+            # TODO test this path
           end
         rescue => exc
           raise if settings.dynamic_instrumentation.propagate_all_exceptions
           logger.warn("Unhandled exception in line trace point: #{exc.class}: #{exc}")
+          telemetry&.report(exc, description: "Unhandled exception in line trace point")
           # TODO test this path
         end
@@ -244,6 +258,8 @@ module Datadog
           end
           probe.instrumentation_trace_point = tp
+          # actual_path could be nil if we don't use targeted trace points.
+          probe.instrumented_path = actual_path
           if iseq
             tp.enable(target: iseq, target_line: line_no)

data/lib/datadog/di/probe.rb CHANGED Viewed

@@ -47,6 +47,10 @@ module Datadog
           raise ArgumentError, "Probe contains both line number and method name: #{id}"
         end
+        if line_no && !file
+          raise ArgumentError, "Probe contains line number but not file: #{id}"
+        end
         if type_name && !method_name || method_name && !type_name
           raise ArgumentError, "Partial method probe definition: #{id}"
         end
@@ -71,6 +75,8 @@ module Datadog
         @rate_limit = rate_limit || (@capture_snapshot ? 1 : 5000)
         @rate_limiter = Datadog::Core::TokenBucket.new(@rate_limit)
+        @emitting_notified = false
       end
       attr_reader :id
@@ -101,7 +107,10 @@ module Datadog
       # method or for stack traversal purposes?), therefore we do not check
       # for file name/path presence here and just consider the line number.
       def line?
-        !line_no.nil?
+        # Constructor checks that file is given if line number is given,
+        # but for safety, check again here since we somehow got a probe with
+        # a line number but no file in the wild.
+        !!(file && line_no)
       end
       # Returns whether the probe is a method probe.
@@ -157,6 +166,19 @@ module Datadog
       # Line trace point for line probes. Normally this would be a targeted
       # trace point.
       attr_accessor :instrumentation_trace_point
+      # Actual path to the file instrumented by the probe, for line probes,
+      # when code tracking is available and line trace point is targeted.
+      # For untargeted line trace points instrumented path will be nil.
+      attr_accessor :instrumented_path
+      # TODO emitting_notified reads and writes should in theory be locked,
+      # however since DI is only implemented for MRI in practice the missing
+      # locking should not cause issues.
+      attr_writer :emitting_notified
+      def emitting_notified?
+        !!@emitting_notified
+      end
     end
   end
 end

data/lib/datadog/di/probe_manager.rb ADDED Viewed

@@ -0,0 +1,246 @@
+# frozen_string_literal: true
+# rubocop:disable Lint/AssignmentInCondition
+require 'monitor'
+module Datadog
+  module DI
+    # Stores probes received from remote config (that we can parse, in other
+    # words, whose type/attributes we support), requests needed instrumentation
+    # for the probes via Instrumenter, and stores pending probes (those which
+    # haven't yet been instrumented successfully due to their targets not
+    # existing) and failed probes (where we are certain the target will not
+    # ever be loaded, or otherwise become valid).
+    #
+    # @api private
+    class ProbeManager
+      def initialize(settings, instrumenter, probe_notification_builder,
+        probe_notifier_worker, logger, telemetry: nil)
+        @settings = settings
+        @instrumenter = instrumenter
+        @probe_notification_builder = probe_notification_builder
+        @probe_notifier_worker = probe_notifier_worker
+        @logger = logger
+        @telemetry = telemetry
+        @installed_probes = {}
+        @pending_probes = {}
+        @failed_probes = {}
+        @lock = Monitor.new
+        @definition_trace_point = TracePoint.trace(:end) do |tp|
+          install_pending_method_probes(tp.self)
+        rescue => exc
+          raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
+          logger.warn("Unhandled exception in definition trace point: #{exc.class}: #{exc}")
+          telemetry&.report(exc, description: "Unhandled exception in definition trace point")
+          # TODO test this path
+        end
+      end
+      attr_reader :logger
+      attr_reader :telemetry
+      # TODO test that close is called during component teardown and
+      # the trace point is cleared
+      def close
+        definition_trace_point.disable
+        clear_hooks
+      end
+      def clear_hooks
+        @lock.synchronize do
+          @pending_probes.clear
+          @installed_probes.each do |probe_id, probe|
+            instrumenter.unhook(probe)
+          end
+          @installed_probes.clear
+        end
+      end
+      attr_reader :settings
+      attr_reader :instrumenter
+      attr_reader :probe_notification_builder
+      attr_reader :probe_notifier_worker
+      def installed_probes
+        @lock.synchronize do
+          @installed_probes
+        end
+      end
+      def pending_probes
+        @lock.synchronize do
+          @pending_probes
+        end
+      end
+      # Probes that failed to instrument for reasons other than the target is
+      # not yet loaded are added to this collection, so that we do not try
+      # to instrument them every time remote configuration is processed.
+      def failed_probes
+        @lock.synchronize do
+          @failed_probes
+        end
+      end
+      # Requests to install the specified probe.
+      #
+      # If the target of the probe does not exist, assume the relevant
+      # code is not loaded yet (rather than that it will never be loaded),
+      # and store the probe in a pending probe list. When classes are
+      # defined, or files loaded, the probe will be checked against the
+      # newly defined classes/loaded files, and will be installed if it
+      # matches.
+      def add_probe(probe)
+        @lock.synchronize do
+          # Probe failed to install previously, do not try to install it again.
+          if msg = @failed_probes[probe.id]
+            # TODO test this path
+            raise Error::ProbePreviouslyFailed, msg
+          end
+          begin
+            instrumenter.hook(probe, &method(:probe_executed_callback))
+            @installed_probes[probe.id] = probe
+            payload = probe_notification_builder.build_installed(probe)
+            probe_notifier_worker.add_status(payload)
+            # The probe would only be in the pending probes list if it was
+            # previously attempted to be installed and the target was not loaded.
+            # Always remove from pending list here because it makes the
+            # API smaller and shouldn't cause any actual problems.
+            @pending_probes.delete(probe.id)
+            true
+          rescue Error::DITargetNotDefined
+            @pending_probes[probe.id] = probe
+            false
+          end
+        rescue => exc
+          # In "propagate all exceptions" mode we will try to instrument again.
+          raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
+          logger.warn("Error processing probe configuration: #{exc.class}: #{exc}")
+          telemetry&.report(exc, description: "Error processing probe configuration")
+          # TODO report probe as failed to agent since we won't attempt to
+          # install it again.
+          # TODO add top stack frame to message
+          @failed_probes[probe.id] = "#{exc.class}: #{exc}"
+          raise
+        end
+      end
+      # Removes probes with ids other than in the specified list.
+      #
+      # This method is meant to be invoked from remote config processor.
+      # Remote config contains the list of currently defined probes; any
+      # probes not in that list have been removed by user and should be
+      # de-instrumented from the application.
+      def remove_other_probes(probe_ids)
+        @lock.synchronize do
+          @pending_probes.values.each do |probe|
+            unless probe_ids.include?(probe.id)
+              @pending_probes.delete(probe.id)
+            end
+          end
+          @installed_probes.values.each do |probe|
+            unless probe_ids.include?(probe.id)
+              begin
+                instrumenter.unhook(probe)
+                # Only remove the probe from installed list if it was
+                # successfully de-instrumented. Active probes do incur overhead
+                # for the running application, and if the error is ephemeral
+                # we want to try removing the probe again at the next opportunity.
+                #
+                # TODO give up after some time?
+                @installed_probes.delete(probe.id)
+              rescue => exc
+                raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
+                # Silence all exceptions?
+                # TODO should we propagate here and rescue upstream?
+                logger.warn("Error removing probe #{probe.id}: #{exc.class}: #{exc}")
+                telemetry&.report(exc, description: "Error removing probe #{probe.id}")
+              end
+            end
+          end
+        end
+      end
+      # Installs pending method probes, if any, for the specified class.
+      #
+      # This method is meant to be called from the "end" trace point,
+      # which is invoked for each class definition.
+      private def install_pending_method_probes(cls)
+        @lock.synchronize do
+          # TODO search more efficiently than linearly
+          @pending_probes.each do |probe_id, probe|
+            if probe.method?
+              # TODO move this stringification elsewhere
+              if probe.type_name == cls.name
+                begin
+                  # TODO is it OK to hook from trace point handler?
+                  # TODO the class is now defined, but can hooking still fail?
+                  instrumenter.hook(probe, &method(:probe_executed_callback))
+                  @pending_probes.delete(probe.id)
+                  break
+                rescue Error::DITargetNotDefined
+                  # This should not happen... try installing again later?
+                rescue => exc
+                  raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
+                  logger.warn("Error installing probe after class is defined: #{exc.class}: #{exc}")
+                  telemetry&.report(exc, description: "Error installing probe after class is defined")
+                end
+              end
+            end
+          end
+        end
+      end
+      # Installs pending line probes, if any, for the file of the specified
+      # absolute path.
+      #
+      # This method is meant to be called from the script_compiled trace
+      # point, which is invoked for each required or loaded file
+      # (and also for eval'd code, but those invocations are filtered out).
+      def install_pending_line_probes(path)
+        @lock.synchronize do
+          @pending_probes.values.each do |probe|
+            if probe.line?
+              if probe.file_matches?(path)
+                add_probe(probe)
+              end
+            end
+          end
+        end
+      end
+      # Entry point invoked from the instrumentation when the specfied probe
+      # is invoked (that is, either its target method is invoked, or
+      # execution reached its target file/line).
+      #
+      # This method is responsible for queueing probe status to be sent to the
+      # backend (once per the probe's lifetime) and a snapshot corresponding
+      # to the current invocation.
+      def probe_executed_callback(probe:, **opts)
+        unless probe.emitting_notified?
+          payload = probe_notification_builder.build_emitting(probe)
+          probe_notifier_worker.add_status(payload)
+          probe.emitting_notified = true
+        end
+        payload = probe_notification_builder.build_executed(probe, **opts)
+        probe_notifier_worker.add_snapshot(payload)
+      end
+      # Class/module definition trace point (:end type).
+      # Used to install hooks when the target classes/modules aren't yet
+      # defined when the hook request is received.
+      attr_reader :definition_trace_point
+    end
+  end
+end
+# rubocop:enable Lint/AssignmentInCondition

data/lib/datadog/di/probe_notification_builder.rb CHANGED Viewed

@@ -46,11 +46,13 @@ module Datadog
         # this should be all frames for enriched probes and no frames for
         # non-enriched probes?
         build_snapshot(probe, rv: rv, snapshot: snapshot,
+          # Actual path of the instrumented file.
+          path: trace_point&.path,
           duration: duration, caller_locations: caller_locations, args: args, kwargs: kwargs,
           serialized_entry_args: serialized_entry_args)
       end
-      def build_snapshot(probe, rv: nil, snapshot: nil,
+      def build_snapshot(probe, rv: nil, snapshot: nil, path: nil,
         duration: nil, caller_locations: nil, args: nil, kwargs: nil,
         serialized_entry_args: nil)
         # TODO also verify that non-capturing probe does not pass
@@ -85,18 +87,8 @@ module Datadog
         end
         location = if probe.line?
-          actual_file = if probe.file
-            # Normally caller_locations should always be filled for a line probe
-            # but in the test suite we don't always provide all arguments.
-            actual_file_basename = File.basename(probe.file)
-            caller_locations&.detect do |loc|
-              # TODO record actual path that probe was installed into,
-              # perform exact match here against that path.
-              File.basename(loc.path) == actual_file_basename
-            end&.path || probe.file
-          end
           {
-            file: actual_file,
+            file: path,
             lines: [probe.line_no],
           }
         elsif probe.method?

data/lib/datadog/di/probe_notifier_worker.rb CHANGED Viewed

@@ -23,12 +23,9 @@ module Datadog
     #
     # @api private
     class ProbeNotifierWorker
-      # Minimum interval between submissions.
-      # TODO make this into an internal setting and increase default to 2 or 3.
-      MIN_SEND_INTERVAL = 1
-      def initialize(settings, transport, logger)
+      def initialize(settings, transport, logger, telemetry: nil)
         @settings = settings
+        @telemetry = telemetry
         @status_queue = []
         @snapshot_queue = []
         @transport = transport
@@ -39,10 +36,12 @@ module Datadog
         @sleep_remaining = nil
         @wake_scheduled = false
         @thread = nil
+        @flush = 0
       end
       attr_reader :settings
       attr_reader :logger
+      attr_reader :telemetry
       def start
         return if @thread
@@ -53,33 +52,38 @@ module Datadog
             # and then quit?
             break if @stop_requested
-            sleep_remaining = @lock.synchronize do
-              if sleep_remaining && sleep_remaining > 0
-                # Recalculate how much sleep time is remaining, then sleep that long.
-                set_sleep_remaining
-              else
-                0
+            # If a flush was requested, send immediately and do not
+            # wait for the cooldown period.
+            if @lock.synchronize { @flush } == 0
+              sleep_remaining = @lock.synchronize do
+                if sleep_remaining && sleep_remaining > 0
+                  # Recalculate how much sleep time is remaining, then sleep that long.
+                  set_sleep_remaining
+                else
+                  0
+                end
               end
-            end
-            if sleep_remaining > 0
-              # Do not need to update @wake_scheduled here because
-              # wake-up is already scheduled for the earliest possible time.
-              wake.wait(sleep_remaining)
-              next
+              if sleep_remaining > 0
+                # Do not need to update @wake_scheduled here because
+                # wake-up is already scheduled for the earliest possible time.
+                wake.wait(sleep_remaining)
+                next
+              end
             end
             begin
               more = maybe_send
             rescue => exc
-              raise if settings.dynamic_instrumentation.propagate_all_exceptions
+              raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
               logger.warn("Error in probe notifier worker: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
+              telemetry&.report(exc, description: "Error in probe notifier worker")
             end
             @lock.synchronize do
               @wake_scheduled = more
             end
-            wake.wait(more ? MIN_SEND_INTERVAL : nil)
+            wake.wait(more ? min_send_interval : nil)
           end
         end
       end
@@ -106,26 +110,40 @@ module Datadog
       # therefore, it should only be called when there is no parallel
       # activity (in another thread) that causes more notifications
       # to be generated.
+      #
+      # This method is used by the test suite to wait until notifications have
+      # been sent out, and could be used for graceful stopping of the
+      # worker thread.
       def flush
-        loop do
-          if @thread.nil? || !@thread.alive?
-            return
-          end
+        @lock.synchronize do
+          @flush += 1
+        end
+        begin
+          loop do
+            if @thread.nil? || !@thread.alive?
+              return
+            end
-          io_in_progress, queues_empty = @lock.synchronize do
-            [io_in_progress?, status_queue.empty? && snapshot_queue.empty?]
-          end
+            io_in_progress, queues_empty = @lock.synchronize do
+              [io_in_progress?, status_queue.empty? && snapshot_queue.empty?]
+            end
-          if io_in_progress
-            # If we just call Thread.pass we could be in a busy loop -
-            # add a sleep.
-            sleep 0.25
-            next
-          elsif queues_empty
-            break
-          else
-            sleep 0.25
-            next
+            if io_in_progress
+              # If we just call Thread.pass we could be in a busy loop -
+              # add a sleep.
+              sleep 0.25
+              next
+            elsif queues_empty
+              break
+            else
+              wake.signal
+              sleep 0.25
+              next
+            end
+          end
+        ensure
+          @lock.synchronize do
+            @flush -= 1
           end
         end
       end
@@ -136,6 +154,11 @@ module Datadog
       attr_reader :wake
       attr_reader :thread
+      # Convenience method to keep line length reasonable in the rest of the file.
+      def min_send_interval
+        settings.dynamic_instrumentation.internal.min_send_interval
+      end
       # This method should be called while @lock is held.
       def io_in_progress?
         @io_in_progress
@@ -181,14 +204,14 @@ module Datadog
         end
         # Determine how much longer the worker thread should sleep
-        # so as not to send in less than MIN_SEND_INTERVAL since the last send.
+        # so as not to send in less than min send interval since the last send.
         # Important: this method must be called when @lock is held.
         #
         # Returns the time remaining to sleep.
         def set_sleep_remaining
           now = Core::Utils::Time.get_time
           @sleep_remaining = if last_sent
-            [last_sent + MIN_SEND_INTERVAL - now, 0].max
+            [last_sent + min_send_interval - now, 0].max
           else
             0
           end
@@ -218,16 +241,20 @@ module Datadog
                 @last_sent = time
               end
             rescue => exc
-              raise if settings.dynamic_instrumentation.propagate_all_exceptions
+              raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
               logger.warn("failed to send #{event_name}: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
+              # Should we report this error to telemetry? Most likely failure
+              # to send is due to a network issue, and trying to send a
+              # telemetry message would also fail.
             end
           end
           batch.any? # steep:ignore
-        rescue ThreadError
+        rescue ThreadError => exc
           # Normally the queue should only be consumed in this method,
           # however if anyone consumes it elsewhere we don't want to block
           # while consuming it here. Rescue ThreadError and return.
-          logger.warn("unexpected #{event_name} queue underflow - consumed elsewhere?")
+          logger.warn("Unexpected #{event_name} queue underflow - consumed elsewhere?")
+          telemetry&.report(exc, description: "Unexpected #{event_name} queue underflow")
         ensure
           @lock.synchronize do
             @io_in_progress = false