RubyGems - ddtrace - Versions diffs - 0.52.0 → 0.54.2 - Mend

ddtrace 0.52.0 → 0.54.2

Files changed (108) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +174 -11
data/ddtrace.gemspec +6 -3
data/docs/DevelopmentGuide.md +1 -6
data/docs/GettingStarted.md +109 -18
data/docs/ProfilingDevelopment.md +2 -2
data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +86 -0
data/ext/ddtrace_profiling_native_extension/clock_id.h +4 -0
data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +52 -0
data/ext/ddtrace_profiling_native_extension/clock_id_noop.c +14 -0
data/ext/ddtrace_profiling_native_extension/extconf.rb +177 -8
data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +35 -0
data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +3 -0
data/ext/ddtrace_profiling_native_extension/profiling.c +6 -1
data/lib/datadog/ci/contrib/cucumber/formatter.rb +1 -0
data/lib/datadog/ci/contrib/rspec/example.rb +1 -0
data/lib/datadog/ci/contrib/rspec/integration.rb +2 -2
data/lib/datadog/ci/ext/environment.rb +64 -22
data/lib/datadog/ci/ext/test.rb +1 -0
data/lib/datadog/ci/test.rb +5 -1
data/lib/datadog/contrib.rb +2 -0
data/lib/datadog/core/environment/vm_cache.rb +46 -0
data/lib/ddtrace/buffer.rb +28 -16
data/lib/ddtrace/configuration/agent_settings_resolver.rb +131 -53
data/lib/ddtrace/configuration/components.rb +1 -1
data/lib/ddtrace/configuration/settings.rb +13 -3
data/lib/ddtrace/context.rb +10 -2
data/lib/ddtrace/contrib/action_cable/instrumentation.rb +46 -0
data/lib/ddtrace/contrib/action_cable/patcher.rb +1 -0
data/lib/ddtrace/contrib/action_mailer/configuration/settings.rb +32 -0
data/lib/ddtrace/contrib/action_mailer/event.rb +50 -0
data/lib/ddtrace/contrib/action_mailer/events/deliver.rb +54 -0
data/lib/ddtrace/contrib/action_mailer/events/process.rb +41 -0
data/lib/ddtrace/contrib/action_mailer/events.rb +31 -0
data/lib/ddtrace/contrib/action_mailer/ext.rb +32 -0
data/lib/ddtrace/contrib/action_mailer/integration.rb +45 -0
data/lib/ddtrace/contrib/action_mailer/patcher.rb +27 -0
data/lib/ddtrace/contrib/active_job/configuration/settings.rb +33 -0
data/lib/ddtrace/contrib/active_job/event.rb +54 -0
data/lib/ddtrace/contrib/active_job/events/discard.rb +46 -0
data/lib/ddtrace/contrib/active_job/events/enqueue.rb +45 -0
data/lib/ddtrace/contrib/active_job/events/enqueue_at.rb +45 -0
data/lib/ddtrace/contrib/active_job/events/enqueue_retry.rb +47 -0
data/lib/ddtrace/contrib/active_job/events/perform.rb +45 -0
data/lib/ddtrace/contrib/active_job/events/retry_stopped.rb +46 -0
data/lib/ddtrace/contrib/active_job/events.rb +39 -0
data/lib/ddtrace/contrib/active_job/ext.rb +32 -0
data/lib/ddtrace/contrib/active_job/integration.rb +46 -0
data/lib/ddtrace/contrib/active_job/log_injection.rb +21 -0
data/lib/ddtrace/contrib/active_job/patcher.rb +33 -0
data/lib/ddtrace/contrib/auto_instrument.rb +0 -1
data/lib/ddtrace/contrib/delayed_job/plugin.rb +2 -2
data/lib/ddtrace/contrib/mongodb/instrumentation.rb +1 -1
data/lib/ddtrace/contrib/mongodb/integration.rb +5 -0
data/lib/ddtrace/contrib/rails/auto_instrument_railtie.rb +0 -1
data/lib/ddtrace/contrib/rails/configuration/settings.rb +7 -0
data/lib/ddtrace/contrib/rails/framework.rb +24 -1
data/lib/ddtrace/contrib/rails/patcher.rb +19 -10
data/lib/ddtrace/contrib/redis/instrumentation.rb +90 -0
data/lib/ddtrace/contrib/redis/patcher.rb +2 -84
data/lib/ddtrace/contrib/registerable.rb +0 -1
data/lib/ddtrace/contrib/resque/integration.rb +1 -5
data/lib/ddtrace/contrib/sidekiq/ext.rb +3 -0
data/lib/ddtrace/contrib/sidekiq/integration.rb +10 -0
data/lib/ddtrace/contrib/sidekiq/patcher.rb +26 -0
data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/heartbeat.rb +30 -0
data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/job_fetch.rb +30 -0
data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/scheduled_push.rb +29 -0
data/lib/ddtrace/contrib/sinatra/env.rb +2 -1
data/lib/ddtrace/contrib/sinatra/tracer.rb +15 -2
data/lib/ddtrace/ext/git.rb +12 -0
data/lib/ddtrace/ext/priority.rb +6 -4
data/lib/ddtrace/ext/profiling.rb +8 -11
data/lib/ddtrace/ext/runtime.rb +3 -0
data/lib/ddtrace/ext/transport.rb +11 -0
data/lib/ddtrace/metrics.rb +2 -2
data/lib/ddtrace/profiling/collectors/stack.rb +112 -72
data/lib/ddtrace/profiling/encoding/profile.rb +10 -2
data/lib/ddtrace/profiling/events/stack.rb +13 -13
data/lib/ddtrace/profiling/native_extension.rb +23 -1
data/lib/ddtrace/profiling/pprof/builder.rb +8 -2
data/lib/ddtrace/profiling/pprof/converter.rb +22 -9
data/lib/ddtrace/profiling/pprof/stack_sample.rb +32 -9
data/lib/ddtrace/profiling/pprof/template.rb +2 -2
data/lib/ddtrace/profiling/scheduler.rb +20 -4
data/lib/ddtrace/profiling/tasks/setup.rb +21 -13
data/lib/ddtrace/profiling/trace_identifiers/ddtrace.rb +10 -9
data/lib/ddtrace/profiling/trace_identifiers/helper.rb +5 -5
data/lib/ddtrace/profiling/transport/http/api/endpoint.rb +8 -15
data/lib/ddtrace/profiling/transport/http.rb +8 -17
data/lib/ddtrace/profiling.rb +0 -2
data/lib/ddtrace/runtime/metrics.rb +14 -0
data/lib/ddtrace/sampler.rb +18 -8
data/lib/ddtrace/sampling/rule_sampler.rb +13 -1
data/lib/ddtrace/span.rb +7 -19
data/lib/ddtrace/tracer.rb +1 -1
data/lib/ddtrace/transport/http/adapters/net.rb +13 -3
data/lib/ddtrace/transport/http/adapters/test.rb +4 -2
data/lib/ddtrace/transport/http/adapters/unix_socket.rb +23 -12
data/lib/ddtrace/transport/http/builder.rb +13 -6
data/lib/ddtrace/transport/http.rb +5 -11
data/lib/ddtrace/utils/time.rb +11 -6
data/lib/ddtrace/version.rb +2 -2
data/lib/ddtrace/workers/{loop.rb → interval_loop.rb} +0 -16
data/lib/ddtrace/workers/polling.rb +1 -1
metadata +40 -10
data/lib/ddtrace/profiling/ext/cpu.rb +0 -67
data/lib/ddtrace/profiling/ext/cthread.rb +0 -156

data/lib/ddtrace/ext/priority.rb CHANGED Viewed

@@ -4,13 +4,15 @@ module Datadog
     # Priority is a hint given to the backend so that it knows which traces to reject or kept.
     # In a distributed context, it should be set before any context propagation (fork, RPC calls) to be effective.
     module Priority
-      # Use this to explicitely inform the backend that a trace should be rejected and not stored.
+      # Use this to explicitly inform the backend that a trace MUST be rejected and not stored.
+      # This includes rules and rate limits configured by the user through the {RuleSampler}.
       USER_REJECT = -1
-      # Used by the builtin sampler to inform the backend that a trace should be rejected and not stored.
+      # Used by the {PrioritySampler} to inform the backend that a trace should be rejected and not stored.
       AUTO_REJECT = 0
-      # Used by the builtin sampler to inform the backend that a trace should be kept and stored.
+      # Used by the {PrioritySampler} to inform the backend that a trace should be kept and stored.
       AUTO_KEEP = 1
-      # Use this to explicitely inform the backend that a trace should be kept and stored.
+      # Use this to explicitly inform the backend that a trace MUST be kept and stored.
+      # This includes rules and rate limits configured by the user through the {RuleSampler}.
       USER_KEEP = 2
     end
   end

data/lib/ddtrace/ext/profiling.rb CHANGED Viewed

@@ -6,11 +6,12 @@ module Datadog
       ENV_UPLOAD_TIMEOUT = 'DD_PROFILING_UPLOAD_TIMEOUT'.freeze
       ENV_MAX_FRAMES = 'DD_PROFILING_MAX_FRAMES'.freeze
       ENV_AGENTLESS = 'DD_PROFILING_AGENTLESS'.freeze
+      ENV_ENDPOINT_COLLECTION_ENABLED = 'DD_PROFILING_ENDPOINT_COLLECTION_ENABLED'.freeze
       module Pprof
+        LABEL_KEY_LOCAL_ROOT_SPAN_ID = 'local root span id'.freeze
         LABEL_KEY_SPAN_ID = 'span id'.freeze
         LABEL_KEY_THREAD_ID = 'thread id'.freeze
-        LABEL_KEY_TRACE_ID = 'trace id'.freeze
         LABEL_KEY_TRACE_ENDPOINT = 'trace endpoint'.freeze
         SAMPLE_VALUE_NO_VALUE = 0
         VALUE_TYPE_CPU = 'cpu-time'.freeze
@@ -22,13 +23,9 @@ module Datadog
         module HTTP
           URI_TEMPLATE_DD_API = 'https://intake.profile.%s/'.freeze
-          FORM_FIELD_DATA = 'data[0]'.freeze
-          FORM_FIELD_FORMAT = 'format'.freeze
-          FORM_FIELD_FORMAT_PPROF = 'pprof'.freeze
-          FORM_FIELD_RECORDING_END = 'recording-end'.freeze
-          FORM_FIELD_RECORDING_START = 'recording-start'.freeze
-          FORM_FIELD_RUNTIME = 'runtime'.freeze
-          FORM_FIELD_RUNTIME_ID = 'runtime-id'.freeze
+          FORM_FIELD_RECORDING_START = 'start'.freeze
+          FORM_FIELD_RECORDING_END = 'end'.freeze
+          FORM_FIELD_FAMILY = 'family'.freeze
           FORM_FIELD_TAG_ENV = 'env'.freeze
           FORM_FIELD_TAG_HOST = 'host'.freeze
           FORM_FIELD_TAG_LANGUAGE = 'language'.freeze
@@ -42,13 +39,13 @@ module Datadog
           FORM_FIELD_TAG_SERVICE = 'service'.freeze
           FORM_FIELD_TAG_VERSION = 'version'.freeze
           FORM_FIELD_TAGS = 'tags'.freeze
-          FORM_FIELD_TYPES = 'types[0]'.freeze
-          FORM_FIELD_TYPES_AUTO = 'auto'.freeze
+          FORM_FIELD_INTAKE_VERSION = 'version'.freeze
           HEADER_CONTENT_TYPE = 'Content-Type'.freeze
           HEADER_CONTENT_TYPE_OCTET_STREAM = 'application/octet-stream'.freeze
-          PPROF_DEFAULT_FILENAME = 'profile.pb.gz'.freeze
+          FORM_FIELD_PPROF_DATA = 'data[rubyprofile.pprof]'.freeze
+          PPROF_DEFAULT_FILENAME = 'rubyprofile.pprof.gz'.freeze
         end
       end
     end

data/lib/ddtrace/ext/runtime.rb CHANGED Viewed

@@ -6,6 +6,7 @@ module Datadog
     module Runtime
       TAG_ID = 'runtime-id'.freeze
       TAG_LANG = 'language'.freeze
+      TAG_PID = 'system.pid'.freeze
       # Metrics
       module Metrics
@@ -14,6 +15,8 @@ module Datadog
         METRIC_CLASS_COUNT = 'runtime.ruby.class_count'.freeze
         METRIC_GC_PREFIX = 'runtime.ruby.gc'.freeze
         METRIC_THREAD_COUNT = 'runtime.ruby.thread_count'.freeze
+        METRIC_GLOBAL_CONSTANT_STATE = 'runtime.ruby.global_constant_state'.freeze
+        METRIC_GLOBAL_METHOD_STATE = 'runtime.ruby.global_method_state'.freeze
         TAG_SERVICE = 'service'.freeze
       end

data/lib/ddtrace/ext/transport.rb CHANGED Viewed

@@ -3,6 +3,7 @@ module Datadog
   module Ext
     module Transport
       module HTTP
+        ADAPTER = :net_http # DEV: Rename to simply `:http`, as Net::HTTP is an implementation detail.
         DEFAULT_HOST = '127.0.0.1'.freeze
         DEFAULT_PORT = 8126
         DEFAULT_TIMEOUT_SECONDS = 1
@@ -16,6 +17,16 @@ module Datadog
         HEADER_META_LANG_INTERPRETER = 'Datadog-Meta-Lang-Interpreter'.freeze
         HEADER_META_TRACER_VERSION = 'Datadog-Meta-Tracer-Version'.freeze
       end
+      module Test
+        ADAPTER = :test
+      end
+      module UnixSocket
+        ADAPTER = :unix
+        DEFAULT_PATH = '/var/run/datadog/apm.socket'.freeze
+        DEFAULT_TIMEOUT_SECONDS = 1
+      end
     end
   end
 end

data/lib/ddtrace/metrics.rb CHANGED Viewed

@@ -31,7 +31,7 @@ module Datadog
       !version.nil? && version >= Gem::Version.new('3.3.0') &&
         # dogstatsd-ruby >= 5.0 & < 5.2.0 has known issues with process forks
         # and do not support the single thread mode we use to avoid this problem.
-        !(version >= Gem::Version.new('5.0') && version < Gem::Version.new('5.2'))
+        !(version >= Gem::Version.new('5.0') && version < Gem::Version.new('5.3'))
     end
     def enabled?
@@ -274,7 +274,7 @@ module Datadog
       IGNORED_STATSD_ONLY_ONCE.run do
         Datadog.logger.warn(
           'Ignoring user-supplied statsd instance as currently-installed version of dogstastd-ruby is incompatible. ' \
-          "To fix this, ensure that you have `gem 'dogstatsd-ruby', '~> 5.2'` on your Gemfile or gems.rb file."
+          "To fix this, ensure that you have `gem 'dogstatsd-ruby', '~> 5.3'` on your Gemfile or gems.rb file."
         )
       end
     end

data/lib/ddtrace/profiling/collectors/stack.rb CHANGED Viewed

@@ -1,4 +1,6 @@
 # typed: true
+require 'ddtrace/profiling/native_extension'
 require 'ddtrace/profiling/backtrace_location'
 require 'ddtrace/profiling/events/stack'
 require 'ddtrace/utils/only_once'
@@ -18,6 +20,13 @@ module Datadog
         DEFAULT_MAX_TIME_USAGE_PCT = 2.0
         MIN_INTERVAL = 0.01
         THREAD_LAST_CPU_TIME_KEY = :datadog_profiler_last_cpu_time
+        THREAD_LAST_WALL_CLOCK_KEY = :datadog_profiler_last_wall_clock
+        SYNTHETIC_STACK_IN_NATIVE_CODE = [BacktraceLocation.new('', 0, 'In native code').freeze].freeze
+        # This default was picked based on the current sampling performance and on expected concurrency on an average
+        # Ruby MRI application. Lowering this optimizes for latency (less impact each time we sample), and raising
+        # optimizes for coverage (less chance to miss what a given thread is doing).
+        DEFAULT_MAX_THREADS_SAMPLED = 16
         attr_reader \
           :recorder,
@@ -25,7 +34,8 @@ module Datadog
           :trace_identifiers_helper,
           :ignore_thread,
           :max_time_usage_pct,
-          :thread_api
+          :thread_api,
+          :cpu_time_provider
         def initialize(
           recorder,
@@ -33,7 +43,9 @@ module Datadog
           trace_identifiers_helper:, # Usually an instance of Datadog::Profiling::TraceIdentifiers::Helper
           ignore_thread: nil,
           max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT,
+          max_threads_sampled: DEFAULT_MAX_THREADS_SAMPLED,
           thread_api: Thread,
+          cpu_time_provider: Datadog::Profiling::NativeExtension,
           fork_policy: Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
           interval: MIN_INTERVAL,
           enabled: true
@@ -43,7 +55,10 @@ module Datadog
           @trace_identifiers_helper = trace_identifiers_helper
           @ignore_thread = ignore_thread
           @max_time_usage_pct = max_time_usage_pct
+          @max_threads_sampled = max_threads_sampled
           @thread_api = thread_api
+          # Only set the provider if it's able to work in the current Ruby/OS combo
+          @cpu_time_provider = cpu_time_provider unless cpu_time_provider.cpu_time_ns_for(thread_api.current).nil?
           # Workers::Async::Thread settings
           self.fork_policy = fork_policy
@@ -54,16 +69,17 @@ module Datadog
           # Workers::Polling settings
           self.enabled = enabled
-          @warn_about_missing_cpu_time_instrumentation_only_once = Datadog::Utils::OnlyOnce.new
           # Cache this proc, since it's pretty expensive to keep recreating it
           @build_backtrace_location = method(:build_backtrace_location).to_proc
           # Cache this buffer, since it's pretty expensive to keep accessing it
           @stack_sample_event_recorder = recorder[Events::StackSample]
+          # See below for details on why this is needed
+          @needs_process_waiter_workaround =
+            Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('2.3') &&
+            Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7')
         end
         def start
-          @last_wall_time = Datadog::Utils::Time.get_time
           reset_cpu_time_tracking
           perform
         end
@@ -72,10 +88,6 @@ module Datadog
           collect_and_wait
         end
-        def loop_back_off?
-          false
-        end
         def collect_and_wait
           run_time = Datadog::Utils::Time.measure do
             collect_events
@@ -87,24 +99,14 @@ module Datadog
         def collect_events
           events = []
-          # Compute wall time interval
-          current_wall_time = Datadog::Utils::Time.get_time
-          last_wall_time = if instance_variable_defined?(:@last_wall_time)
-                             @last_wall_time
-                           else
-                             current_wall_time
-                           end
-          wall_time_interval_ns = ((current_wall_time - last_wall_time).round(9) * 1e9).to_i
-          @last_wall_time = current_wall_time
+          current_wall_time_ns = get_current_wall_time_timestamp_ns
           # Collect backtraces from each thread
-          thread_api.list.each do |thread|
+          threads_to_sample.each do |thread|
             next unless thread.alive?
             next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread)
-            event = collect_thread_event(thread, wall_time_interval_ns)
+            event = collect_thread_event(thread, current_wall_time_ns)
             events << event unless event.nil?
           end
@@ -114,10 +116,30 @@ module Datadog
           events
         end
-        def collect_thread_event(thread, wall_time_interval_ns)
+        def collect_thread_event(thread, current_wall_time_ns)
           locations = thread.backtrace_locations
           return if locations.nil?
+          # Having empty locations means that the thread is alive, but we don't know what it's doing:
+          #
+          # 1. It can be starting up
+          #    ```
+          #    > Thread.new { sleep }.backtrace
+          #    => [] # <-- note the thread hasn't actually started running sleep yet, we got there first
+          #    ```
+          # 2. It can be running native code
+          #    ```
+          #    > t = Process.detach(fork { sleep })
+          #    => #<Process::Waiter:0x00007ffe7285f7a0 run>
+          #    > t.backtrace
+          #    => [] # <-- this can happen even minutes later, e.g. it's not a race as in 1.
+          #    ```
+          #    This effect has been observed in threads created by the Iodine web server and the ffi gem
+          #
+          # To give customers visibility into these threads, we replace the empty stack with one containing a
+          # synthetic placeholder frame, so that these threads are properly represented in the UX.
+          locations = SYNTHETIC_STACK_IN_NATIVE_CODE if locations.empty?
           # Get actual stack size then trim the stack
           stack_size = locations.length
           locations = locations[0..(max_frames - 1)]
@@ -125,45 +147,33 @@ module Datadog
           # Convert backtrace locations into structs
           locations = convert_backtrace_locations(locations)
-          thread_id = thread.respond_to?(:pthread_thread_id) ? thread.pthread_thread_id : thread.object_id
-          trace_id, span_id, trace_resource_container = trace_identifiers_helper.trace_identifiers_for(thread)
+          thread_id = thread.object_id
+          root_span_id, span_id, trace_resource = trace_identifiers_helper.trace_identifiers_for(thread)
           cpu_time = get_cpu_time_interval!(thread)
+          wall_time_interval_ns =
+            get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_WALL_CLOCK_KEY, current_wall_time_ns)
           Events::StackSample.new(
             nil,
             locations,
             stack_size,
             thread_id,
-            trace_id,
+            root_span_id,
             span_id,
-            trace_resource_container,
+            trace_resource,
             cpu_time,
             wall_time_interval_ns
           )
         end
         def get_cpu_time_interval!(thread)
-          # Return if we can't get the current CPU time
-          unless thread.respond_to?(:cpu_time_instrumentation_installed?) && thread.cpu_time_instrumentation_installed?
-            warn_about_missing_cpu_time_instrumentation(thread)
-            return
-          end
+          return unless cpu_time_provider
-          current_cpu_time_ns = thread.cpu_time(:nanosecond)
+          current_cpu_time_ns = cpu_time_provider.cpu_time_ns_for(thread)
-          # NOTE: This can still be nil even when all of the checks above passed because of a race: there's a bit of
-          # initialization that needs to be done by the thread itself, and it's possible for us to try to sample
-          # *before* the thread had time to finish the initialization
           return unless current_cpu_time_ns
-          last_cpu_time_ns = (thread.thread_variable_get(THREAD_LAST_CPU_TIME_KEY) || current_cpu_time_ns)
-          interval = current_cpu_time_ns - last_cpu_time_ns
-          # Update CPU time for thread
-          thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
-          # Return interval
-          interval
+          get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
         end
         def compute_wait_time(used_time)
@@ -209,38 +219,11 @@ module Datadog
         private
-        def warn_about_missing_cpu_time_instrumentation(thread)
-          @warn_about_missing_cpu_time_instrumentation_only_once.run do
-            # Is the profiler thread instrumented? If it is, then we know instrumentation is available, but seems to be
-            # missing on this thread we just found.
-            #
-            # As far as we know, it can be missing due to one the following:
-            #
-            # a) The thread was started before we installed our instrumentation.
-            #    In this case, the fix is to make sure ddtrace gets loaded before any other parts of the application.
-            #
-            # b) The thread was started using the Ruby native APIs (e.g. from a C extension such as ffi).
-            #    Known cases right now that trigger this are the ethon/typhoeus gems.
-            #    We currently have no solution for this case; these threads will always be missing our CPU instrumentation.
-            #
-            # c) The thread was started with `Thread.start`/`Thread.fork` and hasn't yet enabled the instrumentation.
-            #    When threads are started using these APIs, there's a small time window during which the thread has started
-            #    but our code to apply the instrumentation hasn't run yet; in these cases it's just a matter of allowing
-            #    it to run and our instrumentation to be applied.
-            #
-            if thread_api.current.respond_to?(:cpu_time) && thread_api.current.cpu_time
-              Datadog.logger.debug(
-                "Thread ('#{thread}') is missing profiling instrumentation; other threads should be unaffected"
-              )
-            end
-          end
-        end
         # If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to
-        # clean up any leftover per-thread cpu time counters, so that the first sample after starting doesn't end up with:
+        # clean up any leftover per-thread counters, so that the first sample after starting doesn't end up with:
         #
         # a) negative time: At least on my test docker container, and on the reliability environment, after the process
-        #    forks, the clock reference changes and (old cpu time - new cpu time) can be < 0
+        #    forks, the cpu time reference changes and (old cpu time - new cpu time) can be < 0
         #
         # b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
         #    restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
@@ -248,9 +231,66 @@ module Datadog
         # By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
         def reset_cpu_time_tracking
           thread_api.list.each do |thread|
+            # See below for details on why this is needed
+            next if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
             thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil)
+            thread.thread_variable_set(THREAD_LAST_WALL_CLOCK_KEY, nil)
           end
         end
+        def get_elapsed_since_last_sample_and_set_value(thread, key, current_value)
+          # Process::Waiter crash workaround:
+          #
+          # This is a workaround for a Ruby VM segfault (usually something like
+          # "[BUG] Segmentation fault at 0x0000000000000008") in the affected Ruby versions.
+          # See https://bugs.ruby-lang.org/issues/17807 for details.
+          #
+          # In those Ruby versions, there's a very special subclass of `Thread` called `Process::Waiter` that causes VM
+          # crashes whenever something tries to read its instance or thread variables. This subclass of thread only
+          # shows up when the `Process.detach` API gets used.
+          # In the specs you'll find crash regression tests that include a way of reproducing it.
+          #
+          # As workaround for now we just skip it for the affected Rubies
+          return 0 if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
+          last_value = thread.thread_variable_get(key) || current_value
+          thread.thread_variable_set(key, current_value)
+          current_value - last_value
+        end
+        # Whenever there are more than max_threads_sampled active, we only sample a subset of them.
+        # We do this to avoid impacting the latency of the service being profiled. We want to avoid doing
+        # a big burst of work all at once (sample everything), and instead do a little work each time
+        # (sample a bit by bit).
+        #
+        # Because we pick the threads to sample randomly, we'll eventually sample all threads -- just not at once.
+        # Notice also that this will interact with our dynamic sampling mechanism -- if samples are faster, we take
+        # them more often, if they are slower, we take them less often -- which again means that over a longer period
+        # we should take sample roughly the same samples.
+        #
+        # One downside of this approach is that if there really are many threads, the resulting wall clock times
+        # in a one minute profile may "drift" around the 60 second mark, e.g. maybe we only sampled a thread once per
+        # second and only 59 times, so we'll report 59s, but on the next report we'll include the missing one, so
+        # then the result will be 61s. I've observed 60 +- 1.68 secs for an app with ~65 threads, given the
+        # default maximum of 16 threads. This seems a reasonable enough margin of error given the improvement to
+        # latency (especially on such a large application! -> even bigger latency impact if we tried to sample all
+        # threads).
+        #
+        def threads_to_sample
+          all_threads = thread_api.list
+          if all_threads.size > @max_threads_sampled
+            all_threads.sample(@max_threads_sampled)
+          else
+            all_threads
+          end
+        end
+        def get_current_wall_time_timestamp_ns
+          Datadog::Utils::Time.get_time(:nanosecond)
+        end
       end
     end
   end

data/lib/ddtrace/profiling/encoding/profile.rb CHANGED Viewed

@@ -24,12 +24,20 @@ module Datadog
             flush.event_groups.each { |event_group| template.add_events!(event_group.event_class, event_group.events) }
             Datadog.logger.debug do
+              max_events = Datadog.configuration.profiling.advanced.max_events
+              events_sampled =
+                if flush.event_count == max_events
+                  'max events limit hit, events were sampled [profile will be biased], '
+                else
+                  ''
+                end
               "Encoding profile covering #{flush.start.iso8601} to #{flush.finish.iso8601}, " \
-              "events: #{flush.event_count} (#{template.debug_statistics})"
+              "events: #{flush.event_count} (#{events_sampled}#{template.debug_statistics})"
             end
             # Build the profile and encode it
-            template.to_pprof
+            template.to_pprof(start: flush.start, finish: flush.finish)
           end
         end
       end

data/lib/ddtrace/profiling/events/stack.rb CHANGED Viewed

@@ -11,34 +11,34 @@ module Datadog
           :frames,
           :total_frame_count,
           :thread_id,
-          :trace_id,
+          :root_span_id,
           :span_id,
-          :trace_resource_container
+          :trace_resource
         def initialize(
           timestamp,
           frames,
           total_frame_count,
           thread_id,
-          trace_id,
+          root_span_id,
           span_id,
-          trace_resource_container
+          trace_resource
         )
           super(timestamp)
           @frames = frames
           @total_frame_count = total_frame_count
           @thread_id = thread_id
-          @trace_id = trace_id
+          @root_span_id = root_span_id
           @span_id = span_id
-          @trace_resource_container = trace_resource_container
+          @trace_resource = trace_resource
           @hash = [
             thread_id,
-            trace_id,
+            root_span_id,
             span_id,
-            # trace_resource_container is deliberately not included -- events that share the same (trace_id, span_id)
-            # pair should also have the same trace_resource_container
+            # trace_resource is deliberately not included -- events that share the same (root_span_id, span_id) refer
+            # to the same trace
             frames.collect(&:hash),
             total_frame_count
           ].hash
@@ -56,9 +56,9 @@ module Datadog
           frames,
           total_frame_count,
           thread_id,
-          trace_id,
+          root_span_id,
           span_id,
-          trace_resource_container,
+          trace_resource,
           cpu_time_interval_ns,
           wall_time_interval_ns
         )
@@ -67,9 +67,9 @@ module Datadog
             frames,
             total_frame_count,
             thread_id,
-            trace_id,
+            root_span_id,
             span_id,
-            trace_resource_container
+            trace_resource
           )
           @cpu_time_interval_ns = cpu_time_interval_ns

data/lib/ddtrace/profiling/native_extension.rb CHANGED Viewed

@@ -2,7 +2,8 @@
 module Datadog
   module Profiling
     # This module contains classes and methods which are implemented using native code in the
-    # ext/ddtrace_profiling_native_extension folder
+    # ext/ddtrace_profiling_native_extension folder, as well as some Ruby-level utilities that don't make sense to
+    # write using C
     module NativeExtension
       private_class_method def self.working?
         native_working?
@@ -13,6 +14,27 @@ module Datadog
           false
         end
       end
+      unless singleton_class.method_defined?(:clock_id_for)
+        def self.clock_id_for(_)
+          nil
+        end
+      end
+      def self.cpu_time_ns_for(thread)
+        clock_id =
+          begin
+            clock_id_for(thread)
+          rescue Errno::ESRCH
+            nil
+          end
+        begin
+          ::Process.clock_gettime(clock_id, :nanosecond) if clock_id
+        rescue Errno::EINVAL
+          nil
+        end
+      end
     end
   end
 end

data/lib/ddtrace/profiling/pprof/builder.rb CHANGED Viewed

@@ -4,6 +4,7 @@
 require 'ddtrace/profiling/flush'
 require 'ddtrace/profiling/pprof/message_set'
 require 'ddtrace/profiling/pprof/string_table'
+require 'ddtrace/utils/time'
 module Datadog
   module Profiling
@@ -47,14 +48,19 @@ module Datadog
           Perftools::Profiles::Profile.encode(profile).force_encoding(DEFAULT_ENCODING)
         end
-        def build_profile
+        def build_profile(start:, finish:)
+          start_ns = Datadog::Utils::Time.as_utc_epoch_ns(start)
+          finish_ns = Datadog::Utils::Time.as_utc_epoch_ns(finish)
           Perftools::Profiles::Profile.new(
             sample_type: @sample_types.messages,
             sample: @samples,
             mapping: @mappings.messages,
             location: @locations.values,
             function: @functions.messages,
-            string_table: @string_table.strings
+            string_table: @string_table.strings,
+            time_nanos: start_ns,
+            duration_nanos: finish_ns - start_ns,
           )
         end

data/lib/ddtrace/profiling/pprof/converter.rb CHANGED Viewed

@@ -25,20 +25,19 @@ module Datadog
           # [key, EventGroup]
           event_groups = {}
+          # Aggregate each event into a group
+          # with identical properties, but different values.
           events.each do |event|
             key = yield(event)
-            values = build_sample_values(event)
+            values = build_event_values(event)
             unless key.nil?
               if event_groups.key?(key)
-                # Update values for group
-                group_values = event_groups[key].values
-                group_values.each_with_index do |group_value, i|
-                  group_values[i] = group_value + values[i]
-                end
+                # Update existing group from event
+                update_group(event_groups[key], event, values)
               else
                 # Add new group
-                event_groups[key] = EventGroup.new(event, values)
+                event_groups[key] = new_group(event, values)
               end
             end
           end
@@ -57,7 +56,7 @@ module Datadog
           index
         end
-        def build_sample_values(stack_sample)
+        def build_event_values(event)
           # Build a value array that matches the length of the sample types
           # Populate all values with "no value" by default
           Array.new(@sample_type_mappings.length, Datadog::Ext::Profiling::Pprof::SAMPLE_VALUE_NO_VALUE)
@@ -69,7 +68,7 @@ module Datadog
         # Represents a grouped event
         # 'sample' is an example event object from the group.
-        # 'values' is the the summation of the group's sample values
+        # 'values' is the summation of the group's sample values
         EventGroup = Struct.new(:sample, :values)
         # Error when the mapping of a sample type to value index is unknown
@@ -84,6 +83,20 @@ module Datadog
             "Mapping for sample value type '#{type}' to index is unknown."
           end
         end
+        protected
+        def new_group(event, values)
+          EventGroup.new(event, values)
+        end
+        def update_group(event_group, event, values)
+          # Update values for group
+          group_values = event_group.values
+          group_values.each_with_index do |group_value, i|
+            group_values[i] = group_value + values[i]
+          end
+        end
       end
     end
   end