ddtrace 0.52.0 → 0.54.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +174 -11
  3. data/ddtrace.gemspec +6 -3
  4. data/docs/DevelopmentGuide.md +1 -6
  5. data/docs/GettingStarted.md +109 -18
  6. data/docs/ProfilingDevelopment.md +2 -2
  7. data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +86 -0
  8. data/ext/ddtrace_profiling_native_extension/clock_id.h +4 -0
  9. data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +52 -0
  10. data/ext/ddtrace_profiling_native_extension/clock_id_noop.c +14 -0
  11. data/ext/ddtrace_profiling_native_extension/extconf.rb +177 -8
  12. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +35 -0
  13. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +3 -0
  14. data/ext/ddtrace_profiling_native_extension/profiling.c +6 -1
  15. data/lib/datadog/ci/contrib/cucumber/formatter.rb +1 -0
  16. data/lib/datadog/ci/contrib/rspec/example.rb +1 -0
  17. data/lib/datadog/ci/contrib/rspec/integration.rb +2 -2
  18. data/lib/datadog/ci/ext/environment.rb +64 -22
  19. data/lib/datadog/ci/ext/test.rb +1 -0
  20. data/lib/datadog/ci/test.rb +5 -1
  21. data/lib/datadog/contrib.rb +2 -0
  22. data/lib/datadog/core/environment/vm_cache.rb +46 -0
  23. data/lib/ddtrace/buffer.rb +28 -16
  24. data/lib/ddtrace/configuration/agent_settings_resolver.rb +131 -53
  25. data/lib/ddtrace/configuration/components.rb +1 -1
  26. data/lib/ddtrace/configuration/settings.rb +13 -3
  27. data/lib/ddtrace/context.rb +10 -2
  28. data/lib/ddtrace/contrib/action_cable/instrumentation.rb +46 -0
  29. data/lib/ddtrace/contrib/action_cable/patcher.rb +1 -0
  30. data/lib/ddtrace/contrib/action_mailer/configuration/settings.rb +32 -0
  31. data/lib/ddtrace/contrib/action_mailer/event.rb +50 -0
  32. data/lib/ddtrace/contrib/action_mailer/events/deliver.rb +54 -0
  33. data/lib/ddtrace/contrib/action_mailer/events/process.rb +41 -0
  34. data/lib/ddtrace/contrib/action_mailer/events.rb +31 -0
  35. data/lib/ddtrace/contrib/action_mailer/ext.rb +32 -0
  36. data/lib/ddtrace/contrib/action_mailer/integration.rb +45 -0
  37. data/lib/ddtrace/contrib/action_mailer/patcher.rb +27 -0
  38. data/lib/ddtrace/contrib/active_job/configuration/settings.rb +33 -0
  39. data/lib/ddtrace/contrib/active_job/event.rb +54 -0
  40. data/lib/ddtrace/contrib/active_job/events/discard.rb +46 -0
  41. data/lib/ddtrace/contrib/active_job/events/enqueue.rb +45 -0
  42. data/lib/ddtrace/contrib/active_job/events/enqueue_at.rb +45 -0
  43. data/lib/ddtrace/contrib/active_job/events/enqueue_retry.rb +47 -0
  44. data/lib/ddtrace/contrib/active_job/events/perform.rb +45 -0
  45. data/lib/ddtrace/contrib/active_job/events/retry_stopped.rb +46 -0
  46. data/lib/ddtrace/contrib/active_job/events.rb +39 -0
  47. data/lib/ddtrace/contrib/active_job/ext.rb +32 -0
  48. data/lib/ddtrace/contrib/active_job/integration.rb +46 -0
  49. data/lib/ddtrace/contrib/active_job/log_injection.rb +21 -0
  50. data/lib/ddtrace/contrib/active_job/patcher.rb +33 -0
  51. data/lib/ddtrace/contrib/auto_instrument.rb +0 -1
  52. data/lib/ddtrace/contrib/delayed_job/plugin.rb +2 -2
  53. data/lib/ddtrace/contrib/mongodb/instrumentation.rb +1 -1
  54. data/lib/ddtrace/contrib/mongodb/integration.rb +5 -0
  55. data/lib/ddtrace/contrib/rails/auto_instrument_railtie.rb +0 -1
  56. data/lib/ddtrace/contrib/rails/configuration/settings.rb +7 -0
  57. data/lib/ddtrace/contrib/rails/framework.rb +24 -1
  58. data/lib/ddtrace/contrib/rails/patcher.rb +19 -10
  59. data/lib/ddtrace/contrib/redis/instrumentation.rb +90 -0
  60. data/lib/ddtrace/contrib/redis/patcher.rb +2 -84
  61. data/lib/ddtrace/contrib/registerable.rb +0 -1
  62. data/lib/ddtrace/contrib/resque/integration.rb +1 -5
  63. data/lib/ddtrace/contrib/sidekiq/ext.rb +3 -0
  64. data/lib/ddtrace/contrib/sidekiq/integration.rb +10 -0
  65. data/lib/ddtrace/contrib/sidekiq/patcher.rb +26 -0
  66. data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/heartbeat.rb +30 -0
  67. data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/job_fetch.rb +30 -0
  68. data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/scheduled_push.rb +29 -0
  69. data/lib/ddtrace/contrib/sinatra/env.rb +2 -1
  70. data/lib/ddtrace/contrib/sinatra/tracer.rb +15 -2
  71. data/lib/ddtrace/ext/git.rb +12 -0
  72. data/lib/ddtrace/ext/priority.rb +6 -4
  73. data/lib/ddtrace/ext/profiling.rb +8 -11
  74. data/lib/ddtrace/ext/runtime.rb +3 -0
  75. data/lib/ddtrace/ext/transport.rb +11 -0
  76. data/lib/ddtrace/metrics.rb +2 -2
  77. data/lib/ddtrace/profiling/collectors/stack.rb +112 -72
  78. data/lib/ddtrace/profiling/encoding/profile.rb +10 -2
  79. data/lib/ddtrace/profiling/events/stack.rb +13 -13
  80. data/lib/ddtrace/profiling/native_extension.rb +23 -1
  81. data/lib/ddtrace/profiling/pprof/builder.rb +8 -2
  82. data/lib/ddtrace/profiling/pprof/converter.rb +22 -9
  83. data/lib/ddtrace/profiling/pprof/stack_sample.rb +32 -9
  84. data/lib/ddtrace/profiling/pprof/template.rb +2 -2
  85. data/lib/ddtrace/profiling/scheduler.rb +20 -4
  86. data/lib/ddtrace/profiling/tasks/setup.rb +21 -13
  87. data/lib/ddtrace/profiling/trace_identifiers/ddtrace.rb +10 -9
  88. data/lib/ddtrace/profiling/trace_identifiers/helper.rb +5 -5
  89. data/lib/ddtrace/profiling/transport/http/api/endpoint.rb +8 -15
  90. data/lib/ddtrace/profiling/transport/http.rb +8 -17
  91. data/lib/ddtrace/profiling.rb +0 -2
  92. data/lib/ddtrace/runtime/metrics.rb +14 -0
  93. data/lib/ddtrace/sampler.rb +18 -8
  94. data/lib/ddtrace/sampling/rule_sampler.rb +13 -1
  95. data/lib/ddtrace/span.rb +7 -19
  96. data/lib/ddtrace/tracer.rb +1 -1
  97. data/lib/ddtrace/transport/http/adapters/net.rb +13 -3
  98. data/lib/ddtrace/transport/http/adapters/test.rb +4 -2
  99. data/lib/ddtrace/transport/http/adapters/unix_socket.rb +23 -12
  100. data/lib/ddtrace/transport/http/builder.rb +13 -6
  101. data/lib/ddtrace/transport/http.rb +5 -11
  102. data/lib/ddtrace/utils/time.rb +11 -6
  103. data/lib/ddtrace/version.rb +2 -2
  104. data/lib/ddtrace/workers/{loop.rb → interval_loop.rb} +0 -16
  105. data/lib/ddtrace/workers/polling.rb +1 -1
  106. metadata +40 -10
  107. data/lib/ddtrace/profiling/ext/cpu.rb +0 -67
  108. data/lib/ddtrace/profiling/ext/cthread.rb +0 -156
@@ -4,13 +4,15 @@ module Datadog
4
4
  # Priority is a hint given to the backend so that it knows which traces to reject or kept.
5
5
  # In a distributed context, it should be set before any context propagation (fork, RPC calls) to be effective.
6
6
  module Priority
7
- # Use this to explicitely inform the backend that a trace should be rejected and not stored.
7
+ # Use this to explicitly inform the backend that a trace MUST be rejected and not stored.
8
+ # This includes rules and rate limits configured by the user through the {RuleSampler}.
8
9
  USER_REJECT = -1
9
- # Used by the builtin sampler to inform the backend that a trace should be rejected and not stored.
10
+ # Used by the {PrioritySampler} to inform the backend that a trace should be rejected and not stored.
10
11
  AUTO_REJECT = 0
11
- # Used by the builtin sampler to inform the backend that a trace should be kept and stored.
12
+ # Used by the {PrioritySampler} to inform the backend that a trace should be kept and stored.
12
13
  AUTO_KEEP = 1
13
- # Use this to explicitely inform the backend that a trace should be kept and stored.
14
+ # Use this to explicitly inform the backend that a trace MUST be kept and stored.
15
+ # This includes rules and rate limits configured by the user through the {RuleSampler}.
14
16
  USER_KEEP = 2
15
17
  end
16
18
  end
@@ -6,11 +6,12 @@ module Datadog
6
6
  ENV_UPLOAD_TIMEOUT = 'DD_PROFILING_UPLOAD_TIMEOUT'.freeze
7
7
  ENV_MAX_FRAMES = 'DD_PROFILING_MAX_FRAMES'.freeze
8
8
  ENV_AGENTLESS = 'DD_PROFILING_AGENTLESS'.freeze
9
+ ENV_ENDPOINT_COLLECTION_ENABLED = 'DD_PROFILING_ENDPOINT_COLLECTION_ENABLED'.freeze
9
10
 
10
11
  module Pprof
12
+ LABEL_KEY_LOCAL_ROOT_SPAN_ID = 'local root span id'.freeze
11
13
  LABEL_KEY_SPAN_ID = 'span id'.freeze
12
14
  LABEL_KEY_THREAD_ID = 'thread id'.freeze
13
- LABEL_KEY_TRACE_ID = 'trace id'.freeze
14
15
  LABEL_KEY_TRACE_ENDPOINT = 'trace endpoint'.freeze
15
16
  SAMPLE_VALUE_NO_VALUE = 0
16
17
  VALUE_TYPE_CPU = 'cpu-time'.freeze
@@ -22,13 +23,9 @@ module Datadog
22
23
  module HTTP
23
24
  URI_TEMPLATE_DD_API = 'https://intake.profile.%s/'.freeze
24
25
 
25
- FORM_FIELD_DATA = 'data[0]'.freeze
26
- FORM_FIELD_FORMAT = 'format'.freeze
27
- FORM_FIELD_FORMAT_PPROF = 'pprof'.freeze
28
- FORM_FIELD_RECORDING_END = 'recording-end'.freeze
29
- FORM_FIELD_RECORDING_START = 'recording-start'.freeze
30
- FORM_FIELD_RUNTIME = 'runtime'.freeze
31
- FORM_FIELD_RUNTIME_ID = 'runtime-id'.freeze
26
+ FORM_FIELD_RECORDING_START = 'start'.freeze
27
+ FORM_FIELD_RECORDING_END = 'end'.freeze
28
+ FORM_FIELD_FAMILY = 'family'.freeze
32
29
  FORM_FIELD_TAG_ENV = 'env'.freeze
33
30
  FORM_FIELD_TAG_HOST = 'host'.freeze
34
31
  FORM_FIELD_TAG_LANGUAGE = 'language'.freeze
@@ -42,13 +39,13 @@ module Datadog
42
39
  FORM_FIELD_TAG_SERVICE = 'service'.freeze
43
40
  FORM_FIELD_TAG_VERSION = 'version'.freeze
44
41
  FORM_FIELD_TAGS = 'tags'.freeze
45
- FORM_FIELD_TYPES = 'types[0]'.freeze
46
- FORM_FIELD_TYPES_AUTO = 'auto'.freeze
42
+ FORM_FIELD_INTAKE_VERSION = 'version'.freeze
47
43
 
48
44
  HEADER_CONTENT_TYPE = 'Content-Type'.freeze
49
45
  HEADER_CONTENT_TYPE_OCTET_STREAM = 'application/octet-stream'.freeze
50
46
 
51
- PPROF_DEFAULT_FILENAME = 'profile.pb.gz'.freeze
47
+ FORM_FIELD_PPROF_DATA = 'data[rubyprofile.pprof]'.freeze
48
+ PPROF_DEFAULT_FILENAME = 'rubyprofile.pprof.gz'.freeze
52
49
  end
53
50
  end
54
51
  end
@@ -6,6 +6,7 @@ module Datadog
6
6
  module Runtime
7
7
  TAG_ID = 'runtime-id'.freeze
8
8
  TAG_LANG = 'language'.freeze
9
+ TAG_PID = 'system.pid'.freeze
9
10
 
10
11
  # Metrics
11
12
  module Metrics
@@ -14,6 +15,8 @@ module Datadog
14
15
  METRIC_CLASS_COUNT = 'runtime.ruby.class_count'.freeze
15
16
  METRIC_GC_PREFIX = 'runtime.ruby.gc'.freeze
16
17
  METRIC_THREAD_COUNT = 'runtime.ruby.thread_count'.freeze
18
+ METRIC_GLOBAL_CONSTANT_STATE = 'runtime.ruby.global_constant_state'.freeze
19
+ METRIC_GLOBAL_METHOD_STATE = 'runtime.ruby.global_method_state'.freeze
17
20
 
18
21
  TAG_SERVICE = 'service'.freeze
19
22
  end
@@ -3,6 +3,7 @@ module Datadog
3
3
  module Ext
4
4
  module Transport
5
5
  module HTTP
6
+ ADAPTER = :net_http # DEV: Rename to simply `:http`, as Net::HTTP is an implementation detail.
6
7
  DEFAULT_HOST = '127.0.0.1'.freeze
7
8
  DEFAULT_PORT = 8126
8
9
  DEFAULT_TIMEOUT_SECONDS = 1
@@ -16,6 +17,16 @@ module Datadog
16
17
  HEADER_META_LANG_INTERPRETER = 'Datadog-Meta-Lang-Interpreter'.freeze
17
18
  HEADER_META_TRACER_VERSION = 'Datadog-Meta-Tracer-Version'.freeze
18
19
  end
20
+
21
+ module Test
22
+ ADAPTER = :test
23
+ end
24
+
25
+ module UnixSocket
26
+ ADAPTER = :unix
27
+ DEFAULT_PATH = '/var/run/datadog/apm.socket'.freeze
28
+ DEFAULT_TIMEOUT_SECONDS = 1
29
+ end
19
30
  end
20
31
  end
21
32
  end
@@ -31,7 +31,7 @@ module Datadog
31
31
  !version.nil? && version >= Gem::Version.new('3.3.0') &&
32
32
  # dogstatsd-ruby >= 5.0 & < 5.2.0 has known issues with process forks
33
33
  # and do not support the single thread mode we use to avoid this problem.
34
- !(version >= Gem::Version.new('5.0') && version < Gem::Version.new('5.2'))
34
+ !(version >= Gem::Version.new('5.0') && version < Gem::Version.new('5.3'))
35
35
  end
36
36
 
37
37
  def enabled?
@@ -274,7 +274,7 @@ module Datadog
274
274
  IGNORED_STATSD_ONLY_ONCE.run do
275
275
  Datadog.logger.warn(
276
276
  'Ignoring user-supplied statsd instance as currently-installed version of dogstastd-ruby is incompatible. ' \
277
- "To fix this, ensure that you have `gem 'dogstatsd-ruby', '~> 5.2'` on your Gemfile or gems.rb file."
277
+ "To fix this, ensure that you have `gem 'dogstatsd-ruby', '~> 5.3'` on your Gemfile or gems.rb file."
278
278
  )
279
279
  end
280
280
  end
@@ -1,4 +1,6 @@
1
1
  # typed: true
2
+
3
+ require 'ddtrace/profiling/native_extension'
2
4
  require 'ddtrace/profiling/backtrace_location'
3
5
  require 'ddtrace/profiling/events/stack'
4
6
  require 'ddtrace/utils/only_once'
@@ -18,6 +20,13 @@ module Datadog
18
20
  DEFAULT_MAX_TIME_USAGE_PCT = 2.0
19
21
  MIN_INTERVAL = 0.01
20
22
  THREAD_LAST_CPU_TIME_KEY = :datadog_profiler_last_cpu_time
23
+ THREAD_LAST_WALL_CLOCK_KEY = :datadog_profiler_last_wall_clock
24
+ SYNTHETIC_STACK_IN_NATIVE_CODE = [BacktraceLocation.new('', 0, 'In native code').freeze].freeze
25
+
26
+ # This default was picked based on the current sampling performance and on expected concurrency on an average
27
+ # Ruby MRI application. Lowering this optimizes for latency (less impact each time we sample), and raising
28
+ # optimizes for coverage (less chance to miss what a given thread is doing).
29
+ DEFAULT_MAX_THREADS_SAMPLED = 16
21
30
 
22
31
  attr_reader \
23
32
  :recorder,
@@ -25,7 +34,8 @@ module Datadog
25
34
  :trace_identifiers_helper,
26
35
  :ignore_thread,
27
36
  :max_time_usage_pct,
28
- :thread_api
37
+ :thread_api,
38
+ :cpu_time_provider
29
39
 
30
40
  def initialize(
31
41
  recorder,
@@ -33,7 +43,9 @@ module Datadog
33
43
  trace_identifiers_helper:, # Usually an instance of Datadog::Profiling::TraceIdentifiers::Helper
34
44
  ignore_thread: nil,
35
45
  max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT,
46
+ max_threads_sampled: DEFAULT_MAX_THREADS_SAMPLED,
36
47
  thread_api: Thread,
48
+ cpu_time_provider: Datadog::Profiling::NativeExtension,
37
49
  fork_policy: Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
38
50
  interval: MIN_INTERVAL,
39
51
  enabled: true
@@ -43,7 +55,10 @@ module Datadog
43
55
  @trace_identifiers_helper = trace_identifiers_helper
44
56
  @ignore_thread = ignore_thread
45
57
  @max_time_usage_pct = max_time_usage_pct
58
+ @max_threads_sampled = max_threads_sampled
46
59
  @thread_api = thread_api
60
+ # Only set the provider if it's able to work in the current Ruby/OS combo
61
+ @cpu_time_provider = cpu_time_provider unless cpu_time_provider.cpu_time_ns_for(thread_api.current).nil?
47
62
 
48
63
  # Workers::Async::Thread settings
49
64
  self.fork_policy = fork_policy
@@ -54,16 +69,17 @@ module Datadog
54
69
  # Workers::Polling settings
55
70
  self.enabled = enabled
56
71
 
57
- @warn_about_missing_cpu_time_instrumentation_only_once = Datadog::Utils::OnlyOnce.new
58
-
59
72
  # Cache this proc, since it's pretty expensive to keep recreating it
60
73
  @build_backtrace_location = method(:build_backtrace_location).to_proc
61
74
  # Cache this buffer, since it's pretty expensive to keep accessing it
62
75
  @stack_sample_event_recorder = recorder[Events::StackSample]
76
+ # See below for details on why this is needed
77
+ @needs_process_waiter_workaround =
78
+ Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('2.3') &&
79
+ Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7')
63
80
  end
64
81
 
65
82
  def start
66
- @last_wall_time = Datadog::Utils::Time.get_time
67
83
  reset_cpu_time_tracking
68
84
  perform
69
85
  end
@@ -72,10 +88,6 @@ module Datadog
72
88
  collect_and_wait
73
89
  end
74
90
 
75
- def loop_back_off?
76
- false
77
- end
78
-
79
91
  def collect_and_wait
80
92
  run_time = Datadog::Utils::Time.measure do
81
93
  collect_events
@@ -87,24 +99,14 @@ module Datadog
87
99
 
88
100
  def collect_events
89
101
  events = []
90
-
91
- # Compute wall time interval
92
- current_wall_time = Datadog::Utils::Time.get_time
93
- last_wall_time = if instance_variable_defined?(:@last_wall_time)
94
- @last_wall_time
95
- else
96
- current_wall_time
97
- end
98
-
99
- wall_time_interval_ns = ((current_wall_time - last_wall_time).round(9) * 1e9).to_i
100
- @last_wall_time = current_wall_time
102
+ current_wall_time_ns = get_current_wall_time_timestamp_ns
101
103
 
102
104
  # Collect backtraces from each thread
103
- thread_api.list.each do |thread|
105
+ threads_to_sample.each do |thread|
104
106
  next unless thread.alive?
105
107
  next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread)
106
108
 
107
- event = collect_thread_event(thread, wall_time_interval_ns)
109
+ event = collect_thread_event(thread, current_wall_time_ns)
108
110
  events << event unless event.nil?
109
111
  end
110
112
 
@@ -114,10 +116,30 @@ module Datadog
114
116
  events
115
117
  end
116
118
 
117
- def collect_thread_event(thread, wall_time_interval_ns)
119
+ def collect_thread_event(thread, current_wall_time_ns)
118
120
  locations = thread.backtrace_locations
119
121
  return if locations.nil?
120
122
 
123
+ # Having empty locations means that the thread is alive, but we don't know what it's doing:
124
+ #
125
+ # 1. It can be starting up
126
+ # ```
127
+ # > Thread.new { sleep }.backtrace
128
+ # => [] # <-- note the thread hasn't actually started running sleep yet, we got there first
129
+ # ```
130
+ # 2. It can be running native code
131
+ # ```
132
+ # > t = Process.detach(fork { sleep })
133
+ # => #<Process::Waiter:0x00007ffe7285f7a0 run>
134
+ # > t.backtrace
135
+ # => [] # <-- this can happen even minutes later, e.g. it's not a race as in 1.
136
+ # ```
137
+ # This effect has been observed in threads created by the Iodine web server and the ffi gem
138
+ #
139
+ # To give customers visibility into these threads, we replace the empty stack with one containing a
140
+ # synthetic placeholder frame, so that these threads are properly represented in the UX.
141
+ locations = SYNTHETIC_STACK_IN_NATIVE_CODE if locations.empty?
142
+
121
143
  # Get actual stack size then trim the stack
122
144
  stack_size = locations.length
123
145
  locations = locations[0..(max_frames - 1)]
@@ -125,45 +147,33 @@ module Datadog
125
147
  # Convert backtrace locations into structs
126
148
  locations = convert_backtrace_locations(locations)
127
149
 
128
- thread_id = thread.respond_to?(:pthread_thread_id) ? thread.pthread_thread_id : thread.object_id
129
- trace_id, span_id, trace_resource_container = trace_identifiers_helper.trace_identifiers_for(thread)
150
+ thread_id = thread.object_id
151
+ root_span_id, span_id, trace_resource = trace_identifiers_helper.trace_identifiers_for(thread)
130
152
  cpu_time = get_cpu_time_interval!(thread)
153
+ wall_time_interval_ns =
154
+ get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_WALL_CLOCK_KEY, current_wall_time_ns)
131
155
 
132
156
  Events::StackSample.new(
133
157
  nil,
134
158
  locations,
135
159
  stack_size,
136
160
  thread_id,
137
- trace_id,
161
+ root_span_id,
138
162
  span_id,
139
- trace_resource_container,
163
+ trace_resource,
140
164
  cpu_time,
141
165
  wall_time_interval_ns
142
166
  )
143
167
  end
144
168
 
145
169
  def get_cpu_time_interval!(thread)
146
- # Return if we can't get the current CPU time
147
- unless thread.respond_to?(:cpu_time_instrumentation_installed?) && thread.cpu_time_instrumentation_installed?
148
- warn_about_missing_cpu_time_instrumentation(thread)
149
- return
150
- end
170
+ return unless cpu_time_provider
151
171
 
152
- current_cpu_time_ns = thread.cpu_time(:nanosecond)
172
+ current_cpu_time_ns = cpu_time_provider.cpu_time_ns_for(thread)
153
173
 
154
- # NOTE: This can still be nil even when all of the checks above passed because of a race: there's a bit of
155
- # initialization that needs to be done by the thread itself, and it's possible for us to try to sample
156
- # *before* the thread had time to finish the initialization
157
174
  return unless current_cpu_time_ns
158
175
 
159
- last_cpu_time_ns = (thread.thread_variable_get(THREAD_LAST_CPU_TIME_KEY) || current_cpu_time_ns)
160
- interval = current_cpu_time_ns - last_cpu_time_ns
161
-
162
- # Update CPU time for thread
163
- thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
164
-
165
- # Return interval
166
- interval
176
+ get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
167
177
  end
168
178
 
169
179
  def compute_wait_time(used_time)
@@ -209,38 +219,11 @@ module Datadog
209
219
 
210
220
  private
211
221
 
212
- def warn_about_missing_cpu_time_instrumentation(thread)
213
- @warn_about_missing_cpu_time_instrumentation_only_once.run do
214
- # Is the profiler thread instrumented? If it is, then we know instrumentation is available, but seems to be
215
- # missing on this thread we just found.
216
- #
217
- # As far as we know, it can be missing due to one the following:
218
- #
219
- # a) The thread was started before we installed our instrumentation.
220
- # In this case, the fix is to make sure ddtrace gets loaded before any other parts of the application.
221
- #
222
- # b) The thread was started using the Ruby native APIs (e.g. from a C extension such as ffi).
223
- # Known cases right now that trigger this are the ethon/typhoeus gems.
224
- # We currently have no solution for this case; these threads will always be missing our CPU instrumentation.
225
- #
226
- # c) The thread was started with `Thread.start`/`Thread.fork` and hasn't yet enabled the instrumentation.
227
- # When threads are started using these APIs, there's a small time window during which the thread has started
228
- # but our code to apply the instrumentation hasn't run yet; in these cases it's just a matter of allowing
229
- # it to run and our instrumentation to be applied.
230
- #
231
- if thread_api.current.respond_to?(:cpu_time) && thread_api.current.cpu_time
232
- Datadog.logger.debug(
233
- "Thread ('#{thread}') is missing profiling instrumentation; other threads should be unaffected"
234
- )
235
- end
236
- end
237
- end
238
-
239
222
  # If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to
240
- # clean up any leftover per-thread cpu time counters, so that the first sample after starting doesn't end up with:
223
+ # clean up any leftover per-thread counters, so that the first sample after starting doesn't end up with:
241
224
  #
242
225
  # a) negative time: At least on my test docker container, and on the reliability environment, after the process
243
- # forks, the clock reference changes and (old cpu time - new cpu time) can be < 0
226
+ # forks, the cpu time reference changes and (old cpu time - new cpu time) can be < 0
244
227
  #
245
228
  # b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
246
229
  # restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
@@ -248,9 +231,66 @@ module Datadog
248
231
  # By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
249
232
  def reset_cpu_time_tracking
250
233
  thread_api.list.each do |thread|
234
+ # See below for details on why this is needed
235
+ next if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
236
+
251
237
  thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil)
238
+ thread.thread_variable_set(THREAD_LAST_WALL_CLOCK_KEY, nil)
252
239
  end
253
240
  end
241
+
242
+ def get_elapsed_since_last_sample_and_set_value(thread, key, current_value)
243
+ # Process::Waiter crash workaround:
244
+ #
245
+ # This is a workaround for a Ruby VM segfault (usually something like
246
+ # "[BUG] Segmentation fault at 0x0000000000000008") in the affected Ruby versions.
247
+ # See https://bugs.ruby-lang.org/issues/17807 for details.
248
+ #
249
+ # In those Ruby versions, there's a very special subclass of `Thread` called `Process::Waiter` that causes VM
250
+ # crashes whenever something tries to read its instance or thread variables. This subclass of thread only
251
+ # shows up when the `Process.detach` API gets used.
252
+ # In the specs you'll find crash regression tests that include a way of reproducing it.
253
+ #
254
+ # As workaround for now we just skip it for the affected Rubies
255
+ return 0 if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
256
+
257
+ last_value = thread.thread_variable_get(key) || current_value
258
+ thread.thread_variable_set(key, current_value)
259
+
260
+ current_value - last_value
261
+ end
262
+
263
+ # Whenever there are more than max_threads_sampled active, we only sample a subset of them.
264
+ # We do this to avoid impacting the latency of the service being profiled. We want to avoid doing
265
+ # a big burst of work all at once (sample everything), and instead do a little work each time
266
+ # (sample a bit by bit).
267
+ #
268
+ # Because we pick the threads to sample randomly, we'll eventually sample all threads -- just not at once.
269
+ # Notice also that this will interact with our dynamic sampling mechanism -- if samples are faster, we take
270
+ # them more often, if they are slower, we take them less often -- which again means that over a longer period
271
+ # we should take sample roughly the same samples.
272
+ #
273
+ # One downside of this approach is that if there really are many threads, the resulting wall clock times
274
+ # in a one minute profile may "drift" around the 60 second mark, e.g. maybe we only sampled a thread once per
275
+ # second and only 59 times, so we'll report 59s, but on the next report we'll include the missing one, so
276
+ # then the result will be 61s. I've observed 60 +- 1.68 secs for an app with ~65 threads, given the
277
+ # default maximum of 16 threads. This seems a reasonable enough margin of error given the improvement to
278
+ # latency (especially on such a large application! -> even bigger latency impact if we tried to sample all
279
+ # threads).
280
+ #
281
+ def threads_to_sample
282
+ all_threads = thread_api.list
283
+
284
+ if all_threads.size > @max_threads_sampled
285
+ all_threads.sample(@max_threads_sampled)
286
+ else
287
+ all_threads
288
+ end
289
+ end
290
+
291
+ def get_current_wall_time_timestamp_ns
292
+ Datadog::Utils::Time.get_time(:nanosecond)
293
+ end
254
294
  end
255
295
  end
256
296
  end
@@ -24,12 +24,20 @@ module Datadog
24
24
  flush.event_groups.each { |event_group| template.add_events!(event_group.event_class, event_group.events) }
25
25
 
26
26
  Datadog.logger.debug do
27
+ max_events = Datadog.configuration.profiling.advanced.max_events
28
+ events_sampled =
29
+ if flush.event_count == max_events
30
+ 'max events limit hit, events were sampled [profile will be biased], '
31
+ else
32
+ ''
33
+ end
34
+
27
35
  "Encoding profile covering #{flush.start.iso8601} to #{flush.finish.iso8601}, " \
28
- "events: #{flush.event_count} (#{template.debug_statistics})"
36
+ "events: #{flush.event_count} (#{events_sampled}#{template.debug_statistics})"
29
37
  end
30
38
 
31
39
  # Build the profile and encode it
32
- template.to_pprof
40
+ template.to_pprof(start: flush.start, finish: flush.finish)
33
41
  end
34
42
  end
35
43
  end
@@ -11,34 +11,34 @@ module Datadog
11
11
  :frames,
12
12
  :total_frame_count,
13
13
  :thread_id,
14
- :trace_id,
14
+ :root_span_id,
15
15
  :span_id,
16
- :trace_resource_container
16
+ :trace_resource
17
17
 
18
18
  def initialize(
19
19
  timestamp,
20
20
  frames,
21
21
  total_frame_count,
22
22
  thread_id,
23
- trace_id,
23
+ root_span_id,
24
24
  span_id,
25
- trace_resource_container
25
+ trace_resource
26
26
  )
27
27
  super(timestamp)
28
28
 
29
29
  @frames = frames
30
30
  @total_frame_count = total_frame_count
31
31
  @thread_id = thread_id
32
- @trace_id = trace_id
32
+ @root_span_id = root_span_id
33
33
  @span_id = span_id
34
- @trace_resource_container = trace_resource_container
34
+ @trace_resource = trace_resource
35
35
 
36
36
  @hash = [
37
37
  thread_id,
38
- trace_id,
38
+ root_span_id,
39
39
  span_id,
40
- # trace_resource_container is deliberately not included -- events that share the same (trace_id, span_id)
41
- # pair should also have the same trace_resource_container
40
+ # trace_resource is deliberately not included -- events that share the same (root_span_id, span_id) refer
41
+ # to the same trace
42
42
  frames.collect(&:hash),
43
43
  total_frame_count
44
44
  ].hash
@@ -56,9 +56,9 @@ module Datadog
56
56
  frames,
57
57
  total_frame_count,
58
58
  thread_id,
59
- trace_id,
59
+ root_span_id,
60
60
  span_id,
61
- trace_resource_container,
61
+ trace_resource,
62
62
  cpu_time_interval_ns,
63
63
  wall_time_interval_ns
64
64
  )
@@ -67,9 +67,9 @@ module Datadog
67
67
  frames,
68
68
  total_frame_count,
69
69
  thread_id,
70
- trace_id,
70
+ root_span_id,
71
71
  span_id,
72
- trace_resource_container
72
+ trace_resource
73
73
  )
74
74
 
75
75
  @cpu_time_interval_ns = cpu_time_interval_ns
@@ -2,7 +2,8 @@
2
2
  module Datadog
3
3
  module Profiling
4
4
  # This module contains classes and methods which are implemented using native code in the
5
- # ext/ddtrace_profiling_native_extension folder
5
+ # ext/ddtrace_profiling_native_extension folder, as well as some Ruby-level utilities that don't make sense to
6
+ # write using C
6
7
  module NativeExtension
7
8
  private_class_method def self.working?
8
9
  native_working?
@@ -13,6 +14,27 @@ module Datadog
13
14
  false
14
15
  end
15
16
  end
17
+
18
+ unless singleton_class.method_defined?(:clock_id_for)
19
+ def self.clock_id_for(_)
20
+ nil
21
+ end
22
+ end
23
+
24
+ def self.cpu_time_ns_for(thread)
25
+ clock_id =
26
+ begin
27
+ clock_id_for(thread)
28
+ rescue Errno::ESRCH
29
+ nil
30
+ end
31
+
32
+ begin
33
+ ::Process.clock_gettime(clock_id, :nanosecond) if clock_id
34
+ rescue Errno::EINVAL
35
+ nil
36
+ end
37
+ end
16
38
  end
17
39
  end
18
40
  end
@@ -4,6 +4,7 @@
4
4
  require 'ddtrace/profiling/flush'
5
5
  require 'ddtrace/profiling/pprof/message_set'
6
6
  require 'ddtrace/profiling/pprof/string_table'
7
+ require 'ddtrace/utils/time'
7
8
 
8
9
  module Datadog
9
10
  module Profiling
@@ -47,14 +48,19 @@ module Datadog
47
48
  Perftools::Profiles::Profile.encode(profile).force_encoding(DEFAULT_ENCODING)
48
49
  end
49
50
 
50
- def build_profile
51
+ def build_profile(start:, finish:)
52
+ start_ns = Datadog::Utils::Time.as_utc_epoch_ns(start)
53
+ finish_ns = Datadog::Utils::Time.as_utc_epoch_ns(finish)
54
+
51
55
  Perftools::Profiles::Profile.new(
52
56
  sample_type: @sample_types.messages,
53
57
  sample: @samples,
54
58
  mapping: @mappings.messages,
55
59
  location: @locations.values,
56
60
  function: @functions.messages,
57
- string_table: @string_table.strings
61
+ string_table: @string_table.strings,
62
+ time_nanos: start_ns,
63
+ duration_nanos: finish_ns - start_ns,
58
64
  )
59
65
  end
60
66
 
@@ -25,20 +25,19 @@ module Datadog
25
25
  # [key, EventGroup]
26
26
  event_groups = {}
27
27
 
28
+ # Aggregate each event into a group
29
+ # with identical properties, but different values.
28
30
  events.each do |event|
29
31
  key = yield(event)
30
- values = build_sample_values(event)
32
+ values = build_event_values(event)
31
33
 
32
34
  unless key.nil?
33
35
  if event_groups.key?(key)
34
- # Update values for group
35
- group_values = event_groups[key].values
36
- group_values.each_with_index do |group_value, i|
37
- group_values[i] = group_value + values[i]
38
- end
36
+ # Update existing group from event
37
+ update_group(event_groups[key], event, values)
39
38
  else
40
39
  # Add new group
41
- event_groups[key] = EventGroup.new(event, values)
40
+ event_groups[key] = new_group(event, values)
42
41
  end
43
42
  end
44
43
  end
@@ -57,7 +56,7 @@ module Datadog
57
56
  index
58
57
  end
59
58
 
60
- def build_sample_values(stack_sample)
59
+ def build_event_values(event)
61
60
  # Build a value array that matches the length of the sample types
62
61
  # Populate all values with "no value" by default
63
62
  Array.new(@sample_type_mappings.length, Datadog::Ext::Profiling::Pprof::SAMPLE_VALUE_NO_VALUE)
@@ -69,7 +68,7 @@ module Datadog
69
68
 
70
69
  # Represents a grouped event
71
70
  # 'sample' is an example event object from the group.
72
- # 'values' is the the summation of the group's sample values
71
+ # 'values' is the summation of the group's sample values
73
72
  EventGroup = Struct.new(:sample, :values)
74
73
 
75
74
  # Error when the mapping of a sample type to value index is unknown
@@ -84,6 +83,20 @@ module Datadog
84
83
  "Mapping for sample value type '#{type}' to index is unknown."
85
84
  end
86
85
  end
86
+
87
+ protected
88
+
89
+ def new_group(event, values)
90
+ EventGroup.new(event, values)
91
+ end
92
+
93
+ def update_group(event_group, event, values)
94
+ # Update values for group
95
+ group_values = event_group.values
96
+ group_values.each_with_index do |group_value, i|
97
+ group_values[i] = group_value + values[i]
98
+ end
99
+ end
87
100
  end
88
101
  end
89
102
  end