ddtrace 0.52.0 → 0.54.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +174 -11
- data/ddtrace.gemspec +6 -3
- data/docs/DevelopmentGuide.md +1 -6
- data/docs/GettingStarted.md +109 -18
- data/docs/ProfilingDevelopment.md +2 -2
- data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +86 -0
- data/ext/ddtrace_profiling_native_extension/clock_id.h +4 -0
- data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +52 -0
- data/ext/ddtrace_profiling_native_extension/clock_id_noop.c +14 -0
- data/ext/ddtrace_profiling_native_extension/extconf.rb +177 -8
- data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +35 -0
- data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +3 -0
- data/ext/ddtrace_profiling_native_extension/profiling.c +6 -1
- data/lib/datadog/ci/contrib/cucumber/formatter.rb +1 -0
- data/lib/datadog/ci/contrib/rspec/example.rb +1 -0
- data/lib/datadog/ci/contrib/rspec/integration.rb +2 -2
- data/lib/datadog/ci/ext/environment.rb +64 -22
- data/lib/datadog/ci/ext/test.rb +1 -0
- data/lib/datadog/ci/test.rb +5 -1
- data/lib/datadog/contrib.rb +2 -0
- data/lib/datadog/core/environment/vm_cache.rb +46 -0
- data/lib/ddtrace/buffer.rb +28 -16
- data/lib/ddtrace/configuration/agent_settings_resolver.rb +131 -53
- data/lib/ddtrace/configuration/components.rb +1 -1
- data/lib/ddtrace/configuration/settings.rb +13 -3
- data/lib/ddtrace/context.rb +10 -2
- data/lib/ddtrace/contrib/action_cable/instrumentation.rb +46 -0
- data/lib/ddtrace/contrib/action_cable/patcher.rb +1 -0
- data/lib/ddtrace/contrib/action_mailer/configuration/settings.rb +32 -0
- data/lib/ddtrace/contrib/action_mailer/event.rb +50 -0
- data/lib/ddtrace/contrib/action_mailer/events/deliver.rb +54 -0
- data/lib/ddtrace/contrib/action_mailer/events/process.rb +41 -0
- data/lib/ddtrace/contrib/action_mailer/events.rb +31 -0
- data/lib/ddtrace/contrib/action_mailer/ext.rb +32 -0
- data/lib/ddtrace/contrib/action_mailer/integration.rb +45 -0
- data/lib/ddtrace/contrib/action_mailer/patcher.rb +27 -0
- data/lib/ddtrace/contrib/active_job/configuration/settings.rb +33 -0
- data/lib/ddtrace/contrib/active_job/event.rb +54 -0
- data/lib/ddtrace/contrib/active_job/events/discard.rb +46 -0
- data/lib/ddtrace/contrib/active_job/events/enqueue.rb +45 -0
- data/lib/ddtrace/contrib/active_job/events/enqueue_at.rb +45 -0
- data/lib/ddtrace/contrib/active_job/events/enqueue_retry.rb +47 -0
- data/lib/ddtrace/contrib/active_job/events/perform.rb +45 -0
- data/lib/ddtrace/contrib/active_job/events/retry_stopped.rb +46 -0
- data/lib/ddtrace/contrib/active_job/events.rb +39 -0
- data/lib/ddtrace/contrib/active_job/ext.rb +32 -0
- data/lib/ddtrace/contrib/active_job/integration.rb +46 -0
- data/lib/ddtrace/contrib/active_job/log_injection.rb +21 -0
- data/lib/ddtrace/contrib/active_job/patcher.rb +33 -0
- data/lib/ddtrace/contrib/auto_instrument.rb +0 -1
- data/lib/ddtrace/contrib/delayed_job/plugin.rb +2 -2
- data/lib/ddtrace/contrib/mongodb/instrumentation.rb +1 -1
- data/lib/ddtrace/contrib/mongodb/integration.rb +5 -0
- data/lib/ddtrace/contrib/rails/auto_instrument_railtie.rb +0 -1
- data/lib/ddtrace/contrib/rails/configuration/settings.rb +7 -0
- data/lib/ddtrace/contrib/rails/framework.rb +24 -1
- data/lib/ddtrace/contrib/rails/patcher.rb +19 -10
- data/lib/ddtrace/contrib/redis/instrumentation.rb +90 -0
- data/lib/ddtrace/contrib/redis/patcher.rb +2 -84
- data/lib/ddtrace/contrib/registerable.rb +0 -1
- data/lib/ddtrace/contrib/resque/integration.rb +1 -5
- data/lib/ddtrace/contrib/sidekiq/ext.rb +3 -0
- data/lib/ddtrace/contrib/sidekiq/integration.rb +10 -0
- data/lib/ddtrace/contrib/sidekiq/patcher.rb +26 -0
- data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/heartbeat.rb +30 -0
- data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/job_fetch.rb +30 -0
- data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/scheduled_push.rb +29 -0
- data/lib/ddtrace/contrib/sinatra/env.rb +2 -1
- data/lib/ddtrace/contrib/sinatra/tracer.rb +15 -2
- data/lib/ddtrace/ext/git.rb +12 -0
- data/lib/ddtrace/ext/priority.rb +6 -4
- data/lib/ddtrace/ext/profiling.rb +8 -11
- data/lib/ddtrace/ext/runtime.rb +3 -0
- data/lib/ddtrace/ext/transport.rb +11 -0
- data/lib/ddtrace/metrics.rb +2 -2
- data/lib/ddtrace/profiling/collectors/stack.rb +112 -72
- data/lib/ddtrace/profiling/encoding/profile.rb +10 -2
- data/lib/ddtrace/profiling/events/stack.rb +13 -13
- data/lib/ddtrace/profiling/native_extension.rb +23 -1
- data/lib/ddtrace/profiling/pprof/builder.rb +8 -2
- data/lib/ddtrace/profiling/pprof/converter.rb +22 -9
- data/lib/ddtrace/profiling/pprof/stack_sample.rb +32 -9
- data/lib/ddtrace/profiling/pprof/template.rb +2 -2
- data/lib/ddtrace/profiling/scheduler.rb +20 -4
- data/lib/ddtrace/profiling/tasks/setup.rb +21 -13
- data/lib/ddtrace/profiling/trace_identifiers/ddtrace.rb +10 -9
- data/lib/ddtrace/profiling/trace_identifiers/helper.rb +5 -5
- data/lib/ddtrace/profiling/transport/http/api/endpoint.rb +8 -15
- data/lib/ddtrace/profiling/transport/http.rb +8 -17
- data/lib/ddtrace/profiling.rb +0 -2
- data/lib/ddtrace/runtime/metrics.rb +14 -0
- data/lib/ddtrace/sampler.rb +18 -8
- data/lib/ddtrace/sampling/rule_sampler.rb +13 -1
- data/lib/ddtrace/span.rb +7 -19
- data/lib/ddtrace/tracer.rb +1 -1
- data/lib/ddtrace/transport/http/adapters/net.rb +13 -3
- data/lib/ddtrace/transport/http/adapters/test.rb +4 -2
- data/lib/ddtrace/transport/http/adapters/unix_socket.rb +23 -12
- data/lib/ddtrace/transport/http/builder.rb +13 -6
- data/lib/ddtrace/transport/http.rb +5 -11
- data/lib/ddtrace/utils/time.rb +11 -6
- data/lib/ddtrace/version.rb +2 -2
- data/lib/ddtrace/workers/{loop.rb → interval_loop.rb} +0 -16
- data/lib/ddtrace/workers/polling.rb +1 -1
- metadata +40 -10
- data/lib/ddtrace/profiling/ext/cpu.rb +0 -67
- data/lib/ddtrace/profiling/ext/cthread.rb +0 -156
data/lib/ddtrace/ext/priority.rb
CHANGED
@@ -4,13 +4,15 @@ module Datadog
|
|
4
4
|
# Priority is a hint given to the backend so that it knows which traces to reject or kept.
|
5
5
|
# In a distributed context, it should be set before any context propagation (fork, RPC calls) to be effective.
|
6
6
|
module Priority
|
7
|
-
# Use this to
|
7
|
+
# Use this to explicitly inform the backend that a trace MUST be rejected and not stored.
|
8
|
+
# This includes rules and rate limits configured by the user through the {RuleSampler}.
|
8
9
|
USER_REJECT = -1
|
9
|
-
# Used by the
|
10
|
+
# Used by the {PrioritySampler} to inform the backend that a trace should be rejected and not stored.
|
10
11
|
AUTO_REJECT = 0
|
11
|
-
# Used by the
|
12
|
+
# Used by the {PrioritySampler} to inform the backend that a trace should be kept and stored.
|
12
13
|
AUTO_KEEP = 1
|
13
|
-
# Use this to
|
14
|
+
# Use this to explicitly inform the backend that a trace MUST be kept and stored.
|
15
|
+
# This includes rules and rate limits configured by the user through the {RuleSampler}.
|
14
16
|
USER_KEEP = 2
|
15
17
|
end
|
16
18
|
end
|
@@ -6,11 +6,12 @@ module Datadog
|
|
6
6
|
ENV_UPLOAD_TIMEOUT = 'DD_PROFILING_UPLOAD_TIMEOUT'.freeze
|
7
7
|
ENV_MAX_FRAMES = 'DD_PROFILING_MAX_FRAMES'.freeze
|
8
8
|
ENV_AGENTLESS = 'DD_PROFILING_AGENTLESS'.freeze
|
9
|
+
ENV_ENDPOINT_COLLECTION_ENABLED = 'DD_PROFILING_ENDPOINT_COLLECTION_ENABLED'.freeze
|
9
10
|
|
10
11
|
module Pprof
|
12
|
+
LABEL_KEY_LOCAL_ROOT_SPAN_ID = 'local root span id'.freeze
|
11
13
|
LABEL_KEY_SPAN_ID = 'span id'.freeze
|
12
14
|
LABEL_KEY_THREAD_ID = 'thread id'.freeze
|
13
|
-
LABEL_KEY_TRACE_ID = 'trace id'.freeze
|
14
15
|
LABEL_KEY_TRACE_ENDPOINT = 'trace endpoint'.freeze
|
15
16
|
SAMPLE_VALUE_NO_VALUE = 0
|
16
17
|
VALUE_TYPE_CPU = 'cpu-time'.freeze
|
@@ -22,13 +23,9 @@ module Datadog
|
|
22
23
|
module HTTP
|
23
24
|
URI_TEMPLATE_DD_API = 'https://intake.profile.%s/'.freeze
|
24
25
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
FORM_FIELD_RECORDING_END = 'recording-end'.freeze
|
29
|
-
FORM_FIELD_RECORDING_START = 'recording-start'.freeze
|
30
|
-
FORM_FIELD_RUNTIME = 'runtime'.freeze
|
31
|
-
FORM_FIELD_RUNTIME_ID = 'runtime-id'.freeze
|
26
|
+
FORM_FIELD_RECORDING_START = 'start'.freeze
|
27
|
+
FORM_FIELD_RECORDING_END = 'end'.freeze
|
28
|
+
FORM_FIELD_FAMILY = 'family'.freeze
|
32
29
|
FORM_FIELD_TAG_ENV = 'env'.freeze
|
33
30
|
FORM_FIELD_TAG_HOST = 'host'.freeze
|
34
31
|
FORM_FIELD_TAG_LANGUAGE = 'language'.freeze
|
@@ -42,13 +39,13 @@ module Datadog
|
|
42
39
|
FORM_FIELD_TAG_SERVICE = 'service'.freeze
|
43
40
|
FORM_FIELD_TAG_VERSION = 'version'.freeze
|
44
41
|
FORM_FIELD_TAGS = 'tags'.freeze
|
45
|
-
|
46
|
-
FORM_FIELD_TYPES_AUTO = 'auto'.freeze
|
42
|
+
FORM_FIELD_INTAKE_VERSION = 'version'.freeze
|
47
43
|
|
48
44
|
HEADER_CONTENT_TYPE = 'Content-Type'.freeze
|
49
45
|
HEADER_CONTENT_TYPE_OCTET_STREAM = 'application/octet-stream'.freeze
|
50
46
|
|
51
|
-
|
47
|
+
FORM_FIELD_PPROF_DATA = 'data[rubyprofile.pprof]'.freeze
|
48
|
+
PPROF_DEFAULT_FILENAME = 'rubyprofile.pprof.gz'.freeze
|
52
49
|
end
|
53
50
|
end
|
54
51
|
end
|
data/lib/ddtrace/ext/runtime.rb
CHANGED
@@ -6,6 +6,7 @@ module Datadog
|
|
6
6
|
module Runtime
|
7
7
|
TAG_ID = 'runtime-id'.freeze
|
8
8
|
TAG_LANG = 'language'.freeze
|
9
|
+
TAG_PID = 'system.pid'.freeze
|
9
10
|
|
10
11
|
# Metrics
|
11
12
|
module Metrics
|
@@ -14,6 +15,8 @@ module Datadog
|
|
14
15
|
METRIC_CLASS_COUNT = 'runtime.ruby.class_count'.freeze
|
15
16
|
METRIC_GC_PREFIX = 'runtime.ruby.gc'.freeze
|
16
17
|
METRIC_THREAD_COUNT = 'runtime.ruby.thread_count'.freeze
|
18
|
+
METRIC_GLOBAL_CONSTANT_STATE = 'runtime.ruby.global_constant_state'.freeze
|
19
|
+
METRIC_GLOBAL_METHOD_STATE = 'runtime.ruby.global_method_state'.freeze
|
17
20
|
|
18
21
|
TAG_SERVICE = 'service'.freeze
|
19
22
|
end
|
@@ -3,6 +3,7 @@ module Datadog
|
|
3
3
|
module Ext
|
4
4
|
module Transport
|
5
5
|
module HTTP
|
6
|
+
ADAPTER = :net_http # DEV: Rename to simply `:http`, as Net::HTTP is an implementation detail.
|
6
7
|
DEFAULT_HOST = '127.0.0.1'.freeze
|
7
8
|
DEFAULT_PORT = 8126
|
8
9
|
DEFAULT_TIMEOUT_SECONDS = 1
|
@@ -16,6 +17,16 @@ module Datadog
|
|
16
17
|
HEADER_META_LANG_INTERPRETER = 'Datadog-Meta-Lang-Interpreter'.freeze
|
17
18
|
HEADER_META_TRACER_VERSION = 'Datadog-Meta-Tracer-Version'.freeze
|
18
19
|
end
|
20
|
+
|
21
|
+
module Test
|
22
|
+
ADAPTER = :test
|
23
|
+
end
|
24
|
+
|
25
|
+
module UnixSocket
|
26
|
+
ADAPTER = :unix
|
27
|
+
DEFAULT_PATH = '/var/run/datadog/apm.socket'.freeze
|
28
|
+
DEFAULT_TIMEOUT_SECONDS = 1
|
29
|
+
end
|
19
30
|
end
|
20
31
|
end
|
21
32
|
end
|
data/lib/ddtrace/metrics.rb
CHANGED
@@ -31,7 +31,7 @@ module Datadog
|
|
31
31
|
!version.nil? && version >= Gem::Version.new('3.3.0') &&
|
32
32
|
# dogstatsd-ruby >= 5.0 & < 5.2.0 has known issues with process forks
|
33
33
|
# and do not support the single thread mode we use to avoid this problem.
|
34
|
-
!(version >= Gem::Version.new('5.0') && version < Gem::Version.new('5.
|
34
|
+
!(version >= Gem::Version.new('5.0') && version < Gem::Version.new('5.3'))
|
35
35
|
end
|
36
36
|
|
37
37
|
def enabled?
|
@@ -274,7 +274,7 @@ module Datadog
|
|
274
274
|
IGNORED_STATSD_ONLY_ONCE.run do
|
275
275
|
Datadog.logger.warn(
|
276
276
|
'Ignoring user-supplied statsd instance as currently-installed version of dogstastd-ruby is incompatible. ' \
|
277
|
-
"To fix this, ensure that you have `gem 'dogstatsd-ruby', '~> 5.
|
277
|
+
"To fix this, ensure that you have `gem 'dogstatsd-ruby', '~> 5.3'` on your Gemfile or gems.rb file."
|
278
278
|
)
|
279
279
|
end
|
280
280
|
end
|
@@ -1,4 +1,6 @@
|
|
1
1
|
# typed: true
|
2
|
+
|
3
|
+
require 'ddtrace/profiling/native_extension'
|
2
4
|
require 'ddtrace/profiling/backtrace_location'
|
3
5
|
require 'ddtrace/profiling/events/stack'
|
4
6
|
require 'ddtrace/utils/only_once'
|
@@ -18,6 +20,13 @@ module Datadog
|
|
18
20
|
DEFAULT_MAX_TIME_USAGE_PCT = 2.0
|
19
21
|
MIN_INTERVAL = 0.01
|
20
22
|
THREAD_LAST_CPU_TIME_KEY = :datadog_profiler_last_cpu_time
|
23
|
+
THREAD_LAST_WALL_CLOCK_KEY = :datadog_profiler_last_wall_clock
|
24
|
+
SYNTHETIC_STACK_IN_NATIVE_CODE = [BacktraceLocation.new('', 0, 'In native code').freeze].freeze
|
25
|
+
|
26
|
+
# This default was picked based on the current sampling performance and on expected concurrency on an average
|
27
|
+
# Ruby MRI application. Lowering this optimizes for latency (less impact each time we sample), and raising
|
28
|
+
# optimizes for coverage (less chance to miss what a given thread is doing).
|
29
|
+
DEFAULT_MAX_THREADS_SAMPLED = 16
|
21
30
|
|
22
31
|
attr_reader \
|
23
32
|
:recorder,
|
@@ -25,7 +34,8 @@ module Datadog
|
|
25
34
|
:trace_identifiers_helper,
|
26
35
|
:ignore_thread,
|
27
36
|
:max_time_usage_pct,
|
28
|
-
:thread_api
|
37
|
+
:thread_api,
|
38
|
+
:cpu_time_provider
|
29
39
|
|
30
40
|
def initialize(
|
31
41
|
recorder,
|
@@ -33,7 +43,9 @@ module Datadog
|
|
33
43
|
trace_identifiers_helper:, # Usually an instance of Datadog::Profiling::TraceIdentifiers::Helper
|
34
44
|
ignore_thread: nil,
|
35
45
|
max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT,
|
46
|
+
max_threads_sampled: DEFAULT_MAX_THREADS_SAMPLED,
|
36
47
|
thread_api: Thread,
|
48
|
+
cpu_time_provider: Datadog::Profiling::NativeExtension,
|
37
49
|
fork_policy: Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
|
38
50
|
interval: MIN_INTERVAL,
|
39
51
|
enabled: true
|
@@ -43,7 +55,10 @@ module Datadog
|
|
43
55
|
@trace_identifiers_helper = trace_identifiers_helper
|
44
56
|
@ignore_thread = ignore_thread
|
45
57
|
@max_time_usage_pct = max_time_usage_pct
|
58
|
+
@max_threads_sampled = max_threads_sampled
|
46
59
|
@thread_api = thread_api
|
60
|
+
# Only set the provider if it's able to work in the current Ruby/OS combo
|
61
|
+
@cpu_time_provider = cpu_time_provider unless cpu_time_provider.cpu_time_ns_for(thread_api.current).nil?
|
47
62
|
|
48
63
|
# Workers::Async::Thread settings
|
49
64
|
self.fork_policy = fork_policy
|
@@ -54,16 +69,17 @@ module Datadog
|
|
54
69
|
# Workers::Polling settings
|
55
70
|
self.enabled = enabled
|
56
71
|
|
57
|
-
@warn_about_missing_cpu_time_instrumentation_only_once = Datadog::Utils::OnlyOnce.new
|
58
|
-
|
59
72
|
# Cache this proc, since it's pretty expensive to keep recreating it
|
60
73
|
@build_backtrace_location = method(:build_backtrace_location).to_proc
|
61
74
|
# Cache this buffer, since it's pretty expensive to keep accessing it
|
62
75
|
@stack_sample_event_recorder = recorder[Events::StackSample]
|
76
|
+
# See below for details on why this is needed
|
77
|
+
@needs_process_waiter_workaround =
|
78
|
+
Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('2.3') &&
|
79
|
+
Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7')
|
63
80
|
end
|
64
81
|
|
65
82
|
def start
|
66
|
-
@last_wall_time = Datadog::Utils::Time.get_time
|
67
83
|
reset_cpu_time_tracking
|
68
84
|
perform
|
69
85
|
end
|
@@ -72,10 +88,6 @@ module Datadog
|
|
72
88
|
collect_and_wait
|
73
89
|
end
|
74
90
|
|
75
|
-
def loop_back_off?
|
76
|
-
false
|
77
|
-
end
|
78
|
-
|
79
91
|
def collect_and_wait
|
80
92
|
run_time = Datadog::Utils::Time.measure do
|
81
93
|
collect_events
|
@@ -87,24 +99,14 @@ module Datadog
|
|
87
99
|
|
88
100
|
def collect_events
|
89
101
|
events = []
|
90
|
-
|
91
|
-
# Compute wall time interval
|
92
|
-
current_wall_time = Datadog::Utils::Time.get_time
|
93
|
-
last_wall_time = if instance_variable_defined?(:@last_wall_time)
|
94
|
-
@last_wall_time
|
95
|
-
else
|
96
|
-
current_wall_time
|
97
|
-
end
|
98
|
-
|
99
|
-
wall_time_interval_ns = ((current_wall_time - last_wall_time).round(9) * 1e9).to_i
|
100
|
-
@last_wall_time = current_wall_time
|
102
|
+
current_wall_time_ns = get_current_wall_time_timestamp_ns
|
101
103
|
|
102
104
|
# Collect backtraces from each thread
|
103
|
-
|
105
|
+
threads_to_sample.each do |thread|
|
104
106
|
next unless thread.alive?
|
105
107
|
next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread)
|
106
108
|
|
107
|
-
event = collect_thread_event(thread,
|
109
|
+
event = collect_thread_event(thread, current_wall_time_ns)
|
108
110
|
events << event unless event.nil?
|
109
111
|
end
|
110
112
|
|
@@ -114,10 +116,30 @@ module Datadog
|
|
114
116
|
events
|
115
117
|
end
|
116
118
|
|
117
|
-
def collect_thread_event(thread,
|
119
|
+
def collect_thread_event(thread, current_wall_time_ns)
|
118
120
|
locations = thread.backtrace_locations
|
119
121
|
return if locations.nil?
|
120
122
|
|
123
|
+
# Having empty locations means that the thread is alive, but we don't know what it's doing:
|
124
|
+
#
|
125
|
+
# 1. It can be starting up
|
126
|
+
# ```
|
127
|
+
# > Thread.new { sleep }.backtrace
|
128
|
+
# => [] # <-- note the thread hasn't actually started running sleep yet, we got there first
|
129
|
+
# ```
|
130
|
+
# 2. It can be running native code
|
131
|
+
# ```
|
132
|
+
# > t = Process.detach(fork { sleep })
|
133
|
+
# => #<Process::Waiter:0x00007ffe7285f7a0 run>
|
134
|
+
# > t.backtrace
|
135
|
+
# => [] # <-- this can happen even minutes later, e.g. it's not a race as in 1.
|
136
|
+
# ```
|
137
|
+
# This effect has been observed in threads created by the Iodine web server and the ffi gem
|
138
|
+
#
|
139
|
+
# To give customers visibility into these threads, we replace the empty stack with one containing a
|
140
|
+
# synthetic placeholder frame, so that these threads are properly represented in the UX.
|
141
|
+
locations = SYNTHETIC_STACK_IN_NATIVE_CODE if locations.empty?
|
142
|
+
|
121
143
|
# Get actual stack size then trim the stack
|
122
144
|
stack_size = locations.length
|
123
145
|
locations = locations[0..(max_frames - 1)]
|
@@ -125,45 +147,33 @@ module Datadog
|
|
125
147
|
# Convert backtrace locations into structs
|
126
148
|
locations = convert_backtrace_locations(locations)
|
127
149
|
|
128
|
-
thread_id = thread.
|
129
|
-
|
150
|
+
thread_id = thread.object_id
|
151
|
+
root_span_id, span_id, trace_resource = trace_identifiers_helper.trace_identifiers_for(thread)
|
130
152
|
cpu_time = get_cpu_time_interval!(thread)
|
153
|
+
wall_time_interval_ns =
|
154
|
+
get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_WALL_CLOCK_KEY, current_wall_time_ns)
|
131
155
|
|
132
156
|
Events::StackSample.new(
|
133
157
|
nil,
|
134
158
|
locations,
|
135
159
|
stack_size,
|
136
160
|
thread_id,
|
137
|
-
|
161
|
+
root_span_id,
|
138
162
|
span_id,
|
139
|
-
|
163
|
+
trace_resource,
|
140
164
|
cpu_time,
|
141
165
|
wall_time_interval_ns
|
142
166
|
)
|
143
167
|
end
|
144
168
|
|
145
169
|
def get_cpu_time_interval!(thread)
|
146
|
-
|
147
|
-
unless thread.respond_to?(:cpu_time_instrumentation_installed?) && thread.cpu_time_instrumentation_installed?
|
148
|
-
warn_about_missing_cpu_time_instrumentation(thread)
|
149
|
-
return
|
150
|
-
end
|
170
|
+
return unless cpu_time_provider
|
151
171
|
|
152
|
-
current_cpu_time_ns =
|
172
|
+
current_cpu_time_ns = cpu_time_provider.cpu_time_ns_for(thread)
|
153
173
|
|
154
|
-
# NOTE: This can still be nil even when all of the checks above passed because of a race: there's a bit of
|
155
|
-
# initialization that needs to be done by the thread itself, and it's possible for us to try to sample
|
156
|
-
# *before* the thread had time to finish the initialization
|
157
174
|
return unless current_cpu_time_ns
|
158
175
|
|
159
|
-
|
160
|
-
interval = current_cpu_time_ns - last_cpu_time_ns
|
161
|
-
|
162
|
-
# Update CPU time for thread
|
163
|
-
thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
|
164
|
-
|
165
|
-
# Return interval
|
166
|
-
interval
|
176
|
+
get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
|
167
177
|
end
|
168
178
|
|
169
179
|
def compute_wait_time(used_time)
|
@@ -209,38 +219,11 @@ module Datadog
|
|
209
219
|
|
210
220
|
private
|
211
221
|
|
212
|
-
def warn_about_missing_cpu_time_instrumentation(thread)
|
213
|
-
@warn_about_missing_cpu_time_instrumentation_only_once.run do
|
214
|
-
# Is the profiler thread instrumented? If it is, then we know instrumentation is available, but seems to be
|
215
|
-
# missing on this thread we just found.
|
216
|
-
#
|
217
|
-
# As far as we know, it can be missing due to one the following:
|
218
|
-
#
|
219
|
-
# a) The thread was started before we installed our instrumentation.
|
220
|
-
# In this case, the fix is to make sure ddtrace gets loaded before any other parts of the application.
|
221
|
-
#
|
222
|
-
# b) The thread was started using the Ruby native APIs (e.g. from a C extension such as ffi).
|
223
|
-
# Known cases right now that trigger this are the ethon/typhoeus gems.
|
224
|
-
# We currently have no solution for this case; these threads will always be missing our CPU instrumentation.
|
225
|
-
#
|
226
|
-
# c) The thread was started with `Thread.start`/`Thread.fork` and hasn't yet enabled the instrumentation.
|
227
|
-
# When threads are started using these APIs, there's a small time window during which the thread has started
|
228
|
-
# but our code to apply the instrumentation hasn't run yet; in these cases it's just a matter of allowing
|
229
|
-
# it to run and our instrumentation to be applied.
|
230
|
-
#
|
231
|
-
if thread_api.current.respond_to?(:cpu_time) && thread_api.current.cpu_time
|
232
|
-
Datadog.logger.debug(
|
233
|
-
"Thread ('#{thread}') is missing profiling instrumentation; other threads should be unaffected"
|
234
|
-
)
|
235
|
-
end
|
236
|
-
end
|
237
|
-
end
|
238
|
-
|
239
222
|
# If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to
|
240
|
-
# clean up any leftover per-thread
|
223
|
+
# clean up any leftover per-thread counters, so that the first sample after starting doesn't end up with:
|
241
224
|
#
|
242
225
|
# a) negative time: At least on my test docker container, and on the reliability environment, after the process
|
243
|
-
# forks, the
|
226
|
+
# forks, the cpu time reference changes and (old cpu time - new cpu time) can be < 0
|
244
227
|
#
|
245
228
|
# b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
|
246
229
|
# restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
|
@@ -248,9 +231,66 @@ module Datadog
|
|
248
231
|
# By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
|
249
232
|
def reset_cpu_time_tracking
|
250
233
|
thread_api.list.each do |thread|
|
234
|
+
# See below for details on why this is needed
|
235
|
+
next if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
|
236
|
+
|
251
237
|
thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil)
|
238
|
+
thread.thread_variable_set(THREAD_LAST_WALL_CLOCK_KEY, nil)
|
252
239
|
end
|
253
240
|
end
|
241
|
+
|
242
|
+
def get_elapsed_since_last_sample_and_set_value(thread, key, current_value)
|
243
|
+
# Process::Waiter crash workaround:
|
244
|
+
#
|
245
|
+
# This is a workaround for a Ruby VM segfault (usually something like
|
246
|
+
# "[BUG] Segmentation fault at 0x0000000000000008") in the affected Ruby versions.
|
247
|
+
# See https://bugs.ruby-lang.org/issues/17807 for details.
|
248
|
+
#
|
249
|
+
# In those Ruby versions, there's a very special subclass of `Thread` called `Process::Waiter` that causes VM
|
250
|
+
# crashes whenever something tries to read its instance or thread variables. This subclass of thread only
|
251
|
+
# shows up when the `Process.detach` API gets used.
|
252
|
+
# In the specs you'll find crash regression tests that include a way of reproducing it.
|
253
|
+
#
|
254
|
+
# As workaround for now we just skip it for the affected Rubies
|
255
|
+
return 0 if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
|
256
|
+
|
257
|
+
last_value = thread.thread_variable_get(key) || current_value
|
258
|
+
thread.thread_variable_set(key, current_value)
|
259
|
+
|
260
|
+
current_value - last_value
|
261
|
+
end
|
262
|
+
|
263
|
+
# Whenever there are more than max_threads_sampled active, we only sample a subset of them.
|
264
|
+
# We do this to avoid impacting the latency of the service being profiled. We want to avoid doing
|
265
|
+
# a big burst of work all at once (sample everything), and instead do a little work each time
|
266
|
+
# (sample a bit by bit).
|
267
|
+
#
|
268
|
+
# Because we pick the threads to sample randomly, we'll eventually sample all threads -- just not at once.
|
269
|
+
# Notice also that this will interact with our dynamic sampling mechanism -- if samples are faster, we take
|
270
|
+
# them more often, if they are slower, we take them less often -- which again means that over a longer period
|
271
|
+
# we should take sample roughly the same samples.
|
272
|
+
#
|
273
|
+
# One downside of this approach is that if there really are many threads, the resulting wall clock times
|
274
|
+
# in a one minute profile may "drift" around the 60 second mark, e.g. maybe we only sampled a thread once per
|
275
|
+
# second and only 59 times, so we'll report 59s, but on the next report we'll include the missing one, so
|
276
|
+
# then the result will be 61s. I've observed 60 +- 1.68 secs for an app with ~65 threads, given the
|
277
|
+
# default maximum of 16 threads. This seems a reasonable enough margin of error given the improvement to
|
278
|
+
# latency (especially on such a large application! -> even bigger latency impact if we tried to sample all
|
279
|
+
# threads).
|
280
|
+
#
|
281
|
+
def threads_to_sample
|
282
|
+
all_threads = thread_api.list
|
283
|
+
|
284
|
+
if all_threads.size > @max_threads_sampled
|
285
|
+
all_threads.sample(@max_threads_sampled)
|
286
|
+
else
|
287
|
+
all_threads
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
def get_current_wall_time_timestamp_ns
|
292
|
+
Datadog::Utils::Time.get_time(:nanosecond)
|
293
|
+
end
|
254
294
|
end
|
255
295
|
end
|
256
296
|
end
|
@@ -24,12 +24,20 @@ module Datadog
|
|
24
24
|
flush.event_groups.each { |event_group| template.add_events!(event_group.event_class, event_group.events) }
|
25
25
|
|
26
26
|
Datadog.logger.debug do
|
27
|
+
max_events = Datadog.configuration.profiling.advanced.max_events
|
28
|
+
events_sampled =
|
29
|
+
if flush.event_count == max_events
|
30
|
+
'max events limit hit, events were sampled [profile will be biased], '
|
31
|
+
else
|
32
|
+
''
|
33
|
+
end
|
34
|
+
|
27
35
|
"Encoding profile covering #{flush.start.iso8601} to #{flush.finish.iso8601}, " \
|
28
|
-
"events: #{flush.event_count} (#{template.debug_statistics})"
|
36
|
+
"events: #{flush.event_count} (#{events_sampled}#{template.debug_statistics})"
|
29
37
|
end
|
30
38
|
|
31
39
|
# Build the profile and encode it
|
32
|
-
template.to_pprof
|
40
|
+
template.to_pprof(start: flush.start, finish: flush.finish)
|
33
41
|
end
|
34
42
|
end
|
35
43
|
end
|
@@ -11,34 +11,34 @@ module Datadog
|
|
11
11
|
:frames,
|
12
12
|
:total_frame_count,
|
13
13
|
:thread_id,
|
14
|
-
:
|
14
|
+
:root_span_id,
|
15
15
|
:span_id,
|
16
|
-
:
|
16
|
+
:trace_resource
|
17
17
|
|
18
18
|
def initialize(
|
19
19
|
timestamp,
|
20
20
|
frames,
|
21
21
|
total_frame_count,
|
22
22
|
thread_id,
|
23
|
-
|
23
|
+
root_span_id,
|
24
24
|
span_id,
|
25
|
-
|
25
|
+
trace_resource
|
26
26
|
)
|
27
27
|
super(timestamp)
|
28
28
|
|
29
29
|
@frames = frames
|
30
30
|
@total_frame_count = total_frame_count
|
31
31
|
@thread_id = thread_id
|
32
|
-
@
|
32
|
+
@root_span_id = root_span_id
|
33
33
|
@span_id = span_id
|
34
|
-
@
|
34
|
+
@trace_resource = trace_resource
|
35
35
|
|
36
36
|
@hash = [
|
37
37
|
thread_id,
|
38
|
-
|
38
|
+
root_span_id,
|
39
39
|
span_id,
|
40
|
-
#
|
41
|
-
#
|
40
|
+
# trace_resource is deliberately not included -- events that share the same (root_span_id, span_id) refer
|
41
|
+
# to the same trace
|
42
42
|
frames.collect(&:hash),
|
43
43
|
total_frame_count
|
44
44
|
].hash
|
@@ -56,9 +56,9 @@ module Datadog
|
|
56
56
|
frames,
|
57
57
|
total_frame_count,
|
58
58
|
thread_id,
|
59
|
-
|
59
|
+
root_span_id,
|
60
60
|
span_id,
|
61
|
-
|
61
|
+
trace_resource,
|
62
62
|
cpu_time_interval_ns,
|
63
63
|
wall_time_interval_ns
|
64
64
|
)
|
@@ -67,9 +67,9 @@ module Datadog
|
|
67
67
|
frames,
|
68
68
|
total_frame_count,
|
69
69
|
thread_id,
|
70
|
-
|
70
|
+
root_span_id,
|
71
71
|
span_id,
|
72
|
-
|
72
|
+
trace_resource
|
73
73
|
)
|
74
74
|
|
75
75
|
@cpu_time_interval_ns = cpu_time_interval_ns
|
@@ -2,7 +2,8 @@
|
|
2
2
|
module Datadog
|
3
3
|
module Profiling
|
4
4
|
# This module contains classes and methods which are implemented using native code in the
|
5
|
-
# ext/ddtrace_profiling_native_extension folder
|
5
|
+
# ext/ddtrace_profiling_native_extension folder, as well as some Ruby-level utilities that don't make sense to
|
6
|
+
# write using C
|
6
7
|
module NativeExtension
|
7
8
|
private_class_method def self.working?
|
8
9
|
native_working?
|
@@ -13,6 +14,27 @@ module Datadog
|
|
13
14
|
false
|
14
15
|
end
|
15
16
|
end
|
17
|
+
|
18
|
+
unless singleton_class.method_defined?(:clock_id_for)
|
19
|
+
def self.clock_id_for(_)
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.cpu_time_ns_for(thread)
|
25
|
+
clock_id =
|
26
|
+
begin
|
27
|
+
clock_id_for(thread)
|
28
|
+
rescue Errno::ESRCH
|
29
|
+
nil
|
30
|
+
end
|
31
|
+
|
32
|
+
begin
|
33
|
+
::Process.clock_gettime(clock_id, :nanosecond) if clock_id
|
34
|
+
rescue Errno::EINVAL
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
end
|
16
38
|
end
|
17
39
|
end
|
18
40
|
end
|
@@ -4,6 +4,7 @@
|
|
4
4
|
require 'ddtrace/profiling/flush'
|
5
5
|
require 'ddtrace/profiling/pprof/message_set'
|
6
6
|
require 'ddtrace/profiling/pprof/string_table'
|
7
|
+
require 'ddtrace/utils/time'
|
7
8
|
|
8
9
|
module Datadog
|
9
10
|
module Profiling
|
@@ -47,14 +48,19 @@ module Datadog
|
|
47
48
|
Perftools::Profiles::Profile.encode(profile).force_encoding(DEFAULT_ENCODING)
|
48
49
|
end
|
49
50
|
|
50
|
-
def build_profile
|
51
|
+
def build_profile(start:, finish:)
|
52
|
+
start_ns = Datadog::Utils::Time.as_utc_epoch_ns(start)
|
53
|
+
finish_ns = Datadog::Utils::Time.as_utc_epoch_ns(finish)
|
54
|
+
|
51
55
|
Perftools::Profiles::Profile.new(
|
52
56
|
sample_type: @sample_types.messages,
|
53
57
|
sample: @samples,
|
54
58
|
mapping: @mappings.messages,
|
55
59
|
location: @locations.values,
|
56
60
|
function: @functions.messages,
|
57
|
-
string_table: @string_table.strings
|
61
|
+
string_table: @string_table.strings,
|
62
|
+
time_nanos: start_ns,
|
63
|
+
duration_nanos: finish_ns - start_ns,
|
58
64
|
)
|
59
65
|
end
|
60
66
|
|
@@ -25,20 +25,19 @@ module Datadog
|
|
25
25
|
# [key, EventGroup]
|
26
26
|
event_groups = {}
|
27
27
|
|
28
|
+
# Aggregate each event into a group
|
29
|
+
# with identical properties, but different values.
|
28
30
|
events.each do |event|
|
29
31
|
key = yield(event)
|
30
|
-
values =
|
32
|
+
values = build_event_values(event)
|
31
33
|
|
32
34
|
unless key.nil?
|
33
35
|
if event_groups.key?(key)
|
34
|
-
# Update
|
35
|
-
|
36
|
-
group_values.each_with_index do |group_value, i|
|
37
|
-
group_values[i] = group_value + values[i]
|
38
|
-
end
|
36
|
+
# Update existing group from event
|
37
|
+
update_group(event_groups[key], event, values)
|
39
38
|
else
|
40
39
|
# Add new group
|
41
|
-
event_groups[key] =
|
40
|
+
event_groups[key] = new_group(event, values)
|
42
41
|
end
|
43
42
|
end
|
44
43
|
end
|
@@ -57,7 +56,7 @@ module Datadog
|
|
57
56
|
index
|
58
57
|
end
|
59
58
|
|
60
|
-
def
|
59
|
+
def build_event_values(event)
|
61
60
|
# Build a value array that matches the length of the sample types
|
62
61
|
# Populate all values with "no value" by default
|
63
62
|
Array.new(@sample_type_mappings.length, Datadog::Ext::Profiling::Pprof::SAMPLE_VALUE_NO_VALUE)
|
@@ -69,7 +68,7 @@ module Datadog
|
|
69
68
|
|
70
69
|
# Represents a grouped event
|
71
70
|
# 'sample' is an example event object from the group.
|
72
|
-
# 'values' is the
|
71
|
+
# 'values' is the summation of the group's sample values
|
73
72
|
EventGroup = Struct.new(:sample, :values)
|
74
73
|
|
75
74
|
# Error when the mapping of a sample type to value index is unknown
|
@@ -84,6 +83,20 @@ module Datadog
|
|
84
83
|
"Mapping for sample value type '#{type}' to index is unknown."
|
85
84
|
end
|
86
85
|
end
|
86
|
+
|
87
|
+
protected
|
88
|
+
|
89
|
+
def new_group(event, values)
|
90
|
+
EventGroup.new(event, values)
|
91
|
+
end
|
92
|
+
|
93
|
+
def update_group(event_group, event, values)
|
94
|
+
# Update values for group
|
95
|
+
group_values = event_group.values
|
96
|
+
group_values.each_with_index do |group_value, i|
|
97
|
+
group_values[i] = group_value + values[i]
|
98
|
+
end
|
99
|
+
end
|
87
100
|
end
|
88
101
|
end
|
89
102
|
end
|