ddtrace 0.52.0 → 0.54.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +174 -11
- data/ddtrace.gemspec +6 -3
- data/docs/DevelopmentGuide.md +1 -6
- data/docs/GettingStarted.md +109 -18
- data/docs/ProfilingDevelopment.md +2 -2
- data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +86 -0
- data/ext/ddtrace_profiling_native_extension/clock_id.h +4 -0
- data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +52 -0
- data/ext/ddtrace_profiling_native_extension/clock_id_noop.c +14 -0
- data/ext/ddtrace_profiling_native_extension/extconf.rb +177 -8
- data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +35 -0
- data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +3 -0
- data/ext/ddtrace_profiling_native_extension/profiling.c +6 -1
- data/lib/datadog/ci/contrib/cucumber/formatter.rb +1 -0
- data/lib/datadog/ci/contrib/rspec/example.rb +1 -0
- data/lib/datadog/ci/contrib/rspec/integration.rb +2 -2
- data/lib/datadog/ci/ext/environment.rb +64 -22
- data/lib/datadog/ci/ext/test.rb +1 -0
- data/lib/datadog/ci/test.rb +5 -1
- data/lib/datadog/contrib.rb +2 -0
- data/lib/datadog/core/environment/vm_cache.rb +46 -0
- data/lib/ddtrace/buffer.rb +28 -16
- data/lib/ddtrace/configuration/agent_settings_resolver.rb +131 -53
- data/lib/ddtrace/configuration/components.rb +1 -1
- data/lib/ddtrace/configuration/settings.rb +13 -3
- data/lib/ddtrace/context.rb +10 -2
- data/lib/ddtrace/contrib/action_cable/instrumentation.rb +46 -0
- data/lib/ddtrace/contrib/action_cable/patcher.rb +1 -0
- data/lib/ddtrace/contrib/action_mailer/configuration/settings.rb +32 -0
- data/lib/ddtrace/contrib/action_mailer/event.rb +50 -0
- data/lib/ddtrace/contrib/action_mailer/events/deliver.rb +54 -0
- data/lib/ddtrace/contrib/action_mailer/events/process.rb +41 -0
- data/lib/ddtrace/contrib/action_mailer/events.rb +31 -0
- data/lib/ddtrace/contrib/action_mailer/ext.rb +32 -0
- data/lib/ddtrace/contrib/action_mailer/integration.rb +45 -0
- data/lib/ddtrace/contrib/action_mailer/patcher.rb +27 -0
- data/lib/ddtrace/contrib/active_job/configuration/settings.rb +33 -0
- data/lib/ddtrace/contrib/active_job/event.rb +54 -0
- data/lib/ddtrace/contrib/active_job/events/discard.rb +46 -0
- data/lib/ddtrace/contrib/active_job/events/enqueue.rb +45 -0
- data/lib/ddtrace/contrib/active_job/events/enqueue_at.rb +45 -0
- data/lib/ddtrace/contrib/active_job/events/enqueue_retry.rb +47 -0
- data/lib/ddtrace/contrib/active_job/events/perform.rb +45 -0
- data/lib/ddtrace/contrib/active_job/events/retry_stopped.rb +46 -0
- data/lib/ddtrace/contrib/active_job/events.rb +39 -0
- data/lib/ddtrace/contrib/active_job/ext.rb +32 -0
- data/lib/ddtrace/contrib/active_job/integration.rb +46 -0
- data/lib/ddtrace/contrib/active_job/log_injection.rb +21 -0
- data/lib/ddtrace/contrib/active_job/patcher.rb +33 -0
- data/lib/ddtrace/contrib/auto_instrument.rb +0 -1
- data/lib/ddtrace/contrib/delayed_job/plugin.rb +2 -2
- data/lib/ddtrace/contrib/mongodb/instrumentation.rb +1 -1
- data/lib/ddtrace/contrib/mongodb/integration.rb +5 -0
- data/lib/ddtrace/contrib/rails/auto_instrument_railtie.rb +0 -1
- data/lib/ddtrace/contrib/rails/configuration/settings.rb +7 -0
- data/lib/ddtrace/contrib/rails/framework.rb +24 -1
- data/lib/ddtrace/contrib/rails/patcher.rb +19 -10
- data/lib/ddtrace/contrib/redis/instrumentation.rb +90 -0
- data/lib/ddtrace/contrib/redis/patcher.rb +2 -84
- data/lib/ddtrace/contrib/registerable.rb +0 -1
- data/lib/ddtrace/contrib/resque/integration.rb +1 -5
- data/lib/ddtrace/contrib/sidekiq/ext.rb +3 -0
- data/lib/ddtrace/contrib/sidekiq/integration.rb +10 -0
- data/lib/ddtrace/contrib/sidekiq/patcher.rb +26 -0
- data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/heartbeat.rb +30 -0
- data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/job_fetch.rb +30 -0
- data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/scheduled_push.rb +29 -0
- data/lib/ddtrace/contrib/sinatra/env.rb +2 -1
- data/lib/ddtrace/contrib/sinatra/tracer.rb +15 -2
- data/lib/ddtrace/ext/git.rb +12 -0
- data/lib/ddtrace/ext/priority.rb +6 -4
- data/lib/ddtrace/ext/profiling.rb +8 -11
- data/lib/ddtrace/ext/runtime.rb +3 -0
- data/lib/ddtrace/ext/transport.rb +11 -0
- data/lib/ddtrace/metrics.rb +2 -2
- data/lib/ddtrace/profiling/collectors/stack.rb +112 -72
- data/lib/ddtrace/profiling/encoding/profile.rb +10 -2
- data/lib/ddtrace/profiling/events/stack.rb +13 -13
- data/lib/ddtrace/profiling/native_extension.rb +23 -1
- data/lib/ddtrace/profiling/pprof/builder.rb +8 -2
- data/lib/ddtrace/profiling/pprof/converter.rb +22 -9
- data/lib/ddtrace/profiling/pprof/stack_sample.rb +32 -9
- data/lib/ddtrace/profiling/pprof/template.rb +2 -2
- data/lib/ddtrace/profiling/scheduler.rb +20 -4
- data/lib/ddtrace/profiling/tasks/setup.rb +21 -13
- data/lib/ddtrace/profiling/trace_identifiers/ddtrace.rb +10 -9
- data/lib/ddtrace/profiling/trace_identifiers/helper.rb +5 -5
- data/lib/ddtrace/profiling/transport/http/api/endpoint.rb +8 -15
- data/lib/ddtrace/profiling/transport/http.rb +8 -17
- data/lib/ddtrace/profiling.rb +0 -2
- data/lib/ddtrace/runtime/metrics.rb +14 -0
- data/lib/ddtrace/sampler.rb +18 -8
- data/lib/ddtrace/sampling/rule_sampler.rb +13 -1
- data/lib/ddtrace/span.rb +7 -19
- data/lib/ddtrace/tracer.rb +1 -1
- data/lib/ddtrace/transport/http/adapters/net.rb +13 -3
- data/lib/ddtrace/transport/http/adapters/test.rb +4 -2
- data/lib/ddtrace/transport/http/adapters/unix_socket.rb +23 -12
- data/lib/ddtrace/transport/http/builder.rb +13 -6
- data/lib/ddtrace/transport/http.rb +5 -11
- data/lib/ddtrace/utils/time.rb +11 -6
- data/lib/ddtrace/version.rb +2 -2
- data/lib/ddtrace/workers/{loop.rb → interval_loop.rb} +0 -16
- data/lib/ddtrace/workers/polling.rb +1 -1
- metadata +40 -10
- data/lib/ddtrace/profiling/ext/cpu.rb +0 -67
- data/lib/ddtrace/profiling/ext/cthread.rb +0 -156
data/lib/ddtrace/ext/priority.rb
CHANGED
|
@@ -4,13 +4,15 @@ module Datadog
|
|
|
4
4
|
# Priority is a hint given to the backend so that it knows which traces to reject or kept.
|
|
5
5
|
# In a distributed context, it should be set before any context propagation (fork, RPC calls) to be effective.
|
|
6
6
|
module Priority
|
|
7
|
-
# Use this to
|
|
7
|
+
# Use this to explicitly inform the backend that a trace MUST be rejected and not stored.
|
|
8
|
+
# This includes rules and rate limits configured by the user through the {RuleSampler}.
|
|
8
9
|
USER_REJECT = -1
|
|
9
|
-
# Used by the
|
|
10
|
+
# Used by the {PrioritySampler} to inform the backend that a trace should be rejected and not stored.
|
|
10
11
|
AUTO_REJECT = 0
|
|
11
|
-
# Used by the
|
|
12
|
+
# Used by the {PrioritySampler} to inform the backend that a trace should be kept and stored.
|
|
12
13
|
AUTO_KEEP = 1
|
|
13
|
-
# Use this to
|
|
14
|
+
# Use this to explicitly inform the backend that a trace MUST be kept and stored.
|
|
15
|
+
# This includes rules and rate limits configured by the user through the {RuleSampler}.
|
|
14
16
|
USER_KEEP = 2
|
|
15
17
|
end
|
|
16
18
|
end
|
|
@@ -6,11 +6,12 @@ module Datadog
|
|
|
6
6
|
ENV_UPLOAD_TIMEOUT = 'DD_PROFILING_UPLOAD_TIMEOUT'.freeze
|
|
7
7
|
ENV_MAX_FRAMES = 'DD_PROFILING_MAX_FRAMES'.freeze
|
|
8
8
|
ENV_AGENTLESS = 'DD_PROFILING_AGENTLESS'.freeze
|
|
9
|
+
ENV_ENDPOINT_COLLECTION_ENABLED = 'DD_PROFILING_ENDPOINT_COLLECTION_ENABLED'.freeze
|
|
9
10
|
|
|
10
11
|
module Pprof
|
|
12
|
+
LABEL_KEY_LOCAL_ROOT_SPAN_ID = 'local root span id'.freeze
|
|
11
13
|
LABEL_KEY_SPAN_ID = 'span id'.freeze
|
|
12
14
|
LABEL_KEY_THREAD_ID = 'thread id'.freeze
|
|
13
|
-
LABEL_KEY_TRACE_ID = 'trace id'.freeze
|
|
14
15
|
LABEL_KEY_TRACE_ENDPOINT = 'trace endpoint'.freeze
|
|
15
16
|
SAMPLE_VALUE_NO_VALUE = 0
|
|
16
17
|
VALUE_TYPE_CPU = 'cpu-time'.freeze
|
|
@@ -22,13 +23,9 @@ module Datadog
|
|
|
22
23
|
module HTTP
|
|
23
24
|
URI_TEMPLATE_DD_API = 'https://intake.profile.%s/'.freeze
|
|
24
25
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
FORM_FIELD_RECORDING_END = 'recording-end'.freeze
|
|
29
|
-
FORM_FIELD_RECORDING_START = 'recording-start'.freeze
|
|
30
|
-
FORM_FIELD_RUNTIME = 'runtime'.freeze
|
|
31
|
-
FORM_FIELD_RUNTIME_ID = 'runtime-id'.freeze
|
|
26
|
+
FORM_FIELD_RECORDING_START = 'start'.freeze
|
|
27
|
+
FORM_FIELD_RECORDING_END = 'end'.freeze
|
|
28
|
+
FORM_FIELD_FAMILY = 'family'.freeze
|
|
32
29
|
FORM_FIELD_TAG_ENV = 'env'.freeze
|
|
33
30
|
FORM_FIELD_TAG_HOST = 'host'.freeze
|
|
34
31
|
FORM_FIELD_TAG_LANGUAGE = 'language'.freeze
|
|
@@ -42,13 +39,13 @@ module Datadog
|
|
|
42
39
|
FORM_FIELD_TAG_SERVICE = 'service'.freeze
|
|
43
40
|
FORM_FIELD_TAG_VERSION = 'version'.freeze
|
|
44
41
|
FORM_FIELD_TAGS = 'tags'.freeze
|
|
45
|
-
|
|
46
|
-
FORM_FIELD_TYPES_AUTO = 'auto'.freeze
|
|
42
|
+
FORM_FIELD_INTAKE_VERSION = 'version'.freeze
|
|
47
43
|
|
|
48
44
|
HEADER_CONTENT_TYPE = 'Content-Type'.freeze
|
|
49
45
|
HEADER_CONTENT_TYPE_OCTET_STREAM = 'application/octet-stream'.freeze
|
|
50
46
|
|
|
51
|
-
|
|
47
|
+
FORM_FIELD_PPROF_DATA = 'data[rubyprofile.pprof]'.freeze
|
|
48
|
+
PPROF_DEFAULT_FILENAME = 'rubyprofile.pprof.gz'.freeze
|
|
52
49
|
end
|
|
53
50
|
end
|
|
54
51
|
end
|
data/lib/ddtrace/ext/runtime.rb
CHANGED
|
@@ -6,6 +6,7 @@ module Datadog
|
|
|
6
6
|
module Runtime
|
|
7
7
|
TAG_ID = 'runtime-id'.freeze
|
|
8
8
|
TAG_LANG = 'language'.freeze
|
|
9
|
+
TAG_PID = 'system.pid'.freeze
|
|
9
10
|
|
|
10
11
|
# Metrics
|
|
11
12
|
module Metrics
|
|
@@ -14,6 +15,8 @@ module Datadog
|
|
|
14
15
|
METRIC_CLASS_COUNT = 'runtime.ruby.class_count'.freeze
|
|
15
16
|
METRIC_GC_PREFIX = 'runtime.ruby.gc'.freeze
|
|
16
17
|
METRIC_THREAD_COUNT = 'runtime.ruby.thread_count'.freeze
|
|
18
|
+
METRIC_GLOBAL_CONSTANT_STATE = 'runtime.ruby.global_constant_state'.freeze
|
|
19
|
+
METRIC_GLOBAL_METHOD_STATE = 'runtime.ruby.global_method_state'.freeze
|
|
17
20
|
|
|
18
21
|
TAG_SERVICE = 'service'.freeze
|
|
19
22
|
end
|
|
@@ -3,6 +3,7 @@ module Datadog
|
|
|
3
3
|
module Ext
|
|
4
4
|
module Transport
|
|
5
5
|
module HTTP
|
|
6
|
+
ADAPTER = :net_http # DEV: Rename to simply `:http`, as Net::HTTP is an implementation detail.
|
|
6
7
|
DEFAULT_HOST = '127.0.0.1'.freeze
|
|
7
8
|
DEFAULT_PORT = 8126
|
|
8
9
|
DEFAULT_TIMEOUT_SECONDS = 1
|
|
@@ -16,6 +17,16 @@ module Datadog
|
|
|
16
17
|
HEADER_META_LANG_INTERPRETER = 'Datadog-Meta-Lang-Interpreter'.freeze
|
|
17
18
|
HEADER_META_TRACER_VERSION = 'Datadog-Meta-Tracer-Version'.freeze
|
|
18
19
|
end
|
|
20
|
+
|
|
21
|
+
module Test
|
|
22
|
+
ADAPTER = :test
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
module UnixSocket
|
|
26
|
+
ADAPTER = :unix
|
|
27
|
+
DEFAULT_PATH = '/var/run/datadog/apm.socket'.freeze
|
|
28
|
+
DEFAULT_TIMEOUT_SECONDS = 1
|
|
29
|
+
end
|
|
19
30
|
end
|
|
20
31
|
end
|
|
21
32
|
end
|
data/lib/ddtrace/metrics.rb
CHANGED
|
@@ -31,7 +31,7 @@ module Datadog
|
|
|
31
31
|
!version.nil? && version >= Gem::Version.new('3.3.0') &&
|
|
32
32
|
# dogstatsd-ruby >= 5.0 & < 5.2.0 has known issues with process forks
|
|
33
33
|
# and do not support the single thread mode we use to avoid this problem.
|
|
34
|
-
!(version >= Gem::Version.new('5.0') && version < Gem::Version.new('5.
|
|
34
|
+
!(version >= Gem::Version.new('5.0') && version < Gem::Version.new('5.3'))
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
def enabled?
|
|
@@ -274,7 +274,7 @@ module Datadog
|
|
|
274
274
|
IGNORED_STATSD_ONLY_ONCE.run do
|
|
275
275
|
Datadog.logger.warn(
|
|
276
276
|
'Ignoring user-supplied statsd instance as currently-installed version of dogstastd-ruby is incompatible. ' \
|
|
277
|
-
"To fix this, ensure that you have `gem 'dogstatsd-ruby', '~> 5.
|
|
277
|
+
"To fix this, ensure that you have `gem 'dogstatsd-ruby', '~> 5.3'` on your Gemfile or gems.rb file."
|
|
278
278
|
)
|
|
279
279
|
end
|
|
280
280
|
end
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
# typed: true
|
|
2
|
+
|
|
3
|
+
require 'ddtrace/profiling/native_extension'
|
|
2
4
|
require 'ddtrace/profiling/backtrace_location'
|
|
3
5
|
require 'ddtrace/profiling/events/stack'
|
|
4
6
|
require 'ddtrace/utils/only_once'
|
|
@@ -18,6 +20,13 @@ module Datadog
|
|
|
18
20
|
DEFAULT_MAX_TIME_USAGE_PCT = 2.0
|
|
19
21
|
MIN_INTERVAL = 0.01
|
|
20
22
|
THREAD_LAST_CPU_TIME_KEY = :datadog_profiler_last_cpu_time
|
|
23
|
+
THREAD_LAST_WALL_CLOCK_KEY = :datadog_profiler_last_wall_clock
|
|
24
|
+
SYNTHETIC_STACK_IN_NATIVE_CODE = [BacktraceLocation.new('', 0, 'In native code').freeze].freeze
|
|
25
|
+
|
|
26
|
+
# This default was picked based on the current sampling performance and on expected concurrency on an average
|
|
27
|
+
# Ruby MRI application. Lowering this optimizes for latency (less impact each time we sample), and raising
|
|
28
|
+
# optimizes for coverage (less chance to miss what a given thread is doing).
|
|
29
|
+
DEFAULT_MAX_THREADS_SAMPLED = 16
|
|
21
30
|
|
|
22
31
|
attr_reader \
|
|
23
32
|
:recorder,
|
|
@@ -25,7 +34,8 @@ module Datadog
|
|
|
25
34
|
:trace_identifiers_helper,
|
|
26
35
|
:ignore_thread,
|
|
27
36
|
:max_time_usage_pct,
|
|
28
|
-
:thread_api
|
|
37
|
+
:thread_api,
|
|
38
|
+
:cpu_time_provider
|
|
29
39
|
|
|
30
40
|
def initialize(
|
|
31
41
|
recorder,
|
|
@@ -33,7 +43,9 @@ module Datadog
|
|
|
33
43
|
trace_identifiers_helper:, # Usually an instance of Datadog::Profiling::TraceIdentifiers::Helper
|
|
34
44
|
ignore_thread: nil,
|
|
35
45
|
max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT,
|
|
46
|
+
max_threads_sampled: DEFAULT_MAX_THREADS_SAMPLED,
|
|
36
47
|
thread_api: Thread,
|
|
48
|
+
cpu_time_provider: Datadog::Profiling::NativeExtension,
|
|
37
49
|
fork_policy: Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
|
|
38
50
|
interval: MIN_INTERVAL,
|
|
39
51
|
enabled: true
|
|
@@ -43,7 +55,10 @@ module Datadog
|
|
|
43
55
|
@trace_identifiers_helper = trace_identifiers_helper
|
|
44
56
|
@ignore_thread = ignore_thread
|
|
45
57
|
@max_time_usage_pct = max_time_usage_pct
|
|
58
|
+
@max_threads_sampled = max_threads_sampled
|
|
46
59
|
@thread_api = thread_api
|
|
60
|
+
# Only set the provider if it's able to work in the current Ruby/OS combo
|
|
61
|
+
@cpu_time_provider = cpu_time_provider unless cpu_time_provider.cpu_time_ns_for(thread_api.current).nil?
|
|
47
62
|
|
|
48
63
|
# Workers::Async::Thread settings
|
|
49
64
|
self.fork_policy = fork_policy
|
|
@@ -54,16 +69,17 @@ module Datadog
|
|
|
54
69
|
# Workers::Polling settings
|
|
55
70
|
self.enabled = enabled
|
|
56
71
|
|
|
57
|
-
@warn_about_missing_cpu_time_instrumentation_only_once = Datadog::Utils::OnlyOnce.new
|
|
58
|
-
|
|
59
72
|
# Cache this proc, since it's pretty expensive to keep recreating it
|
|
60
73
|
@build_backtrace_location = method(:build_backtrace_location).to_proc
|
|
61
74
|
# Cache this buffer, since it's pretty expensive to keep accessing it
|
|
62
75
|
@stack_sample_event_recorder = recorder[Events::StackSample]
|
|
76
|
+
# See below for details on why this is needed
|
|
77
|
+
@needs_process_waiter_workaround =
|
|
78
|
+
Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('2.3') &&
|
|
79
|
+
Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7')
|
|
63
80
|
end
|
|
64
81
|
|
|
65
82
|
def start
|
|
66
|
-
@last_wall_time = Datadog::Utils::Time.get_time
|
|
67
83
|
reset_cpu_time_tracking
|
|
68
84
|
perform
|
|
69
85
|
end
|
|
@@ -72,10 +88,6 @@ module Datadog
|
|
|
72
88
|
collect_and_wait
|
|
73
89
|
end
|
|
74
90
|
|
|
75
|
-
def loop_back_off?
|
|
76
|
-
false
|
|
77
|
-
end
|
|
78
|
-
|
|
79
91
|
def collect_and_wait
|
|
80
92
|
run_time = Datadog::Utils::Time.measure do
|
|
81
93
|
collect_events
|
|
@@ -87,24 +99,14 @@ module Datadog
|
|
|
87
99
|
|
|
88
100
|
def collect_events
|
|
89
101
|
events = []
|
|
90
|
-
|
|
91
|
-
# Compute wall time interval
|
|
92
|
-
current_wall_time = Datadog::Utils::Time.get_time
|
|
93
|
-
last_wall_time = if instance_variable_defined?(:@last_wall_time)
|
|
94
|
-
@last_wall_time
|
|
95
|
-
else
|
|
96
|
-
current_wall_time
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
wall_time_interval_ns = ((current_wall_time - last_wall_time).round(9) * 1e9).to_i
|
|
100
|
-
@last_wall_time = current_wall_time
|
|
102
|
+
current_wall_time_ns = get_current_wall_time_timestamp_ns
|
|
101
103
|
|
|
102
104
|
# Collect backtraces from each thread
|
|
103
|
-
|
|
105
|
+
threads_to_sample.each do |thread|
|
|
104
106
|
next unless thread.alive?
|
|
105
107
|
next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread)
|
|
106
108
|
|
|
107
|
-
event = collect_thread_event(thread,
|
|
109
|
+
event = collect_thread_event(thread, current_wall_time_ns)
|
|
108
110
|
events << event unless event.nil?
|
|
109
111
|
end
|
|
110
112
|
|
|
@@ -114,10 +116,30 @@ module Datadog
|
|
|
114
116
|
events
|
|
115
117
|
end
|
|
116
118
|
|
|
117
|
-
def collect_thread_event(thread,
|
|
119
|
+
def collect_thread_event(thread, current_wall_time_ns)
|
|
118
120
|
locations = thread.backtrace_locations
|
|
119
121
|
return if locations.nil?
|
|
120
122
|
|
|
123
|
+
# Having empty locations means that the thread is alive, but we don't know what it's doing:
|
|
124
|
+
#
|
|
125
|
+
# 1. It can be starting up
|
|
126
|
+
# ```
|
|
127
|
+
# > Thread.new { sleep }.backtrace
|
|
128
|
+
# => [] # <-- note the thread hasn't actually started running sleep yet, we got there first
|
|
129
|
+
# ```
|
|
130
|
+
# 2. It can be running native code
|
|
131
|
+
# ```
|
|
132
|
+
# > t = Process.detach(fork { sleep })
|
|
133
|
+
# => #<Process::Waiter:0x00007ffe7285f7a0 run>
|
|
134
|
+
# > t.backtrace
|
|
135
|
+
# => [] # <-- this can happen even minutes later, e.g. it's not a race as in 1.
|
|
136
|
+
# ```
|
|
137
|
+
# This effect has been observed in threads created by the Iodine web server and the ffi gem
|
|
138
|
+
#
|
|
139
|
+
# To give customers visibility into these threads, we replace the empty stack with one containing a
|
|
140
|
+
# synthetic placeholder frame, so that these threads are properly represented in the UX.
|
|
141
|
+
locations = SYNTHETIC_STACK_IN_NATIVE_CODE if locations.empty?
|
|
142
|
+
|
|
121
143
|
# Get actual stack size then trim the stack
|
|
122
144
|
stack_size = locations.length
|
|
123
145
|
locations = locations[0..(max_frames - 1)]
|
|
@@ -125,45 +147,33 @@ module Datadog
|
|
|
125
147
|
# Convert backtrace locations into structs
|
|
126
148
|
locations = convert_backtrace_locations(locations)
|
|
127
149
|
|
|
128
|
-
thread_id = thread.
|
|
129
|
-
|
|
150
|
+
thread_id = thread.object_id
|
|
151
|
+
root_span_id, span_id, trace_resource = trace_identifiers_helper.trace_identifiers_for(thread)
|
|
130
152
|
cpu_time = get_cpu_time_interval!(thread)
|
|
153
|
+
wall_time_interval_ns =
|
|
154
|
+
get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_WALL_CLOCK_KEY, current_wall_time_ns)
|
|
131
155
|
|
|
132
156
|
Events::StackSample.new(
|
|
133
157
|
nil,
|
|
134
158
|
locations,
|
|
135
159
|
stack_size,
|
|
136
160
|
thread_id,
|
|
137
|
-
|
|
161
|
+
root_span_id,
|
|
138
162
|
span_id,
|
|
139
|
-
|
|
163
|
+
trace_resource,
|
|
140
164
|
cpu_time,
|
|
141
165
|
wall_time_interval_ns
|
|
142
166
|
)
|
|
143
167
|
end
|
|
144
168
|
|
|
145
169
|
def get_cpu_time_interval!(thread)
|
|
146
|
-
|
|
147
|
-
unless thread.respond_to?(:cpu_time_instrumentation_installed?) && thread.cpu_time_instrumentation_installed?
|
|
148
|
-
warn_about_missing_cpu_time_instrumentation(thread)
|
|
149
|
-
return
|
|
150
|
-
end
|
|
170
|
+
return unless cpu_time_provider
|
|
151
171
|
|
|
152
|
-
current_cpu_time_ns =
|
|
172
|
+
current_cpu_time_ns = cpu_time_provider.cpu_time_ns_for(thread)
|
|
153
173
|
|
|
154
|
-
# NOTE: This can still be nil even when all of the checks above passed because of a race: there's a bit of
|
|
155
|
-
# initialization that needs to be done by the thread itself, and it's possible for us to try to sample
|
|
156
|
-
# *before* the thread had time to finish the initialization
|
|
157
174
|
return unless current_cpu_time_ns
|
|
158
175
|
|
|
159
|
-
|
|
160
|
-
interval = current_cpu_time_ns - last_cpu_time_ns
|
|
161
|
-
|
|
162
|
-
# Update CPU time for thread
|
|
163
|
-
thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
|
|
164
|
-
|
|
165
|
-
# Return interval
|
|
166
|
-
interval
|
|
176
|
+
get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
|
|
167
177
|
end
|
|
168
178
|
|
|
169
179
|
def compute_wait_time(used_time)
|
|
@@ -209,38 +219,11 @@ module Datadog
|
|
|
209
219
|
|
|
210
220
|
private
|
|
211
221
|
|
|
212
|
-
def warn_about_missing_cpu_time_instrumentation(thread)
|
|
213
|
-
@warn_about_missing_cpu_time_instrumentation_only_once.run do
|
|
214
|
-
# Is the profiler thread instrumented? If it is, then we know instrumentation is available, but seems to be
|
|
215
|
-
# missing on this thread we just found.
|
|
216
|
-
#
|
|
217
|
-
# As far as we know, it can be missing due to one the following:
|
|
218
|
-
#
|
|
219
|
-
# a) The thread was started before we installed our instrumentation.
|
|
220
|
-
# In this case, the fix is to make sure ddtrace gets loaded before any other parts of the application.
|
|
221
|
-
#
|
|
222
|
-
# b) The thread was started using the Ruby native APIs (e.g. from a C extension such as ffi).
|
|
223
|
-
# Known cases right now that trigger this are the ethon/typhoeus gems.
|
|
224
|
-
# We currently have no solution for this case; these threads will always be missing our CPU instrumentation.
|
|
225
|
-
#
|
|
226
|
-
# c) The thread was started with `Thread.start`/`Thread.fork` and hasn't yet enabled the instrumentation.
|
|
227
|
-
# When threads are started using these APIs, there's a small time window during which the thread has started
|
|
228
|
-
# but our code to apply the instrumentation hasn't run yet; in these cases it's just a matter of allowing
|
|
229
|
-
# it to run and our instrumentation to be applied.
|
|
230
|
-
#
|
|
231
|
-
if thread_api.current.respond_to?(:cpu_time) && thread_api.current.cpu_time
|
|
232
|
-
Datadog.logger.debug(
|
|
233
|
-
"Thread ('#{thread}') is missing profiling instrumentation; other threads should be unaffected"
|
|
234
|
-
)
|
|
235
|
-
end
|
|
236
|
-
end
|
|
237
|
-
end
|
|
238
|
-
|
|
239
222
|
# If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to
|
|
240
|
-
# clean up any leftover per-thread
|
|
223
|
+
# clean up any leftover per-thread counters, so that the first sample after starting doesn't end up with:
|
|
241
224
|
#
|
|
242
225
|
# a) negative time: At least on my test docker container, and on the reliability environment, after the process
|
|
243
|
-
# forks, the
|
|
226
|
+
# forks, the cpu time reference changes and (old cpu time - new cpu time) can be < 0
|
|
244
227
|
#
|
|
245
228
|
# b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
|
|
246
229
|
# restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
|
|
@@ -248,9 +231,66 @@ module Datadog
|
|
|
248
231
|
# By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
|
|
249
232
|
def reset_cpu_time_tracking
|
|
250
233
|
thread_api.list.each do |thread|
|
|
234
|
+
# See below for details on why this is needed
|
|
235
|
+
next if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
|
|
236
|
+
|
|
251
237
|
thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil)
|
|
238
|
+
thread.thread_variable_set(THREAD_LAST_WALL_CLOCK_KEY, nil)
|
|
252
239
|
end
|
|
253
240
|
end
|
|
241
|
+
|
|
242
|
+
def get_elapsed_since_last_sample_and_set_value(thread, key, current_value)
|
|
243
|
+
# Process::Waiter crash workaround:
|
|
244
|
+
#
|
|
245
|
+
# This is a workaround for a Ruby VM segfault (usually something like
|
|
246
|
+
# "[BUG] Segmentation fault at 0x0000000000000008") in the affected Ruby versions.
|
|
247
|
+
# See https://bugs.ruby-lang.org/issues/17807 for details.
|
|
248
|
+
#
|
|
249
|
+
# In those Ruby versions, there's a very special subclass of `Thread` called `Process::Waiter` that causes VM
|
|
250
|
+
# crashes whenever something tries to read its instance or thread variables. This subclass of thread only
|
|
251
|
+
# shows up when the `Process.detach` API gets used.
|
|
252
|
+
# In the specs you'll find crash regression tests that include a way of reproducing it.
|
|
253
|
+
#
|
|
254
|
+
# As workaround for now we just skip it for the affected Rubies
|
|
255
|
+
return 0 if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
|
|
256
|
+
|
|
257
|
+
last_value = thread.thread_variable_get(key) || current_value
|
|
258
|
+
thread.thread_variable_set(key, current_value)
|
|
259
|
+
|
|
260
|
+
current_value - last_value
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Whenever there are more than max_threads_sampled active, we only sample a subset of them.
|
|
264
|
+
# We do this to avoid impacting the latency of the service being profiled. We want to avoid doing
|
|
265
|
+
# a big burst of work all at once (sample everything), and instead do a little work each time
|
|
266
|
+
# (sample a bit by bit).
|
|
267
|
+
#
|
|
268
|
+
# Because we pick the threads to sample randomly, we'll eventually sample all threads -- just not at once.
|
|
269
|
+
# Notice also that this will interact with our dynamic sampling mechanism -- if samples are faster, we take
|
|
270
|
+
# them more often, if they are slower, we take them less often -- which again means that over a longer period
|
|
271
|
+
# we should take sample roughly the same samples.
|
|
272
|
+
#
|
|
273
|
+
# One downside of this approach is that if there really are many threads, the resulting wall clock times
|
|
274
|
+
# in a one minute profile may "drift" around the 60 second mark, e.g. maybe we only sampled a thread once per
|
|
275
|
+
# second and only 59 times, so we'll report 59s, but on the next report we'll include the missing one, so
|
|
276
|
+
# then the result will be 61s. I've observed 60 +- 1.68 secs for an app with ~65 threads, given the
|
|
277
|
+
# default maximum of 16 threads. This seems a reasonable enough margin of error given the improvement to
|
|
278
|
+
# latency (especially on such a large application! -> even bigger latency impact if we tried to sample all
|
|
279
|
+
# threads).
|
|
280
|
+
#
|
|
281
|
+
def threads_to_sample
|
|
282
|
+
all_threads = thread_api.list
|
|
283
|
+
|
|
284
|
+
if all_threads.size > @max_threads_sampled
|
|
285
|
+
all_threads.sample(@max_threads_sampled)
|
|
286
|
+
else
|
|
287
|
+
all_threads
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
def get_current_wall_time_timestamp_ns
|
|
292
|
+
Datadog::Utils::Time.get_time(:nanosecond)
|
|
293
|
+
end
|
|
254
294
|
end
|
|
255
295
|
end
|
|
256
296
|
end
|
|
@@ -24,12 +24,20 @@ module Datadog
|
|
|
24
24
|
flush.event_groups.each { |event_group| template.add_events!(event_group.event_class, event_group.events) }
|
|
25
25
|
|
|
26
26
|
Datadog.logger.debug do
|
|
27
|
+
max_events = Datadog.configuration.profiling.advanced.max_events
|
|
28
|
+
events_sampled =
|
|
29
|
+
if flush.event_count == max_events
|
|
30
|
+
'max events limit hit, events were sampled [profile will be biased], '
|
|
31
|
+
else
|
|
32
|
+
''
|
|
33
|
+
end
|
|
34
|
+
|
|
27
35
|
"Encoding profile covering #{flush.start.iso8601} to #{flush.finish.iso8601}, " \
|
|
28
|
-
"events: #{flush.event_count} (#{template.debug_statistics})"
|
|
36
|
+
"events: #{flush.event_count} (#{events_sampled}#{template.debug_statistics})"
|
|
29
37
|
end
|
|
30
38
|
|
|
31
39
|
# Build the profile and encode it
|
|
32
|
-
template.to_pprof
|
|
40
|
+
template.to_pprof(start: flush.start, finish: flush.finish)
|
|
33
41
|
end
|
|
34
42
|
end
|
|
35
43
|
end
|
|
@@ -11,34 +11,34 @@ module Datadog
|
|
|
11
11
|
:frames,
|
|
12
12
|
:total_frame_count,
|
|
13
13
|
:thread_id,
|
|
14
|
-
:
|
|
14
|
+
:root_span_id,
|
|
15
15
|
:span_id,
|
|
16
|
-
:
|
|
16
|
+
:trace_resource
|
|
17
17
|
|
|
18
18
|
def initialize(
|
|
19
19
|
timestamp,
|
|
20
20
|
frames,
|
|
21
21
|
total_frame_count,
|
|
22
22
|
thread_id,
|
|
23
|
-
|
|
23
|
+
root_span_id,
|
|
24
24
|
span_id,
|
|
25
|
-
|
|
25
|
+
trace_resource
|
|
26
26
|
)
|
|
27
27
|
super(timestamp)
|
|
28
28
|
|
|
29
29
|
@frames = frames
|
|
30
30
|
@total_frame_count = total_frame_count
|
|
31
31
|
@thread_id = thread_id
|
|
32
|
-
@
|
|
32
|
+
@root_span_id = root_span_id
|
|
33
33
|
@span_id = span_id
|
|
34
|
-
@
|
|
34
|
+
@trace_resource = trace_resource
|
|
35
35
|
|
|
36
36
|
@hash = [
|
|
37
37
|
thread_id,
|
|
38
|
-
|
|
38
|
+
root_span_id,
|
|
39
39
|
span_id,
|
|
40
|
-
#
|
|
41
|
-
#
|
|
40
|
+
# trace_resource is deliberately not included -- events that share the same (root_span_id, span_id) refer
|
|
41
|
+
# to the same trace
|
|
42
42
|
frames.collect(&:hash),
|
|
43
43
|
total_frame_count
|
|
44
44
|
].hash
|
|
@@ -56,9 +56,9 @@ module Datadog
|
|
|
56
56
|
frames,
|
|
57
57
|
total_frame_count,
|
|
58
58
|
thread_id,
|
|
59
|
-
|
|
59
|
+
root_span_id,
|
|
60
60
|
span_id,
|
|
61
|
-
|
|
61
|
+
trace_resource,
|
|
62
62
|
cpu_time_interval_ns,
|
|
63
63
|
wall_time_interval_ns
|
|
64
64
|
)
|
|
@@ -67,9 +67,9 @@ module Datadog
|
|
|
67
67
|
frames,
|
|
68
68
|
total_frame_count,
|
|
69
69
|
thread_id,
|
|
70
|
-
|
|
70
|
+
root_span_id,
|
|
71
71
|
span_id,
|
|
72
|
-
|
|
72
|
+
trace_resource
|
|
73
73
|
)
|
|
74
74
|
|
|
75
75
|
@cpu_time_interval_ns = cpu_time_interval_ns
|
|
@@ -2,7 +2,8 @@
|
|
|
2
2
|
module Datadog
|
|
3
3
|
module Profiling
|
|
4
4
|
# This module contains classes and methods which are implemented using native code in the
|
|
5
|
-
# ext/ddtrace_profiling_native_extension folder
|
|
5
|
+
# ext/ddtrace_profiling_native_extension folder, as well as some Ruby-level utilities that don't make sense to
|
|
6
|
+
# write using C
|
|
6
7
|
module NativeExtension
|
|
7
8
|
private_class_method def self.working?
|
|
8
9
|
native_working?
|
|
@@ -13,6 +14,27 @@ module Datadog
|
|
|
13
14
|
false
|
|
14
15
|
end
|
|
15
16
|
end
|
|
17
|
+
|
|
18
|
+
unless singleton_class.method_defined?(:clock_id_for)
|
|
19
|
+
def self.clock_id_for(_)
|
|
20
|
+
nil
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def self.cpu_time_ns_for(thread)
|
|
25
|
+
clock_id =
|
|
26
|
+
begin
|
|
27
|
+
clock_id_for(thread)
|
|
28
|
+
rescue Errno::ESRCH
|
|
29
|
+
nil
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
begin
|
|
33
|
+
::Process.clock_gettime(clock_id, :nanosecond) if clock_id
|
|
34
|
+
rescue Errno::EINVAL
|
|
35
|
+
nil
|
|
36
|
+
end
|
|
37
|
+
end
|
|
16
38
|
end
|
|
17
39
|
end
|
|
18
40
|
end
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
require 'ddtrace/profiling/flush'
|
|
5
5
|
require 'ddtrace/profiling/pprof/message_set'
|
|
6
6
|
require 'ddtrace/profiling/pprof/string_table'
|
|
7
|
+
require 'ddtrace/utils/time'
|
|
7
8
|
|
|
8
9
|
module Datadog
|
|
9
10
|
module Profiling
|
|
@@ -47,14 +48,19 @@ module Datadog
|
|
|
47
48
|
Perftools::Profiles::Profile.encode(profile).force_encoding(DEFAULT_ENCODING)
|
|
48
49
|
end
|
|
49
50
|
|
|
50
|
-
def build_profile
|
|
51
|
+
def build_profile(start:, finish:)
|
|
52
|
+
start_ns = Datadog::Utils::Time.as_utc_epoch_ns(start)
|
|
53
|
+
finish_ns = Datadog::Utils::Time.as_utc_epoch_ns(finish)
|
|
54
|
+
|
|
51
55
|
Perftools::Profiles::Profile.new(
|
|
52
56
|
sample_type: @sample_types.messages,
|
|
53
57
|
sample: @samples,
|
|
54
58
|
mapping: @mappings.messages,
|
|
55
59
|
location: @locations.values,
|
|
56
60
|
function: @functions.messages,
|
|
57
|
-
string_table: @string_table.strings
|
|
61
|
+
string_table: @string_table.strings,
|
|
62
|
+
time_nanos: start_ns,
|
|
63
|
+
duration_nanos: finish_ns - start_ns,
|
|
58
64
|
)
|
|
59
65
|
end
|
|
60
66
|
|
|
@@ -25,20 +25,19 @@ module Datadog
|
|
|
25
25
|
# [key, EventGroup]
|
|
26
26
|
event_groups = {}
|
|
27
27
|
|
|
28
|
+
# Aggregate each event into a group
|
|
29
|
+
# with identical properties, but different values.
|
|
28
30
|
events.each do |event|
|
|
29
31
|
key = yield(event)
|
|
30
|
-
values =
|
|
32
|
+
values = build_event_values(event)
|
|
31
33
|
|
|
32
34
|
unless key.nil?
|
|
33
35
|
if event_groups.key?(key)
|
|
34
|
-
# Update
|
|
35
|
-
|
|
36
|
-
group_values.each_with_index do |group_value, i|
|
|
37
|
-
group_values[i] = group_value + values[i]
|
|
38
|
-
end
|
|
36
|
+
# Update existing group from event
|
|
37
|
+
update_group(event_groups[key], event, values)
|
|
39
38
|
else
|
|
40
39
|
# Add new group
|
|
41
|
-
event_groups[key] =
|
|
40
|
+
event_groups[key] = new_group(event, values)
|
|
42
41
|
end
|
|
43
42
|
end
|
|
44
43
|
end
|
|
@@ -57,7 +56,7 @@ module Datadog
|
|
|
57
56
|
index
|
|
58
57
|
end
|
|
59
58
|
|
|
60
|
-
def
|
|
59
|
+
def build_event_values(event)
|
|
61
60
|
# Build a value array that matches the length of the sample types
|
|
62
61
|
# Populate all values with "no value" by default
|
|
63
62
|
Array.new(@sample_type_mappings.length, Datadog::Ext::Profiling::Pprof::SAMPLE_VALUE_NO_VALUE)
|
|
@@ -69,7 +68,7 @@ module Datadog
|
|
|
69
68
|
|
|
70
69
|
# Represents a grouped event
|
|
71
70
|
# 'sample' is an example event object from the group.
|
|
72
|
-
# 'values' is the
|
|
71
|
+
# 'values' is the summation of the group's sample values
|
|
73
72
|
EventGroup = Struct.new(:sample, :values)
|
|
74
73
|
|
|
75
74
|
# Error when the mapping of a sample type to value index is unknown
|
|
@@ -84,6 +83,20 @@ module Datadog
|
|
|
84
83
|
"Mapping for sample value type '#{type}' to index is unknown."
|
|
85
84
|
end
|
|
86
85
|
end
|
|
86
|
+
|
|
87
|
+
protected
|
|
88
|
+
|
|
89
|
+
def new_group(event, values)
|
|
90
|
+
EventGroup.new(event, values)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def update_group(event_group, event, values)
|
|
94
|
+
# Update values for group
|
|
95
|
+
group_values = event_group.values
|
|
96
|
+
group_values.each_with_index do |group_value, i|
|
|
97
|
+
group_values[i] = group_value + values[i]
|
|
98
|
+
end
|
|
99
|
+
end
|
|
87
100
|
end
|
|
88
101
|
end
|
|
89
102
|
end
|