ddtrace 0.52.0 → 0.53.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +65 -1
- data/ddtrace.gemspec +1 -1
- data/docs/DevelopmentGuide.md +1 -6
- data/docs/GettingStarted.md +66 -16
- data/lib/datadog/ci/contrib/rspec/integration.rb +2 -2
- data/lib/datadog/ci/ext/environment.rb +41 -4
- data/lib/datadog/contrib.rb +2 -0
- data/lib/datadog/core/environment/vm_cache.rb +46 -0
- data/lib/ddtrace/configuration/agent_settings_resolver.rb +107 -40
- data/lib/ddtrace/configuration/components.rb +1 -1
- data/lib/ddtrace/configuration/settings.rb +13 -3
- data/lib/ddtrace/contrib/action_cable/instrumentation.rb +46 -0
- data/lib/ddtrace/contrib/action_cable/patcher.rb +1 -0
- data/lib/ddtrace/contrib/action_mailer/configuration/settings.rb +32 -0
- data/lib/ddtrace/contrib/action_mailer/event.rb +50 -0
- data/lib/ddtrace/contrib/action_mailer/events/deliver.rb +54 -0
- data/lib/ddtrace/contrib/action_mailer/events/process.rb +41 -0
- data/lib/ddtrace/contrib/action_mailer/events.rb +31 -0
- data/lib/ddtrace/contrib/action_mailer/ext.rb +32 -0
- data/lib/ddtrace/contrib/action_mailer/integration.rb +45 -0
- data/lib/ddtrace/contrib/action_mailer/patcher.rb +27 -0
- data/lib/ddtrace/contrib/active_job/configuration/settings.rb +33 -0
- data/lib/ddtrace/contrib/active_job/event.rb +54 -0
- data/lib/ddtrace/contrib/active_job/events/discard.rb +46 -0
- data/lib/ddtrace/contrib/active_job/events/enqueue.rb +45 -0
- data/lib/ddtrace/contrib/active_job/events/enqueue_at.rb +45 -0
- data/lib/ddtrace/contrib/active_job/events/enqueue_retry.rb +47 -0
- data/lib/ddtrace/contrib/active_job/events/perform.rb +45 -0
- data/lib/ddtrace/contrib/active_job/events/retry_stopped.rb +46 -0
- data/lib/ddtrace/contrib/active_job/events.rb +39 -0
- data/lib/ddtrace/contrib/active_job/ext.rb +32 -0
- data/lib/ddtrace/contrib/active_job/integration.rb +46 -0
- data/lib/ddtrace/contrib/active_job/log_injection.rb +21 -0
- data/lib/ddtrace/contrib/active_job/patcher.rb +33 -0
- data/lib/ddtrace/contrib/auto_instrument.rb +0 -1
- data/lib/ddtrace/contrib/rails/auto_instrument_railtie.rb +0 -1
- data/lib/ddtrace/contrib/rails/framework.rb +22 -0
- data/lib/ddtrace/contrib/rails/patcher.rb +19 -10
- data/lib/ddtrace/contrib/registerable.rb +0 -1
- data/lib/ddtrace/contrib/sidekiq/ext.rb +3 -0
- data/lib/ddtrace/contrib/sidekiq/integration.rb +10 -0
- data/lib/ddtrace/contrib/sidekiq/patcher.rb +26 -0
- data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/heartbeat.rb +30 -0
- data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/job_fetch.rb +30 -0
- data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/scheduled_push.rb +29 -0
- data/lib/ddtrace/contrib/sinatra/env.rb +2 -1
- data/lib/ddtrace/contrib/sinatra/tracer.rb +15 -2
- data/lib/ddtrace/ext/git.rb +12 -0
- data/lib/ddtrace/ext/profiling.rb +1 -0
- data/lib/ddtrace/ext/runtime.rb +3 -0
- data/lib/ddtrace/ext/transport.rb +11 -0
- data/lib/ddtrace/profiling/collectors/stack.rb +71 -27
- data/lib/ddtrace/profiling/encoding/profile.rb +9 -1
- data/lib/ddtrace/profiling/events/stack.rb +7 -7
- data/lib/ddtrace/profiling/pprof/converter.rb +22 -9
- data/lib/ddtrace/profiling/pprof/stack_sample.rb +28 -2
- data/lib/ddtrace/profiling/tasks/setup.rb +0 -1
- data/lib/ddtrace/profiling/trace_identifiers/ddtrace.rb +1 -1
- data/lib/ddtrace/profiling/trace_identifiers/helper.rb +3 -3
- data/lib/ddtrace/profiling/transport/http.rb +8 -17
- data/lib/ddtrace/runtime/metrics.rb +14 -0
- data/lib/ddtrace/span.rb +7 -19
- data/lib/ddtrace/tracer.rb +1 -1
- data/lib/ddtrace/transport/http/adapters/net.rb +13 -3
- data/lib/ddtrace/transport/http/adapters/test.rb +4 -2
- data/lib/ddtrace/transport/http/adapters/unix_socket.rb +23 -12
- data/lib/ddtrace/transport/http/builder.rb +13 -6
- data/lib/ddtrace/transport/http.rb +5 -11
- data/lib/ddtrace/utils/time.rb +5 -6
- data/lib/ddtrace/version.rb +1 -1
- metadata +27 -2
@@ -34,8 +34,34 @@ module Datadog
|
|
34
34
|
config.server_middleware do |chain|
|
35
35
|
chain.add(Sidekiq::ServerTracer)
|
36
36
|
end
|
37
|
+
|
38
|
+
patch_server_internals if Integration.compatible_with_server_internal_tracing?
|
37
39
|
end
|
38
40
|
end
|
41
|
+
|
42
|
+
def patch_server_internals
|
43
|
+
patch_server_heartbeat
|
44
|
+
patch_server_job_fetch
|
45
|
+
patch_server_scheduled_push
|
46
|
+
end
|
47
|
+
|
48
|
+
def patch_server_heartbeat
|
49
|
+
require 'ddtrace/contrib/sidekiq/server_internal_tracer/heartbeat'
|
50
|
+
|
51
|
+
::Sidekiq::Launcher.prepend(ServerInternalTracer::Heartbeat)
|
52
|
+
end
|
53
|
+
|
54
|
+
def patch_server_job_fetch
|
55
|
+
require 'ddtrace/contrib/sidekiq/server_internal_tracer/job_fetch'
|
56
|
+
|
57
|
+
::Sidekiq::Processor.prepend(ServerInternalTracer::JobFetch)
|
58
|
+
end
|
59
|
+
|
60
|
+
def patch_server_scheduled_push
|
61
|
+
require 'ddtrace/contrib/sidekiq/server_internal_tracer/scheduled_push'
|
62
|
+
|
63
|
+
::Sidekiq::Scheduled::Poller.prepend(ServerInternalTracer::ScheduledPush)
|
64
|
+
end
|
39
65
|
end
|
40
66
|
end
|
41
67
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# typed: true
|
2
|
+
|
3
|
+
module Datadog
|
4
|
+
module Contrib
|
5
|
+
module Sidekiq
|
6
|
+
module ServerInternalTracer
|
7
|
+
# Trace when a Sidekiq process has a heartbeat
|
8
|
+
module Heartbeat
|
9
|
+
private
|
10
|
+
|
11
|
+
def ❤ # rubocop:disable Naming/AsciiIdentifiers, Naming/MethodName
|
12
|
+
configuration = Datadog.configuration[:sidekiq]
|
13
|
+
|
14
|
+
configuration[:tracer].trace(Ext::SPAN_HEARTBEAT) do |span|
|
15
|
+
span.service = configuration[:service_name]
|
16
|
+
span.span_type = Datadog::Ext::AppTypes::WORKER
|
17
|
+
|
18
|
+
# Set analytics sample rate
|
19
|
+
if Contrib::Analytics.enabled?(configuration[:analytics_enabled])
|
20
|
+
Contrib::Analytics.set_sample_rate(span, configuration[:analytics_sample_rate])
|
21
|
+
end
|
22
|
+
|
23
|
+
super
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# typed: true
|
2
|
+
|
3
|
+
module Datadog
|
4
|
+
module Contrib
|
5
|
+
module Sidekiq
|
6
|
+
module ServerInternalTracer
|
7
|
+
# Trace when Sidekiq looks for another job to work
|
8
|
+
module JobFetch
|
9
|
+
private
|
10
|
+
|
11
|
+
def fetch
|
12
|
+
configuration = Datadog.configuration[:sidekiq]
|
13
|
+
|
14
|
+
configuration[:tracer].trace(Ext::SPAN_JOB_FETCH) do |span|
|
15
|
+
span.service = configuration[:service_name]
|
16
|
+
span.span_type = Datadog::Ext::AppTypes::WORKER
|
17
|
+
|
18
|
+
# Set analytics sample rate
|
19
|
+
if Contrib::Analytics.enabled?(configuration[:analytics_enabled])
|
20
|
+
Contrib::Analytics.set_sample_rate(span, configuration[:analytics_sample_rate])
|
21
|
+
end
|
22
|
+
|
23
|
+
super
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# typed: true
|
2
|
+
|
3
|
+
module Datadog
|
4
|
+
module Contrib
|
5
|
+
module Sidekiq
|
6
|
+
module ServerInternalTracer
|
7
|
+
# Trace when Sidekiq checks to see if there are scheduled jobs that need to be worked
|
8
|
+
# https://github.com/mperham/sidekiq/wiki/Scheduled-Jobs
|
9
|
+
module ScheduledPush
|
10
|
+
def enqueue
|
11
|
+
configuration = Datadog.configuration[:sidekiq]
|
12
|
+
|
13
|
+
configuration[:tracer].trace(Ext::SPAN_SCHEDULED_PUSH) do |span|
|
14
|
+
span.service = configuration[:service_name]
|
15
|
+
span.span_type = Datadog::Ext::AppTypes::WORKER
|
16
|
+
|
17
|
+
# Set analytics sample rate
|
18
|
+
if Contrib::Analytics.enabled?(configuration[:analytics_enabled])
|
19
|
+
Contrib::Analytics.set_sample_rate(span, configuration[:analytics_sample_rate])
|
20
|
+
end
|
21
|
+
|
22
|
+
super
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -5,7 +5,7 @@ require 'ddtrace/ext/app_types'
|
|
5
5
|
require 'ddtrace/ext/errors'
|
6
6
|
require 'ddtrace/ext/http'
|
7
7
|
require 'ddtrace/propagation/http_propagator'
|
8
|
-
|
8
|
+
require 'ddtrace/utils/only_once'
|
9
9
|
require 'ddtrace/contrib/sinatra/ext'
|
10
10
|
require 'ddtrace/contrib/sinatra/tracer_middleware'
|
11
11
|
require 'ddtrace/contrib/sinatra/env'
|
@@ -77,6 +77,9 @@ module Datadog
|
|
77
77
|
|
78
78
|
# Method overrides for Sinatra::Base
|
79
79
|
module Base
|
80
|
+
MISSING_REQUEST_SPAN_ONLY_ONCE = Datadog::Utils::OnlyOnce.new
|
81
|
+
private_constant :MISSING_REQUEST_SPAN_ONLY_ONCE
|
82
|
+
|
80
83
|
def render(engine, data, *)
|
81
84
|
tracer = Datadog.configuration[:sinatra][:tracer]
|
82
85
|
return super unless tracer.enabled
|
@@ -121,8 +124,18 @@ module Datadog
|
|
121
124
|
else
|
122
125
|
Sinatra::Env.datadog_span(env, self.class)
|
123
126
|
end
|
124
|
-
if sinatra_request_span
|
127
|
+
if sinatra_request_span
|
125
128
|
sinatra_request_span.resource = span.resource
|
129
|
+
else
|
130
|
+
MISSING_REQUEST_SPAN_ONLY_ONCE.run do
|
131
|
+
Datadog.logger.warn do
|
132
|
+
'Sinatra integration is misconfigured, reported traces will be missing request metadata ' \
|
133
|
+
'such as path and HTTP status code. ' \
|
134
|
+
'Did you forget to add `register Datadog::Contrib::Sinatra::Tracer` to your ' \
|
135
|
+
'`Sinatra::Base` subclass? ' \
|
136
|
+
'See <https://docs.datadoghq.com/tracing/setup_overview/setup/ruby/#sinatra> for more details.'
|
137
|
+
end
|
138
|
+
end
|
126
139
|
end
|
127
140
|
|
128
141
|
Contrib::Analytics.set_measured(span)
|
data/lib/ddtrace/ext/git.rb
CHANGED
@@ -15,6 +15,18 @@ module Datadog
|
|
15
15
|
TAG_COMMIT_COMMITTER_NAME = 'git.commit.committer.name'.freeze
|
16
16
|
TAG_COMMIT_MESSAGE = 'git.commit.message'.freeze
|
17
17
|
TAG_COMMIT_SHA = 'git.commit.sha'.freeze
|
18
|
+
|
19
|
+
ENV_REPOSITORY_URL = 'DD_GIT_REPOSITORY_URL'.freeze
|
20
|
+
ENV_COMMIT_SHA = 'DD_GIT_COMMIT_SHA'.freeze
|
21
|
+
ENV_BRANCH = 'DD_GIT_BRANCH'.freeze
|
22
|
+
ENV_TAG = 'DD_GIT_TAG'.freeze
|
23
|
+
ENV_COMMIT_MESSAGE = 'DD_GIT_COMMIT_MESSAGE'.freeze
|
24
|
+
ENV_COMMIT_AUTHOR_NAME = 'DD_GIT_COMMIT_AUTHOR_NAME'.freeze
|
25
|
+
ENV_COMMIT_AUTHOR_EMAIL = 'DD_GIT_COMMIT_AUTHOR_EMAIL'.freeze
|
26
|
+
ENV_COMMIT_AUTHOR_DATE = 'DD_GIT_COMMIT_AUTHOR_DATE'.freeze
|
27
|
+
ENV_COMMIT_COMMITTER_NAME = 'DD_GIT_COMMIT_COMMITTER_NAME'.freeze
|
28
|
+
ENV_COMMIT_COMMITTER_EMAIL = 'DD_GIT_COMMIT_COMMITTER_EMAIL'.freeze
|
29
|
+
ENV_COMMIT_COMMITTER_DATE = 'DD_GIT_COMMIT_COMMITTER_DATE'.freeze
|
18
30
|
end
|
19
31
|
end
|
20
32
|
end
|
@@ -6,6 +6,7 @@ module Datadog
|
|
6
6
|
ENV_UPLOAD_TIMEOUT = 'DD_PROFILING_UPLOAD_TIMEOUT'.freeze
|
7
7
|
ENV_MAX_FRAMES = 'DD_PROFILING_MAX_FRAMES'.freeze
|
8
8
|
ENV_AGENTLESS = 'DD_PROFILING_AGENTLESS'.freeze
|
9
|
+
ENV_ENDPOINT_COLLECTION_ENABLED = 'DD_PROFILING_ENDPOINT_COLLECTION_ENABLED'.freeze
|
9
10
|
|
10
11
|
module Pprof
|
11
12
|
LABEL_KEY_SPAN_ID = 'span id'.freeze
|
data/lib/ddtrace/ext/runtime.rb
CHANGED
@@ -6,6 +6,7 @@ module Datadog
|
|
6
6
|
module Runtime
|
7
7
|
TAG_ID = 'runtime-id'.freeze
|
8
8
|
TAG_LANG = 'language'.freeze
|
9
|
+
TAG_PID = 'system.pid'.freeze
|
9
10
|
|
10
11
|
# Metrics
|
11
12
|
module Metrics
|
@@ -14,6 +15,8 @@ module Datadog
|
|
14
15
|
METRIC_CLASS_COUNT = 'runtime.ruby.class_count'.freeze
|
15
16
|
METRIC_GC_PREFIX = 'runtime.ruby.gc'.freeze
|
16
17
|
METRIC_THREAD_COUNT = 'runtime.ruby.thread_count'.freeze
|
18
|
+
METRIC_GLOBAL_CONSTANT_STATE = 'runtime.ruby.global_constant_state'.freeze
|
19
|
+
METRIC_GLOBAL_METHOD_STATE = 'runtime.ruby.global_method_state'.freeze
|
17
20
|
|
18
21
|
TAG_SERVICE = 'service'.freeze
|
19
22
|
end
|
@@ -3,6 +3,7 @@ module Datadog
|
|
3
3
|
module Ext
|
4
4
|
module Transport
|
5
5
|
module HTTP
|
6
|
+
ADAPTER = :net_http # DEV: Rename to simply `:http`, as Net::HTTP is an implementation detail.
|
6
7
|
DEFAULT_HOST = '127.0.0.1'.freeze
|
7
8
|
DEFAULT_PORT = 8126
|
8
9
|
DEFAULT_TIMEOUT_SECONDS = 1
|
@@ -16,6 +17,16 @@ module Datadog
|
|
16
17
|
HEADER_META_LANG_INTERPRETER = 'Datadog-Meta-Lang-Interpreter'.freeze
|
17
18
|
HEADER_META_TRACER_VERSION = 'Datadog-Meta-Tracer-Version'.freeze
|
18
19
|
end
|
20
|
+
|
21
|
+
module Test
|
22
|
+
ADAPTER = :test
|
23
|
+
end
|
24
|
+
|
25
|
+
module UnixSocket
|
26
|
+
ADAPTER = :unix
|
27
|
+
DEFAULT_PATH = '/var/run/datadog/apm.socket'.freeze
|
28
|
+
DEFAULT_TIMEOUT_SECONDS = 1
|
29
|
+
end
|
19
30
|
end
|
20
31
|
end
|
21
32
|
end
|
@@ -18,6 +18,12 @@ module Datadog
|
|
18
18
|
DEFAULT_MAX_TIME_USAGE_PCT = 2.0
|
19
19
|
MIN_INTERVAL = 0.01
|
20
20
|
THREAD_LAST_CPU_TIME_KEY = :datadog_profiler_last_cpu_time
|
21
|
+
THREAD_LAST_WALL_CLOCK_KEY = :datadog_profiler_last_wall_clock
|
22
|
+
|
23
|
+
# This default was picked based on the current sampling performance and on expected concurrency on an average
|
24
|
+
# Ruby MRI application. Lowering this optimizes for latency (less impact each time we sample), and raising
|
25
|
+
# optimizes for coverage (less chance to miss what a given thread is doing).
|
26
|
+
DEFAULT_MAX_THREADS_SAMPLED = 16
|
21
27
|
|
22
28
|
attr_reader \
|
23
29
|
:recorder,
|
@@ -33,6 +39,7 @@ module Datadog
|
|
33
39
|
trace_identifiers_helper:, # Usually an instance of Datadog::Profiling::TraceIdentifiers::Helper
|
34
40
|
ignore_thread: nil,
|
35
41
|
max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT,
|
42
|
+
max_threads_sampled: DEFAULT_MAX_THREADS_SAMPLED,
|
36
43
|
thread_api: Thread,
|
37
44
|
fork_policy: Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
|
38
45
|
interval: MIN_INTERVAL,
|
@@ -43,6 +50,7 @@ module Datadog
|
|
43
50
|
@trace_identifiers_helper = trace_identifiers_helper
|
44
51
|
@ignore_thread = ignore_thread
|
45
52
|
@max_time_usage_pct = max_time_usage_pct
|
53
|
+
@max_threads_sampled = max_threads_sampled
|
46
54
|
@thread_api = thread_api
|
47
55
|
|
48
56
|
# Workers::Async::Thread settings
|
@@ -60,10 +68,13 @@ module Datadog
|
|
60
68
|
@build_backtrace_location = method(:build_backtrace_location).to_proc
|
61
69
|
# Cache this buffer, since it's pretty expensive to keep accessing it
|
62
70
|
@stack_sample_event_recorder = recorder[Events::StackSample]
|
71
|
+
# See below for details on why this is needed
|
72
|
+
@needs_process_waiter_workaround =
|
73
|
+
Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('2.3') &&
|
74
|
+
Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7')
|
63
75
|
end
|
64
76
|
|
65
77
|
def start
|
66
|
-
@last_wall_time = Datadog::Utils::Time.get_time
|
67
78
|
reset_cpu_time_tracking
|
68
79
|
perform
|
69
80
|
end
|
@@ -87,24 +98,14 @@ module Datadog
|
|
87
98
|
|
88
99
|
def collect_events
|
89
100
|
events = []
|
90
|
-
|
91
|
-
# Compute wall time interval
|
92
|
-
current_wall_time = Datadog::Utils::Time.get_time
|
93
|
-
last_wall_time = if instance_variable_defined?(:@last_wall_time)
|
94
|
-
@last_wall_time
|
95
|
-
else
|
96
|
-
current_wall_time
|
97
|
-
end
|
98
|
-
|
99
|
-
wall_time_interval_ns = ((current_wall_time - last_wall_time).round(9) * 1e9).to_i
|
100
|
-
@last_wall_time = current_wall_time
|
101
|
+
current_wall_time_ns = get_current_wall_time_timestamp_ns
|
101
102
|
|
102
103
|
# Collect backtraces from each thread
|
103
|
-
|
104
|
+
threads_to_sample.each do |thread|
|
104
105
|
next unless thread.alive?
|
105
106
|
next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread)
|
106
107
|
|
107
|
-
event = collect_thread_event(thread,
|
108
|
+
event = collect_thread_event(thread, current_wall_time_ns)
|
108
109
|
events << event unless event.nil?
|
109
110
|
end
|
110
111
|
|
@@ -114,7 +115,7 @@ module Datadog
|
|
114
115
|
events
|
115
116
|
end
|
116
117
|
|
117
|
-
def collect_thread_event(thread,
|
118
|
+
def collect_thread_event(thread, current_wall_time_ns)
|
118
119
|
locations = thread.backtrace_locations
|
119
120
|
return if locations.nil?
|
120
121
|
|
@@ -126,8 +127,10 @@ module Datadog
|
|
126
127
|
locations = convert_backtrace_locations(locations)
|
127
128
|
|
128
129
|
thread_id = thread.respond_to?(:pthread_thread_id) ? thread.pthread_thread_id : thread.object_id
|
129
|
-
trace_id, span_id,
|
130
|
+
trace_id, span_id, trace_resource = trace_identifiers_helper.trace_identifiers_for(thread)
|
130
131
|
cpu_time = get_cpu_time_interval!(thread)
|
132
|
+
wall_time_interval_ns =
|
133
|
+
get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_WALL_CLOCK_KEY, current_wall_time_ns)
|
131
134
|
|
132
135
|
Events::StackSample.new(
|
133
136
|
nil,
|
@@ -136,7 +139,7 @@ module Datadog
|
|
136
139
|
thread_id,
|
137
140
|
trace_id,
|
138
141
|
span_id,
|
139
|
-
|
142
|
+
trace_resource,
|
140
143
|
cpu_time,
|
141
144
|
wall_time_interval_ns
|
142
145
|
)
|
@@ -156,14 +159,7 @@ module Datadog
|
|
156
159
|
# *before* the thread had time to finish the initialization
|
157
160
|
return unless current_cpu_time_ns
|
158
161
|
|
159
|
-
|
160
|
-
interval = current_cpu_time_ns - last_cpu_time_ns
|
161
|
-
|
162
|
-
# Update CPU time for thread
|
163
|
-
thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
|
164
|
-
|
165
|
-
# Return interval
|
166
|
-
interval
|
162
|
+
get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
|
167
163
|
end
|
168
164
|
|
169
165
|
def compute_wait_time(used_time)
|
@@ -237,10 +233,10 @@ module Datadog
|
|
237
233
|
end
|
238
234
|
|
239
235
|
# If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to
|
240
|
-
# clean up any leftover per-thread
|
236
|
+
# clean up any leftover per-thread counters, so that the first sample after starting doesn't end up with:
|
241
237
|
#
|
242
238
|
# a) negative time: At least on my test docker container, and on the reliability environment, after the process
|
243
|
-
# forks, the
|
239
|
+
# forks, the cpu time reference changes and (old cpu time - new cpu time) can be < 0
|
244
240
|
#
|
245
241
|
# b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
|
246
242
|
# restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
|
@@ -248,9 +244,57 @@ module Datadog
|
|
248
244
|
# By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
|
249
245
|
def reset_cpu_time_tracking
|
250
246
|
thread_api.list.each do |thread|
|
247
|
+
# See below for details on why this is needed
|
248
|
+
next if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
|
249
|
+
|
251
250
|
thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil)
|
251
|
+
thread.thread_variable_set(THREAD_LAST_WALL_CLOCK_KEY, nil)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def get_elapsed_since_last_sample_and_set_value(thread, key, current_value)
|
256
|
+
# See cthread.rb for more details, but this is a workaround for https://bugs.ruby-lang.org/issues/17807 ;
|
257
|
+
# using all thread_variable related methods on these instances also triggers a crash and for now we just
|
258
|
+
# skip it for the affected Rubies
|
259
|
+
return 0 if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
|
260
|
+
|
261
|
+
last_value = thread.thread_variable_get(key) || current_value
|
262
|
+
thread.thread_variable_set(key, current_value)
|
263
|
+
|
264
|
+
current_value - last_value
|
265
|
+
end
|
266
|
+
|
267
|
+
# Whenever there are more than max_threads_sampled active, we only sample a subset of them.
|
268
|
+
# We do this to avoid impacting the latency of the service being profiled. We want to avoid doing
|
269
|
+
# a big burst of work all at once (sample everything), and instead do a little work each time
|
270
|
+
# (sample a bit by bit).
|
271
|
+
#
|
272
|
+
# Because we pick the threads to sample randomly, we'll eventually sample all threads -- just not at once.
|
273
|
+
# Notice also that this will interact with our dynamic sampling mechanism -- if samples are faster, we take
|
274
|
+
# them more often, if they are slower, we take them less often -- which again means that over a longer period
|
275
|
+
# we should take sample roughly the same samples.
|
276
|
+
#
|
277
|
+
# One downside of this approach is that if there really are many threads, the resulting wall clock times
|
278
|
+
# in a one minute profile may "drift" around the 60 second mark, e.g. maybe we only sampled a thread once per
|
279
|
+
# second and only 59 times, so we'll report 59s, but on the next report we'll include the missing one, so
|
280
|
+
# then the result will be 61s. I've observed 60 +- 1.68 secs for an app with ~65 threads, given the
|
281
|
+
# default maximum of 16 threads. This seems a reasonable enough margin of error given the improvement to
|
282
|
+
# latency (especially on such a large application! -> even bigger latency impact if we tried to sample all
|
283
|
+
# threads).
|
284
|
+
#
|
285
|
+
def threads_to_sample
|
286
|
+
all_threads = thread_api.list
|
287
|
+
|
288
|
+
if all_threads.size > @max_threads_sampled
|
289
|
+
all_threads.sample(@max_threads_sampled)
|
290
|
+
else
|
291
|
+
all_threads
|
252
292
|
end
|
253
293
|
end
|
294
|
+
|
295
|
+
def get_current_wall_time_timestamp_ns
|
296
|
+
Datadog::Utils::Time.get_time(:nanosecond)
|
297
|
+
end
|
254
298
|
end
|
255
299
|
end
|
256
300
|
end
|
@@ -24,8 +24,16 @@ module Datadog
|
|
24
24
|
flush.event_groups.each { |event_group| template.add_events!(event_group.event_class, event_group.events) }
|
25
25
|
|
26
26
|
Datadog.logger.debug do
|
27
|
+
max_events = Datadog.configuration.profiling.advanced.max_events
|
28
|
+
events_sampled =
|
29
|
+
if flush.event_count == max_events
|
30
|
+
'max events limit hit, events were sampled [profile will be biased], '
|
31
|
+
else
|
32
|
+
''
|
33
|
+
end
|
34
|
+
|
27
35
|
"Encoding profile covering #{flush.start.iso8601} to #{flush.finish.iso8601}, " \
|
28
|
-
"events: #{flush.event_count} (#{template.debug_statistics})"
|
36
|
+
"events: #{flush.event_count} (#{events_sampled}#{template.debug_statistics})"
|
29
37
|
end
|
30
38
|
|
31
39
|
# Build the profile and encode it
|
@@ -13,7 +13,7 @@ module Datadog
|
|
13
13
|
:thread_id,
|
14
14
|
:trace_id,
|
15
15
|
:span_id,
|
16
|
-
:
|
16
|
+
:trace_resource
|
17
17
|
|
18
18
|
def initialize(
|
19
19
|
timestamp,
|
@@ -22,7 +22,7 @@ module Datadog
|
|
22
22
|
thread_id,
|
23
23
|
trace_id,
|
24
24
|
span_id,
|
25
|
-
|
25
|
+
trace_resource
|
26
26
|
)
|
27
27
|
super(timestamp)
|
28
28
|
|
@@ -31,14 +31,14 @@ module Datadog
|
|
31
31
|
@thread_id = thread_id
|
32
32
|
@trace_id = trace_id
|
33
33
|
@span_id = span_id
|
34
|
-
@
|
34
|
+
@trace_resource = trace_resource
|
35
35
|
|
36
36
|
@hash = [
|
37
37
|
thread_id,
|
38
38
|
trace_id,
|
39
39
|
span_id,
|
40
|
-
#
|
41
|
-
#
|
40
|
+
# trace_resource is deliberately not included -- events that share the same (trace_id, span_id)
|
41
|
+
# trace_resource might not match between pairs, but they refer to the same trace.
|
42
42
|
frames.collect(&:hash),
|
43
43
|
total_frame_count
|
44
44
|
].hash
|
@@ -58,7 +58,7 @@ module Datadog
|
|
58
58
|
thread_id,
|
59
59
|
trace_id,
|
60
60
|
span_id,
|
61
|
-
|
61
|
+
trace_resource,
|
62
62
|
cpu_time_interval_ns,
|
63
63
|
wall_time_interval_ns
|
64
64
|
)
|
@@ -69,7 +69,7 @@ module Datadog
|
|
69
69
|
thread_id,
|
70
70
|
trace_id,
|
71
71
|
span_id,
|
72
|
-
|
72
|
+
trace_resource
|
73
73
|
)
|
74
74
|
|
75
75
|
@cpu_time_interval_ns = cpu_time_interval_ns
|
@@ -25,20 +25,19 @@ module Datadog
|
|
25
25
|
# [key, EventGroup]
|
26
26
|
event_groups = {}
|
27
27
|
|
28
|
+
# Aggregate each event into a group
|
29
|
+
# with identical properties, but different values.
|
28
30
|
events.each do |event|
|
29
31
|
key = yield(event)
|
30
|
-
values =
|
32
|
+
values = build_event_values(event)
|
31
33
|
|
32
34
|
unless key.nil?
|
33
35
|
if event_groups.key?(key)
|
34
|
-
# Update
|
35
|
-
|
36
|
-
group_values.each_with_index do |group_value, i|
|
37
|
-
group_values[i] = group_value + values[i]
|
38
|
-
end
|
36
|
+
# Update existing group from event
|
37
|
+
update_group(event_groups[key], event, values)
|
39
38
|
else
|
40
39
|
# Add new group
|
41
|
-
event_groups[key] =
|
40
|
+
event_groups[key] = new_group(event, values)
|
42
41
|
end
|
43
42
|
end
|
44
43
|
end
|
@@ -57,7 +56,7 @@ module Datadog
|
|
57
56
|
index
|
58
57
|
end
|
59
58
|
|
60
|
-
def
|
59
|
+
def build_event_values(event)
|
61
60
|
# Build a value array that matches the length of the sample types
|
62
61
|
# Populate all values with "no value" by default
|
63
62
|
Array.new(@sample_type_mappings.length, Datadog::Ext::Profiling::Pprof::SAMPLE_VALUE_NO_VALUE)
|
@@ -69,7 +68,7 @@ module Datadog
|
|
69
68
|
|
70
69
|
# Represents a grouped event
|
71
70
|
# 'sample' is an example event object from the group.
|
72
|
-
# 'values' is the
|
71
|
+
# 'values' is the summation of the group's sample values
|
73
72
|
EventGroup = Struct.new(:sample, :values)
|
74
73
|
|
75
74
|
# Error when the mapping of a sample type to value index is unknown
|
@@ -84,6 +83,20 @@ module Datadog
|
|
84
83
|
"Mapping for sample value type '#{type}' to index is unknown."
|
85
84
|
end
|
86
85
|
end
|
86
|
+
|
87
|
+
protected
|
88
|
+
|
89
|
+
def new_group(event, values)
|
90
|
+
EventGroup.new(event, values)
|
91
|
+
end
|
92
|
+
|
93
|
+
def update_group(event_group, event, values)
|
94
|
+
# Update values for group
|
95
|
+
group_values = event_group.values
|
96
|
+
group_values.each_with_index do |group_value, i|
|
97
|
+
group_values[i] = group_value + values[i]
|
98
|
+
end
|
99
|
+
end
|
87
100
|
end
|
88
101
|
end
|
89
102
|
end
|
@@ -30,6 +30,7 @@ module Datadog
|
|
30
30
|
def initialize(*_)
|
31
31
|
super
|
32
32
|
|
33
|
+
@most_recent_trace_samples = {}
|
33
34
|
@processed_unique_stacks = 0
|
34
35
|
@processed_with_trace_ids = 0
|
35
36
|
end
|
@@ -40,9 +41,28 @@ module Datadog
|
|
40
41
|
end
|
41
42
|
|
42
43
|
def stack_sample_group_key(stack_sample)
|
44
|
+
# We want to make sure we have the most recent sample for any trace.
|
45
|
+
# (This is done here to save an iteration over all samples.)
|
46
|
+
update_most_recent_trace_sample(stack_sample)
|
47
|
+
|
43
48
|
stack_sample.hash
|
44
49
|
end
|
45
50
|
|
51
|
+
# Track the most recent sample for each trace
|
52
|
+
def update_most_recent_trace_sample(stack_sample)
|
53
|
+
return unless stack_sample.trace_id && stack_sample.trace_resource
|
54
|
+
|
55
|
+
# Update trace resource with most recent value
|
56
|
+
if (most_recent_trace_sample = @most_recent_trace_samples[stack_sample.trace_id])
|
57
|
+
if most_recent_trace_sample.timestamp < stack_sample.timestamp
|
58
|
+
@most_recent_trace_samples[stack_sample.trace_id] = stack_sample
|
59
|
+
end
|
60
|
+
else
|
61
|
+
# Add trace resource
|
62
|
+
@most_recent_trace_samples[stack_sample.trace_id] = stack_sample
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
46
66
|
def build_samples(stack_samples)
|
47
67
|
groups = group_events(stack_samples, &method(:stack_sample_group_key))
|
48
68
|
groups.collect do |_group_key, group|
|
@@ -64,7 +84,7 @@ module Datadog
|
|
64
84
|
)
|
65
85
|
end
|
66
86
|
|
67
|
-
def
|
87
|
+
def build_event_values(stack_sample)
|
68
88
|
no_value = Datadog::Ext::Profiling::Pprof::SAMPLE_VALUE_NO_VALUE
|
69
89
|
values = super(stack_sample)
|
70
90
|
values[sample_value_index(:cpu_time_ns)] = stack_sample.cpu_time_interval_ns || no_value
|
@@ -96,7 +116,13 @@ module Datadog
|
|
96
116
|
str: builder.string_table.fetch(span_id.to_s)
|
97
117
|
)
|
98
118
|
|
99
|
-
|
119
|
+
# Use most up-to-date trace resource, if available.
|
120
|
+
# Otherwise, use the trace resource provided.
|
121
|
+
trace_resource = (
|
122
|
+
@most_recent_trace_samples[stack_sample.trace_id] \
|
123
|
+
|| stack_sample
|
124
|
+
).trace_resource
|
125
|
+
|
100
126
|
if trace_resource && !trace_resource.empty?
|
101
127
|
labels << Perftools::Profiles::Label.new(
|
102
128
|
key: builder.string_table.fetch(Datadog::Ext::Profiling::Pprof::LABEL_KEY_TRACE_ENDPOINT),
|
@@ -33,7 +33,7 @@ module Datadog
|
|
33
33
|
def maybe_extract_resource(root_span)
|
34
34
|
return unless root_span
|
35
35
|
|
36
|
-
root_span.
|
36
|
+
root_span.resource if root_span.span_type == Datadog::Ext::HTTP::TYPE_INBOUND
|
37
37
|
end
|
38
38
|
end
|
39
39
|
end
|
@@ -20,10 +20,10 @@ module Datadog
|
|
20
20
|
def initialize(
|
21
21
|
tracer:,
|
22
22
|
# If this is disabled, the helper will strip the optional trace_resource_container even if provided by the api
|
23
|
-
|
23
|
+
endpoint_collection_enabled:,
|
24
24
|
supported_apis: DEFAULT_SUPPORTED_APIS.map { |api| api.new(tracer: tracer) }
|
25
25
|
)
|
26
|
-
@
|
26
|
+
@endpoint_collection_enabled = endpoint_collection_enabled
|
27
27
|
@supported_apis = supported_apis
|
28
28
|
end
|
29
29
|
|
@@ -34,7 +34,7 @@ module Datadog
|
|
34
34
|
trace_identifiers = api.trace_identifiers_for(thread)
|
35
35
|
|
36
36
|
if trace_identifiers
|
37
|
-
return @
|
37
|
+
return @endpoint_collection_enabled ? trace_identifiers : trace_identifiers[0..1]
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|