datadog 2.2.0 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +51 -2
  3. data/ext/datadog_profiling_loader/extconf.rb +15 -15
  4. data/ext/datadog_profiling_native_extension/clock_id.h +1 -0
  5. data/ext/datadog_profiling_native_extension/clock_id_from_pthread.c +1 -2
  6. data/ext/datadog_profiling_native_extension/clock_id_noop.c +1 -2
  7. data/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +113 -43
  8. data/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c +49 -26
  9. data/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.h +34 -4
  10. data/ext/datadog_profiling_native_extension/collectors_idle_sampling_helper.c +4 -0
  11. data/ext/datadog_profiling_native_extension/collectors_stack.c +49 -37
  12. data/ext/datadog_profiling_native_extension/collectors_stack.h +2 -2
  13. data/ext/datadog_profiling_native_extension/collectors_thread_context.c +81 -19
  14. data/ext/datadog_profiling_native_extension/collectors_thread_context.h +1 -0
  15. data/ext/datadog_profiling_native_extension/datadog_ruby_common.c +110 -0
  16. data/ext/datadog_profiling_native_extension/datadog_ruby_common.h +57 -0
  17. data/ext/datadog_profiling_native_extension/extconf.rb +65 -60
  18. data/ext/datadog_profiling_native_extension/heap_recorder.c +34 -6
  19. data/ext/datadog_profiling_native_extension/heap_recorder.h +3 -1
  20. data/ext/datadog_profiling_native_extension/helpers.h +6 -17
  21. data/ext/datadog_profiling_native_extension/http_transport.c +3 -3
  22. data/ext/datadog_profiling_native_extension/libdatadog_helpers.c +0 -86
  23. data/ext/datadog_profiling_native_extension/libdatadog_helpers.h +2 -23
  24. data/ext/datadog_profiling_native_extension/native_extension_helpers.rb +61 -172
  25. data/ext/datadog_profiling_native_extension/private_vm_api_access.c +64 -138
  26. data/ext/datadog_profiling_native_extension/private_vm_api_access.h +17 -11
  27. data/ext/datadog_profiling_native_extension/profiling.c +0 -2
  28. data/ext/datadog_profiling_native_extension/ruby_helpers.c +0 -33
  29. data/ext/datadog_profiling_native_extension/ruby_helpers.h +1 -26
  30. data/ext/datadog_profiling_native_extension/setup_signal_handler.h +1 -0
  31. data/ext/datadog_profiling_native_extension/stack_recorder.c +14 -2
  32. data/ext/datadog_profiling_native_extension/stack_recorder.h +1 -0
  33. data/ext/datadog_profiling_native_extension/time_helpers.c +0 -15
  34. data/ext/datadog_profiling_native_extension/time_helpers.h +36 -6
  35. data/ext/{datadog_profiling_native_extension → libdatadog_api}/crashtracker.c +19 -6
  36. data/ext/libdatadog_api/datadog_ruby_common.c +110 -0
  37. data/ext/libdatadog_api/datadog_ruby_common.h +57 -0
  38. data/ext/libdatadog_api/extconf.rb +108 -0
  39. data/ext/libdatadog_api/macos_development.md +26 -0
  40. data/ext/libdatadog_extconf_helpers.rb +130 -0
  41. data/lib/datadog/appsec/contrib/graphql/appsec_trace.rb +49 -0
  42. data/lib/datadog/appsec/contrib/graphql/gateway/multiplex.rb +73 -0
  43. data/lib/datadog/appsec/contrib/graphql/gateway/watcher.rb +68 -0
  44. data/lib/datadog/appsec/contrib/graphql/integration.rb +41 -0
  45. data/lib/datadog/appsec/contrib/graphql/patcher.rb +37 -0
  46. data/lib/datadog/appsec/contrib/graphql/reactive/multiplex.rb +59 -0
  47. data/lib/datadog/appsec/contrib/rack/gateway/request.rb +1 -1
  48. data/lib/datadog/appsec/processor/actions.rb +1 -1
  49. data/lib/datadog/appsec/response.rb +15 -1
  50. data/lib/datadog/appsec.rb +1 -0
  51. data/lib/datadog/core/configuration/components.rb +14 -12
  52. data/lib/datadog/core/configuration/settings.rb +54 -7
  53. data/lib/datadog/core/crashtracking/agent_base_url.rb +21 -0
  54. data/lib/datadog/core/crashtracking/component.rb +111 -0
  55. data/lib/datadog/core/crashtracking/tag_builder.rb +39 -0
  56. data/lib/datadog/core/diagnostics/environment_logger.rb +8 -11
  57. data/lib/datadog/core/telemetry/component.rb +49 -2
  58. data/lib/datadog/core/telemetry/emitter.rb +9 -11
  59. data/lib/datadog/core/telemetry/event.rb +32 -1
  60. data/lib/datadog/core/telemetry/ext.rb +1 -0
  61. data/lib/datadog/core/telemetry/http/adapters/net.rb +10 -12
  62. data/lib/datadog/core/telemetry/http/ext.rb +3 -0
  63. data/lib/datadog/core/telemetry/http/transport.rb +38 -9
  64. data/lib/datadog/core/telemetry/logging.rb +35 -0
  65. data/lib/datadog/core/utils/at_fork_monkey_patch.rb +102 -0
  66. data/lib/datadog/kit/appsec/events.rb +2 -4
  67. data/lib/datadog/opentelemetry/sdk/span_processor.rb +10 -0
  68. data/lib/datadog/opentelemetry/sdk/trace/span.rb +23 -0
  69. data/lib/datadog/profiling/collectors/code_provenance.rb +7 -7
  70. data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +17 -17
  71. data/lib/datadog/profiling/collectors/idle_sampling_helper.rb +11 -13
  72. data/lib/datadog/profiling/collectors/info.rb +3 -3
  73. data/lib/datadog/profiling/collectors/thread_context.rb +4 -2
  74. data/lib/datadog/profiling/component.rb +69 -91
  75. data/lib/datadog/profiling/exporter.rb +3 -3
  76. data/lib/datadog/profiling/ext/dir_monkey_patches.rb +3 -3
  77. data/lib/datadog/profiling/ext.rb +21 -21
  78. data/lib/datadog/profiling/flush.rb +1 -1
  79. data/lib/datadog/profiling/http_transport.rb +8 -6
  80. data/lib/datadog/profiling/load_native_extension.rb +5 -5
  81. data/lib/datadog/profiling/preload.rb +1 -1
  82. data/lib/datadog/profiling/profiler.rb +5 -8
  83. data/lib/datadog/profiling/scheduler.rb +31 -25
  84. data/lib/datadog/profiling/tag_builder.rb +2 -2
  85. data/lib/datadog/profiling/tasks/exec.rb +5 -5
  86. data/lib/datadog/profiling/tasks/setup.rb +16 -35
  87. data/lib/datadog/profiling.rb +4 -5
  88. data/lib/datadog/tracing/contrib/active_record/events/sql.rb +1 -0
  89. data/lib/datadog/tracing/contrib/ext.rb +14 -0
  90. data/lib/datadog/tracing/contrib/graphql/unified_trace.rb +1 -1
  91. data/lib/datadog/tracing/contrib/graphql/unified_trace_patcher.rb +4 -1
  92. data/lib/datadog/tracing/contrib/lograge/patcher.rb +16 -0
  93. data/lib/datadog/tracing/contrib/mysql2/configuration/settings.rb +5 -0
  94. data/lib/datadog/tracing/contrib/mysql2/instrumentation.rb +17 -13
  95. data/lib/datadog/tracing/contrib/pg/configuration/settings.rb +5 -0
  96. data/lib/datadog/tracing/contrib/pg/instrumentation.rb +4 -1
  97. data/lib/datadog/tracing/contrib/propagation/sql_comment/ext.rb +28 -0
  98. data/lib/datadog/tracing/contrib/propagation/sql_comment/mode.rb +5 -1
  99. data/lib/datadog/tracing/contrib/propagation/sql_comment.rb +22 -10
  100. data/lib/datadog/tracing/contrib/trilogy/configuration/settings.rb +5 -0
  101. data/lib/datadog/tracing/contrib/trilogy/instrumentation.rb +4 -1
  102. data/lib/datadog/tracing/diagnostics/environment_logger.rb +14 -16
  103. data/lib/datadog/tracing/metadata/errors.rb +9 -1
  104. data/lib/datadog/tracing/metadata/ext.rb +4 -0
  105. data/lib/datadog/tracing/pipeline/span_filter.rb +2 -2
  106. data/lib/datadog/tracing/span.rb +9 -2
  107. data/lib/datadog/tracing/span_event.rb +41 -0
  108. data/lib/datadog/tracing/span_operation.rb +6 -2
  109. data/lib/datadog/tracing/transport/serializable_trace.rb +3 -0
  110. data/lib/datadog/version.rb +1 -1
  111. metadata +28 -10
  112. data/lib/datadog/profiling/crashtracker.rb +0 -91
  113. data/lib/datadog/profiling/ext/forking.rb +0 -98
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 895fe8f9fdd8391d5c6c86e8d39d0a8e241c85bbce42b35c9744e1f94095853f
4
- data.tar.gz: d5a6e88ec35816de59a5d080f66896a4c85656216d2853b46bb1268dfa64df35
3
+ metadata.gz: 5d6610c8ef7e86c023f8a3fca884807bd7e9cf6b84fc6cbdd79b98e8a8762c2e
4
+ data.tar.gz: 236fafc4b8e2c809552d97c8eb025654c0ef4279685c741ad9504b2af8265b94
5
5
  SHA512:
6
- metadata.gz: fef3c78c7835c47507a1f09d87c2ee84ddcc97303f9b9d7c6b4d601381ac62a47133f3450ddbc888cf54c56353ef4417ff337009a1e60d2ae239fb19093d721b
7
- data.tar.gz: 061154162ab97a6e1cdc87f53c18d98fd0eea75b6cb7e71fd1e4e5c4a536ff7722c4e1202258760113ce5a7f0ce3838622e700858103fa97dddc33d0322275cc
6
+ metadata.gz: e633db76f69b5d151629cde5e1b7024a6bea43343aa348ea50857d5e74d049d2b6a833be252c6c1c7aee95976805a795e1607a81d42e8c3246126abd9811777d
7
+ data.tar.gz: fb2bcf3803689d8e499f2266e7d063f1c5b9b7d08bb28b5d27f81bd16e7e373c01c1d5224027d43229a5ca609f66c5179aa521ef272f347d069ca73e3613bdc3
data/CHANGELOG.md CHANGED
@@ -2,6 +2,34 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [2.3.0] - 2024-08-22
6
+
7
+ ### Added
8
+
9
+ * Core: Support agentless telemetry ([#3779][])
10
+ * Tracing: Add support for span events ([#3776][])
11
+ * Tracing: Add tags to enable inferred service dependencies for databases ([#3789][])
12
+ * Tracing: Emit log message and instructions for incompatible Lograge setup ([#3812][], [#3839][])
13
+ * Tracing: Add `append_comment` option to append SQL comment propagation for `mysql`, `pg` and `trilogy` ([#3809][])
14
+ * AppSec: Add threat detection and protection for `graphql` ([#3769][], [#3814][])
15
+
16
+ ### Changed
17
+
18
+ * Core: Enable crashtracking by default ([#3826][])
19
+ * Profiling: Reduce allocation overhead ([#3805][], [#3797][])
20
+ * Profiling: Speed up stack sampling ([#3837][])
21
+ * Profiling: Upgrade to libdatadog 11 ([#3799][])
22
+ * Profiling: Disable allocation counting feature by default ([#3798][])
23
+ * Profiling: Reduce the maximum biased result for allocation samples ([#3793][])
24
+ * Tracing: Reduce noisy integration logs ([#3785][])
25
+
26
+ ### Fixed
27
+
28
+ * Tracing: Fix `require` issue for `graphql` ([#3813][])
29
+ * AppSec: Fix an error when parsing http headers with integer value ([#3790][])
30
+ * AppSec: Fix an error when tracking login failure without `user_id` ([#3841][])
31
+ * Fix a syntax error for Ruby < 2.4 during single step instrumentation ([#3795][])
32
+
5
33
  ## [2.2.0] - 2024-07-11
6
34
 
7
35
  ### Added
@@ -2934,7 +2962,8 @@ Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v0.3.1
2934
2962
  Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
2935
2963
 
2936
2964
 
2937
- [Unreleased]: https://github.com/DataDog/dd-trace-rb/compare/v2.2.0...master
2965
+ [Unreleased]: https://github.com/DataDog/dd-trace-rb/compare/v2.3.0...master
2966
+ [2.3.0]: https://github.com/DataDog/dd-trace-rb/compare/v2.2.0...v2.3.0
2938
2967
  [2.2.0]: https://github.com/DataDog/dd-trace-rb/compare/v2.1.0...v2.2.0
2939
2968
  [2.1.0]: https://github.com/DataDog/dd-trace-rb/compare/v2.0.0...v2.1.0
2940
2969
  [2.0.0]: https://github.com/DataDog/dd-trace-rb/compare/v2.0.0.rc1...v2.0.0
@@ -4331,9 +4360,29 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
4331
4360
  [#3753]: https://github.com/DataDog/dd-trace-rb/issues/3753
4332
4361
  [#3757]: https://github.com/DataDog/dd-trace-rb/issues/3757
4333
4362
  [#3759]: https://github.com/DataDog/dd-trace-rb/issues/3759
4363
+ [#3769]: https://github.com/DataDog/dd-trace-rb/issues/3769
4334
4364
  [#3770]: https://github.com/DataDog/dd-trace-rb/issues/3770
4335
4365
  [#3772]: https://github.com/DataDog/dd-trace-rb/issues/3772
4336
4366
  [#3774]: https://github.com/DataDog/dd-trace-rb/issues/3774
4367
+ [#3776]: https://github.com/DataDog/dd-trace-rb/issues/3776
4368
+ [#3779]: https://github.com/DataDog/dd-trace-rb/issues/3779
4369
+ [#3785]: https://github.com/DataDog/dd-trace-rb/issues/3785
4370
+ [#3789]: https://github.com/DataDog/dd-trace-rb/issues/3789
4371
+ [#3790]: https://github.com/DataDog/dd-trace-rb/issues/3790
4372
+ [#3793]: https://github.com/DataDog/dd-trace-rb/issues/3793
4373
+ [#3795]: https://github.com/DataDog/dd-trace-rb/issues/3795
4374
+ [#3797]: https://github.com/DataDog/dd-trace-rb/issues/3797
4375
+ [#3798]: https://github.com/DataDog/dd-trace-rb/issues/3798
4376
+ [#3799]: https://github.com/DataDog/dd-trace-rb/issues/3799
4377
+ [#3805]: https://github.com/DataDog/dd-trace-rb/issues/3805
4378
+ [#3809]: https://github.com/DataDog/dd-trace-rb/issues/3809
4379
+ [#3812]: https://github.com/DataDog/dd-trace-rb/issues/3812
4380
+ [#3813]: https://github.com/DataDog/dd-trace-rb/issues/3813
4381
+ [#3814]: https://github.com/DataDog/dd-trace-rb/issues/3814
4382
+ [#3826]: https://github.com/DataDog/dd-trace-rb/issues/3826
4383
+ [#3837]: https://github.com/DataDog/dd-trace-rb/issues/3837
4384
+ [#3839]: https://github.com/DataDog/dd-trace-rb/issues/3839
4385
+ [#3841]: https://github.com/DataDog/dd-trace-rb/issues/3841
4337
4386
  [@AdrianLC]: https://github.com/AdrianLC
4338
4387
  [@Azure7111]: https://github.com/Azure7111
4339
4388
  [@BabyGroot]: https://github.com/BabyGroot
@@ -4485,4 +4534,4 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
4485
4534
  [@y-yagi]: https://github.com/y-yagi
4486
4535
  [@yujideveloper]: https://github.com/yujideveloper
4487
4536
  [@yukimurasawa]: https://github.com/yukimurasawa
4488
- [@zachmccormick]: https://github.com/zachmccormick
4537
+ [@zachmccormick]: https://github.com/zachmccormick
@@ -1,22 +1,22 @@
1
1
  # rubocop:disable Style/StderrPuts
2
2
  # rubocop:disable Style/GlobalVars
3
3
 
4
- if RUBY_ENGINE != 'ruby' || Gem.win_platform?
4
+ if RUBY_ENGINE != "ruby" || Gem.win_platform?
5
5
  $stderr.puts(
6
- 'WARN: Skipping build of Datadog profiling loader. See Datadog profiling native extension note for details.'
6
+ "WARN: Skipping build of Datadog profiling loader. See Datadog profiling native extension note for details."
7
7
  )
8
8
 
9
- File.write('Makefile', 'all install clean: # dummy makefile that does nothing')
9
+ File.write("Makefile", "all install clean: # dummy makefile that does nothing")
10
10
  exit
11
11
  end
12
12
 
13
- require 'mkmf'
13
+ require "mkmf"
14
14
 
15
15
  # mkmf on modern Rubies actually has an append_cflags that does something similar
16
16
  # (see https://github.com/ruby/ruby/pull/5760), but as usual we need a bit more boilerplate to deal with legacy Rubies
17
17
  def add_compiler_flag(flag)
18
18
  if try_cflags(flag)
19
- $CFLAGS << ' ' << flag
19
+ $CFLAGS << " " << flag
20
20
  else
21
21
  $stderr.puts("WARNING: '#{flag}' not accepted by compiler, skipping it")
22
22
  end
@@ -24,26 +24,26 @@ end
24
24
 
25
25
  # Because we can't control what compiler versions our customers use, shipping with -Werror by default is a no-go.
26
26
  # But we can enable it in CI, so that we quickly spot any new warnings that just got introduced.
27
- add_compiler_flag '-Werror' if ENV['DATADOG_GEM_CI'] == 'true'
27
+ add_compiler_flag "-Werror" if ENV["DATADOG_GEM_CI"] == "true"
28
28
 
29
29
  # Older gcc releases may not default to C99 and we need to ask for this. This is also used:
30
30
  # * by upstream Ruby -- search for gnu99 in the codebase
31
31
  # * by msgpack, another datadog gem dependency
32
32
  # (https://github.com/msgpack/msgpack-ruby/blob/18ce08f6d612fe973843c366ac9a0b74c4e50599/ext/msgpack/extconf.rb#L8)
33
- add_compiler_flag '-std=gnu99'
33
+ add_compiler_flag "-std=gnu99"
34
34
 
35
35
  # Gets really noisy when we include the MJIT header, let's omit it (TODO: Use #pragma GCC diagnostic instead?)
36
- add_compiler_flag '-Wno-unused-function'
36
+ add_compiler_flag "-Wno-unused-function"
37
37
 
38
38
  # Allow defining variables at any point in a function
39
- add_compiler_flag '-Wno-declaration-after-statement'
39
+ add_compiler_flag "-Wno-declaration-after-statement"
40
40
 
41
41
  # If we forget to include a Ruby header, the function call may still appear to work, but then
42
42
  # cause a segfault later. Let's ensure that never happens.
43
- add_compiler_flag '-Werror-implicit-function-declaration'
43
+ add_compiler_flag "-Werror-implicit-function-declaration"
44
44
 
45
45
  # Warn on unused parameters to functions. Use `DDTRACE_UNUSED` to mark things as known-to-not-be-used.
46
- add_compiler_flag '-Wunused-parameter'
46
+ add_compiler_flag "-Wunused-parameter"
47
47
 
48
48
  # The native extension is not intended to expose any symbols/functions for other native libraries to use;
49
49
  # the sole exception being `Init_datadog_profiling_loader` which needs to be visible for Ruby to call it when
@@ -51,14 +51,14 @@ add_compiler_flag '-Wunused-parameter'
51
51
  #
52
52
  # By setting this compiler flag, we tell it to assume that everything is private unless explicitly stated.
53
53
  # For more details see https://gcc.gnu.org/wiki/Visibility
54
- add_compiler_flag '-fvisibility=hidden'
54
+ add_compiler_flag "-fvisibility=hidden"
55
55
 
56
56
  # Avoid legacy C definitions
57
- add_compiler_flag '-Wold-style-definition'
57
+ add_compiler_flag "-Wold-style-definition"
58
58
 
59
59
  # Enable all other compiler warnings
60
- add_compiler_flag '-Wall'
61
- add_compiler_flag '-Wextra'
60
+ add_compiler_flag "-Wall"
61
+ add_compiler_flag "-Wextra"
62
62
 
63
63
  # Tag the native extension library with the Ruby version and Ruby platform.
64
64
  # This makes it easier for development (avoids "oops I forgot to rebuild when I switched my Ruby") and ensures that
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include <stdbool.h>
4
4
  #include <time.h>
5
+ #include <ruby.h>
5
6
 
6
7
  // Contains the operating-system specific identifier needed to fetch CPU-time, and a flag to indicate if we failed to fetch it
7
8
  typedef struct thread_cpu_time_id {
@@ -7,11 +7,10 @@
7
7
  #include <pthread.h>
8
8
  #include <time.h>
9
9
  #include <errno.h>
10
- #include <ruby.h>
11
10
 
11
+ #include "clock_id.h"
12
12
  #include "helpers.h"
13
13
  #include "private_vm_api_access.h"
14
- #include "clock_id.h"
15
14
  #include "time_helpers.h"
16
15
 
17
16
  // Validate that our home-cooked pthread_id_for() matches pthread_self() for the current thread
@@ -4,10 +4,9 @@
4
4
  // is not available.
5
5
  #ifndef HAVE_PTHREAD_GETCPUCLOCKID
6
6
 
7
- #include <ruby.h>
8
-
9
7
  #include "clock_id.h"
10
8
  #include "helpers.h"
9
+ #include "datadog_ruby_common.h"
11
10
 
12
11
  void self_test_clock_id(void) { } // Nothing to check
13
12
 
@@ -20,7 +20,9 @@
20
20
  #define ERR_CLOCK_FAIL "failed to get clock time"
21
21
 
22
22
  // Maximum allowed value for an allocation weight. Attempts to use higher values will result in clamping.
23
- unsigned int MAX_ALLOC_WEIGHT = 65535;
23
+ // See https://docs.google.com/document/d/1lWLB714wlLBBq6T4xZyAc4a5wtWhSmr4-hgiPKeErlA/edit#heading=h.ugp0zxcj5iqh
24
+ // (Datadog-only link) for research backing the choice of this value.
25
+ unsigned int MAX_ALLOC_WEIGHT = 10000;
24
26
 
25
27
  // Used to trigger the execution of Collectors::ThreadState, which implements all of the sampling logic
26
28
  // itself; this class only implements the "when to do it" part.
@@ -96,6 +98,7 @@ struct cpu_and_wall_time_worker_state {
96
98
  bool no_signals_workaround_enabled;
97
99
  bool dynamic_sampling_rate_enabled;
98
100
  bool allocation_profiling_enabled;
101
+ bool allocation_counting_enabled;
99
102
  bool skip_idle_samples_for_testing;
100
103
  VALUE self_instance;
101
104
  VALUE thread_context_collector_instance;
@@ -104,7 +107,6 @@ struct cpu_and_wall_time_worker_state {
104
107
  dynamic_sampling_rate_state cpu_dynamic_sampling_rate;
105
108
  discrete_dynamic_sampler allocation_sampler;
106
109
  VALUE gc_tracepoint; // Used to get gc start/finish information
107
- VALUE object_allocation_tracepoint; // Used to get allocation counts and allocation profiling
108
110
 
109
111
  // These are mutable and used to signal things between the worker thread and other threads
110
112
 
@@ -117,7 +119,7 @@ struct cpu_and_wall_time_worker_state {
117
119
 
118
120
  // Others
119
121
 
120
- // Used to detect/avoid nested sampling, e.g. when the object_allocation_tracepoint gets triggered by a memory allocation
122
+ // Used to detect/avoid nested sampling, e.g. when on_newobj_event gets triggered by a memory allocation
121
123
  // that happens during another sample.
122
124
  bool during_sample;
123
125
 
@@ -181,6 +183,7 @@ static VALUE _native_initialize(
181
183
  VALUE dynamic_sampling_rate_enabled,
182
184
  VALUE dynamic_sampling_rate_overhead_target_percentage,
183
185
  VALUE allocation_profiling_enabled,
186
+ VALUE allocation_counting_enabled,
184
187
  VALUE skip_idle_samples_for_testing
185
188
  );
186
189
  static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
@@ -216,7 +219,7 @@ static void grab_gvl_and_sample(void);
216
219
  static void reset_stats_not_thread_safe(struct cpu_and_wall_time_worker_state *state);
217
220
  static void sleep_for(uint64_t time_ns);
218
221
  static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self);
219
- static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
222
+ static void on_newobj_event(DDTRACE_UNUSED VALUE unused1, DDTRACE_UNUSED void *unused2);
220
223
  static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state);
221
224
  static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self);
222
225
  static VALUE rescued_sample_allocation(VALUE tracepoint_data);
@@ -225,6 +228,20 @@ static VALUE _native_delayed_error(DDTRACE_UNUSED VALUE self, VALUE instance, VA
225
228
  static VALUE _native_hold_signals(DDTRACE_UNUSED VALUE self);
226
229
  static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self);
227
230
 
231
+ // We're using `on_newobj_event` function with `rb_add_event_hook2`, which requires in its public signature a function
232
+ // with signature `rb_event_hook_func_t` which doesn't match `on_newobj_event`.
233
+ //
234
+ // But in practice, because we pass the `RUBY_EVENT_HOOK_FLAG_RAW_ARG` flag to `rb_add_event_hook2`, it casts the
235
+ // expected signature into a `rb_event_hook_raw_arg_func_t`:
236
+ // > typedef void (*rb_event_hook_raw_arg_func_t)(VALUE data, const rb_trace_arg_t *arg); (from vm_trace.c)
237
+ // which does match `on_newobj_event`.
238
+ //
239
+ // So TL;DR we're just doing this here to avoid the warning and explain why the apparent mismatch in function signatures.
240
+ #pragma GCC diagnostic push
241
+ #pragma GCC diagnostic ignored "-Wcast-function-type"
242
+ static const rb_event_hook_func_t on_newobj_event_as_hook = (rb_event_hook_func_t) on_newobj_event;
243
+ #pragma GCC diagnostic pop
244
+
228
245
  // Note on sampler global state safety:
229
246
  //
230
247
  // Both `active_sampler_instance` and `active_sampler_instance_state` are **GLOBAL** state. Be careful when accessing
@@ -278,7 +295,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
278
295
  // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
279
296
  rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
280
297
 
281
- rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 9);
298
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 10);
282
299
  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
283
300
  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
284
301
  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
@@ -316,6 +333,8 @@ static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
316
333
  };
317
334
 
318
335
  static VALUE _native_new(VALUE klass) {
336
+ long now = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
337
+
319
338
  struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
320
339
 
321
340
  // Note: Any exceptions raised from this note until the TypedData_Wrap_Struct call will lead to the state memory
@@ -325,13 +344,13 @@ static VALUE _native_new(VALUE klass) {
325
344
  state->no_signals_workaround_enabled = false;
326
345
  state->dynamic_sampling_rate_enabled = true;
327
346
  state->allocation_profiling_enabled = false;
347
+ state->allocation_counting_enabled = false;
328
348
  state->skip_idle_samples_for_testing = false;
329
349
  state->thread_context_collector_instance = Qnil;
330
350
  state->idle_sampling_helper_instance = Qnil;
331
351
  state->owner_thread = Qnil;
332
352
  dynamic_sampling_rate_init(&state->cpu_dynamic_sampling_rate);
333
353
  state->gc_tracepoint = Qnil;
334
- state->object_allocation_tracepoint = Qnil;
335
354
 
336
355
  atomic_init(&state->should_run, false);
337
356
  state->failure_exception = Qnil;
@@ -340,15 +359,12 @@ static VALUE _native_new(VALUE klass) {
340
359
  state->during_sample = false;
341
360
 
342
361
  reset_stats_not_thread_safe(state);
343
-
344
- long now = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
345
- if (now == 0) {
346
- ruby_xfree(state);
347
- rb_raise(rb_eRuntimeError, ERR_CLOCK_FAIL);
348
- }
349
-
350
362
  discrete_dynamic_sampler_init(&state->allocation_sampler, "allocation", now);
351
363
 
364
+ // Note: As of this writing, no new Ruby objects get created and stored in the state. If that ever changes, remember
365
+ // to keep them on the stack and mark them with RB_GC_GUARD -- otherwise it's possible for a GC to run and
366
+ // since the instance representing the state does not yet exist, such objects will not get marked.
367
+
352
368
  return state->self_instance = TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
353
369
  }
354
370
 
@@ -362,6 +378,7 @@ static VALUE _native_initialize(
362
378
  VALUE dynamic_sampling_rate_enabled,
363
379
  VALUE dynamic_sampling_rate_overhead_target_percentage,
364
380
  VALUE allocation_profiling_enabled,
381
+ VALUE allocation_counting_enabled,
365
382
  VALUE skip_idle_samples_for_testing
366
383
  ) {
367
384
  ENFORCE_BOOLEAN(gc_profiling_enabled);
@@ -369,6 +386,7 @@ static VALUE _native_initialize(
369
386
  ENFORCE_BOOLEAN(dynamic_sampling_rate_enabled);
370
387
  ENFORCE_TYPE(dynamic_sampling_rate_overhead_target_percentage, T_FLOAT);
371
388
  ENFORCE_BOOLEAN(allocation_profiling_enabled);
389
+ ENFORCE_BOOLEAN(allocation_counting_enabled);
372
390
  ENFORCE_BOOLEAN(skip_idle_samples_for_testing)
373
391
 
374
392
  struct cpu_and_wall_time_worker_state *state;
@@ -378,6 +396,7 @@ static VALUE _native_initialize(
378
396
  state->no_signals_workaround_enabled = (no_signals_workaround_enabled == Qtrue);
379
397
  state->dynamic_sampling_rate_enabled = (dynamic_sampling_rate_enabled == Qtrue);
380
398
  state->allocation_profiling_enabled = (allocation_profiling_enabled == Qtrue);
399
+ state->allocation_counting_enabled = (allocation_counting_enabled == Qtrue);
381
400
  state->skip_idle_samples_for_testing = (skip_idle_samples_for_testing == Qtrue);
382
401
 
383
402
  double total_overhead_target_percentage = NUM2DBL(dynamic_sampling_rate_overhead_target_percentage);
@@ -394,7 +413,6 @@ static VALUE _native_initialize(
394
413
  state->thread_context_collector_instance = enforce_thread_context_collector_instance(thread_context_collector_instance);
395
414
  state->idle_sampling_helper_instance = idle_sampling_helper_instance;
396
415
  state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
397
- state->object_allocation_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_NEWOBJ, on_newobj_event, NULL /* unused */);
398
416
 
399
417
  return Qtrue;
400
418
  }
@@ -409,7 +427,6 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
409
427
  rb_gc_mark(state->failure_exception);
410
428
  rb_gc_mark(state->stop_thread);
411
429
  rb_gc_mark(state->gc_tracepoint);
412
- rb_gc_mark(state->object_allocation_tracepoint);
413
430
  }
414
431
 
415
432
  // Called in a background thread created in CpuAndWallTimeWorker#start
@@ -755,7 +772,14 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
755
772
  // because they may raise exceptions.
756
773
  install_sigprof_signal_handler(handle_sampling_signal, "handle_sampling_signal");
757
774
  if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
758
- if (state->allocation_profiling_enabled) rb_tracepoint_enable(state->object_allocation_tracepoint);
775
+ if (state->allocation_profiling_enabled) {
776
+ rb_add_event_hook2(
777
+ on_newobj_event_as_hook,
778
+ RUBY_INTERNAL_EVENT_NEWOBJ,
779
+ state->self_instance,
780
+ RUBY_EVENT_HOOK_FLAG_SAFE | RUBY_EVENT_HOOK_FLAG_RAW_ARG)
781
+ ;
782
+ }
759
783
 
760
784
  // Flag the profiler as running before we release the GVL, in case anyone's waiting to know about it
761
785
  rb_funcall(instance, rb_intern("signal_running"), 0);
@@ -1036,46 +1060,87 @@ static void sleep_for(uint64_t time_ns) {
1036
1060
  }
1037
1061
 
1038
1062
  static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self) {
1039
- bool are_allocations_being_tracked = active_sampler_instance_state != NULL && active_sampler_instance_state->allocation_profiling_enabled;
1063
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state;
1064
+
1065
+ bool are_allocations_being_tracked = state != NULL && state->allocation_profiling_enabled && state->allocation_counting_enabled;
1040
1066
 
1041
1067
  return are_allocations_being_tracked ? ULL2NUM(allocation_count) : Qnil;
1042
1068
  }
1043
1069
 
1044
- // Implements memory-related profiling events. This function is called by Ruby via the `object_allocation_tracepoint`
1045
- // when the RUBY_INTERNAL_EVENT_NEWOBJ event is triggered.
1046
- static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
1047
- // Update thread-local allocation count
1048
- if (RB_UNLIKELY(allocation_count == UINT64_MAX)) {
1049
- allocation_count = 0;
1050
- } else {
1051
- allocation_count++;
1052
- }
1070
+ #define HANDLE_CLOCK_FAILURE(call) ({ \
1071
+ long _result = (call); \
1072
+ if (_result == 0) { \
1073
+ delayed_error(state, ERR_CLOCK_FAIL); \
1074
+ return; \
1075
+ } \
1076
+ _result; \
1077
+ })
1053
1078
 
1079
+ // Implements memory-related profiling events. This function is called by Ruby via the `rb_add_event_hook2`
1080
+ // when the RUBY_INTERNAL_EVENT_NEWOBJ event is triggered.
1081
+ //
1082
+ // When allocation sampling is enabled, this function gets called for almost all* objects allocated by the Ruby VM.
1083
+ // (*In some weird cases the VM may skip this tracepoint.)
1084
+ //
1085
+ // At a high level, there's two paths through this function:
1086
+ // 1. should_sample == false -> return
1087
+ // 2. should_sample == true -> sample
1088
+ //
1089
+ // On big applications, path 1. is the hottest, since we don't sample every object. So it's quite important for it to
1090
+ // be as fast as possible.
1091
+ //
1092
+ // NOTE: You may be wondering why we don't use any of the arguments to this function. It turns out it's possible to just
1093
+ // call `rb_tracearg_from_tracepoint(anything)` anywhere during this function or its callees to get the data, so that's
1094
+ // why it's not being passed as an argument.
1095
+ static void on_newobj_event(DDTRACE_UNUSED VALUE unused1, DDTRACE_UNUSED void *unused2) {
1054
1096
  struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
1055
1097
 
1056
1098
  // This should not happen in a normal situation because the tracepoint is always enabled after the instance is set
1057
1099
  // and disabled before it is cleared, but just in case...
1058
1100
  if (state == NULL) return;
1059
1101
 
1060
- // In a few cases, we may actually be allocating an object as part of profiler sampling. We don't want to recursively
1102
+ if (RB_UNLIKELY(state->allocation_counting_enabled)) {
1103
+ // Update thread-local allocation count
1104
+ if (RB_UNLIKELY(allocation_count == UINT64_MAX)) {
1105
+ allocation_count = 0;
1106
+ } else {
1107
+ allocation_count++;
1108
+ }
1109
+ }
1110
+
1111
+ // In rare cases, we may actually be allocating an object as part of profiler sampling. We don't want to recursively
1061
1112
  // sample, so we just return early
1062
1113
  if (state->during_sample) {
1063
1114
  state->stats.allocations_during_sample++;
1064
1115
  return;
1065
1116
  }
1066
1117
 
1067
- if (state->dynamic_sampling_rate_enabled) {
1068
- long now = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
1069
- if (now == 0) {
1070
- delayed_error(state, ERR_CLOCK_FAIL);
1071
- return;
1072
- }
1073
- if (!discrete_dynamic_sampler_should_sample(&state->allocation_sampler, now)) {
1074
- state->stats.allocation_skipped++;
1075
- return;
1118
+ // Hot path: Dynamic sampling rate is usually enabled and the sampling decision is usually false
1119
+ if (RB_LIKELY(state->dynamic_sampling_rate_enabled && !discrete_dynamic_sampler_should_sample(&state->allocation_sampler))) {
1120
+ state->stats.allocation_skipped++;
1121
+
1122
+ coarse_instant now = monotonic_coarse_wall_time_now_ns();
1123
+ HANDLE_CLOCK_FAILURE(now.timestamp_ns);
1124
+
1125
+ bool needs_readjust = discrete_dynamic_sampler_skipped_sample(&state->allocation_sampler, now);
1126
+ if (RB_UNLIKELY(needs_readjust)) {
1127
+ // We rarely readjust, so this is a cold path
1128
+ // Also, while above we used the cheaper monotonic_coarse, for this call we want the regular monotonic call,
1129
+ // which is why we end up getting time "again".
1130
+ discrete_dynamic_sampler_readjust(
1131
+ &state->allocation_sampler, HANDLE_CLOCK_FAILURE(monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE))
1132
+ );
1076
1133
  }
1134
+
1135
+ return;
1077
1136
  }
1078
1137
 
1138
+ // From here on, we've decided to go ahead with the sample, which is way less common than skipping it
1139
+
1140
+ discrete_dynamic_sampler_before_sample(
1141
+ &state->allocation_sampler, HANDLE_CLOCK_FAILURE(monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE))
1142
+ );
1143
+
1079
1144
  // @ivoanjo: Strictly speaking, this is not needed because Ruby should not call the same tracepoint while a previous
1080
1145
  // invocation is still pending, (e.g. it wouldn't call `on_newobj_event` while it's already running), but I decided
1081
1146
  // to keep this here for consistency -- every call to the thread context (other than the special gc calls which are
@@ -1083,7 +1148,7 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
1083
1148
  state->during_sample = true;
1084
1149
 
1085
1150
  // Rescue against any exceptions that happen during sampling
1086
- safely_call(rescued_sample_allocation, tracepoint_data, state->self_instance);
1151
+ safely_call(rescued_sample_allocation, Qnil, state->self_instance);
1087
1152
 
1088
1153
  if (state->dynamic_sampling_rate_enabled) {
1089
1154
  long now = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
@@ -1108,9 +1173,7 @@ static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state) {
1108
1173
  if (state->gc_tracepoint != Qnil) {
1109
1174
  rb_tracepoint_disable(state->gc_tracepoint);
1110
1175
  }
1111
- if (state->object_allocation_tracepoint != Qnil) {
1112
- rb_tracepoint_disable(state->object_allocation_tracepoint);
1113
- }
1176
+ rb_remove_event_hook_with_data(on_newobj_event_as_hook, state->self_instance);
1114
1177
  }
1115
1178
 
1116
1179
  static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self) {
@@ -1126,13 +1189,14 @@ static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self) {
1126
1189
  }
1127
1190
  }
1128
1191
 
1129
- static VALUE rescued_sample_allocation(VALUE tracepoint_data) {
1192
+ static VALUE rescued_sample_allocation(DDTRACE_UNUSED VALUE unused) {
1130
1193
  struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
1131
1194
 
1132
1195
  // This should not happen in a normal situation because on_newobj_event already checked for this, but just in case...
1133
1196
  if (state == NULL) return Qnil;
1134
1197
 
1135
- rb_trace_arg_t *data = rb_tracearg_from_tracepoint(tracepoint_data);
1198
+ // If we're getting called from inside a tracepoint/event hook, Ruby exposes the data using this function.
1199
+ rb_trace_arg_t *data = rb_tracearg_from_tracepoint(Qnil);
1136
1200
  VALUE new_object = rb_tracearg_object(data);
1137
1201
 
1138
1202
  unsigned long allocations_since_last_sample = state->dynamic_sampling_rate_enabled ?
@@ -1140,9 +1204,15 @@ static VALUE rescued_sample_allocation(VALUE tracepoint_data) {
1140
1204
  discrete_dynamic_sampler_events_since_last_sample(&state->allocation_sampler) :
1141
1205
  // if we aren't, then we're sampling every event
1142
1206
  1;
1143
- // TODO: Signal in the profile that clamping happened?
1207
+
1208
+ // To control bias from sampling, we clamp the maximum weight attributed to a single allocation sample. This avoids
1209
+ // assigning a very large number to a sample, if for instance the dynamic sampling mechanism chose a really big interval.
1144
1210
  unsigned int weight = allocations_since_last_sample > MAX_ALLOC_WEIGHT ? MAX_ALLOC_WEIGHT : (unsigned int) allocations_since_last_sample;
1145
1211
  thread_context_collector_sample_allocation(state->thread_context_collector_instance, weight, new_object);
1212
+ // ...but we still represent the skipped samples in the profile, thus the data will account for all allocations.
1213
+ if (weight < allocations_since_last_sample) {
1214
+ thread_context_collector_sample_skipped_allocation_samples(state->thread_context_collector_instance, allocations_since_last_sample - weight);
1215
+ }
1146
1216
 
1147
1217
  // Return a dummy VALUE because we're called from rb_rescue2 which requires it
1148
1218
  return Qnil;