stackprof 0.2.15 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,15 +7,20 @@
7
7
  **********************************************************************/
8
8
 
9
9
  #include <ruby/ruby.h>
10
+ #include <ruby/version.h>
10
11
  #include <ruby/debug.h>
11
12
  #include <ruby/st.h>
12
13
  #include <ruby/io.h>
13
14
  #include <ruby/intern.h>
15
+ #include <ruby/vm.h>
14
16
  #include <signal.h>
15
17
  #include <sys/time.h>
18
+ #include <time.h>
16
19
  #include <pthread.h>
17
20
 
18
21
  #define BUF_SIZE 2048
22
+ #define MICROSECONDS_IN_SECOND 1000000
23
+ #define NANOSECONDS_IN_SECOND 1000000000
19
24
 
20
25
  #define FAKE_FRAME_GC INT2FIX(0)
21
26
  #define FAKE_FRAME_MARK INT2FIX(1)
@@ -27,8 +32,52 @@ static const char *fake_frame_cstrs[] = {
27
32
  "(sweeping)",
28
33
  };
29
34
 
35
+ static int stackprof_use_postponed_job = 1;
36
+ static int ruby_vm_running = 0;
37
+
30
38
  #define TOTAL_FAKE_FRAMES (sizeof(fake_frame_cstrs) / sizeof(char *))
31
39
 
40
+ #ifdef _POSIX_MONOTONIC_CLOCK
41
+ #define timestamp_t timespec
42
+ typedef struct timestamp_t timestamp_t;
43
+
44
+ static void capture_timestamp(timestamp_t *ts) {
45
+ clock_gettime(CLOCK_MONOTONIC, ts);
46
+ }
47
+
48
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
49
+ int64_t result = MICROSECONDS_IN_SECOND * (end->tv_sec - start->tv_sec);
50
+ if (end->tv_nsec < start->tv_nsec) {
51
+ result -= MICROSECONDS_IN_SECOND;
52
+ result += (NANOSECONDS_IN_SECOND + end->tv_nsec - start->tv_nsec) / 1000;
53
+ } else {
54
+ result += (end->tv_nsec - start->tv_nsec) / 1000;
55
+ }
56
+ return result;
57
+ }
58
+
59
+ static uint64_t timestamp_usec(timestamp_t *ts) {
60
+ return (MICROSECONDS_IN_SECOND * ts->tv_sec) + (ts->tv_nsec / 1000);
61
+ }
62
+ #else
63
+ #define timestamp_t timeval
64
+ typedef struct timestamp_t timestamp_t;
65
+
66
+ static void capture_timestamp(timestamp_t *ts) {
67
+ gettimeofday(ts, NULL);
68
+ }
69
+
70
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
71
+ struct timeval diff;
72
+ timersub(end, start, &diff);
73
+ return (MICROSECONDS_IN_SECOND * diff.tv_sec) + diff.tv_usec;
74
+ }
75
+
76
+ static uint64_t timestamp_usec(timestamp_t *ts) {
77
+ return (MICROSECONDS_IN_SECOND * ts.tv_sec) + diff.tv_usec
78
+ }
79
+ #endif
80
+
32
81
  typedef struct {
33
82
  size_t total_samples;
34
83
  size_t caller_samples;
@@ -37,7 +86,24 @@ typedef struct {
37
86
  st_table *lines;
38
87
  } frame_data_t;
39
88
 
89
+ typedef struct {
90
+ uint64_t timestamp_usec;
91
+ int64_t delta_usec;
92
+ } sample_time_t;
93
+
94
+ /* We need to ensure that various memory operations are visible across
95
+ * threads. Ruby doesn't offer a portable way to do this sort of detection
96
+ * across all the Ruby versions we support, so we use something that casts a
97
+ * wide net (Clang, along with ICC, defines __GNUC__). */
98
+ #if defined(__GNUC__) && defined(__ATOMIC_SEQ_CST)
99
+ #define STACKPROF_HAVE_ATOMICS 1
100
+ #else
101
+ #define STACKPROF_HAVE_ATOMICS 0
102
+ #endif
103
+
40
104
  static struct {
105
+ /* Access this field with the `STACKPROF_RUNNING` macro, below, since we
106
+ * can't properly express that this field has an atomic type. */
41
107
  int running;
42
108
  int raw;
43
109
  int aggregate;
@@ -46,16 +112,17 @@ static struct {
46
112
  VALUE interval;
47
113
  VALUE out;
48
114
  VALUE metadata;
115
+ int ignore_gc;
49
116
 
50
- VALUE *raw_samples;
117
+ uint64_t *raw_samples;
51
118
  size_t raw_samples_len;
52
119
  size_t raw_samples_capa;
53
120
  size_t raw_sample_index;
54
121
 
55
- struct timeval last_sample_at;
56
- int *raw_timestamp_deltas;
57
- size_t raw_timestamp_deltas_len;
58
- size_t raw_timestamp_deltas_capa;
122
+ struct timestamp_t last_sample_at;
123
+ sample_time_t *raw_sample_times;
124
+ size_t raw_sample_times_len;
125
+ size_t raw_sample_times_capa;
59
126
 
60
127
  size_t overall_signals;
61
128
  size_t overall_samples;
@@ -65,16 +132,29 @@ static struct {
65
132
  size_t unrecorded_gc_sweeping_samples;
66
133
  st_table *frames;
67
134
 
135
+ timestamp_t gc_start_timestamp;
136
+
68
137
  VALUE fake_frame_names[TOTAL_FAKE_FRAMES];
69
138
  VALUE empty_string;
139
+
140
+ int buffer_count;
141
+ sample_time_t buffer_time;
70
142
  VALUE frames_buffer[BUF_SIZE];
71
143
  int lines_buffer[BUF_SIZE];
144
+
145
+ pthread_t target_thread;
72
146
  } _stackprof;
73
147
 
148
+ #if STACKPROF_HAVE_ATOMICS
149
+ #define STACKPROF_RUNNING() __atomic_load_n(&_stackprof.running, __ATOMIC_ACQUIRE)
150
+ #else
151
+ #define STACKPROF_RUNNING() _stackprof.running
152
+ #endif
153
+
74
154
  static VALUE sym_object, sym_wall, sym_cpu, sym_custom, sym_name, sym_file, sym_line;
75
155
  static VALUE sym_samples, sym_total_samples, sym_missed_samples, sym_edges, sym_lines;
76
- static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_metadata, sym_frames, sym_out, sym_aggregate, sym_raw_timestamp_deltas;
77
- static VALUE sym_state, sym_marking, sym_sweeping;
156
+ static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_raw_lines, sym_metadata, sym_frames, sym_ignore_gc, sym_out;
157
+ static VALUE sym_aggregate, sym_raw_sample_timestamps, sym_raw_timestamp_deltas, sym_state, sym_marking, sym_sweeping;
78
158
  static VALUE sym_gc_samples, objtracer;
79
159
  static VALUE gc_hook;
80
160
  static VALUE rb_mStackProf;
@@ -88,9 +168,11 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
88
168
  struct sigaction sa;
89
169
  struct itimerval timer;
90
170
  VALUE opts = Qnil, mode = Qnil, interval = Qnil, metadata = rb_hash_new(), out = Qfalse;
171
+ int ignore_gc = 0;
91
172
  int raw = 0, aggregate = 1;
173
+ VALUE metadata_val;
92
174
 
93
- if (_stackprof.running)
175
+ if (STACKPROF_RUNNING())
94
176
  return Qfalse;
95
177
 
96
178
  rb_scan_args(argc, argv, "0:", &opts);
@@ -99,8 +181,11 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
99
181
  mode = rb_hash_aref(opts, sym_mode);
100
182
  interval = rb_hash_aref(opts, sym_interval);
101
183
  out = rb_hash_aref(opts, sym_out);
184
+ if (RTEST(rb_hash_aref(opts, sym_ignore_gc))) {
185
+ ignore_gc = 1;
186
+ }
102
187
 
103
- VALUE metadata_val = rb_hash_aref(opts, sym_metadata);
188
+ metadata_val = rb_hash_aref(opts, sym_metadata);
104
189
  if (RTEST(metadata_val)) {
105
190
  if (!RB_TYPE_P(metadata_val, T_HASH))
106
191
  rb_raise(rb_eArgError, "metadata should be a hash");
@@ -115,6 +200,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
115
200
  }
116
201
  if (!RTEST(mode)) mode = sym_wall;
117
202
 
203
+ if (!NIL_P(interval) && (NUM2INT(interval) < 1 || NUM2INT(interval) >= MICROSECONDS_IN_SECOND)) {
204
+ rb_raise(rb_eArgError, "interval is a number of microseconds between 1 and 1 million");
205
+ }
206
+
118
207
  if (!_stackprof.frames) {
119
208
  _stackprof.frames = st_init_numtable();
120
209
  _stackprof.overall_signals = 0;
@@ -146,16 +235,24 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
146
235
  rb_raise(rb_eArgError, "unknown profiler mode");
147
236
  }
148
237
 
149
- _stackprof.running = 1;
150
238
  _stackprof.raw = raw;
151
239
  _stackprof.aggregate = aggregate;
152
240
  _stackprof.mode = mode;
153
241
  _stackprof.interval = interval;
242
+ _stackprof.ignore_gc = ignore_gc;
154
243
  _stackprof.metadata = metadata;
155
244
  _stackprof.out = out;
245
+ _stackprof.target_thread = pthread_self();
246
+ /* We need to ensure previous initialization stores are visible across
247
+ * threads. */
248
+ #if STACKPROF_HAVE_ATOMICS
249
+ __atomic_store_n(&_stackprof.running, 1, __ATOMIC_SEQ_CST);
250
+ #else
251
+ _stackprof.running = 1;
252
+ #endif
156
253
 
157
254
  if (raw) {
158
- gettimeofday(&_stackprof.last_sample_at, NULL);
255
+ capture_timestamp(&_stackprof.last_sample_at);
159
256
  }
160
257
 
161
258
  return Qtrue;
@@ -167,9 +264,15 @@ stackprof_stop(VALUE self)
167
264
  struct sigaction sa;
168
265
  struct itimerval timer;
169
266
 
267
+ #if STACKPROF_HAVE_ATOMICS
268
+ int was_running = __atomic_exchange_n(&_stackprof.running, 0, __ATOMIC_SEQ_CST);
269
+ if (!was_running)
270
+ return Qfalse;
271
+ #else
170
272
  if (!_stackprof.running)
171
273
  return Qfalse;
172
274
  _stackprof.running = 0;
275
+ #endif
173
276
 
174
277
  if (_stackprof.mode == sym_object) {
175
278
  rb_tracepoint_disable(objtracer);
@@ -190,13 +293,19 @@ stackprof_stop(VALUE self)
190
293
  return Qtrue;
191
294
  }
192
295
 
296
+ #if SIZEOF_VOIDP == SIZEOF_LONG
297
+ # define PTR2NUM(x) (LONG2NUM((long)(x)))
298
+ #else
299
+ # define PTR2NUM(x) (LL2NUM((LONG_LONG)(x)))
300
+ #endif
301
+
193
302
  static int
194
303
  frame_edges_i(st_data_t key, st_data_t val, st_data_t arg)
195
304
  {
196
305
  VALUE edges = (VALUE)arg;
197
306
 
198
307
  intptr_t weight = (intptr_t)val;
199
- rb_hash_aset(edges, rb_obj_id((VALUE)key), INT2FIX(weight));
308
+ rb_hash_aset(edges, PTR2NUM(key), INT2FIX(weight));
200
309
  return ST_CONTINUE;
201
310
  }
202
311
 
@@ -223,7 +332,7 @@ frame_i(st_data_t key, st_data_t val, st_data_t arg)
223
332
  VALUE name, file, edges, lines;
224
333
  VALUE line;
225
334
 
226
- rb_hash_aset(results, rb_obj_id(frame), details);
335
+ rb_hash_aset(results, PTR2NUM(frame), details);
227
336
 
228
337
  if (FIXNUM_P(frame)) {
229
338
  name = _stackprof.fake_frame_names[FIX2INT(frame)];
@@ -272,7 +381,7 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
272
381
  {
273
382
  VALUE results, frames;
274
383
 
275
- if (!_stackprof.frames || _stackprof.running)
384
+ if (!_stackprof.frames || STACKPROF_RUNNING())
276
385
  return Qnil;
277
386
 
278
387
  results = rb_hash_new();
@@ -295,16 +404,25 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
295
404
 
296
405
  if (_stackprof.raw && _stackprof.raw_samples_len) {
297
406
  size_t len, n, o;
298
- VALUE raw_timestamp_deltas;
407
+ VALUE raw_sample_timestamps, raw_timestamp_deltas;
299
408
  VALUE raw_samples = rb_ary_new_capa(_stackprof.raw_samples_len);
409
+ VALUE raw_lines = rb_ary_new_capa(_stackprof.raw_samples_len);
300
410
 
301
411
  for (n = 0; n < _stackprof.raw_samples_len; n++) {
302
412
  len = (size_t)_stackprof.raw_samples[n];
303
413
  rb_ary_push(raw_samples, SIZET2NUM(len));
414
+ rb_ary_push(raw_lines, SIZET2NUM(len));
415
+
416
+ for (o = 0, n++; o < len; n++, o++) {
417
+ // Line is in the upper 16 bits
418
+ rb_ary_push(raw_lines, INT2NUM(_stackprof.raw_samples[n] >> 48));
419
+
420
+ VALUE frame = _stackprof.raw_samples[n] & ~((uint64_t)0xFFFF << 48);
421
+ rb_ary_push(raw_samples, PTR2NUM(frame));
422
+ }
304
423
 
305
- for (o = 0, n++; o < len; n++, o++)
306
- rb_ary_push(raw_samples, rb_obj_id(_stackprof.raw_samples[n]));
307
424
  rb_ary_push(raw_samples, SIZET2NUM((size_t)_stackprof.raw_samples[n]));
425
+ rb_ary_push(raw_lines, SIZET2NUM((size_t)_stackprof.raw_samples[n]));
308
426
  }
309
427
 
310
428
  free(_stackprof.raw_samples);
@@ -314,18 +432,22 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
314
432
  _stackprof.raw_sample_index = 0;
315
433
 
316
434
  rb_hash_aset(results, sym_raw, raw_samples);
435
+ rb_hash_aset(results, sym_raw_lines, raw_lines);
317
436
 
318
- raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_timestamp_deltas_len);
437
+ raw_sample_timestamps = rb_ary_new_capa(_stackprof.raw_sample_times_len);
438
+ raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_sample_times_len);
319
439
 
320
- for (n = 0; n < _stackprof.raw_timestamp_deltas_len; n++) {
321
- rb_ary_push(raw_timestamp_deltas, INT2FIX(_stackprof.raw_timestamp_deltas[n]));
440
+ for (n = 0; n < _stackprof.raw_sample_times_len; n++) {
441
+ rb_ary_push(raw_sample_timestamps, ULL2NUM(_stackprof.raw_sample_times[n].timestamp_usec));
442
+ rb_ary_push(raw_timestamp_deltas, LL2NUM(_stackprof.raw_sample_times[n].delta_usec));
322
443
  }
323
444
 
324
- free(_stackprof.raw_timestamp_deltas);
325
- _stackprof.raw_timestamp_deltas = NULL;
326
- _stackprof.raw_timestamp_deltas_len = 0;
327
- _stackprof.raw_timestamp_deltas_capa = 0;
445
+ free(_stackprof.raw_sample_times);
446
+ _stackprof.raw_sample_times = NULL;
447
+ _stackprof.raw_sample_times_len = 0;
448
+ _stackprof.raw_sample_times_capa = 0;
328
449
 
450
+ rb_hash_aset(results, sym_raw_sample_timestamps, raw_sample_timestamps);
329
451
  rb_hash_aset(results, sym_raw_timestamp_deltas, raw_timestamp_deltas);
330
452
 
331
453
  _stackprof.raw = 0;
@@ -363,7 +485,7 @@ stackprof_run(int argc, VALUE *argv, VALUE self)
363
485
  static VALUE
364
486
  stackprof_running_p(VALUE self)
365
487
  {
366
- return _stackprof.running ? Qtrue : Qfalse;
488
+ return STACKPROF_RUNNING() ? Qtrue : Qfalse;
367
489
  }
368
490
 
369
491
  static inline frame_data_t *
@@ -405,14 +527,14 @@ st_numtable_increment(st_table *table, st_data_t key, size_t increment)
405
527
  }
406
528
 
407
529
  void
408
- stackprof_record_sample_for_stack(int num, int timestamp_delta)
530
+ stackprof_record_sample_for_stack(int num, uint64_t sample_timestamp, int64_t timestamp_delta)
409
531
  {
410
532
  int i, n;
411
533
  VALUE prev_frame = Qnil;
412
534
 
413
535
  _stackprof.overall_samples++;
414
536
 
415
- if (_stackprof.raw) {
537
+ if (_stackprof.raw && num > 0) {
416
538
  int found = 0;
417
539
 
418
540
  /* If there's no sample buffer allocated, then allocate one. The buffer
@@ -440,7 +562,12 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
440
562
  * in the frames buffer that came from Ruby. */
441
563
  for (i = num-1, n = 0; i >= 0; i--, n++) {
442
564
  VALUE frame = _stackprof.frames_buffer[i];
443
- if (_stackprof.raw_samples[_stackprof.raw_sample_index + 1 + n] != frame)
565
+ int line = _stackprof.lines_buffer[i];
566
+
567
+ // Encode the line in to the upper 16 bits.
568
+ uint64_t key = ((uint64_t)line << 48) | (uint64_t)frame;
569
+
570
+ if (_stackprof.raw_samples[_stackprof.raw_sample_index + 1 + n] != key)
444
571
  break;
445
572
  }
446
573
  if (i == -1) {
@@ -458,26 +585,34 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
458
585
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)num;
459
586
  for (i = num-1; i >= 0; i--) {
460
587
  VALUE frame = _stackprof.frames_buffer[i];
461
- _stackprof.raw_samples[_stackprof.raw_samples_len++] = frame;
588
+ int line = _stackprof.lines_buffer[i];
589
+
590
+ // Encode the line in to the upper 16 bits.
591
+ uint64_t key = ((uint64_t)line << 48) | (uint64_t)frame;
592
+
593
+ _stackprof.raw_samples[_stackprof.raw_samples_len++] = key;
462
594
  }
463
595
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)1;
464
596
  }
465
597
 
466
598
  /* If there's no timestamp delta buffer, allocate one */
467
- if (!_stackprof.raw_timestamp_deltas) {
468
- _stackprof.raw_timestamp_deltas_capa = 100;
469
- _stackprof.raw_timestamp_deltas = malloc(sizeof(int) * _stackprof.raw_timestamp_deltas_capa);
470
- _stackprof.raw_timestamp_deltas_len = 0;
599
+ if (!_stackprof.raw_sample_times) {
600
+ _stackprof.raw_sample_times_capa = 100;
601
+ _stackprof.raw_sample_times = malloc(sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
602
+ _stackprof.raw_sample_times_len = 0;
471
603
  }
472
604
 
473
605
  /* Double the buffer size if it's too small */
474
- while (_stackprof.raw_timestamp_deltas_capa <= _stackprof.raw_timestamp_deltas_len + 1) {
475
- _stackprof.raw_timestamp_deltas_capa *= 2;
476
- _stackprof.raw_timestamp_deltas = realloc(_stackprof.raw_timestamp_deltas, sizeof(int) * _stackprof.raw_timestamp_deltas_capa);
606
+ while (_stackprof.raw_sample_times_capa <= _stackprof.raw_sample_times_len + 1) {
607
+ _stackprof.raw_sample_times_capa *= 2;
608
+ _stackprof.raw_sample_times = realloc(_stackprof.raw_sample_times, sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
477
609
  }
478
610
 
479
- /* Store the time delta (which is the amount of time between samples) */
480
- _stackprof.raw_timestamp_deltas[_stackprof.raw_timestamp_deltas_len++] = timestamp_delta;
611
+ /* Store the time delta (which is the amount of microseconds between samples). */
612
+ _stackprof.raw_sample_times[_stackprof.raw_sample_times_len++] = (sample_time_t) {
613
+ .timestamp_usec = sample_timestamp,
614
+ .delta_usec = timestamp_delta,
615
+ };
481
616
  }
482
617
 
483
618
  for (i = 0; i < num; i++) {
@@ -510,48 +645,60 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
510
645
  }
511
646
 
512
647
  if (_stackprof.raw) {
513
- gettimeofday(&_stackprof.last_sample_at, NULL);
648
+ capture_timestamp(&_stackprof.last_sample_at);
514
649
  }
515
650
  }
516
651
 
652
+ // buffer the current profile frames
653
+ // This must be async-signal-safe
654
+ // Returns immediately if another set of frames are already in the buffer
517
655
  void
518
- stackprof_record_sample()
656
+ stackprof_buffer_sample(void)
519
657
  {
520
- int timestamp_delta = 0;
658
+ uint64_t start_timestamp = 0;
659
+ int64_t timestamp_delta = 0;
521
660
  int num;
661
+
662
+ if (_stackprof.buffer_count > 0) {
663
+ // Another sample is already pending
664
+ return;
665
+ }
666
+
522
667
  if (_stackprof.raw) {
523
- struct timeval t;
524
- struct timeval diff;
525
- gettimeofday(&t, NULL);
526
- timersub(&t, &_stackprof.last_sample_at, &diff);
527
- timestamp_delta = (1000 * diff.tv_sec) + diff.tv_usec;
668
+ struct timestamp_t t;
669
+ capture_timestamp(&t);
670
+ start_timestamp = timestamp_usec(&t);
671
+ timestamp_delta = delta_usec(&_stackprof.last_sample_at, &t);
528
672
  }
673
+
529
674
  num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer);
530
- stackprof_record_sample_for_stack(num, timestamp_delta);
675
+
676
+ _stackprof.buffer_count = num;
677
+ _stackprof.buffer_time.timestamp_usec = start_timestamp;
678
+ _stackprof.buffer_time.delta_usec = timestamp_delta;
531
679
  }
532
680
 
681
+ // Postponed job
533
682
  void
534
- stackprof_record_gc_samples()
683
+ stackprof_record_gc_samples(void)
535
684
  {
536
- int delta_to_first_unrecorded_gc_sample = 0;
537
- int i;
685
+ int64_t delta_to_first_unrecorded_gc_sample = 0;
686
+ uint64_t start_timestamp = 0;
687
+ size_t i;
538
688
  if (_stackprof.raw) {
539
- struct timeval t;
540
- struct timeval diff;
541
- gettimeofday(&t, NULL);
542
- timersub(&t, &_stackprof.last_sample_at, &diff);
689
+ struct timestamp_t t = _stackprof.gc_start_timestamp;
690
+ start_timestamp = timestamp_usec(&t);
543
691
 
544
692
  // We don't know when the GC samples were actually marked, so let's
545
693
  // assume that they were marked at a perfectly regular interval.
546
- delta_to_first_unrecorded_gc_sample = (1000 * diff.tv_sec + diff.tv_usec) - (_stackprof.unrecorded_gc_samples - 1) * NUM2LONG(_stackprof.interval);
694
+ delta_to_first_unrecorded_gc_sample = delta_usec(&_stackprof.last_sample_at, &t) - (_stackprof.unrecorded_gc_samples - 1) * NUM2LONG(_stackprof.interval);
547
695
  if (delta_to_first_unrecorded_gc_sample < 0) {
548
696
  delta_to_first_unrecorded_gc_sample = 0;
549
697
  }
550
698
  }
551
699
 
552
-
553
700
  for (i = 0; i < _stackprof.unrecorded_gc_samples; i++) {
554
- int timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : NUM2LONG(_stackprof.interval);
701
+ int64_t timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : NUM2LONG(_stackprof.interval);
555
702
 
556
703
  if (_stackprof.unrecorded_gc_marking_samples) {
557
704
  _stackprof.frames_buffer[0] = FAKE_FRAME_MARK;
@@ -560,7 +707,7 @@ stackprof_record_gc_samples()
560
707
  _stackprof.lines_buffer[1] = 0;
561
708
  _stackprof.unrecorded_gc_marking_samples--;
562
709
 
563
- stackprof_record_sample_for_stack(2, timestamp_delta);
710
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
564
711
  } else if (_stackprof.unrecorded_gc_sweeping_samples) {
565
712
  _stackprof.frames_buffer[0] = FAKE_FRAME_SWEEP;
566
713
  _stackprof.lines_buffer[0] = 0;
@@ -569,11 +716,11 @@ stackprof_record_gc_samples()
569
716
 
570
717
  _stackprof.unrecorded_gc_sweeping_samples--;
571
718
 
572
- stackprof_record_sample_for_stack(2, timestamp_delta);
719
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
573
720
  } else {
574
721
  _stackprof.frames_buffer[0] = FAKE_FRAME_GC;
575
722
  _stackprof.lines_buffer[0] = 0;
576
- stackprof_record_sample_for_stack(1, timestamp_delta);
723
+ stackprof_record_sample_for_stack(1, start_timestamp, timestamp_delta);
577
724
  }
578
725
  }
579
726
  _stackprof.during_gc += _stackprof.unrecorded_gc_samples;
@@ -582,46 +729,103 @@ stackprof_record_gc_samples()
582
729
  _stackprof.unrecorded_gc_sweeping_samples = 0;
583
730
  }
584
731
 
732
+ // record the sample previously buffered by stackprof_buffer_sample
733
+ static void
734
+ stackprof_record_buffer(void)
735
+ {
736
+ stackprof_record_sample_for_stack(_stackprof.buffer_count, _stackprof.buffer_time.timestamp_usec, _stackprof.buffer_time.delta_usec);
737
+
738
+ // reset the buffer
739
+ _stackprof.buffer_count = 0;
740
+ }
741
+
742
+ static void
743
+ stackprof_sample_and_record(void)
744
+ {
745
+ stackprof_buffer_sample();
746
+ stackprof_record_buffer();
747
+ }
748
+
585
749
  static void
586
- stackprof_gc_job_handler(void *data)
750
+ stackprof_job_record_gc(void *data)
587
751
  {
588
- static int in_signal_handler = 0;
589
- if (in_signal_handler) return;
590
- if (!_stackprof.running) return;
752
+ if (!STACKPROF_RUNNING()) return;
591
753
 
592
- in_signal_handler++;
593
754
  stackprof_record_gc_samples();
594
- in_signal_handler--;
595
755
  }
596
756
 
597
757
  static void
598
- stackprof_job_handler(void *data)
758
+ stackprof_job_sample_and_record(void *data)
599
759
  {
600
- static int in_signal_handler = 0;
601
- if (in_signal_handler) return;
602
- if (!_stackprof.running) return;
760
+ if (!STACKPROF_RUNNING()) return;
603
761
 
604
- in_signal_handler++;
605
- stackprof_record_sample();
606
- in_signal_handler--;
762
+ stackprof_sample_and_record();
763
+ }
764
+
765
+ static void
766
+ stackprof_job_record_buffer(void *data)
767
+ {
768
+ if (!STACKPROF_RUNNING()) return;
769
+
770
+ stackprof_record_buffer();
607
771
  }
608
772
 
609
773
  static void
610
774
  stackprof_signal_handler(int sig, siginfo_t *sinfo, void *ucontext)
611
775
  {
776
+ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
777
+
612
778
  _stackprof.overall_signals++;
613
- if (rb_during_gc()) {
779
+
780
+ if (!STACKPROF_RUNNING()) return;
781
+
782
+ // There's a possibility that the signal handler is invoked *after* the Ruby
783
+ // VM has been shut down (e.g. after ruby_cleanup(0)). In this case, things
784
+ // that rely on global VM state (e.g. rb_during_gc) will segfault.
785
+ if (!ruby_vm_running) return;
786
+
787
+ if (_stackprof.mode == sym_wall) {
788
+ // In "wall" mode, the SIGALRM signal will arrive at an arbitrary thread.
789
+ // In order to provide more useful results, especially under threaded web
790
+ // servers, we want to forward this signal to the original thread
791
+ // StackProf was started from.
792
+ // According to POSIX.1-2008 TC1 pthread_kill and pthread_self should be
793
+ // async-signal-safe.
794
+ if (pthread_self() != _stackprof.target_thread) {
795
+ pthread_kill(_stackprof.target_thread, sig);
796
+ return;
797
+ }
798
+ } else {
799
+ if (!ruby_native_thread_p()) return;
800
+ }
801
+
802
+ if (pthread_mutex_trylock(&lock)) return;
803
+
804
+ if (!_stackprof.ignore_gc && rb_during_gc()) {
614
805
  VALUE mode = rb_gc_latest_gc_info(sym_state);
615
806
  if (mode == sym_marking) {
616
807
  _stackprof.unrecorded_gc_marking_samples++;
617
808
  } else if (mode == sym_sweeping) {
618
809
  _stackprof.unrecorded_gc_sweeping_samples++;
619
810
  }
811
+ if(!_stackprof.unrecorded_gc_samples) {
812
+ // record start
813
+ capture_timestamp(&_stackprof.gc_start_timestamp);
814
+ }
620
815
  _stackprof.unrecorded_gc_samples++;
621
- rb_postponed_job_register_one(0, stackprof_gc_job_handler, (void*)0);
816
+ rb_postponed_job_register_one(0, stackprof_job_record_gc, (void*)0);
622
817
  } else {
623
- rb_postponed_job_register_one(0, stackprof_job_handler, (void*)0);
818
+ if (stackprof_use_postponed_job) {
819
+ rb_postponed_job_register_one(0, stackprof_job_sample_and_record, (void*)0);
820
+ } else {
821
+ // Buffer a sample immediately, if an existing sample exists this will
822
+ // return immediately
823
+ stackprof_buffer_sample();
824
+ // Enqueue a job to record the sample
825
+ rb_postponed_job_register_one(0, stackprof_job_record_buffer, (void*)0);
826
+ }
624
827
  }
828
+ pthread_mutex_unlock(&lock);
625
829
  }
626
830
 
627
831
  static void
@@ -630,17 +834,17 @@ stackprof_newobj_handler(VALUE tpval, void *data)
630
834
  _stackprof.overall_signals++;
631
835
  if (RTEST(_stackprof.interval) && _stackprof.overall_signals % NUM2LONG(_stackprof.interval))
632
836
  return;
633
- stackprof_job_handler(0);
837
+ stackprof_sample_and_record();
634
838
  }
635
839
 
636
840
  static VALUE
637
841
  stackprof_sample(VALUE self)
638
842
  {
639
- if (!_stackprof.running)
843
+ if (!STACKPROF_RUNNING())
640
844
  return Qfalse;
641
845
 
642
846
  _stackprof.overall_signals++;
643
- stackprof_job_handler(0);
847
+ stackprof_sample_and_record();
644
848
  return Qtrue;
645
849
  }
646
850
 
@@ -663,13 +867,24 @@ stackprof_gc_mark(void *data)
663
867
 
664
868
  if (_stackprof.frames)
665
869
  st_foreach(_stackprof.frames, frame_mark_i, 0);
870
+
871
+ int i;
872
+ for (i = 0; i < _stackprof.buffer_count; i++) {
873
+ rb_gc_mark(_stackprof.frames_buffer[i]);
874
+ }
875
+ }
876
+
877
+ static size_t
878
+ stackprof_memsize(const void *data)
879
+ {
880
+ return sizeof(_stackprof);
666
881
  }
667
882
 
668
883
  static void
669
884
  stackprof_atfork_prepare(void)
670
885
  {
671
886
  struct itimerval timer;
672
- if (_stackprof.running) {
887
+ if (STACKPROF_RUNNING()) {
673
888
  if (_stackprof.mode == sym_wall || _stackprof.mode == sym_cpu) {
674
889
  memset(&timer, 0, sizeof(timer));
675
890
  setitimer(_stackprof.mode == sym_wall ? ITIMER_REAL : ITIMER_PROF, &timer, 0);
@@ -681,7 +896,7 @@ static void
681
896
  stackprof_atfork_parent(void)
682
897
  {
683
898
  struct itimerval timer;
684
- if (_stackprof.running) {
899
+ if (STACKPROF_RUNNING()) {
685
900
  if (_stackprof.mode == sym_wall || _stackprof.mode == sym_cpu) {
686
901
  timer.it_interval.tv_sec = 0;
687
902
  timer.it_interval.tv_usec = NUM2LONG(_stackprof.interval);
@@ -697,10 +912,41 @@ stackprof_atfork_child(void)
697
912
  stackprof_stop(rb_mStackProf);
698
913
  }
699
914
 
915
+ static VALUE
916
+ stackprof_use_postponed_job_l(VALUE self)
917
+ {
918
+ stackprof_use_postponed_job = 1;
919
+ return Qnil;
920
+ }
921
+
922
+ static void
923
+ stackprof_at_exit(ruby_vm_t* vm)
924
+ {
925
+ ruby_vm_running = 0;
926
+ }
927
+
928
+ static const rb_data_type_t stackprof_type = {
929
+ "StackProf",
930
+ {
931
+ stackprof_gc_mark,
932
+ NULL,
933
+ stackprof_memsize,
934
+ }
935
+ };
936
+
700
937
  void
701
938
  Init_stackprof(void)
702
939
  {
703
940
  size_t i;
941
+ /*
942
+ * As of Ruby 3.0, it should be safe to read stack frames at any time, unless YJIT is enabled
943
+ * See https://github.com/ruby/ruby/commit/0e276dc458f94d9d79a0f7c7669bde84abe80f21
944
+ */
945
+ stackprof_use_postponed_job = RUBY_API_VERSION_MAJOR < 3;
946
+
947
+ ruby_vm_running = 1;
948
+ ruby_vm_at_exit(stackprof_at_exit);
949
+
704
950
  #define S(name) sym_##name = ID2SYM(rb_intern(#name));
705
951
  S(object);
706
952
  S(custom);
@@ -719,9 +965,12 @@ Init_stackprof(void)
719
965
  S(mode);
720
966
  S(interval);
721
967
  S(raw);
968
+ S(raw_lines);
969
+ S(raw_sample_timestamps);
722
970
  S(raw_timestamp_deltas);
723
971
  S(out);
724
972
  S(metadata);
973
+ S(ignore_gc);
725
974
  S(frames);
726
975
  S(aggregate);
727
976
  S(state);
@@ -732,17 +981,17 @@ Init_stackprof(void)
732
981
  /* Need to run this to warm the symbol table before we call this during GC */
733
982
  rb_gc_latest_gc_info(sym_state);
734
983
 
735
- gc_hook = Data_Wrap_Struct(rb_cObject, stackprof_gc_mark, NULL, &_stackprof);
736
984
  rb_global_variable(&gc_hook);
985
+ gc_hook = TypedData_Wrap_Struct(rb_cObject, &stackprof_type, &_stackprof);
737
986
 
738
987
  _stackprof.raw_samples = NULL;
739
988
  _stackprof.raw_samples_len = 0;
740
989
  _stackprof.raw_samples_capa = 0;
741
990
  _stackprof.raw_sample_index = 0;
742
991
 
743
- _stackprof.raw_timestamp_deltas = NULL;
744
- _stackprof.raw_timestamp_deltas_len = 0;
745
- _stackprof.raw_timestamp_deltas_capa = 0;
992
+ _stackprof.raw_sample_times = NULL;
993
+ _stackprof.raw_sample_times_len = 0;
994
+ _stackprof.raw_sample_times_capa = 0;
746
995
 
747
996
  _stackprof.empty_string = rb_str_new_cstr("");
748
997
  rb_global_variable(&_stackprof.empty_string);
@@ -759,6 +1008,7 @@ Init_stackprof(void)
759
1008
  rb_define_singleton_method(rb_mStackProf, "stop", stackprof_stop, 0);
760
1009
  rb_define_singleton_method(rb_mStackProf, "results", stackprof_results, -1);
761
1010
  rb_define_singleton_method(rb_mStackProf, "sample", stackprof_sample, 0);
1011
+ rb_define_singleton_method(rb_mStackProf, "use_postponed_job!", stackprof_use_postponed_job_l, 0);
762
1012
 
763
1013
  pthread_atfork(stackprof_atfork_prepare, stackprof_atfork_parent, stackprof_atfork_child);
764
1014
  }