stackprof 0.2.10 → 0.2.25

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,23 +7,90 @@
7
7
  **********************************************************************/
8
8
 
9
9
  #include <ruby/ruby.h>
10
+ #include <ruby/version.h>
10
11
  #include <ruby/debug.h>
11
12
  #include <ruby/st.h>
12
13
  #include <ruby/io.h>
13
14
  #include <ruby/intern.h>
15
+ #include <ruby/vm.h>
14
16
  #include <signal.h>
15
17
  #include <sys/time.h>
18
+ #include <time.h>
16
19
  #include <pthread.h>
17
20
 
18
21
  #define BUF_SIZE 2048
22
+ #define MICROSECONDS_IN_SECOND 1000000
23
+ #define NANOSECONDS_IN_SECOND 1000000000
24
+
25
+ #define FAKE_FRAME_GC INT2FIX(0)
26
+ #define FAKE_FRAME_MARK INT2FIX(1)
27
+ #define FAKE_FRAME_SWEEP INT2FIX(2)
28
+
29
+ static const char *fake_frame_cstrs[] = {
30
+ "(garbage collection)",
31
+ "(marking)",
32
+ "(sweeping)",
33
+ };
34
+
35
+ static int stackprof_use_postponed_job = 1;
36
+ static int ruby_vm_running = 0;
37
+
38
+ #define TOTAL_FAKE_FRAMES (sizeof(fake_frame_cstrs) / sizeof(char *))
39
+
40
+ #ifdef _POSIX_MONOTONIC_CLOCK
41
+ #define timestamp_t timespec
42
+ typedef struct timestamp_t timestamp_t;
43
+
44
+ static void capture_timestamp(timestamp_t *ts) {
45
+ clock_gettime(CLOCK_MONOTONIC, ts);
46
+ }
47
+
48
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
49
+ int64_t result = MICROSECONDS_IN_SECOND * (end->tv_sec - start->tv_sec);
50
+ if (end->tv_nsec < start->tv_nsec) {
51
+ result -= MICROSECONDS_IN_SECOND;
52
+ result += (NANOSECONDS_IN_SECOND + end->tv_nsec - start->tv_nsec) / 1000;
53
+ } else {
54
+ result += (end->tv_nsec - start->tv_nsec) / 1000;
55
+ }
56
+ return result;
57
+ }
58
+
59
+ static uint64_t timestamp_usec(timestamp_t *ts) {
60
+ return (MICROSECONDS_IN_SECOND * ts->tv_sec) + (ts->tv_nsec / 1000);
61
+ }
62
+ #else
63
+ #define timestamp_t timeval
64
+ typedef struct timestamp_t timestamp_t;
65
+
66
+ static void capture_timestamp(timestamp_t *ts) {
67
+ gettimeofday(ts, NULL);
68
+ }
69
+
70
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
71
+ struct timeval diff;
72
+ timersub(end, start, &diff);
73
+ return (MICROSECONDS_IN_SECOND * diff.tv_sec) + diff.tv_usec;
74
+ }
75
+
76
+ static uint64_t timestamp_usec(timestamp_t *ts) {
77
+ return (MICROSECONDS_IN_SECOND * ts.tv_sec) + diff.tv_usec
78
+ }
79
+ #endif
19
80
 
20
81
  typedef struct {
21
82
  size_t total_samples;
22
83
  size_t caller_samples;
84
+ size_t seen_at_sample_number;
23
85
  st_table *edges;
24
86
  st_table *lines;
25
87
  } frame_data_t;
26
88
 
89
+ typedef struct {
90
+ uint64_t timestamp_usec;
91
+ int64_t delta_usec;
92
+ } sample_time_t;
93
+
27
94
  static struct {
28
95
  int running;
29
96
  int raw;
@@ -32,24 +99,42 @@ static struct {
32
99
  VALUE mode;
33
100
  VALUE interval;
34
101
  VALUE out;
102
+ VALUE metadata;
103
+ int ignore_gc;
35
104
 
36
105
  VALUE *raw_samples;
37
106
  size_t raw_samples_len;
38
107
  size_t raw_samples_capa;
39
108
  size_t raw_sample_index;
40
109
 
110
+ struct timestamp_t last_sample_at;
111
+ sample_time_t *raw_sample_times;
112
+ size_t raw_sample_times_len;
113
+ size_t raw_sample_times_capa;
114
+
41
115
  size_t overall_signals;
42
116
  size_t overall_samples;
43
117
  size_t during_gc;
118
+ size_t unrecorded_gc_samples;
119
+ size_t unrecorded_gc_marking_samples;
120
+ size_t unrecorded_gc_sweeping_samples;
44
121
  st_table *frames;
45
122
 
123
+ VALUE fake_frame_names[TOTAL_FAKE_FRAMES];
124
+ VALUE empty_string;
125
+
126
+ int buffer_count;
127
+ sample_time_t buffer_time;
46
128
  VALUE frames_buffer[BUF_SIZE];
47
129
  int lines_buffer[BUF_SIZE];
130
+
131
+ pthread_t target_thread;
48
132
  } _stackprof;
49
133
 
50
134
  static VALUE sym_object, sym_wall, sym_cpu, sym_custom, sym_name, sym_file, sym_line;
51
135
  static VALUE sym_samples, sym_total_samples, sym_missed_samples, sym_edges, sym_lines;
52
- static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_frames, sym_out, sym_aggregate;
136
+ static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_metadata, sym_frames, sym_ignore_gc, sym_out;
137
+ static VALUE sym_aggregate, sym_raw_sample_timestamps, sym_raw_timestamp_deltas, sym_state, sym_marking, sym_sweeping;
53
138
  static VALUE sym_gc_samples, objtracer;
54
139
  static VALUE gc_hook;
55
140
  static VALUE rb_mStackProf;
@@ -62,8 +147,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
62
147
  {
63
148
  struct sigaction sa;
64
149
  struct itimerval timer;
65
- VALUE opts = Qnil, mode = Qnil, interval = Qnil, out = Qfalse;
150
+ VALUE opts = Qnil, mode = Qnil, interval = Qnil, metadata = rb_hash_new(), out = Qfalse;
151
+ int ignore_gc = 0;
66
152
  int raw = 0, aggregate = 1;
153
+ VALUE metadata_val;
67
154
 
68
155
  if (_stackprof.running)
69
156
  return Qfalse;
@@ -74,6 +161,17 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
74
161
  mode = rb_hash_aref(opts, sym_mode);
75
162
  interval = rb_hash_aref(opts, sym_interval);
76
163
  out = rb_hash_aref(opts, sym_out);
164
+ if (RTEST(rb_hash_aref(opts, sym_ignore_gc))) {
165
+ ignore_gc = 1;
166
+ }
167
+
168
+ metadata_val = rb_hash_aref(opts, sym_metadata);
169
+ if (RTEST(metadata_val)) {
170
+ if (!RB_TYPE_P(metadata_val, T_HASH))
171
+ rb_raise(rb_eArgError, "metadata should be a hash");
172
+
173
+ metadata = metadata_val;
174
+ }
77
175
 
78
176
  if (RTEST(rb_hash_aref(opts, sym_raw)))
79
177
  raw = 1;
@@ -82,6 +180,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
82
180
  }
83
181
  if (!RTEST(mode)) mode = sym_wall;
84
182
 
183
+ if (!NIL_P(interval) && (NUM2INT(interval) < 1 || NUM2INT(interval) >= MICROSECONDS_IN_SECOND)) {
184
+ rb_raise(rb_eArgError, "interval is a number of microseconds between 1 and 1 million");
185
+ }
186
+
85
187
  if (!_stackprof.frames) {
86
188
  _stackprof.frames = st_init_numtable();
87
189
  _stackprof.overall_signals = 0;
@@ -118,7 +220,14 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
118
220
  _stackprof.aggregate = aggregate;
119
221
  _stackprof.mode = mode;
120
222
  _stackprof.interval = interval;
223
+ _stackprof.ignore_gc = ignore_gc;
224
+ _stackprof.metadata = metadata;
121
225
  _stackprof.out = out;
226
+ _stackprof.target_thread = pthread_self();
227
+
228
+ if (raw) {
229
+ capture_timestamp(&_stackprof.last_sample_at);
230
+ }
122
231
 
123
232
  return Qtrue;
124
233
  }
@@ -152,13 +261,19 @@ stackprof_stop(VALUE self)
152
261
  return Qtrue;
153
262
  }
154
263
 
264
+ #if SIZEOF_VOIDP == SIZEOF_LONG
265
+ # define PTR2NUM(x) (LONG2NUM((long)(x)))
266
+ #else
267
+ # define PTR2NUM(x) (LL2NUM((LONG_LONG)(x)))
268
+ #endif
269
+
155
270
  static int
156
271
  frame_edges_i(st_data_t key, st_data_t val, st_data_t arg)
157
272
  {
158
273
  VALUE edges = (VALUE)arg;
159
274
 
160
275
  intptr_t weight = (intptr_t)val;
161
- rb_hash_aset(edges, rb_obj_id((VALUE)key), INT2FIX(weight));
276
+ rb_hash_aset(edges, PTR2NUM(key), INT2FIX(weight));
162
277
  return ST_CONTINUE;
163
278
  }
164
279
 
@@ -185,18 +300,26 @@ frame_i(st_data_t key, st_data_t val, st_data_t arg)
185
300
  VALUE name, file, edges, lines;
186
301
  VALUE line;
187
302
 
188
- rb_hash_aset(results, rb_obj_id(frame), details);
303
+ rb_hash_aset(results, PTR2NUM(frame), details);
189
304
 
190
- name = rb_profile_frame_full_label(frame);
191
- rb_hash_aset(details, sym_name, name);
305
+ if (FIXNUM_P(frame)) {
306
+ name = _stackprof.fake_frame_names[FIX2INT(frame)];
307
+ file = _stackprof.empty_string;
308
+ line = INT2FIX(0);
309
+ } else {
310
+ name = rb_profile_frame_full_label(frame);
192
311
 
193
- file = rb_profile_frame_absolute_path(frame);
194
- if (NIL_P(file))
195
- file = rb_profile_frame_path(frame);
196
- rb_hash_aset(details, sym_file, file);
312
+ file = rb_profile_frame_absolute_path(frame);
313
+ if (NIL_P(file))
314
+ file = rb_profile_frame_path(frame);
315
+ line = rb_profile_frame_first_lineno(frame);
316
+ }
197
317
 
198
- if ((line = rb_profile_frame_first_lineno(frame)) != INT2FIX(0))
318
+ rb_hash_aset(details, sym_name, name);
319
+ rb_hash_aset(details, sym_file, file);
320
+ if (line != INT2FIX(0)) {
199
321
  rb_hash_aset(details, sym_line, line);
322
+ }
200
323
 
201
324
  rb_hash_aset(details, sym_total_samples, SIZET2NUM(frame_data->total_samples));
202
325
  rb_hash_aset(details, sym_samples, SIZET2NUM(frame_data->caller_samples));
@@ -230,12 +353,15 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
230
353
  return Qnil;
231
354
 
232
355
  results = rb_hash_new();
233
- rb_hash_aset(results, sym_version, DBL2NUM(1.1));
356
+ rb_hash_aset(results, sym_version, DBL2NUM(1.2));
234
357
  rb_hash_aset(results, sym_mode, _stackprof.mode);
235
358
  rb_hash_aset(results, sym_interval, _stackprof.interval);
236
359
  rb_hash_aset(results, sym_samples, SIZET2NUM(_stackprof.overall_samples));
237
360
  rb_hash_aset(results, sym_gc_samples, SIZET2NUM(_stackprof.during_gc));
238
361
  rb_hash_aset(results, sym_missed_samples, SIZET2NUM(_stackprof.overall_signals - _stackprof.overall_samples));
362
+ rb_hash_aset(results, sym_metadata, _stackprof.metadata);
363
+
364
+ _stackprof.metadata = Qnil;
239
365
 
240
366
  frames = rb_hash_new();
241
367
  rb_hash_aset(results, sym_frames, frames);
@@ -246,6 +372,7 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
246
372
 
247
373
  if (_stackprof.raw && _stackprof.raw_samples_len) {
248
374
  size_t len, n, o;
375
+ VALUE raw_sample_timestamps, raw_timestamp_deltas;
249
376
  VALUE raw_samples = rb_ary_new_capa(_stackprof.raw_samples_len);
250
377
 
251
378
  for (n = 0; n < _stackprof.raw_samples_len; n++) {
@@ -253,7 +380,7 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
253
380
  rb_ary_push(raw_samples, SIZET2NUM(len));
254
381
 
255
382
  for (o = 0, n++; o < len; n++, o++)
256
- rb_ary_push(raw_samples, rb_obj_id(_stackprof.raw_samples[n]));
383
+ rb_ary_push(raw_samples, PTR2NUM(_stackprof.raw_samples[n]));
257
384
  rb_ary_push(raw_samples, SIZET2NUM((size_t)_stackprof.raw_samples[n]));
258
385
  }
259
386
 
@@ -262,9 +389,26 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
262
389
  _stackprof.raw_samples_len = 0;
263
390
  _stackprof.raw_samples_capa = 0;
264
391
  _stackprof.raw_sample_index = 0;
265
- _stackprof.raw = 0;
266
392
 
267
393
  rb_hash_aset(results, sym_raw, raw_samples);
394
+
395
+ raw_sample_timestamps = rb_ary_new_capa(_stackprof.raw_sample_times_len);
396
+ raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_sample_times_len);
397
+
398
+ for (n = 0; n < _stackprof.raw_sample_times_len; n++) {
399
+ rb_ary_push(raw_sample_timestamps, ULL2NUM(_stackprof.raw_sample_times[n].timestamp_usec));
400
+ rb_ary_push(raw_timestamp_deltas, LL2NUM(_stackprof.raw_sample_times[n].delta_usec));
401
+ }
402
+
403
+ free(_stackprof.raw_sample_times);
404
+ _stackprof.raw_sample_times = NULL;
405
+ _stackprof.raw_sample_times_len = 0;
406
+ _stackprof.raw_sample_times_capa = 0;
407
+
408
+ rb_hash_aset(results, sym_raw_sample_timestamps, raw_sample_timestamps);
409
+ rb_hash_aset(results, sym_raw_timestamp_deltas, raw_timestamp_deltas);
410
+
411
+ _stackprof.raw = 0;
268
412
  }
269
413
 
270
414
  if (argc == 1)
@@ -272,11 +416,12 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
272
416
 
273
417
  if (RTEST(_stackprof.out)) {
274
418
  VALUE file;
275
- if (RB_TYPE_P(_stackprof.out, T_STRING)) {
276
- file = rb_file_open_str(_stackprof.out, "w");
277
- } else {
419
+ if (rb_respond_to(_stackprof.out, rb_intern("to_io"))) {
278
420
  file = rb_io_check_io(_stackprof.out);
421
+ } else {
422
+ file = rb_file_open_str(_stackprof.out, "w");
279
423
  }
424
+
280
425
  rb_marshal_dump(results, file);
281
426
  rb_io_flush(file);
282
427
  _stackprof.out = Qnil;
@@ -340,28 +485,39 @@ st_numtable_increment(st_table *table, st_data_t key, size_t increment)
340
485
  }
341
486
 
342
487
  void
343
- stackprof_record_sample()
488
+ stackprof_record_sample_for_stack(int num, uint64_t sample_timestamp, int64_t timestamp_delta)
344
489
  {
345
- int num, i, n;
490
+ int i, n;
346
491
  VALUE prev_frame = Qnil;
347
492
 
348
493
  _stackprof.overall_samples++;
349
- num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer);
350
494
 
351
- if (_stackprof.raw) {
495
+ if (_stackprof.raw && num > 0) {
352
496
  int found = 0;
353
497
 
498
+ /* If there's no sample buffer allocated, then allocate one. The buffer
499
+ * format is the number of frames (num), then the list of frames (from
500
+ * `_stackprof.raw_samples`), followed by the number of times this
501
+ * particular stack has been seen in a row. Each "new" stack is added
502
+ * to the end of the buffer, but if the previous stack is the same as
503
+ * the current stack, the counter will be incremented. */
354
504
  if (!_stackprof.raw_samples) {
355
505
  _stackprof.raw_samples_capa = num * 100;
356
506
  _stackprof.raw_samples = malloc(sizeof(VALUE) * _stackprof.raw_samples_capa);
357
507
  }
358
508
 
359
- if (_stackprof.raw_samples_capa <= _stackprof.raw_samples_len + num) {
509
+ /* If we can't fit all the samples in the buffer, double the buffer size. */
510
+ while (_stackprof.raw_samples_capa <= _stackprof.raw_samples_len + (num + 2)) {
360
511
  _stackprof.raw_samples_capa *= 2;
361
512
  _stackprof.raw_samples = realloc(_stackprof.raw_samples, sizeof(VALUE) * _stackprof.raw_samples_capa);
362
513
  }
363
514
 
515
+ /* If we've seen this stack before in the last sample, then increment the "seen" count. */
364
516
  if (_stackprof.raw_samples_len > 0 && _stackprof.raw_samples[_stackprof.raw_sample_index] == (VALUE)num) {
517
+ /* The number of samples could have been the same, but the stack
518
+ * might be different, so we need to check the stack here. Stacks
519
+ * in the raw buffer are stored in the opposite direction of stacks
520
+ * in the frames buffer that came from Ruby. */
365
521
  for (i = num-1, n = 0; i >= 0; i--, n++) {
366
522
  VALUE frame = _stackprof.frames_buffer[i];
367
523
  if (_stackprof.raw_samples[_stackprof.raw_sample_index + 1 + n] != frame)
@@ -373,7 +529,11 @@ stackprof_record_sample()
373
529
  }
374
530
  }
375
531
 
532
+ /* If we haven't seen the stack, then add it to the buffer along with
533
+ * the length of the stack and a 1 for the "seen" count */
376
534
  if (!found) {
535
+ /* Bump the `raw_sample_index` up so that the next iteration can
536
+ * find the previously recorded stack size. */
377
537
  _stackprof.raw_sample_index = _stackprof.raw_samples_len;
378
538
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)num;
379
539
  for (i = num-1; i >= 0; i--) {
@@ -382,6 +542,25 @@ stackprof_record_sample()
382
542
  }
383
543
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)1;
384
544
  }
545
+
546
+ /* If there's no timestamp delta buffer, allocate one */
547
+ if (!_stackprof.raw_sample_times) {
548
+ _stackprof.raw_sample_times_capa = 100;
549
+ _stackprof.raw_sample_times = malloc(sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
550
+ _stackprof.raw_sample_times_len = 0;
551
+ }
552
+
553
+ /* Double the buffer size if it's too small */
554
+ while (_stackprof.raw_sample_times_capa <= _stackprof.raw_sample_times_len + 1) {
555
+ _stackprof.raw_sample_times_capa *= 2;
556
+ _stackprof.raw_sample_times = realloc(_stackprof.raw_sample_times, sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
557
+ }
558
+
559
+ /* Store the time delta (which is the amount of microseconds between samples). */
560
+ _stackprof.raw_sample_times[_stackprof.raw_sample_times_len++] = (sample_time_t) {
561
+ .timestamp_usec = sample_timestamp,
562
+ .delta_usec = timestamp_delta,
563
+ };
385
564
  }
386
565
 
387
566
  for (i = 0; i < num; i++) {
@@ -389,7 +568,10 @@ stackprof_record_sample()
389
568
  VALUE frame = _stackprof.frames_buffer[i];
390
569
  frame_data_t *frame_data = sample_for(frame);
391
570
 
392
- frame_data->total_samples++;
571
+ if (frame_data->seen_at_sample_number != _stackprof.overall_samples) {
572
+ frame_data->total_samples++;
573
+ }
574
+ frame_data->seen_at_sample_number = _stackprof.overall_samples;
393
575
 
394
576
  if (i == 0) {
395
577
  frame_data->caller_samples++;
@@ -400,37 +582,194 @@ stackprof_record_sample()
400
582
  }
401
583
 
402
584
  if (_stackprof.aggregate && line > 0) {
403
- if (!frame_data->lines)
404
- frame_data->lines = st_init_numtable();
405
585
  size_t half = (size_t)1<<(8*SIZEOF_SIZE_T/2);
406
586
  size_t increment = i == 0 ? half + 1 : half;
587
+ if (!frame_data->lines)
588
+ frame_data->lines = st_init_numtable();
407
589
  st_numtable_increment(frame_data->lines, (st_data_t)line, increment);
408
590
  }
409
591
 
410
592
  prev_frame = frame;
411
593
  }
594
+
595
+ if (_stackprof.raw) {
596
+ capture_timestamp(&_stackprof.last_sample_at);
597
+ }
598
+ }
599
+
600
+ // buffer the current profile frames
601
+ // This must be async-signal-safe
602
+ // Returns immediately if another set of frames are already in the buffer
603
+ void
604
+ stackprof_buffer_sample(void)
605
+ {
606
+ uint64_t start_timestamp = 0;
607
+ int64_t timestamp_delta = 0;
608
+ int num;
609
+
610
+ if (_stackprof.buffer_count > 0) {
611
+ // Another sample is already pending
612
+ return;
613
+ }
614
+
615
+ if (_stackprof.raw) {
616
+ struct timestamp_t t;
617
+ capture_timestamp(&t);
618
+ start_timestamp = timestamp_usec(&t);
619
+ timestamp_delta = delta_usec(&_stackprof.last_sample_at, &t);
620
+ }
621
+
622
+ num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer);
623
+
624
+ _stackprof.buffer_count = num;
625
+ _stackprof.buffer_time.timestamp_usec = start_timestamp;
626
+ _stackprof.buffer_time.delta_usec = timestamp_delta;
627
+ }
628
+
629
+ void
630
+ stackprof_record_gc_samples(void)
631
+ {
632
+ int64_t delta_to_first_unrecorded_gc_sample = 0;
633
+ uint64_t start_timestamp = 0;
634
+ size_t i;
635
+ if (_stackprof.raw) {
636
+ struct timestamp_t t;
637
+ capture_timestamp(&t);
638
+ start_timestamp = timestamp_usec(&t);
639
+
640
+ // We don't know when the GC samples were actually marked, so let's
641
+ // assume that they were marked at a perfectly regular interval.
642
+ delta_to_first_unrecorded_gc_sample = delta_usec(&_stackprof.last_sample_at, &t) - (_stackprof.unrecorded_gc_samples - 1) * NUM2LONG(_stackprof.interval);
643
+ if (delta_to_first_unrecorded_gc_sample < 0) {
644
+ delta_to_first_unrecorded_gc_sample = 0;
645
+ }
646
+ }
647
+
648
+ for (i = 0; i < _stackprof.unrecorded_gc_samples; i++) {
649
+ int64_t timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : NUM2LONG(_stackprof.interval);
650
+
651
+ if (_stackprof.unrecorded_gc_marking_samples) {
652
+ _stackprof.frames_buffer[0] = FAKE_FRAME_MARK;
653
+ _stackprof.lines_buffer[0] = 0;
654
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
655
+ _stackprof.lines_buffer[1] = 0;
656
+ _stackprof.unrecorded_gc_marking_samples--;
657
+
658
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
659
+ } else if (_stackprof.unrecorded_gc_sweeping_samples) {
660
+ _stackprof.frames_buffer[0] = FAKE_FRAME_SWEEP;
661
+ _stackprof.lines_buffer[0] = 0;
662
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
663
+ _stackprof.lines_buffer[1] = 0;
664
+
665
+ _stackprof.unrecorded_gc_sweeping_samples--;
666
+
667
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
668
+ } else {
669
+ _stackprof.frames_buffer[0] = FAKE_FRAME_GC;
670
+ _stackprof.lines_buffer[0] = 0;
671
+ stackprof_record_sample_for_stack(1, start_timestamp, timestamp_delta);
672
+ }
673
+ }
674
+ _stackprof.during_gc += _stackprof.unrecorded_gc_samples;
675
+ _stackprof.unrecorded_gc_samples = 0;
676
+ _stackprof.unrecorded_gc_marking_samples = 0;
677
+ _stackprof.unrecorded_gc_sweeping_samples = 0;
678
+ }
679
+
680
+ // record the sample previously buffered by stackprof_buffer_sample
681
+ static void
682
+ stackprof_record_buffer(void)
683
+ {
684
+ stackprof_record_sample_for_stack(_stackprof.buffer_count, _stackprof.buffer_time.timestamp_usec, _stackprof.buffer_time.delta_usec);
685
+
686
+ // reset the buffer
687
+ _stackprof.buffer_count = 0;
688
+ }
689
+
690
+ static void
691
+ stackprof_sample_and_record(void)
692
+ {
693
+ stackprof_buffer_sample();
694
+ stackprof_record_buffer();
695
+ }
696
+
697
+ static void
698
+ stackprof_job_record_gc(void *data)
699
+ {
700
+ if (!_stackprof.running) return;
701
+
702
+ stackprof_record_gc_samples();
703
+ }
704
+
705
+ static void
706
+ stackprof_job_sample_and_record(void *data)
707
+ {
708
+ if (!_stackprof.running) return;
709
+
710
+ stackprof_sample_and_record();
412
711
  }
413
712
 
414
713
  static void
415
- stackprof_job_handler(void *data)
714
+ stackprof_job_record_buffer(void *data)
416
715
  {
417
- static int in_signal_handler = 0;
418
- if (in_signal_handler) return;
419
716
  if (!_stackprof.running) return;
420
717
 
421
- in_signal_handler++;
422
- stackprof_record_sample();
423
- in_signal_handler--;
718
+ stackprof_record_buffer();
424
719
  }
425
720
 
426
721
  static void
427
722
  stackprof_signal_handler(int sig, siginfo_t *sinfo, void *ucontext)
428
723
  {
724
+ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
725
+
429
726
  _stackprof.overall_signals++;
430
- if (rb_during_gc())
431
- _stackprof.during_gc++, _stackprof.overall_samples++;
432
- else
433
- rb_postponed_job_register_one(0, stackprof_job_handler, 0);
727
+
728
+ if (!_stackprof.running) return;
729
+
730
+ // There's a possibility that the signal handler is invoked *after* the Ruby
731
+ // VM has been shut down (e.g. after ruby_cleanup(0)). In this case, things
732
+ // that rely on global VM state (e.g. rb_during_gc) will segfault.
733
+ if (!ruby_vm_running) return;
734
+
735
+ if (_stackprof.mode == sym_wall) {
736
+ // In "wall" mode, the SIGALRM signal will arrive at an arbitrary thread.
737
+ // In order to provide more useful results, especially under threaded web
738
+ // servers, we want to forward this signal to the original thread
739
+ // StackProf was started from.
740
+ // According to POSIX.1-2008 TC1 pthread_kill and pthread_self should be
741
+ // async-signal-safe.
742
+ if (pthread_self() != _stackprof.target_thread) {
743
+ pthread_kill(_stackprof.target_thread, sig);
744
+ return;
745
+ }
746
+ } else {
747
+ if (!ruby_native_thread_p()) return;
748
+ }
749
+
750
+ if (pthread_mutex_trylock(&lock)) return;
751
+
752
+ if (!_stackprof.ignore_gc && rb_during_gc()) {
753
+ VALUE mode = rb_gc_latest_gc_info(sym_state);
754
+ if (mode == sym_marking) {
755
+ _stackprof.unrecorded_gc_marking_samples++;
756
+ } else if (mode == sym_sweeping) {
757
+ _stackprof.unrecorded_gc_sweeping_samples++;
758
+ }
759
+ _stackprof.unrecorded_gc_samples++;
760
+ rb_postponed_job_register_one(0, stackprof_job_record_gc, (void*)0);
761
+ } else {
762
+ if (stackprof_use_postponed_job) {
763
+ rb_postponed_job_register_one(0, stackprof_job_sample_and_record, (void*)0);
764
+ } else {
765
+ // Buffer a sample immediately, if an existing sample exists this will
766
+ // return immediately
767
+ stackprof_buffer_sample();
768
+ // Enqueue a job to record the sample
769
+ rb_postponed_job_register_one(0, stackprof_job_record_buffer, (void*)0);
770
+ }
771
+ }
772
+ pthread_mutex_unlock(&lock);
434
773
  }
435
774
 
436
775
  static void
@@ -439,7 +778,7 @@ stackprof_newobj_handler(VALUE tpval, void *data)
439
778
  _stackprof.overall_signals++;
440
779
  if (RTEST(_stackprof.interval) && _stackprof.overall_signals % NUM2LONG(_stackprof.interval))
441
780
  return;
442
- stackprof_job_handler(0);
781
+ stackprof_sample_and_record();
443
782
  }
444
783
 
445
784
  static VALUE
@@ -449,7 +788,7 @@ stackprof_sample(VALUE self)
449
788
  return Qfalse;
450
789
 
451
790
  _stackprof.overall_signals++;
452
- stackprof_job_handler(0);
791
+ stackprof_sample_and_record();
453
792
  return Qtrue;
454
793
  }
455
794
 
@@ -464,11 +803,18 @@ frame_mark_i(st_data_t key, st_data_t val, st_data_t arg)
464
803
  static void
465
804
  stackprof_gc_mark(void *data)
466
805
  {
806
+ if (RTEST(_stackprof.metadata))
807
+ rb_gc_mark(_stackprof.metadata);
808
+
467
809
  if (RTEST(_stackprof.out))
468
810
  rb_gc_mark(_stackprof.out);
469
811
 
470
812
  if (_stackprof.frames)
471
813
  st_foreach(_stackprof.frames, frame_mark_i, 0);
814
+
815
+ for (int i = 0; i < _stackprof.buffer_count; i++) {
816
+ rb_gc_mark(_stackprof.frames_buffer[i]);
817
+ }
472
818
  }
473
819
 
474
820
  static void
@@ -503,9 +849,32 @@ stackprof_atfork_child(void)
503
849
  stackprof_stop(rb_mStackProf);
504
850
  }
505
851
 
852
+ static VALUE
853
+ stackprof_use_postponed_job_l(VALUE self)
854
+ {
855
+ stackprof_use_postponed_job = 1;
856
+ return Qnil;
857
+ }
858
+
859
+ static void
860
+ stackprof_at_exit(ruby_vm_t* vm)
861
+ {
862
+ ruby_vm_running = 0;
863
+ }
864
+
506
865
  void
507
866
  Init_stackprof(void)
508
867
  {
868
+ size_t i;
869
+ /*
870
+ * As of Ruby 3.0, it should be safe to read stack frames at any time, unless YJIT is enabled
871
+ * See https://github.com/ruby/ruby/commit/0e276dc458f94d9d79a0f7c7669bde84abe80f21
872
+ */
873
+ stackprof_use_postponed_job = RUBY_API_VERSION_MAJOR < 3;
874
+
875
+ ruby_vm_running = 1;
876
+ ruby_vm_at_exit(stackprof_at_exit);
877
+
509
878
  #define S(name) sym_##name = ID2SYM(rb_intern(#name));
510
879
  S(object);
511
880
  S(custom);
@@ -524,14 +893,41 @@ Init_stackprof(void)
524
893
  S(mode);
525
894
  S(interval);
526
895
  S(raw);
896
+ S(raw_sample_timestamps);
897
+ S(raw_timestamp_deltas);
527
898
  S(out);
899
+ S(metadata);
900
+ S(ignore_gc);
528
901
  S(frames);
529
902
  S(aggregate);
903
+ S(state);
904
+ S(marking);
905
+ S(sweeping);
530
906
  #undef S
531
907
 
908
+ /* Need to run this to warm the symbol table before we call this during GC */
909
+ rb_gc_latest_gc_info(sym_state);
910
+
532
911
  gc_hook = Data_Wrap_Struct(rb_cObject, stackprof_gc_mark, NULL, &_stackprof);
533
912
  rb_global_variable(&gc_hook);
534
913
 
914
+ _stackprof.raw_samples = NULL;
915
+ _stackprof.raw_samples_len = 0;
916
+ _stackprof.raw_samples_capa = 0;
917
+ _stackprof.raw_sample_index = 0;
918
+
919
+ _stackprof.raw_sample_times = NULL;
920
+ _stackprof.raw_sample_times_len = 0;
921
+ _stackprof.raw_sample_times_capa = 0;
922
+
923
+ _stackprof.empty_string = rb_str_new_cstr("");
924
+ rb_global_variable(&_stackprof.empty_string);
925
+
926
+ for (i = 0; i < TOTAL_FAKE_FRAMES; i++) {
927
+ _stackprof.fake_frame_names[i] = rb_str_new_cstr(fake_frame_cstrs[i]);
928
+ rb_global_variable(&_stackprof.fake_frame_names[i]);
929
+ }
930
+
535
931
  rb_mStackProf = rb_define_module("StackProf");
536
932
  rb_define_singleton_method(rb_mStackProf, "running?", stackprof_running_p, 0);
537
933
  rb_define_singleton_method(rb_mStackProf, "run", stackprof_run, -1);
@@ -539,6 +935,7 @@ Init_stackprof(void)
539
935
  rb_define_singleton_method(rb_mStackProf, "stop", stackprof_stop, 0);
540
936
  rb_define_singleton_method(rb_mStackProf, "results", stackprof_results, -1);
541
937
  rb_define_singleton_method(rb_mStackProf, "sample", stackprof_sample, 0);
938
+ rb_define_singleton_method(rb_mStackProf, "use_postponed_job!", stackprof_use_postponed_job_l, 0);
542
939
 
543
940
  pthread_atfork(stackprof_atfork_prepare, stackprof_atfork_parent, stackprof_atfork_child);
544
941
  }