stackprof 0.2.10 → 0.2.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,23 +7,90 @@
7
7
  **********************************************************************/
8
8
 
9
9
  #include <ruby/ruby.h>
10
+ #include <ruby/version.h>
10
11
  #include <ruby/debug.h>
11
12
  #include <ruby/st.h>
12
13
  #include <ruby/io.h>
13
14
  #include <ruby/intern.h>
15
+ #include <ruby/vm.h>
14
16
  #include <signal.h>
15
17
  #include <sys/time.h>
18
+ #include <time.h>
16
19
  #include <pthread.h>
17
20
 
18
21
  #define BUF_SIZE 2048
22
+ #define MICROSECONDS_IN_SECOND 1000000
23
+ #define NANOSECONDS_IN_SECOND 1000000000
24
+
25
+ #define FAKE_FRAME_GC INT2FIX(0)
26
+ #define FAKE_FRAME_MARK INT2FIX(1)
27
+ #define FAKE_FRAME_SWEEP INT2FIX(2)
28
+
29
+ static const char *fake_frame_cstrs[] = {
30
+ "(garbage collection)",
31
+ "(marking)",
32
+ "(sweeping)",
33
+ };
34
+
35
+ static int stackprof_use_postponed_job = 1;
36
+ static int ruby_vm_running = 0;
37
+
38
+ #define TOTAL_FAKE_FRAMES (sizeof(fake_frame_cstrs) / sizeof(char *))
39
+
40
+ #ifdef _POSIX_MONOTONIC_CLOCK
41
+ #define timestamp_t timespec
42
+ typedef struct timestamp_t timestamp_t;
43
+
44
+ static void capture_timestamp(timestamp_t *ts) {
45
+ clock_gettime(CLOCK_MONOTONIC, ts);
46
+ }
47
+
48
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
49
+ int64_t result = MICROSECONDS_IN_SECOND * (end->tv_sec - start->tv_sec);
50
+ if (end->tv_nsec < start->tv_nsec) {
51
+ result -= MICROSECONDS_IN_SECOND;
52
+ result += (NANOSECONDS_IN_SECOND + end->tv_nsec - start->tv_nsec) / 1000;
53
+ } else {
54
+ result += (end->tv_nsec - start->tv_nsec) / 1000;
55
+ }
56
+ return result;
57
+ }
58
+
59
+ static uint64_t timestamp_usec(timestamp_t *ts) {
60
+ return (MICROSECONDS_IN_SECOND * ts->tv_sec) + (ts->tv_nsec / 1000);
61
+ }
62
+ #else
63
+ #define timestamp_t timeval
64
+ typedef struct timestamp_t timestamp_t;
65
+
66
+ static void capture_timestamp(timestamp_t *ts) {
67
+ gettimeofday(ts, NULL);
68
+ }
69
+
70
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
71
+ struct timeval diff;
72
+ timersub(end, start, &diff);
73
+ return (MICROSECONDS_IN_SECOND * diff.tv_sec) + diff.tv_usec;
74
+ }
75
+
76
+ static uint64_t timestamp_usec(timestamp_t *ts) {
77
+ return (MICROSECONDS_IN_SECOND * ts.tv_sec) + diff.tv_usec
78
+ }
79
+ #endif
19
80
 
20
81
  typedef struct {
21
82
  size_t total_samples;
22
83
  size_t caller_samples;
84
+ size_t seen_at_sample_number;
23
85
  st_table *edges;
24
86
  st_table *lines;
25
87
  } frame_data_t;
26
88
 
89
+ typedef struct {
90
+ uint64_t timestamp_usec;
91
+ int64_t delta_usec;
92
+ } sample_time_t;
93
+
27
94
  static struct {
28
95
  int running;
29
96
  int raw;
@@ -32,24 +99,42 @@ static struct {
32
99
  VALUE mode;
33
100
  VALUE interval;
34
101
  VALUE out;
102
+ VALUE metadata;
103
+ int ignore_gc;
35
104
 
36
105
  VALUE *raw_samples;
37
106
  size_t raw_samples_len;
38
107
  size_t raw_samples_capa;
39
108
  size_t raw_sample_index;
40
109
 
110
+ struct timestamp_t last_sample_at;
111
+ sample_time_t *raw_sample_times;
112
+ size_t raw_sample_times_len;
113
+ size_t raw_sample_times_capa;
114
+
41
115
  size_t overall_signals;
42
116
  size_t overall_samples;
43
117
  size_t during_gc;
118
+ size_t unrecorded_gc_samples;
119
+ size_t unrecorded_gc_marking_samples;
120
+ size_t unrecorded_gc_sweeping_samples;
44
121
  st_table *frames;
45
122
 
123
+ VALUE fake_frame_names[TOTAL_FAKE_FRAMES];
124
+ VALUE empty_string;
125
+
126
+ int buffer_count;
127
+ sample_time_t buffer_time;
46
128
  VALUE frames_buffer[BUF_SIZE];
47
129
  int lines_buffer[BUF_SIZE];
130
+
131
+ pthread_t target_thread;
48
132
  } _stackprof;
49
133
 
50
134
  static VALUE sym_object, sym_wall, sym_cpu, sym_custom, sym_name, sym_file, sym_line;
51
135
  static VALUE sym_samples, sym_total_samples, sym_missed_samples, sym_edges, sym_lines;
52
- static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_frames, sym_out, sym_aggregate;
136
+ static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_metadata, sym_frames, sym_ignore_gc, sym_out;
137
+ static VALUE sym_aggregate, sym_raw_sample_timestamps, sym_raw_timestamp_deltas, sym_state, sym_marking, sym_sweeping;
53
138
  static VALUE sym_gc_samples, objtracer;
54
139
  static VALUE gc_hook;
55
140
  static VALUE rb_mStackProf;
@@ -62,8 +147,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
62
147
  {
63
148
  struct sigaction sa;
64
149
  struct itimerval timer;
65
- VALUE opts = Qnil, mode = Qnil, interval = Qnil, out = Qfalse;
150
+ VALUE opts = Qnil, mode = Qnil, interval = Qnil, metadata = rb_hash_new(), out = Qfalse;
151
+ int ignore_gc = 0;
66
152
  int raw = 0, aggregate = 1;
153
+ VALUE metadata_val;
67
154
 
68
155
  if (_stackprof.running)
69
156
  return Qfalse;
@@ -74,6 +161,17 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
74
161
  mode = rb_hash_aref(opts, sym_mode);
75
162
  interval = rb_hash_aref(opts, sym_interval);
76
163
  out = rb_hash_aref(opts, sym_out);
164
+ if (RTEST(rb_hash_aref(opts, sym_ignore_gc))) {
165
+ ignore_gc = 1;
166
+ }
167
+
168
+ metadata_val = rb_hash_aref(opts, sym_metadata);
169
+ if (RTEST(metadata_val)) {
170
+ if (!RB_TYPE_P(metadata_val, T_HASH))
171
+ rb_raise(rb_eArgError, "metadata should be a hash");
172
+
173
+ metadata = metadata_val;
174
+ }
77
175
 
78
176
  if (RTEST(rb_hash_aref(opts, sym_raw)))
79
177
  raw = 1;
@@ -82,6 +180,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
82
180
  }
83
181
  if (!RTEST(mode)) mode = sym_wall;
84
182
 
183
+ if (!NIL_P(interval) && (NUM2INT(interval) < 1 || NUM2INT(interval) >= MICROSECONDS_IN_SECOND)) {
184
+ rb_raise(rb_eArgError, "interval is a number of microseconds between 1 and 1 million");
185
+ }
186
+
85
187
  if (!_stackprof.frames) {
86
188
  _stackprof.frames = st_init_numtable();
87
189
  _stackprof.overall_signals = 0;
@@ -118,7 +220,14 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
118
220
  _stackprof.aggregate = aggregate;
119
221
  _stackprof.mode = mode;
120
222
  _stackprof.interval = interval;
223
+ _stackprof.ignore_gc = ignore_gc;
224
+ _stackprof.metadata = metadata;
121
225
  _stackprof.out = out;
226
+ _stackprof.target_thread = pthread_self();
227
+
228
+ if (raw) {
229
+ capture_timestamp(&_stackprof.last_sample_at);
230
+ }
122
231
 
123
232
  return Qtrue;
124
233
  }
@@ -152,13 +261,19 @@ stackprof_stop(VALUE self)
152
261
  return Qtrue;
153
262
  }
154
263
 
264
+ #if SIZEOF_VOIDP == SIZEOF_LONG
265
+ # define PTR2NUM(x) (LONG2NUM((long)(x)))
266
+ #else
267
+ # define PTR2NUM(x) (LL2NUM((LONG_LONG)(x)))
268
+ #endif
269
+
155
270
  static int
156
271
  frame_edges_i(st_data_t key, st_data_t val, st_data_t arg)
157
272
  {
158
273
  VALUE edges = (VALUE)arg;
159
274
 
160
275
  intptr_t weight = (intptr_t)val;
161
- rb_hash_aset(edges, rb_obj_id((VALUE)key), INT2FIX(weight));
276
+ rb_hash_aset(edges, PTR2NUM(key), INT2FIX(weight));
162
277
  return ST_CONTINUE;
163
278
  }
164
279
 
@@ -185,18 +300,26 @@ frame_i(st_data_t key, st_data_t val, st_data_t arg)
185
300
  VALUE name, file, edges, lines;
186
301
  VALUE line;
187
302
 
188
- rb_hash_aset(results, rb_obj_id(frame), details);
303
+ rb_hash_aset(results, PTR2NUM(frame), details);
189
304
 
190
- name = rb_profile_frame_full_label(frame);
191
- rb_hash_aset(details, sym_name, name);
305
+ if (FIXNUM_P(frame)) {
306
+ name = _stackprof.fake_frame_names[FIX2INT(frame)];
307
+ file = _stackprof.empty_string;
308
+ line = INT2FIX(0);
309
+ } else {
310
+ name = rb_profile_frame_full_label(frame);
192
311
 
193
- file = rb_profile_frame_absolute_path(frame);
194
- if (NIL_P(file))
195
- file = rb_profile_frame_path(frame);
196
- rb_hash_aset(details, sym_file, file);
312
+ file = rb_profile_frame_absolute_path(frame);
313
+ if (NIL_P(file))
314
+ file = rb_profile_frame_path(frame);
315
+ line = rb_profile_frame_first_lineno(frame);
316
+ }
197
317
 
198
- if ((line = rb_profile_frame_first_lineno(frame)) != INT2FIX(0))
318
+ rb_hash_aset(details, sym_name, name);
319
+ rb_hash_aset(details, sym_file, file);
320
+ if (line != INT2FIX(0)) {
199
321
  rb_hash_aset(details, sym_line, line);
322
+ }
200
323
 
201
324
  rb_hash_aset(details, sym_total_samples, SIZET2NUM(frame_data->total_samples));
202
325
  rb_hash_aset(details, sym_samples, SIZET2NUM(frame_data->caller_samples));
@@ -230,12 +353,15 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
230
353
  return Qnil;
231
354
 
232
355
  results = rb_hash_new();
233
- rb_hash_aset(results, sym_version, DBL2NUM(1.1));
356
+ rb_hash_aset(results, sym_version, DBL2NUM(1.2));
234
357
  rb_hash_aset(results, sym_mode, _stackprof.mode);
235
358
  rb_hash_aset(results, sym_interval, _stackprof.interval);
236
359
  rb_hash_aset(results, sym_samples, SIZET2NUM(_stackprof.overall_samples));
237
360
  rb_hash_aset(results, sym_gc_samples, SIZET2NUM(_stackprof.during_gc));
238
361
  rb_hash_aset(results, sym_missed_samples, SIZET2NUM(_stackprof.overall_signals - _stackprof.overall_samples));
362
+ rb_hash_aset(results, sym_metadata, _stackprof.metadata);
363
+
364
+ _stackprof.metadata = Qnil;
239
365
 
240
366
  frames = rb_hash_new();
241
367
  rb_hash_aset(results, sym_frames, frames);
@@ -246,6 +372,7 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
246
372
 
247
373
  if (_stackprof.raw && _stackprof.raw_samples_len) {
248
374
  size_t len, n, o;
375
+ VALUE raw_sample_timestamps, raw_timestamp_deltas;
249
376
  VALUE raw_samples = rb_ary_new_capa(_stackprof.raw_samples_len);
250
377
 
251
378
  for (n = 0; n < _stackprof.raw_samples_len; n++) {
@@ -253,7 +380,7 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
253
380
  rb_ary_push(raw_samples, SIZET2NUM(len));
254
381
 
255
382
  for (o = 0, n++; o < len; n++, o++)
256
- rb_ary_push(raw_samples, rb_obj_id(_stackprof.raw_samples[n]));
383
+ rb_ary_push(raw_samples, PTR2NUM(_stackprof.raw_samples[n]));
257
384
  rb_ary_push(raw_samples, SIZET2NUM((size_t)_stackprof.raw_samples[n]));
258
385
  }
259
386
 
@@ -262,9 +389,26 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
262
389
  _stackprof.raw_samples_len = 0;
263
390
  _stackprof.raw_samples_capa = 0;
264
391
  _stackprof.raw_sample_index = 0;
265
- _stackprof.raw = 0;
266
392
 
267
393
  rb_hash_aset(results, sym_raw, raw_samples);
394
+
395
+ raw_sample_timestamps = rb_ary_new_capa(_stackprof.raw_sample_times_len);
396
+ raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_sample_times_len);
397
+
398
+ for (n = 0; n < _stackprof.raw_sample_times_len; n++) {
399
+ rb_ary_push(raw_sample_timestamps, ULL2NUM(_stackprof.raw_sample_times[n].timestamp_usec));
400
+ rb_ary_push(raw_timestamp_deltas, LL2NUM(_stackprof.raw_sample_times[n].delta_usec));
401
+ }
402
+
403
+ free(_stackprof.raw_sample_times);
404
+ _stackprof.raw_sample_times = NULL;
405
+ _stackprof.raw_sample_times_len = 0;
406
+ _stackprof.raw_sample_times_capa = 0;
407
+
408
+ rb_hash_aset(results, sym_raw_sample_timestamps, raw_sample_timestamps);
409
+ rb_hash_aset(results, sym_raw_timestamp_deltas, raw_timestamp_deltas);
410
+
411
+ _stackprof.raw = 0;
268
412
  }
269
413
 
270
414
  if (argc == 1)
@@ -272,11 +416,12 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
272
416
 
273
417
  if (RTEST(_stackprof.out)) {
274
418
  VALUE file;
275
- if (RB_TYPE_P(_stackprof.out, T_STRING)) {
276
- file = rb_file_open_str(_stackprof.out, "w");
277
- } else {
419
+ if (rb_respond_to(_stackprof.out, rb_intern("to_io"))) {
278
420
  file = rb_io_check_io(_stackprof.out);
421
+ } else {
422
+ file = rb_file_open_str(_stackprof.out, "w");
279
423
  }
424
+
280
425
  rb_marshal_dump(results, file);
281
426
  rb_io_flush(file);
282
427
  _stackprof.out = Qnil;
@@ -340,28 +485,39 @@ st_numtable_increment(st_table *table, st_data_t key, size_t increment)
340
485
  }
341
486
 
342
487
  void
343
- stackprof_record_sample()
488
+ stackprof_record_sample_for_stack(int num, uint64_t sample_timestamp, int64_t timestamp_delta)
344
489
  {
345
- int num, i, n;
490
+ int i, n;
346
491
  VALUE prev_frame = Qnil;
347
492
 
348
493
  _stackprof.overall_samples++;
349
- num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer);
350
494
 
351
- if (_stackprof.raw) {
495
+ if (_stackprof.raw && num > 0) {
352
496
  int found = 0;
353
497
 
498
+ /* If there's no sample buffer allocated, then allocate one. The buffer
499
+ * format is the number of frames (num), then the list of frames (from
500
+ * `_stackprof.raw_samples`), followed by the number of times this
501
+ * particular stack has been seen in a row. Each "new" stack is added
502
+ * to the end of the buffer, but if the previous stack is the same as
503
+ * the current stack, the counter will be incremented. */
354
504
  if (!_stackprof.raw_samples) {
355
505
  _stackprof.raw_samples_capa = num * 100;
356
506
  _stackprof.raw_samples = malloc(sizeof(VALUE) * _stackprof.raw_samples_capa);
357
507
  }
358
508
 
359
- if (_stackprof.raw_samples_capa <= _stackprof.raw_samples_len + num) {
509
+ /* If we can't fit all the samples in the buffer, double the buffer size. */
510
+ while (_stackprof.raw_samples_capa <= _stackprof.raw_samples_len + (num + 2)) {
360
511
  _stackprof.raw_samples_capa *= 2;
361
512
  _stackprof.raw_samples = realloc(_stackprof.raw_samples, sizeof(VALUE) * _stackprof.raw_samples_capa);
362
513
  }
363
514
 
515
+ /* If we've seen this stack before in the last sample, then increment the "seen" count. */
364
516
  if (_stackprof.raw_samples_len > 0 && _stackprof.raw_samples[_stackprof.raw_sample_index] == (VALUE)num) {
517
+ /* The number of samples could have been the same, but the stack
518
+ * might be different, so we need to check the stack here. Stacks
519
+ * in the raw buffer are stored in the opposite direction of stacks
520
+ * in the frames buffer that came from Ruby. */
365
521
  for (i = num-1, n = 0; i >= 0; i--, n++) {
366
522
  VALUE frame = _stackprof.frames_buffer[i];
367
523
  if (_stackprof.raw_samples[_stackprof.raw_sample_index + 1 + n] != frame)
@@ -373,7 +529,11 @@ stackprof_record_sample()
373
529
  }
374
530
  }
375
531
 
532
+ /* If we haven't seen the stack, then add it to the buffer along with
533
+ * the length of the stack and a 1 for the "seen" count */
376
534
  if (!found) {
535
+ /* Bump the `raw_sample_index` up so that the next iteration can
536
+ * find the previously recorded stack size. */
377
537
  _stackprof.raw_sample_index = _stackprof.raw_samples_len;
378
538
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)num;
379
539
  for (i = num-1; i >= 0; i--) {
@@ -382,6 +542,25 @@ stackprof_record_sample()
382
542
  }
383
543
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)1;
384
544
  }
545
+
546
+ /* If there's no timestamp delta buffer, allocate one */
547
+ if (!_stackprof.raw_sample_times) {
548
+ _stackprof.raw_sample_times_capa = 100;
549
+ _stackprof.raw_sample_times = malloc(sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
550
+ _stackprof.raw_sample_times_len = 0;
551
+ }
552
+
553
+ /* Double the buffer size if it's too small */
554
+ while (_stackprof.raw_sample_times_capa <= _stackprof.raw_sample_times_len + 1) {
555
+ _stackprof.raw_sample_times_capa *= 2;
556
+ _stackprof.raw_sample_times = realloc(_stackprof.raw_sample_times, sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
557
+ }
558
+
559
+ /* Store the time delta (which is the amount of microseconds between samples). */
560
+ _stackprof.raw_sample_times[_stackprof.raw_sample_times_len++] = (sample_time_t) {
561
+ .timestamp_usec = sample_timestamp,
562
+ .delta_usec = timestamp_delta,
563
+ };
385
564
  }
386
565
 
387
566
  for (i = 0; i < num; i++) {
@@ -389,7 +568,10 @@ stackprof_record_sample()
389
568
  VALUE frame = _stackprof.frames_buffer[i];
390
569
  frame_data_t *frame_data = sample_for(frame);
391
570
 
392
- frame_data->total_samples++;
571
+ if (frame_data->seen_at_sample_number != _stackprof.overall_samples) {
572
+ frame_data->total_samples++;
573
+ }
574
+ frame_data->seen_at_sample_number = _stackprof.overall_samples;
393
575
 
394
576
  if (i == 0) {
395
577
  frame_data->caller_samples++;
@@ -400,37 +582,194 @@ stackprof_record_sample()
400
582
  }
401
583
 
402
584
  if (_stackprof.aggregate && line > 0) {
403
- if (!frame_data->lines)
404
- frame_data->lines = st_init_numtable();
405
585
  size_t half = (size_t)1<<(8*SIZEOF_SIZE_T/2);
406
586
  size_t increment = i == 0 ? half + 1 : half;
587
+ if (!frame_data->lines)
588
+ frame_data->lines = st_init_numtable();
407
589
  st_numtable_increment(frame_data->lines, (st_data_t)line, increment);
408
590
  }
409
591
 
410
592
  prev_frame = frame;
411
593
  }
594
+
595
+ if (_stackprof.raw) {
596
+ capture_timestamp(&_stackprof.last_sample_at);
597
+ }
598
+ }
599
+
600
+ // buffer the current profile frames
601
+ // This must be async-signal-safe
602
+ // Returns immediately if another set of frames are already in the buffer
603
+ void
604
+ stackprof_buffer_sample(void)
605
+ {
606
+ uint64_t start_timestamp = 0;
607
+ int64_t timestamp_delta = 0;
608
+ int num;
609
+
610
+ if (_stackprof.buffer_count > 0) {
611
+ // Another sample is already pending
612
+ return;
613
+ }
614
+
615
+ if (_stackprof.raw) {
616
+ struct timestamp_t t;
617
+ capture_timestamp(&t);
618
+ start_timestamp = timestamp_usec(&t);
619
+ timestamp_delta = delta_usec(&_stackprof.last_sample_at, &t);
620
+ }
621
+
622
+ num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer);
623
+
624
+ _stackprof.buffer_count = num;
625
+ _stackprof.buffer_time.timestamp_usec = start_timestamp;
626
+ _stackprof.buffer_time.delta_usec = timestamp_delta;
627
+ }
628
+
629
+ void
630
+ stackprof_record_gc_samples(void)
631
+ {
632
+ int64_t delta_to_first_unrecorded_gc_sample = 0;
633
+ uint64_t start_timestamp = 0;
634
+ size_t i;
635
+ if (_stackprof.raw) {
636
+ struct timestamp_t t;
637
+ capture_timestamp(&t);
638
+ start_timestamp = timestamp_usec(&t);
639
+
640
+ // We don't know when the GC samples were actually marked, so let's
641
+ // assume that they were marked at a perfectly regular interval.
642
+ delta_to_first_unrecorded_gc_sample = delta_usec(&_stackprof.last_sample_at, &t) - (_stackprof.unrecorded_gc_samples - 1) * NUM2LONG(_stackprof.interval);
643
+ if (delta_to_first_unrecorded_gc_sample < 0) {
644
+ delta_to_first_unrecorded_gc_sample = 0;
645
+ }
646
+ }
647
+
648
+ for (i = 0; i < _stackprof.unrecorded_gc_samples; i++) {
649
+ int64_t timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : NUM2LONG(_stackprof.interval);
650
+
651
+ if (_stackprof.unrecorded_gc_marking_samples) {
652
+ _stackprof.frames_buffer[0] = FAKE_FRAME_MARK;
653
+ _stackprof.lines_buffer[0] = 0;
654
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
655
+ _stackprof.lines_buffer[1] = 0;
656
+ _stackprof.unrecorded_gc_marking_samples--;
657
+
658
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
659
+ } else if (_stackprof.unrecorded_gc_sweeping_samples) {
660
+ _stackprof.frames_buffer[0] = FAKE_FRAME_SWEEP;
661
+ _stackprof.lines_buffer[0] = 0;
662
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
663
+ _stackprof.lines_buffer[1] = 0;
664
+
665
+ _stackprof.unrecorded_gc_sweeping_samples--;
666
+
667
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
668
+ } else {
669
+ _stackprof.frames_buffer[0] = FAKE_FRAME_GC;
670
+ _stackprof.lines_buffer[0] = 0;
671
+ stackprof_record_sample_for_stack(1, start_timestamp, timestamp_delta);
672
+ }
673
+ }
674
+ _stackprof.during_gc += _stackprof.unrecorded_gc_samples;
675
+ _stackprof.unrecorded_gc_samples = 0;
676
+ _stackprof.unrecorded_gc_marking_samples = 0;
677
+ _stackprof.unrecorded_gc_sweeping_samples = 0;
678
+ }
679
+
680
+ // record the sample previously buffered by stackprof_buffer_sample
681
+ static void
682
+ stackprof_record_buffer(void)
683
+ {
684
+ stackprof_record_sample_for_stack(_stackprof.buffer_count, _stackprof.buffer_time.timestamp_usec, _stackprof.buffer_time.delta_usec);
685
+
686
+ // reset the buffer
687
+ _stackprof.buffer_count = 0;
688
+ }
689
+
690
+ static void
691
+ stackprof_sample_and_record(void)
692
+ {
693
+ stackprof_buffer_sample();
694
+ stackprof_record_buffer();
695
+ }
696
+
697
+ static void
698
+ stackprof_job_record_gc(void *data)
699
+ {
700
+ if (!_stackprof.running) return;
701
+
702
+ stackprof_record_gc_samples();
703
+ }
704
+
705
+ static void
706
+ stackprof_job_sample_and_record(void *data)
707
+ {
708
+ if (!_stackprof.running) return;
709
+
710
+ stackprof_sample_and_record();
412
711
  }
413
712
 
414
713
  static void
415
- stackprof_job_handler(void *data)
714
+ stackprof_job_record_buffer(void *data)
416
715
  {
417
- static int in_signal_handler = 0;
418
- if (in_signal_handler) return;
419
716
  if (!_stackprof.running) return;
420
717
 
421
- in_signal_handler++;
422
- stackprof_record_sample();
423
- in_signal_handler--;
718
+ stackprof_record_buffer();
424
719
  }
425
720
 
426
721
  static void
427
722
  stackprof_signal_handler(int sig, siginfo_t *sinfo, void *ucontext)
428
723
  {
724
+ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
725
+
429
726
  _stackprof.overall_signals++;
430
- if (rb_during_gc())
431
- _stackprof.during_gc++, _stackprof.overall_samples++;
432
- else
433
- rb_postponed_job_register_one(0, stackprof_job_handler, 0);
727
+
728
+ if (!_stackprof.running) return;
729
+
730
+ // There's a possibility that the signal handler is invoked *after* the Ruby
731
+ // VM has been shut down (e.g. after ruby_cleanup(0)). In this case, things
732
+ // that rely on global VM state (e.g. rb_during_gc) will segfault.
733
+ if (!ruby_vm_running) return;
734
+
735
+ if (_stackprof.mode == sym_wall) {
736
+ // In "wall" mode, the SIGALRM signal will arrive at an arbitrary thread.
737
+ // In order to provide more useful results, especially under threaded web
738
+ // servers, we want to forward this signal to the original thread
739
+ // StackProf was started from.
740
+ // According to POSIX.1-2008 TC1 pthread_kill and pthread_self should be
741
+ // async-signal-safe.
742
+ if (pthread_self() != _stackprof.target_thread) {
743
+ pthread_kill(_stackprof.target_thread, sig);
744
+ return;
745
+ }
746
+ } else {
747
+ if (!ruby_native_thread_p()) return;
748
+ }
749
+
750
+ if (pthread_mutex_trylock(&lock)) return;
751
+
752
+ if (!_stackprof.ignore_gc && rb_during_gc()) {
753
+ VALUE mode = rb_gc_latest_gc_info(sym_state);
754
+ if (mode == sym_marking) {
755
+ _stackprof.unrecorded_gc_marking_samples++;
756
+ } else if (mode == sym_sweeping) {
757
+ _stackprof.unrecorded_gc_sweeping_samples++;
758
+ }
759
+ _stackprof.unrecorded_gc_samples++;
760
+ rb_postponed_job_register_one(0, stackprof_job_record_gc, (void*)0);
761
+ } else {
762
+ if (stackprof_use_postponed_job) {
763
+ rb_postponed_job_register_one(0, stackprof_job_sample_and_record, (void*)0);
764
+ } else {
765
+ // Buffer a sample immediately, if an existing sample exists this will
766
+ // return immediately
767
+ stackprof_buffer_sample();
768
+ // Enqueue a job to record the sample
769
+ rb_postponed_job_register_one(0, stackprof_job_record_buffer, (void*)0);
770
+ }
771
+ }
772
+ pthread_mutex_unlock(&lock);
434
773
  }
435
774
 
436
775
  static void
@@ -439,7 +778,7 @@ stackprof_newobj_handler(VALUE tpval, void *data)
439
778
  _stackprof.overall_signals++;
440
779
  if (RTEST(_stackprof.interval) && _stackprof.overall_signals % NUM2LONG(_stackprof.interval))
441
780
  return;
442
- stackprof_job_handler(0);
781
+ stackprof_sample_and_record();
443
782
  }
444
783
 
445
784
  static VALUE
@@ -449,7 +788,7 @@ stackprof_sample(VALUE self)
449
788
  return Qfalse;
450
789
 
451
790
  _stackprof.overall_signals++;
452
- stackprof_job_handler(0);
791
+ stackprof_sample_and_record();
453
792
  return Qtrue;
454
793
  }
455
794
 
@@ -464,11 +803,18 @@ frame_mark_i(st_data_t key, st_data_t val, st_data_t arg)
464
803
  static void
465
804
  stackprof_gc_mark(void *data)
466
805
  {
806
+ if (RTEST(_stackprof.metadata))
807
+ rb_gc_mark(_stackprof.metadata);
808
+
467
809
  if (RTEST(_stackprof.out))
468
810
  rb_gc_mark(_stackprof.out);
469
811
 
470
812
  if (_stackprof.frames)
471
813
  st_foreach(_stackprof.frames, frame_mark_i, 0);
814
+
815
+ for (int i = 0; i < _stackprof.buffer_count; i++) {
816
+ rb_gc_mark(_stackprof.frames_buffer[i]);
817
+ }
472
818
  }
473
819
 
474
820
  static void
@@ -503,9 +849,32 @@ stackprof_atfork_child(void)
503
849
  stackprof_stop(rb_mStackProf);
504
850
  }
505
851
 
852
+ static VALUE
853
+ stackprof_use_postponed_job_l(VALUE self)
854
+ {
855
+ stackprof_use_postponed_job = 1;
856
+ return Qnil;
857
+ }
858
+
859
+ static void
860
+ stackprof_at_exit(ruby_vm_t* vm)
861
+ {
862
+ ruby_vm_running = 0;
863
+ }
864
+
506
865
  void
507
866
  Init_stackprof(void)
508
867
  {
868
+ size_t i;
869
+ /*
870
+ * As of Ruby 3.0, it should be safe to read stack frames at any time, unless YJIT is enabled
871
+ * See https://github.com/ruby/ruby/commit/0e276dc458f94d9d79a0f7c7669bde84abe80f21
872
+ */
873
+ stackprof_use_postponed_job = RUBY_API_VERSION_MAJOR < 3;
874
+
875
+ ruby_vm_running = 1;
876
+ ruby_vm_at_exit(stackprof_at_exit);
877
+
509
878
  #define S(name) sym_##name = ID2SYM(rb_intern(#name));
510
879
  S(object);
511
880
  S(custom);
@@ -524,14 +893,41 @@ Init_stackprof(void)
524
893
  S(mode);
525
894
  S(interval);
526
895
  S(raw);
896
+ S(raw_sample_timestamps);
897
+ S(raw_timestamp_deltas);
527
898
  S(out);
899
+ S(metadata);
900
+ S(ignore_gc);
528
901
  S(frames);
529
902
  S(aggregate);
903
+ S(state);
904
+ S(marking);
905
+ S(sweeping);
530
906
  #undef S
531
907
 
908
+ /* Need to run this to warm the symbol table before we call this during GC */
909
+ rb_gc_latest_gc_info(sym_state);
910
+
532
911
  gc_hook = Data_Wrap_Struct(rb_cObject, stackprof_gc_mark, NULL, &_stackprof);
533
912
  rb_global_variable(&gc_hook);
534
913
 
914
+ _stackprof.raw_samples = NULL;
915
+ _stackprof.raw_samples_len = 0;
916
+ _stackprof.raw_samples_capa = 0;
917
+ _stackprof.raw_sample_index = 0;
918
+
919
+ _stackprof.raw_sample_times = NULL;
920
+ _stackprof.raw_sample_times_len = 0;
921
+ _stackprof.raw_sample_times_capa = 0;
922
+
923
+ _stackprof.empty_string = rb_str_new_cstr("");
924
+ rb_global_variable(&_stackprof.empty_string);
925
+
926
+ for (i = 0; i < TOTAL_FAKE_FRAMES; i++) {
927
+ _stackprof.fake_frame_names[i] = rb_str_new_cstr(fake_frame_cstrs[i]);
928
+ rb_global_variable(&_stackprof.fake_frame_names[i]);
929
+ }
930
+
535
931
  rb_mStackProf = rb_define_module("StackProf");
536
932
  rb_define_singleton_method(rb_mStackProf, "running?", stackprof_running_p, 0);
537
933
  rb_define_singleton_method(rb_mStackProf, "run", stackprof_run, -1);
@@ -539,6 +935,7 @@ Init_stackprof(void)
539
935
  rb_define_singleton_method(rb_mStackProf, "stop", stackprof_stop, 0);
540
936
  rb_define_singleton_method(rb_mStackProf, "results", stackprof_results, -1);
541
937
  rb_define_singleton_method(rb_mStackProf, "sample", stackprof_sample, 0);
938
+ rb_define_singleton_method(rb_mStackProf, "use_postponed_job!", stackprof_use_postponed_job_l, 0);
542
939
 
543
940
  pthread_atfork(stackprof_atfork_prepare, stackprof_atfork_parent, stackprof_atfork_child);
544
941
  }