stackprof 0.2.11 → 0.2.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,24 +7,90 @@
7
7
  **********************************************************************/
8
8
 
9
9
  #include <ruby/ruby.h>
10
+ #include <ruby/version.h>
10
11
  #include <ruby/debug.h>
11
12
  #include <ruby/st.h>
12
13
  #include <ruby/io.h>
13
14
  #include <ruby/intern.h>
15
+ #include <ruby/vm.h>
14
16
  #include <signal.h>
15
17
  #include <sys/time.h>
18
+ #include <time.h>
16
19
  #include <pthread.h>
17
20
 
18
21
  #define BUF_SIZE 2048
22
+ #define MICROSECONDS_IN_SECOND 1000000
23
+ #define NANOSECONDS_IN_SECOND 1000000000
24
+
25
+ #define FAKE_FRAME_GC INT2FIX(0)
26
+ #define FAKE_FRAME_MARK INT2FIX(1)
27
+ #define FAKE_FRAME_SWEEP INT2FIX(2)
28
+
29
+ static const char *fake_frame_cstrs[] = {
30
+ "(garbage collection)",
31
+ "(marking)",
32
+ "(sweeping)",
33
+ };
34
+
35
+ static int stackprof_use_postponed_job = 1;
36
+ static int ruby_vm_running = 0;
37
+
38
+ #define TOTAL_FAKE_FRAMES (sizeof(fake_frame_cstrs) / sizeof(char *))
39
+
40
+ #ifdef _POSIX_MONOTONIC_CLOCK
41
+ #define timestamp_t timespec
42
+ typedef struct timestamp_t timestamp_t;
43
+
44
+ static void capture_timestamp(timestamp_t *ts) {
45
+ clock_gettime(CLOCK_MONOTONIC, ts);
46
+ }
47
+
48
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
49
+ int64_t result = MICROSECONDS_IN_SECOND * (end->tv_sec - start->tv_sec);
50
+ if (end->tv_nsec < start->tv_nsec) {
51
+ result -= MICROSECONDS_IN_SECOND;
52
+ result += (NANOSECONDS_IN_SECOND + end->tv_nsec - start->tv_nsec) / 1000;
53
+ } else {
54
+ result += (end->tv_nsec - start->tv_nsec) / 1000;
55
+ }
56
+ return result;
57
+ }
58
+
59
+ static uint64_t timestamp_usec(timestamp_t *ts) {
60
+ return (MICROSECONDS_IN_SECOND * ts->tv_sec) + (ts->tv_nsec / 1000);
61
+ }
62
+ #else
63
+ #define timestamp_t timeval
64
+ typedef struct timestamp_t timestamp_t;
65
+
66
+ static void capture_timestamp(timestamp_t *ts) {
67
+ gettimeofday(ts, NULL);
68
+ }
69
+
70
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
71
+ struct timeval diff;
72
+ timersub(end, start, &diff);
73
+ return (MICROSECONDS_IN_SECOND * diff.tv_sec) + diff.tv_usec;
74
+ }
75
+
76
+ static uint64_t timestamp_usec(timestamp_t *ts) {
77
+ return (MICROSECONDS_IN_SECOND * ts.tv_sec) + diff.tv_usec
78
+ }
79
+ #endif
19
80
 
20
81
  typedef struct {
21
82
  size_t total_samples;
22
83
  size_t caller_samples;
23
- int already_accounted_in_total;
84
+ size_t seen_at_sample_number;
24
85
  st_table *edges;
25
86
  st_table *lines;
26
87
  } frame_data_t;
27
88
 
89
+ typedef struct {
90
+ uint64_t timestamp_usec;
91
+ int64_t delta_usec;
92
+ } sample_time_t;
93
+
28
94
  static struct {
29
95
  int running;
30
96
  int raw;
@@ -33,33 +99,42 @@ static struct {
33
99
  VALUE mode;
34
100
  VALUE interval;
35
101
  VALUE out;
102
+ VALUE metadata;
103
+ int ignore_gc;
36
104
 
37
105
  VALUE *raw_samples;
38
106
  size_t raw_samples_len;
39
107
  size_t raw_samples_capa;
40
108
  size_t raw_sample_index;
41
109
 
42
- struct timeval last_sample_at;
43
- int *raw_timestamp_deltas;
44
- size_t raw_timestamp_deltas_len;
45
- size_t raw_timestamp_deltas_capa;
110
+ struct timestamp_t last_sample_at;
111
+ sample_time_t *raw_sample_times;
112
+ size_t raw_sample_times_len;
113
+ size_t raw_sample_times_capa;
46
114
 
47
115
  size_t overall_signals;
48
116
  size_t overall_samples;
49
117
  size_t during_gc;
50
118
  size_t unrecorded_gc_samples;
119
+ size_t unrecorded_gc_marking_samples;
120
+ size_t unrecorded_gc_sweeping_samples;
51
121
  st_table *frames;
52
122
 
53
- VALUE fake_gc_frame;
54
- VALUE fake_gc_frame_name;
123
+ VALUE fake_frame_names[TOTAL_FAKE_FRAMES];
55
124
  VALUE empty_string;
125
+
126
+ int buffer_count;
127
+ sample_time_t buffer_time;
56
128
  VALUE frames_buffer[BUF_SIZE];
57
129
  int lines_buffer[BUF_SIZE];
130
+
131
+ pthread_t target_thread;
58
132
  } _stackprof;
59
133
 
60
134
  static VALUE sym_object, sym_wall, sym_cpu, sym_custom, sym_name, sym_file, sym_line;
61
135
  static VALUE sym_samples, sym_total_samples, sym_missed_samples, sym_edges, sym_lines;
62
- static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_frames, sym_out, sym_aggregate, sym_raw_timestamp_deltas;
136
+ static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_metadata, sym_frames, sym_ignore_gc, sym_out;
137
+ static VALUE sym_aggregate, sym_raw_sample_timestamps, sym_raw_timestamp_deltas, sym_state, sym_marking, sym_sweeping;
63
138
  static VALUE sym_gc_samples, objtracer;
64
139
  static VALUE gc_hook;
65
140
  static VALUE rb_mStackProf;
@@ -72,8 +147,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
72
147
  {
73
148
  struct sigaction sa;
74
149
  struct itimerval timer;
75
- VALUE opts = Qnil, mode = Qnil, interval = Qnil, out = Qfalse;
150
+ VALUE opts = Qnil, mode = Qnil, interval = Qnil, metadata = rb_hash_new(), out = Qfalse;
151
+ int ignore_gc = 0;
76
152
  int raw = 0, aggregate = 1;
153
+ VALUE metadata_val;
77
154
 
78
155
  if (_stackprof.running)
79
156
  return Qfalse;
@@ -84,6 +161,17 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
84
161
  mode = rb_hash_aref(opts, sym_mode);
85
162
  interval = rb_hash_aref(opts, sym_interval);
86
163
  out = rb_hash_aref(opts, sym_out);
164
+ if (RTEST(rb_hash_aref(opts, sym_ignore_gc))) {
165
+ ignore_gc = 1;
166
+ }
167
+
168
+ metadata_val = rb_hash_aref(opts, sym_metadata);
169
+ if (RTEST(metadata_val)) {
170
+ if (!RB_TYPE_P(metadata_val, T_HASH))
171
+ rb_raise(rb_eArgError, "metadata should be a hash");
172
+
173
+ metadata = metadata_val;
174
+ }
87
175
 
88
176
  if (RTEST(rb_hash_aref(opts, sym_raw)))
89
177
  raw = 1;
@@ -92,6 +180,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
92
180
  }
93
181
  if (!RTEST(mode)) mode = sym_wall;
94
182
 
183
+ if (!NIL_P(interval) && (NUM2INT(interval) < 1 || NUM2INT(interval) >= MICROSECONDS_IN_SECOND)) {
184
+ rb_raise(rb_eArgError, "interval is a number of microseconds between 1 and 1 million");
185
+ }
186
+
95
187
  if (!_stackprof.frames) {
96
188
  _stackprof.frames = st_init_numtable();
97
189
  _stackprof.overall_signals = 0;
@@ -128,10 +220,13 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
128
220
  _stackprof.aggregate = aggregate;
129
221
  _stackprof.mode = mode;
130
222
  _stackprof.interval = interval;
223
+ _stackprof.ignore_gc = ignore_gc;
224
+ _stackprof.metadata = metadata;
131
225
  _stackprof.out = out;
226
+ _stackprof.target_thread = pthread_self();
132
227
 
133
228
  if (raw) {
134
- gettimeofday(&_stackprof.last_sample_at, NULL);
229
+ capture_timestamp(&_stackprof.last_sample_at);
135
230
  }
136
231
 
137
232
  return Qtrue;
@@ -166,13 +261,19 @@ stackprof_stop(VALUE self)
166
261
  return Qtrue;
167
262
  }
168
263
 
264
+ #if SIZEOF_VOIDP == SIZEOF_LONG
265
+ # define PTR2NUM(x) (LONG2NUM((long)(x)))
266
+ #else
267
+ # define PTR2NUM(x) (LL2NUM((LONG_LONG)(x)))
268
+ #endif
269
+
169
270
  static int
170
271
  frame_edges_i(st_data_t key, st_data_t val, st_data_t arg)
171
272
  {
172
273
  VALUE edges = (VALUE)arg;
173
274
 
174
275
  intptr_t weight = (intptr_t)val;
175
- rb_hash_aset(edges, rb_obj_id((VALUE)key), INT2FIX(weight));
276
+ rb_hash_aset(edges, PTR2NUM(key), INT2FIX(weight));
176
277
  return ST_CONTINUE;
177
278
  }
178
279
 
@@ -199,10 +300,10 @@ frame_i(st_data_t key, st_data_t val, st_data_t arg)
199
300
  VALUE name, file, edges, lines;
200
301
  VALUE line;
201
302
 
202
- rb_hash_aset(results, rb_obj_id(frame), details);
303
+ rb_hash_aset(results, PTR2NUM(frame), details);
203
304
 
204
- if (frame == _stackprof.fake_gc_frame) {
205
- name = _stackprof.fake_gc_frame_name;
305
+ if (FIXNUM_P(frame)) {
306
+ name = _stackprof.fake_frame_names[FIX2INT(frame)];
206
307
  file = _stackprof.empty_string;
207
308
  line = INT2FIX(0);
208
309
  } else {
@@ -258,6 +359,9 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
258
359
  rb_hash_aset(results, sym_samples, SIZET2NUM(_stackprof.overall_samples));
259
360
  rb_hash_aset(results, sym_gc_samples, SIZET2NUM(_stackprof.during_gc));
260
361
  rb_hash_aset(results, sym_missed_samples, SIZET2NUM(_stackprof.overall_signals - _stackprof.overall_samples));
362
+ rb_hash_aset(results, sym_metadata, _stackprof.metadata);
363
+
364
+ _stackprof.metadata = Qnil;
261
365
 
262
366
  frames = rb_hash_new();
263
367
  rb_hash_aset(results, sym_frames, frames);
@@ -268,6 +372,7 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
268
372
 
269
373
  if (_stackprof.raw && _stackprof.raw_samples_len) {
270
374
  size_t len, n, o;
375
+ VALUE raw_sample_timestamps, raw_timestamp_deltas;
271
376
  VALUE raw_samples = rb_ary_new_capa(_stackprof.raw_samples_len);
272
377
 
273
378
  for (n = 0; n < _stackprof.raw_samples_len; n++) {
@@ -275,7 +380,7 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
275
380
  rb_ary_push(raw_samples, SIZET2NUM(len));
276
381
 
277
382
  for (o = 0, n++; o < len; n++, o++)
278
- rb_ary_push(raw_samples, rb_obj_id(_stackprof.raw_samples[n]));
383
+ rb_ary_push(raw_samples, PTR2NUM(_stackprof.raw_samples[n]));
279
384
  rb_ary_push(raw_samples, SIZET2NUM((size_t)_stackprof.raw_samples[n]));
280
385
  }
281
386
 
@@ -287,17 +392,20 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
287
392
 
288
393
  rb_hash_aset(results, sym_raw, raw_samples);
289
394
 
290
- VALUE raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_timestamp_deltas_len);
395
+ raw_sample_timestamps = rb_ary_new_capa(_stackprof.raw_sample_times_len);
396
+ raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_sample_times_len);
291
397
 
292
- for (n = 0; n < _stackprof.raw_timestamp_deltas_len; n++) {
293
- rb_ary_push(raw_timestamp_deltas, INT2FIX(_stackprof.raw_timestamp_deltas[n]));
398
+ for (n = 0; n < _stackprof.raw_sample_times_len; n++) {
399
+ rb_ary_push(raw_sample_timestamps, ULL2NUM(_stackprof.raw_sample_times[n].timestamp_usec));
400
+ rb_ary_push(raw_timestamp_deltas, LL2NUM(_stackprof.raw_sample_times[n].delta_usec));
294
401
  }
295
402
 
296
- free(_stackprof.raw_timestamp_deltas);
297
- _stackprof.raw_timestamp_deltas = NULL;
298
- _stackprof.raw_timestamp_deltas_len = 0;
299
- _stackprof.raw_timestamp_deltas_capa = 0;
403
+ free(_stackprof.raw_sample_times);
404
+ _stackprof.raw_sample_times = NULL;
405
+ _stackprof.raw_sample_times_len = 0;
406
+ _stackprof.raw_sample_times_capa = 0;
300
407
 
408
+ rb_hash_aset(results, sym_raw_sample_timestamps, raw_sample_timestamps);
301
409
  rb_hash_aset(results, sym_raw_timestamp_deltas, raw_timestamp_deltas);
302
410
 
303
411
  _stackprof.raw = 0;
@@ -308,11 +416,12 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
308
416
 
309
417
  if (RTEST(_stackprof.out)) {
310
418
  VALUE file;
311
- if (RB_TYPE_P(_stackprof.out, T_STRING)) {
312
- file = rb_file_open_str(_stackprof.out, "w");
313
- } else {
419
+ if (rb_respond_to(_stackprof.out, rb_intern("to_io"))) {
314
420
  file = rb_io_check_io(_stackprof.out);
421
+ } else {
422
+ file = rb_file_open_str(_stackprof.out, "w");
315
423
  }
424
+
316
425
  rb_marshal_dump(results, file);
317
426
  rb_io_flush(file);
318
427
  _stackprof.out = Qnil;
@@ -376,27 +485,39 @@ st_numtable_increment(st_table *table, st_data_t key, size_t increment)
376
485
  }
377
486
 
378
487
  void
379
- stackprof_record_sample_for_stack(int num, int timestamp_delta)
488
+ stackprof_record_sample_for_stack(int num, uint64_t sample_timestamp, int64_t timestamp_delta)
380
489
  {
381
490
  int i, n;
382
491
  VALUE prev_frame = Qnil;
383
492
 
384
493
  _stackprof.overall_samples++;
385
494
 
386
- if (_stackprof.raw) {
495
+ if (_stackprof.raw && num > 0) {
387
496
  int found = 0;
388
497
 
498
+ /* If there's no sample buffer allocated, then allocate one. The buffer
499
+ * format is the number of frames (num), then the list of frames (from
500
+ * `_stackprof.raw_samples`), followed by the number of times this
501
+ * particular stack has been seen in a row. Each "new" stack is added
502
+ * to the end of the buffer, but if the previous stack is the same as
503
+ * the current stack, the counter will be incremented. */
389
504
  if (!_stackprof.raw_samples) {
390
505
  _stackprof.raw_samples_capa = num * 100;
391
506
  _stackprof.raw_samples = malloc(sizeof(VALUE) * _stackprof.raw_samples_capa);
392
507
  }
393
508
 
509
+ /* If we can't fit all the samples in the buffer, double the buffer size. */
394
510
  while (_stackprof.raw_samples_capa <= _stackprof.raw_samples_len + (num + 2)) {
395
511
  _stackprof.raw_samples_capa *= 2;
396
512
  _stackprof.raw_samples = realloc(_stackprof.raw_samples, sizeof(VALUE) * _stackprof.raw_samples_capa);
397
513
  }
398
514
 
515
+ /* If we've seen this stack before in the last sample, then increment the "seen" count. */
399
516
  if (_stackprof.raw_samples_len > 0 && _stackprof.raw_samples[_stackprof.raw_sample_index] == (VALUE)num) {
517
+ /* The number of samples could have been the same, but the stack
518
+ * might be different, so we need to check the stack here. Stacks
519
+ * in the raw buffer are stored in the opposite direction of stacks
520
+ * in the frames buffer that came from Ruby. */
400
521
  for (i = num-1, n = 0; i >= 0; i--, n++) {
401
522
  VALUE frame = _stackprof.frames_buffer[i];
402
523
  if (_stackprof.raw_samples[_stackprof.raw_sample_index + 1 + n] != frame)
@@ -408,7 +529,11 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
408
529
  }
409
530
  }
410
531
 
532
+ /* If we haven't seen the stack, then add it to the buffer along with
533
+ * the length of the stack and a 1 for the "seen" count */
411
534
  if (!found) {
535
+ /* Bump the `raw_sample_index` up so that the next iteration can
536
+ * find the previously recorded stack size. */
412
537
  _stackprof.raw_sample_index = _stackprof.raw_samples_len;
413
538
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)num;
414
539
  for (i = num-1; i >= 0; i--) {
@@ -418,23 +543,24 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
418
543
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)1;
419
544
  }
420
545
 
421
- if (!_stackprof.raw_timestamp_deltas) {
422
- _stackprof.raw_timestamp_deltas_capa = 100;
423
- _stackprof.raw_timestamp_deltas = malloc(sizeof(int) * _stackprof.raw_timestamp_deltas_capa);
424
- _stackprof.raw_timestamp_deltas_len = 0;
546
+ /* If there's no timestamp delta buffer, allocate one */
547
+ if (!_stackprof.raw_sample_times) {
548
+ _stackprof.raw_sample_times_capa = 100;
549
+ _stackprof.raw_sample_times = malloc(sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
550
+ _stackprof.raw_sample_times_len = 0;
425
551
  }
426
552
 
427
- while (_stackprof.raw_timestamp_deltas_capa <= _stackprof.raw_timestamp_deltas_len + 1) {
428
- _stackprof.raw_timestamp_deltas_capa *= 2;
429
- _stackprof.raw_timestamp_deltas = realloc(_stackprof.raw_timestamp_deltas, sizeof(int) * _stackprof.raw_timestamp_deltas_capa);
553
+ /* Double the buffer size if it's too small */
554
+ while (_stackprof.raw_sample_times_capa <= _stackprof.raw_sample_times_len + 1) {
555
+ _stackprof.raw_sample_times_capa *= 2;
556
+ _stackprof.raw_sample_times = realloc(_stackprof.raw_sample_times, sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
430
557
  }
431
558
 
432
- _stackprof.raw_timestamp_deltas[_stackprof.raw_timestamp_deltas_len++] = timestamp_delta;
433
- }
434
-
435
- for (i = 0; i < num; i++) {
436
- VALUE frame = _stackprof.frames_buffer[i];
437
- sample_for(frame)->already_accounted_in_total = 0;
559
+ /* Store the time delta (which is the amount of microseconds between samples). */
560
+ _stackprof.raw_sample_times[_stackprof.raw_sample_times_len++] = (sample_time_t) {
561
+ .timestamp_usec = sample_timestamp,
562
+ .delta_usec = timestamp_delta,
563
+ };
438
564
  }
439
565
 
440
566
  for (i = 0; i < num; i++) {
@@ -442,9 +568,10 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
442
568
  VALUE frame = _stackprof.frames_buffer[i];
443
569
  frame_data_t *frame_data = sample_for(frame);
444
570
 
445
- if (!frame_data->already_accounted_in_total)
571
+ if (frame_data->seen_at_sample_number != _stackprof.overall_samples) {
446
572
  frame_data->total_samples++;
447
- frame_data->already_accounted_in_total = 1;
573
+ }
574
+ frame_data->seen_at_sample_number = _stackprof.overall_samples;
448
575
 
449
576
  if (i == 0) {
450
577
  frame_data->caller_samples++;
@@ -455,10 +582,10 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
455
582
  }
456
583
 
457
584
  if (_stackprof.aggregate && line > 0) {
458
- if (!frame_data->lines)
459
- frame_data->lines = st_init_numtable();
460
585
  size_t half = (size_t)1<<(8*SIZEOF_SIZE_T/2);
461
586
  size_t increment = i == 0 ? half + 1 : half;
587
+ if (!frame_data->lines)
588
+ frame_data->lines = st_init_numtable();
462
589
  st_numtable_increment(frame_data->lines, (st_data_t)line, increment);
463
590
  }
464
591
 
@@ -466,90 +593,183 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
466
593
  }
467
594
 
468
595
  if (_stackprof.raw) {
469
- gettimeofday(&_stackprof.last_sample_at, NULL);
596
+ capture_timestamp(&_stackprof.last_sample_at);
470
597
  }
471
598
  }
472
599
 
600
+ // buffer the current profile frames
601
+ // This must be async-signal-safe
602
+ // Returns immediately if another set of frames are already in the buffer
473
603
  void
474
- stackprof_record_sample()
604
+ stackprof_buffer_sample(void)
475
605
  {
476
- int timestamp_delta = 0;
606
+ uint64_t start_timestamp = 0;
607
+ int64_t timestamp_delta = 0;
608
+ int num;
609
+
610
+ if (_stackprof.buffer_count > 0) {
611
+ // Another sample is already pending
612
+ return;
613
+ }
614
+
477
615
  if (_stackprof.raw) {
478
- struct timeval t;
479
- gettimeofday(&t, NULL);
480
- struct timeval diff;
481
- timersub(&t, &_stackprof.last_sample_at, &diff);
482
- timestamp_delta = (1000 * diff.tv_sec) + diff.tv_usec;
616
+ struct timestamp_t t;
617
+ capture_timestamp(&t);
618
+ start_timestamp = timestamp_usec(&t);
619
+ timestamp_delta = delta_usec(&_stackprof.last_sample_at, &t);
483
620
  }
484
- int num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer);
485
- stackprof_record_sample_for_stack(num, timestamp_delta);
621
+
622
+ num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer);
623
+
624
+ _stackprof.buffer_count = num;
625
+ _stackprof.buffer_time.timestamp_usec = start_timestamp;
626
+ _stackprof.buffer_time.delta_usec = timestamp_delta;
486
627
  }
487
628
 
488
629
  void
489
- stackprof_record_gc_samples()
630
+ stackprof_record_gc_samples(void)
490
631
  {
491
- int delta_to_first_unrecorded_gc_sample = 0;
632
+ int64_t delta_to_first_unrecorded_gc_sample = 0;
633
+ uint64_t start_timestamp = 0;
634
+ size_t i;
492
635
  if (_stackprof.raw) {
493
- struct timeval t;
494
- gettimeofday(&t, NULL);
495
- struct timeval diff;
496
- timersub(&t, &_stackprof.last_sample_at, &diff);
636
+ struct timestamp_t t;
637
+ capture_timestamp(&t);
638
+ start_timestamp = timestamp_usec(&t);
497
639
 
498
640
  // We don't know when the GC samples were actually marked, so let's
499
641
  // assume that they were marked at a perfectly regular interval.
500
- delta_to_first_unrecorded_gc_sample = (1000 * diff.tv_sec + diff.tv_usec) - (_stackprof.unrecorded_gc_samples - 1) * _stackprof.interval;
642
+ delta_to_first_unrecorded_gc_sample = delta_usec(&_stackprof.last_sample_at, &t) - (_stackprof.unrecorded_gc_samples - 1) * NUM2LONG(_stackprof.interval);
501
643
  if (delta_to_first_unrecorded_gc_sample < 0) {
502
644
  delta_to_first_unrecorded_gc_sample = 0;
503
645
  }
504
646
  }
505
647
 
506
- int i;
507
-
508
- _stackprof.frames_buffer[0] = _stackprof.fake_gc_frame;
509
- _stackprof.lines_buffer[0] = 0;
510
-
511
648
  for (i = 0; i < _stackprof.unrecorded_gc_samples; i++) {
512
- int timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : _stackprof.interval;
513
- stackprof_record_sample_for_stack(1, timestamp_delta);
649
+ int64_t timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : NUM2LONG(_stackprof.interval);
650
+
651
+ if (_stackprof.unrecorded_gc_marking_samples) {
652
+ _stackprof.frames_buffer[0] = FAKE_FRAME_MARK;
653
+ _stackprof.lines_buffer[0] = 0;
654
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
655
+ _stackprof.lines_buffer[1] = 0;
656
+ _stackprof.unrecorded_gc_marking_samples--;
657
+
658
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
659
+ } else if (_stackprof.unrecorded_gc_sweeping_samples) {
660
+ _stackprof.frames_buffer[0] = FAKE_FRAME_SWEEP;
661
+ _stackprof.lines_buffer[0] = 0;
662
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
663
+ _stackprof.lines_buffer[1] = 0;
664
+
665
+ _stackprof.unrecorded_gc_sweeping_samples--;
666
+
667
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
668
+ } else {
669
+ _stackprof.frames_buffer[0] = FAKE_FRAME_GC;
670
+ _stackprof.lines_buffer[0] = 0;
671
+ stackprof_record_sample_for_stack(1, start_timestamp, timestamp_delta);
672
+ }
514
673
  }
515
674
  _stackprof.during_gc += _stackprof.unrecorded_gc_samples;
516
675
  _stackprof.unrecorded_gc_samples = 0;
676
+ _stackprof.unrecorded_gc_marking_samples = 0;
677
+ _stackprof.unrecorded_gc_sweeping_samples = 0;
517
678
  }
518
679
 
680
+ // record the sample previously buffered by stackprof_buffer_sample
519
681
  static void
520
- stackprof_gc_job_handler(void *data)
682
+ stackprof_record_buffer(void)
683
+ {
684
+ stackprof_record_sample_for_stack(_stackprof.buffer_count, _stackprof.buffer_time.timestamp_usec, _stackprof.buffer_time.delta_usec);
685
+
686
+ // reset the buffer
687
+ _stackprof.buffer_count = 0;
688
+ }
689
+
690
+ static void
691
+ stackprof_sample_and_record(void)
692
+ {
693
+ stackprof_buffer_sample();
694
+ stackprof_record_buffer();
695
+ }
696
+
697
+ static void
698
+ stackprof_job_record_gc(void *data)
521
699
  {
522
- static int in_signal_handler = 0;
523
- if (in_signal_handler) return;
524
700
  if (!_stackprof.running) return;
525
701
 
526
- in_signal_handler++;
527
702
  stackprof_record_gc_samples();
528
- in_signal_handler--;
529
703
  }
530
704
 
531
705
  static void
532
- stackprof_job_handler(void *data)
706
+ stackprof_job_sample_and_record(void *data)
533
707
  {
534
- static int in_signal_handler = 0;
535
- if (in_signal_handler) return;
536
708
  if (!_stackprof.running) return;
537
709
 
538
- in_signal_handler++;
539
- stackprof_record_sample();
540
- in_signal_handler--;
710
+ stackprof_sample_and_record();
711
+ }
712
+
713
+ static void
714
+ stackprof_job_record_buffer(void *data)
715
+ {
716
+ if (!_stackprof.running) return;
717
+
718
+ stackprof_record_buffer();
541
719
  }
542
720
 
543
721
  static void
544
722
  stackprof_signal_handler(int sig, siginfo_t *sinfo, void *ucontext)
545
723
  {
724
+ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
725
+
546
726
  _stackprof.overall_signals++;
547
- if (rb_during_gc()) {
727
+
728
+ if (!_stackprof.running) return;
729
+
730
+ // There's a possibility that the signal handler is invoked *after* the Ruby
731
+ // VM has been shut down (e.g. after ruby_cleanup(0)). In this case, things
732
+ // that rely on global VM state (e.g. rb_during_gc) will segfault.
733
+ if (!ruby_vm_running) return;
734
+
735
+ if (_stackprof.mode == sym_wall) {
736
+ // In "wall" mode, the SIGALRM signal will arrive at an arbitrary thread.
737
+ // In order to provide more useful results, especially under threaded web
738
+ // servers, we want to forward this signal to the original thread
739
+ // StackProf was started from.
740
+ // According to POSIX.1-2008 TC1 pthread_kill and pthread_self should be
741
+ // async-signal-safe.
742
+ if (pthread_self() != _stackprof.target_thread) {
743
+ pthread_kill(_stackprof.target_thread, sig);
744
+ return;
745
+ }
746
+ } else {
747
+ if (!ruby_native_thread_p()) return;
748
+ }
749
+
750
+ if (pthread_mutex_trylock(&lock)) return;
751
+
752
+ if (!_stackprof.ignore_gc && rb_during_gc()) {
753
+ VALUE mode = rb_gc_latest_gc_info(sym_state);
754
+ if (mode == sym_marking) {
755
+ _stackprof.unrecorded_gc_marking_samples++;
756
+ } else if (mode == sym_sweeping) {
757
+ _stackprof.unrecorded_gc_sweeping_samples++;
758
+ }
548
759
  _stackprof.unrecorded_gc_samples++;
549
- rb_postponed_job_register_one(0, stackprof_gc_job_handler, (void*)0);
760
+ rb_postponed_job_register_one(0, stackprof_job_record_gc, (void*)0);
550
761
  } else {
551
- rb_postponed_job_register_one(0, stackprof_job_handler, (void*)0);
762
+ if (stackprof_use_postponed_job) {
763
+ rb_postponed_job_register_one(0, stackprof_job_sample_and_record, (void*)0);
764
+ } else {
765
+ // Buffer a sample immediately, if an existing sample exists this will
766
+ // return immediately
767
+ stackprof_buffer_sample();
768
+ // Enqueue a job to record the sample
769
+ rb_postponed_job_register_one(0, stackprof_job_record_buffer, (void*)0);
770
+ }
552
771
  }
772
+ pthread_mutex_unlock(&lock);
553
773
  }
554
774
 
555
775
  static void
@@ -558,7 +778,7 @@ stackprof_newobj_handler(VALUE tpval, void *data)
558
778
  _stackprof.overall_signals++;
559
779
  if (RTEST(_stackprof.interval) && _stackprof.overall_signals % NUM2LONG(_stackprof.interval))
560
780
  return;
561
- stackprof_job_handler(0);
781
+ stackprof_sample_and_record();
562
782
  }
563
783
 
564
784
  static VALUE
@@ -568,7 +788,7 @@ stackprof_sample(VALUE self)
568
788
  return Qfalse;
569
789
 
570
790
  _stackprof.overall_signals++;
571
- stackprof_job_handler(0);
791
+ stackprof_sample_and_record();
572
792
  return Qtrue;
573
793
  }
574
794
 
@@ -583,11 +803,18 @@ frame_mark_i(st_data_t key, st_data_t val, st_data_t arg)
583
803
  static void
584
804
  stackprof_gc_mark(void *data)
585
805
  {
806
+ if (RTEST(_stackprof.metadata))
807
+ rb_gc_mark(_stackprof.metadata);
808
+
586
809
  if (RTEST(_stackprof.out))
587
810
  rb_gc_mark(_stackprof.out);
588
811
 
589
812
  if (_stackprof.frames)
590
813
  st_foreach(_stackprof.frames, frame_mark_i, 0);
814
+
815
+ for (int i = 0; i < _stackprof.buffer_count; i++) {
816
+ rb_gc_mark(_stackprof.frames_buffer[i]);
817
+ }
591
818
  }
592
819
 
593
820
  static void
@@ -622,9 +849,32 @@ stackprof_atfork_child(void)
622
849
  stackprof_stop(rb_mStackProf);
623
850
  }
624
851
 
852
+ static VALUE
853
+ stackprof_use_postponed_job_l(VALUE self)
854
+ {
855
+ stackprof_use_postponed_job = 1;
856
+ return Qnil;
857
+ }
858
+
859
+ static void
860
+ stackprof_at_exit(ruby_vm_t* vm)
861
+ {
862
+ ruby_vm_running = 0;
863
+ }
864
+
625
865
  void
626
866
  Init_stackprof(void)
627
867
  {
868
+ size_t i;
869
+ /*
870
+ * As of Ruby 3.0, it should be safe to read stack frames at any time, unless YJIT is enabled
871
+ * See https://github.com/ruby/ruby/commit/0e276dc458f94d9d79a0f7c7669bde84abe80f21
872
+ */
873
+ stackprof_use_postponed_job = RUBY_API_VERSION_MAJOR < 3;
874
+
875
+ ruby_vm_running = 1;
876
+ ruby_vm_at_exit(stackprof_at_exit);
877
+
628
878
  #define S(name) sym_##name = ID2SYM(rb_intern(#name));
629
879
  S(object);
630
880
  S(custom);
@@ -643,12 +893,21 @@ Init_stackprof(void)
643
893
  S(mode);
644
894
  S(interval);
645
895
  S(raw);
896
+ S(raw_sample_timestamps);
646
897
  S(raw_timestamp_deltas);
647
898
  S(out);
899
+ S(metadata);
900
+ S(ignore_gc);
648
901
  S(frames);
649
902
  S(aggregate);
903
+ S(state);
904
+ S(marking);
905
+ S(sweeping);
650
906
  #undef S
651
907
 
908
+ /* Need to run this to warm the symbol table before we call this during GC */
909
+ rb_gc_latest_gc_info(sym_state);
910
+
652
911
  gc_hook = Data_Wrap_Struct(rb_cObject, stackprof_gc_mark, NULL, &_stackprof);
653
912
  rb_global_variable(&gc_hook);
654
913
 
@@ -657,16 +916,18 @@ Init_stackprof(void)
657
916
  _stackprof.raw_samples_capa = 0;
658
917
  _stackprof.raw_sample_index = 0;
659
918
 
660
- _stackprof.raw_timestamp_deltas = NULL;
661
- _stackprof.raw_timestamp_deltas_len = 0;
662
- _stackprof.raw_timestamp_deltas_capa = 0;
919
+ _stackprof.raw_sample_times = NULL;
920
+ _stackprof.raw_sample_times_len = 0;
921
+ _stackprof.raw_sample_times_capa = 0;
663
922
 
664
- _stackprof.fake_gc_frame = INT2FIX(0x9C);
665
923
  _stackprof.empty_string = rb_str_new_cstr("");
666
- _stackprof.fake_gc_frame_name = rb_str_new_cstr("(garbage collection)");
667
- rb_global_variable(&_stackprof.fake_gc_frame_name);
668
924
  rb_global_variable(&_stackprof.empty_string);
669
925
 
926
+ for (i = 0; i < TOTAL_FAKE_FRAMES; i++) {
927
+ _stackprof.fake_frame_names[i] = rb_str_new_cstr(fake_frame_cstrs[i]);
928
+ rb_global_variable(&_stackprof.fake_frame_names[i]);
929
+ }
930
+
670
931
  rb_mStackProf = rb_define_module("StackProf");
671
932
  rb_define_singleton_method(rb_mStackProf, "running?", stackprof_running_p, 0);
672
933
  rb_define_singleton_method(rb_mStackProf, "run", stackprof_run, -1);
@@ -674,6 +935,7 @@ Init_stackprof(void)
674
935
  rb_define_singleton_method(rb_mStackProf, "stop", stackprof_stop, 0);
675
936
  rb_define_singleton_method(rb_mStackProf, "results", stackprof_results, -1);
676
937
  rb_define_singleton_method(rb_mStackProf, "sample", stackprof_sample, 0);
938
+ rb_define_singleton_method(rb_mStackProf, "use_postponed_job!", stackprof_use_postponed_job_l, 0);
677
939
 
678
940
  pthread_atfork(stackprof_atfork_prepare, stackprof_atfork_parent, stackprof_atfork_child);
679
941
  }