stackprof 0.2.12 → 0.2.26

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,15 +7,76 @@
7
7
  **********************************************************************/
8
8
 
9
9
  #include <ruby/ruby.h>
10
+ #include <ruby/version.h>
10
11
  #include <ruby/debug.h>
11
12
  #include <ruby/st.h>
12
13
  #include <ruby/io.h>
13
14
  #include <ruby/intern.h>
15
+ #include <ruby/vm.h>
14
16
  #include <signal.h>
15
17
  #include <sys/time.h>
18
+ #include <time.h>
16
19
  #include <pthread.h>
17
20
 
18
21
  #define BUF_SIZE 2048
22
+ #define MICROSECONDS_IN_SECOND 1000000
23
+ #define NANOSECONDS_IN_SECOND 1000000000
24
+
25
+ #define FAKE_FRAME_GC INT2FIX(0)
26
+ #define FAKE_FRAME_MARK INT2FIX(1)
27
+ #define FAKE_FRAME_SWEEP INT2FIX(2)
28
+
29
+ static const char *fake_frame_cstrs[] = {
30
+ "(garbage collection)",
31
+ "(marking)",
32
+ "(sweeping)",
33
+ };
34
+
35
+ static int stackprof_use_postponed_job = 1;
36
+ static int ruby_vm_running = 0;
37
+
38
+ #define TOTAL_FAKE_FRAMES (sizeof(fake_frame_cstrs) / sizeof(char *))
39
+
40
+ #ifdef _POSIX_MONOTONIC_CLOCK
41
+ #define timestamp_t timespec
42
+ typedef struct timestamp_t timestamp_t;
43
+
44
+ static void capture_timestamp(timestamp_t *ts) {
45
+ clock_gettime(CLOCK_MONOTONIC, ts);
46
+ }
47
+
48
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
49
+ int64_t result = MICROSECONDS_IN_SECOND * (end->tv_sec - start->tv_sec);
50
+ if (end->tv_nsec < start->tv_nsec) {
51
+ result -= MICROSECONDS_IN_SECOND;
52
+ result += (NANOSECONDS_IN_SECOND + end->tv_nsec - start->tv_nsec) / 1000;
53
+ } else {
54
+ result += (end->tv_nsec - start->tv_nsec) / 1000;
55
+ }
56
+ return result;
57
+ }
58
+
59
+ static uint64_t timestamp_usec(timestamp_t *ts) {
60
+ return (MICROSECONDS_IN_SECOND * ts->tv_sec) + (ts->tv_nsec / 1000);
61
+ }
62
+ #else
63
+ #define timestamp_t timeval
64
+ typedef struct timestamp_t timestamp_t;
65
+
66
+ static void capture_timestamp(timestamp_t *ts) {
67
+ gettimeofday(ts, NULL);
68
+ }
69
+
70
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
71
+ struct timeval diff;
72
+ timersub(end, start, &diff);
73
+ return (MICROSECONDS_IN_SECOND * diff.tv_sec) + diff.tv_usec;
74
+ }
75
+
76
+ static uint64_t timestamp_usec(timestamp_t *ts) {
77
+ return (MICROSECONDS_IN_SECOND * ts.tv_sec) + diff.tv_usec
78
+ }
79
+ #endif
19
80
 
20
81
  typedef struct {
21
82
  size_t total_samples;
@@ -25,6 +86,11 @@ typedef struct {
25
86
  st_table *lines;
26
87
  } frame_data_t;
27
88
 
89
+ typedef struct {
90
+ uint64_t timestamp_usec;
91
+ int64_t delta_usec;
92
+ } sample_time_t;
93
+
28
94
  static struct {
29
95
  int running;
30
96
  int raw;
@@ -33,33 +99,44 @@ static struct {
33
99
  VALUE mode;
34
100
  VALUE interval;
35
101
  VALUE out;
102
+ VALUE metadata;
103
+ int ignore_gc;
36
104
 
37
- VALUE *raw_samples;
105
+ uint64_t *raw_samples;
38
106
  size_t raw_samples_len;
39
107
  size_t raw_samples_capa;
40
108
  size_t raw_sample_index;
41
109
 
42
- struct timeval last_sample_at;
43
- int *raw_timestamp_deltas;
44
- size_t raw_timestamp_deltas_len;
45
- size_t raw_timestamp_deltas_capa;
110
+ struct timestamp_t last_sample_at;
111
+ sample_time_t *raw_sample_times;
112
+ size_t raw_sample_times_len;
113
+ size_t raw_sample_times_capa;
46
114
 
47
115
  size_t overall_signals;
48
116
  size_t overall_samples;
49
117
  size_t during_gc;
50
118
  size_t unrecorded_gc_samples;
119
+ size_t unrecorded_gc_marking_samples;
120
+ size_t unrecorded_gc_sweeping_samples;
51
121
  st_table *frames;
52
122
 
53
- VALUE fake_gc_frame;
54
- VALUE fake_gc_frame_name;
123
+ timestamp_t gc_start_timestamp;
124
+
125
+ VALUE fake_frame_names[TOTAL_FAKE_FRAMES];
55
126
  VALUE empty_string;
127
+
128
+ int buffer_count;
129
+ sample_time_t buffer_time;
56
130
  VALUE frames_buffer[BUF_SIZE];
57
131
  int lines_buffer[BUF_SIZE];
132
+
133
+ pthread_t target_thread;
58
134
  } _stackprof;
59
135
 
60
136
  static VALUE sym_object, sym_wall, sym_cpu, sym_custom, sym_name, sym_file, sym_line;
61
137
  static VALUE sym_samples, sym_total_samples, sym_missed_samples, sym_edges, sym_lines;
62
- static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_frames, sym_out, sym_aggregate, sym_raw_timestamp_deltas;
138
+ static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_raw_lines, sym_metadata, sym_frames, sym_ignore_gc, sym_out;
139
+ static VALUE sym_aggregate, sym_raw_sample_timestamps, sym_raw_timestamp_deltas, sym_state, sym_marking, sym_sweeping;
63
140
  static VALUE sym_gc_samples, objtracer;
64
141
  static VALUE gc_hook;
65
142
  static VALUE rb_mStackProf;
@@ -72,8 +149,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
72
149
  {
73
150
  struct sigaction sa;
74
151
  struct itimerval timer;
75
- VALUE opts = Qnil, mode = Qnil, interval = Qnil, out = Qfalse;
152
+ VALUE opts = Qnil, mode = Qnil, interval = Qnil, metadata = rb_hash_new(), out = Qfalse;
153
+ int ignore_gc = 0;
76
154
  int raw = 0, aggregate = 1;
155
+ VALUE metadata_val;
77
156
 
78
157
  if (_stackprof.running)
79
158
  return Qfalse;
@@ -84,6 +163,17 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
84
163
  mode = rb_hash_aref(opts, sym_mode);
85
164
  interval = rb_hash_aref(opts, sym_interval);
86
165
  out = rb_hash_aref(opts, sym_out);
166
+ if (RTEST(rb_hash_aref(opts, sym_ignore_gc))) {
167
+ ignore_gc = 1;
168
+ }
169
+
170
+ metadata_val = rb_hash_aref(opts, sym_metadata);
171
+ if (RTEST(metadata_val)) {
172
+ if (!RB_TYPE_P(metadata_val, T_HASH))
173
+ rb_raise(rb_eArgError, "metadata should be a hash");
174
+
175
+ metadata = metadata_val;
176
+ }
87
177
 
88
178
  if (RTEST(rb_hash_aref(opts, sym_raw)))
89
179
  raw = 1;
@@ -92,6 +182,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
92
182
  }
93
183
  if (!RTEST(mode)) mode = sym_wall;
94
184
 
185
+ if (!NIL_P(interval) && (NUM2INT(interval) < 1 || NUM2INT(interval) >= MICROSECONDS_IN_SECOND)) {
186
+ rb_raise(rb_eArgError, "interval is a number of microseconds between 1 and 1 million");
187
+ }
188
+
95
189
  if (!_stackprof.frames) {
96
190
  _stackprof.frames = st_init_numtable();
97
191
  _stackprof.overall_signals = 0;
@@ -128,10 +222,13 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
128
222
  _stackprof.aggregate = aggregate;
129
223
  _stackprof.mode = mode;
130
224
  _stackprof.interval = interval;
225
+ _stackprof.ignore_gc = ignore_gc;
226
+ _stackprof.metadata = metadata;
131
227
  _stackprof.out = out;
228
+ _stackprof.target_thread = pthread_self();
132
229
 
133
230
  if (raw) {
134
- gettimeofday(&_stackprof.last_sample_at, NULL);
231
+ capture_timestamp(&_stackprof.last_sample_at);
135
232
  }
136
233
 
137
234
  return Qtrue;
@@ -166,13 +263,19 @@ stackprof_stop(VALUE self)
166
263
  return Qtrue;
167
264
  }
168
265
 
266
+ #if SIZEOF_VOIDP == SIZEOF_LONG
267
+ # define PTR2NUM(x) (LONG2NUM((long)(x)))
268
+ #else
269
+ # define PTR2NUM(x) (LL2NUM((LONG_LONG)(x)))
270
+ #endif
271
+
169
272
  static int
170
273
  frame_edges_i(st_data_t key, st_data_t val, st_data_t arg)
171
274
  {
172
275
  VALUE edges = (VALUE)arg;
173
276
 
174
277
  intptr_t weight = (intptr_t)val;
175
- rb_hash_aset(edges, rb_obj_id((VALUE)key), INT2FIX(weight));
278
+ rb_hash_aset(edges, PTR2NUM(key), INT2FIX(weight));
176
279
  return ST_CONTINUE;
177
280
  }
178
281
 
@@ -199,10 +302,10 @@ frame_i(st_data_t key, st_data_t val, st_data_t arg)
199
302
  VALUE name, file, edges, lines;
200
303
  VALUE line;
201
304
 
202
- rb_hash_aset(results, rb_obj_id(frame), details);
305
+ rb_hash_aset(results, PTR2NUM(frame), details);
203
306
 
204
- if (frame == _stackprof.fake_gc_frame) {
205
- name = _stackprof.fake_gc_frame_name;
307
+ if (FIXNUM_P(frame)) {
308
+ name = _stackprof.fake_frame_names[FIX2INT(frame)];
206
309
  file = _stackprof.empty_string;
207
310
  line = INT2FIX(0);
208
311
  } else {
@@ -258,6 +361,9 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
258
361
  rb_hash_aset(results, sym_samples, SIZET2NUM(_stackprof.overall_samples));
259
362
  rb_hash_aset(results, sym_gc_samples, SIZET2NUM(_stackprof.during_gc));
260
363
  rb_hash_aset(results, sym_missed_samples, SIZET2NUM(_stackprof.overall_signals - _stackprof.overall_samples));
364
+ rb_hash_aset(results, sym_metadata, _stackprof.metadata);
365
+
366
+ _stackprof.metadata = Qnil;
261
367
 
262
368
  frames = rb_hash_new();
263
369
  rb_hash_aset(results, sym_frames, frames);
@@ -268,16 +374,25 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
268
374
 
269
375
  if (_stackprof.raw && _stackprof.raw_samples_len) {
270
376
  size_t len, n, o;
271
- VALUE raw_timestamp_deltas;
377
+ VALUE raw_sample_timestamps, raw_timestamp_deltas;
272
378
  VALUE raw_samples = rb_ary_new_capa(_stackprof.raw_samples_len);
379
+ VALUE raw_lines = rb_ary_new_capa(_stackprof.raw_samples_len);
273
380
 
274
381
  for (n = 0; n < _stackprof.raw_samples_len; n++) {
275
382
  len = (size_t)_stackprof.raw_samples[n];
276
383
  rb_ary_push(raw_samples, SIZET2NUM(len));
384
+ rb_ary_push(raw_lines, SIZET2NUM(len));
385
+
386
+ for (o = 0, n++; o < len; n++, o++) {
387
+ // Line is in the upper 16 bits
388
+ rb_ary_push(raw_lines, INT2NUM(_stackprof.raw_samples[n] >> 48));
389
+
390
+ VALUE frame = _stackprof.raw_samples[n] & ~((uint64_t)0xFFFF << 48);
391
+ rb_ary_push(raw_samples, PTR2NUM(frame));
392
+ }
277
393
 
278
- for (o = 0, n++; o < len; n++, o++)
279
- rb_ary_push(raw_samples, rb_obj_id(_stackprof.raw_samples[n]));
280
394
  rb_ary_push(raw_samples, SIZET2NUM((size_t)_stackprof.raw_samples[n]));
395
+ rb_ary_push(raw_lines, SIZET2NUM((size_t)_stackprof.raw_samples[n]));
281
396
  }
282
397
 
283
398
  free(_stackprof.raw_samples);
@@ -287,18 +402,22 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
287
402
  _stackprof.raw_sample_index = 0;
288
403
 
289
404
  rb_hash_aset(results, sym_raw, raw_samples);
405
+ rb_hash_aset(results, sym_raw_lines, raw_lines);
290
406
 
291
- raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_timestamp_deltas_len);
407
+ raw_sample_timestamps = rb_ary_new_capa(_stackprof.raw_sample_times_len);
408
+ raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_sample_times_len);
292
409
 
293
- for (n = 0; n < _stackprof.raw_timestamp_deltas_len; n++) {
294
- rb_ary_push(raw_timestamp_deltas, INT2FIX(_stackprof.raw_timestamp_deltas[n]));
410
+ for (n = 0; n < _stackprof.raw_sample_times_len; n++) {
411
+ rb_ary_push(raw_sample_timestamps, ULL2NUM(_stackprof.raw_sample_times[n].timestamp_usec));
412
+ rb_ary_push(raw_timestamp_deltas, LL2NUM(_stackprof.raw_sample_times[n].delta_usec));
295
413
  }
296
414
 
297
- free(_stackprof.raw_timestamp_deltas);
298
- _stackprof.raw_timestamp_deltas = NULL;
299
- _stackprof.raw_timestamp_deltas_len = 0;
300
- _stackprof.raw_timestamp_deltas_capa = 0;
415
+ free(_stackprof.raw_sample_times);
416
+ _stackprof.raw_sample_times = NULL;
417
+ _stackprof.raw_sample_times_len = 0;
418
+ _stackprof.raw_sample_times_capa = 0;
301
419
 
420
+ rb_hash_aset(results, sym_raw_sample_timestamps, raw_sample_timestamps);
302
421
  rb_hash_aset(results, sym_raw_timestamp_deltas, raw_timestamp_deltas);
303
422
 
304
423
  _stackprof.raw = 0;
@@ -309,11 +428,12 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
309
428
 
310
429
  if (RTEST(_stackprof.out)) {
311
430
  VALUE file;
312
- if (RB_TYPE_P(_stackprof.out, T_STRING)) {
313
- file = rb_file_open_str(_stackprof.out, "w");
314
- } else {
431
+ if (rb_respond_to(_stackprof.out, rb_intern("to_io"))) {
315
432
  file = rb_io_check_io(_stackprof.out);
433
+ } else {
434
+ file = rb_file_open_str(_stackprof.out, "w");
316
435
  }
436
+
317
437
  rb_marshal_dump(results, file);
318
438
  rb_io_flush(file);
319
439
  _stackprof.out = Qnil;
@@ -377,30 +497,47 @@ st_numtable_increment(st_table *table, st_data_t key, size_t increment)
377
497
  }
378
498
 
379
499
  void
380
- stackprof_record_sample_for_stack(int num, int timestamp_delta)
500
+ stackprof_record_sample_for_stack(int num, uint64_t sample_timestamp, int64_t timestamp_delta)
381
501
  {
382
502
  int i, n;
383
503
  VALUE prev_frame = Qnil;
384
504
 
385
505
  _stackprof.overall_samples++;
386
506
 
387
- if (_stackprof.raw) {
507
+ if (_stackprof.raw && num > 0) {
388
508
  int found = 0;
389
509
 
510
+ /* If there's no sample buffer allocated, then allocate one. The buffer
511
+ * format is the number of frames (num), then the list of frames (from
512
+ * `_stackprof.raw_samples`), followed by the number of times this
513
+ * particular stack has been seen in a row. Each "new" stack is added
514
+ * to the end of the buffer, but if the previous stack is the same as
515
+ * the current stack, the counter will be incremented. */
390
516
  if (!_stackprof.raw_samples) {
391
517
  _stackprof.raw_samples_capa = num * 100;
392
518
  _stackprof.raw_samples = malloc(sizeof(VALUE) * _stackprof.raw_samples_capa);
393
519
  }
394
520
 
521
+ /* If we can't fit all the samples in the buffer, double the buffer size. */
395
522
  while (_stackprof.raw_samples_capa <= _stackprof.raw_samples_len + (num + 2)) {
396
523
  _stackprof.raw_samples_capa *= 2;
397
524
  _stackprof.raw_samples = realloc(_stackprof.raw_samples, sizeof(VALUE) * _stackprof.raw_samples_capa);
398
525
  }
399
526
 
527
+ /* If we've seen this stack before in the last sample, then increment the "seen" count. */
400
528
  if (_stackprof.raw_samples_len > 0 && _stackprof.raw_samples[_stackprof.raw_sample_index] == (VALUE)num) {
529
+ /* The number of samples could have been the same, but the stack
530
+ * might be different, so we need to check the stack here. Stacks
531
+ * in the raw buffer are stored in the opposite direction of stacks
532
+ * in the frames buffer that came from Ruby. */
401
533
  for (i = num-1, n = 0; i >= 0; i--, n++) {
402
534
  VALUE frame = _stackprof.frames_buffer[i];
403
- if (_stackprof.raw_samples[_stackprof.raw_sample_index + 1 + n] != frame)
535
+ int line = _stackprof.lines_buffer[i];
536
+
537
+ // Encode the line in to the upper 16 bits.
538
+ uint64_t key = ((uint64_t)line << 48) | (uint64_t)frame;
539
+
540
+ if (_stackprof.raw_samples[_stackprof.raw_sample_index + 1 + n] != key)
404
541
  break;
405
542
  }
406
543
  if (i == -1) {
@@ -409,28 +546,43 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
409
546
  }
410
547
  }
411
548
 
549
+ /* If we haven't seen the stack, then add it to the buffer along with
550
+ * the length of the stack and a 1 for the "seen" count */
412
551
  if (!found) {
552
+ /* Bump the `raw_sample_index` up so that the next iteration can
553
+ * find the previously recorded stack size. */
413
554
  _stackprof.raw_sample_index = _stackprof.raw_samples_len;
414
555
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)num;
415
556
  for (i = num-1; i >= 0; i--) {
416
557
  VALUE frame = _stackprof.frames_buffer[i];
417
- _stackprof.raw_samples[_stackprof.raw_samples_len++] = frame;
558
+ int line = _stackprof.lines_buffer[i];
559
+
560
+ // Encode the line in to the upper 16 bits.
561
+ uint64_t key = ((uint64_t)line << 48) | (uint64_t)frame;
562
+
563
+ _stackprof.raw_samples[_stackprof.raw_samples_len++] = key;
418
564
  }
419
565
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)1;
420
566
  }
421
567
 
422
- if (!_stackprof.raw_timestamp_deltas) {
423
- _stackprof.raw_timestamp_deltas_capa = 100;
424
- _stackprof.raw_timestamp_deltas = malloc(sizeof(int) * _stackprof.raw_timestamp_deltas_capa);
425
- _stackprof.raw_timestamp_deltas_len = 0;
568
+ /* If there's no timestamp delta buffer, allocate one */
569
+ if (!_stackprof.raw_sample_times) {
570
+ _stackprof.raw_sample_times_capa = 100;
571
+ _stackprof.raw_sample_times = malloc(sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
572
+ _stackprof.raw_sample_times_len = 0;
426
573
  }
427
574
 
428
- while (_stackprof.raw_timestamp_deltas_capa <= _stackprof.raw_timestamp_deltas_len + 1) {
429
- _stackprof.raw_timestamp_deltas_capa *= 2;
430
- _stackprof.raw_timestamp_deltas = realloc(_stackprof.raw_timestamp_deltas, sizeof(int) * _stackprof.raw_timestamp_deltas_capa);
575
+ /* Double the buffer size if it's too small */
576
+ while (_stackprof.raw_sample_times_capa <= _stackprof.raw_sample_times_len + 1) {
577
+ _stackprof.raw_sample_times_capa *= 2;
578
+ _stackprof.raw_sample_times = realloc(_stackprof.raw_sample_times, sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
431
579
  }
432
580
 
433
- _stackprof.raw_timestamp_deltas[_stackprof.raw_timestamp_deltas_len++] = timestamp_delta;
581
+ /* Store the time delta (which is the amount of microseconds between samples). */
582
+ _stackprof.raw_sample_times[_stackprof.raw_sample_times_len++] = (sample_time_t) {
583
+ .timestamp_usec = sample_timestamp,
584
+ .delta_usec = timestamp_delta,
585
+ };
434
586
  }
435
587
 
436
588
  for (i = 0; i < num; i++) {
@@ -463,90 +615,187 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
463
615
  }
464
616
 
465
617
  if (_stackprof.raw) {
466
- gettimeofday(&_stackprof.last_sample_at, NULL);
618
+ capture_timestamp(&_stackprof.last_sample_at);
467
619
  }
468
620
  }
469
621
 
622
+ // buffer the current profile frames
623
+ // This must be async-signal-safe
624
+ // Returns immediately if another set of frames are already in the buffer
470
625
  void
471
- stackprof_record_sample()
626
+ stackprof_buffer_sample(void)
472
627
  {
473
- int timestamp_delta = 0;
628
+ uint64_t start_timestamp = 0;
629
+ int64_t timestamp_delta = 0;
474
630
  int num;
631
+
632
+ if (_stackprof.buffer_count > 0) {
633
+ // Another sample is already pending
634
+ return;
635
+ }
636
+
475
637
  if (_stackprof.raw) {
476
- struct timeval t;
477
- struct timeval diff;
478
- gettimeofday(&t, NULL);
479
- timersub(&t, &_stackprof.last_sample_at, &diff);
480
- timestamp_delta = (1000 * diff.tv_sec) + diff.tv_usec;
638
+ struct timestamp_t t;
639
+ capture_timestamp(&t);
640
+ start_timestamp = timestamp_usec(&t);
641
+ timestamp_delta = delta_usec(&_stackprof.last_sample_at, &t);
481
642
  }
643
+
482
644
  num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer);
483
- stackprof_record_sample_for_stack(num, timestamp_delta);
645
+
646
+ _stackprof.buffer_count = num;
647
+ _stackprof.buffer_time.timestamp_usec = start_timestamp;
648
+ _stackprof.buffer_time.delta_usec = timestamp_delta;
484
649
  }
485
650
 
651
+ // Postponed job
486
652
  void
487
- stackprof_record_gc_samples()
653
+ stackprof_record_gc_samples(void)
488
654
  {
489
- int delta_to_first_unrecorded_gc_sample = 0;
490
- int i;
655
+ int64_t delta_to_first_unrecorded_gc_sample = 0;
656
+ uint64_t start_timestamp = 0;
657
+ size_t i;
491
658
  if (_stackprof.raw) {
492
- struct timeval t;
493
- struct timeval diff;
494
- gettimeofday(&t, NULL);
495
- timersub(&t, &_stackprof.last_sample_at, &diff);
659
+ struct timestamp_t t = _stackprof.gc_start_timestamp;
660
+ start_timestamp = timestamp_usec(&t);
496
661
 
497
662
  // We don't know when the GC samples were actually marked, so let's
498
663
  // assume that they were marked at a perfectly regular interval.
499
- delta_to_first_unrecorded_gc_sample = (1000 * diff.tv_sec + diff.tv_usec) - (_stackprof.unrecorded_gc_samples - 1) * _stackprof.interval;
664
+ delta_to_first_unrecorded_gc_sample = delta_usec(&_stackprof.last_sample_at, &t) - (_stackprof.unrecorded_gc_samples - 1) * NUM2LONG(_stackprof.interval);
500
665
  if (delta_to_first_unrecorded_gc_sample < 0) {
501
666
  delta_to_first_unrecorded_gc_sample = 0;
502
667
  }
503
668
  }
504
669
 
505
- _stackprof.frames_buffer[0] = _stackprof.fake_gc_frame;
506
- _stackprof.lines_buffer[0] = 0;
507
-
508
670
  for (i = 0; i < _stackprof.unrecorded_gc_samples; i++) {
509
- int timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : _stackprof.interval;
510
- stackprof_record_sample_for_stack(1, timestamp_delta);
671
+ int64_t timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : NUM2LONG(_stackprof.interval);
672
+
673
+ if (_stackprof.unrecorded_gc_marking_samples) {
674
+ _stackprof.frames_buffer[0] = FAKE_FRAME_MARK;
675
+ _stackprof.lines_buffer[0] = 0;
676
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
677
+ _stackprof.lines_buffer[1] = 0;
678
+ _stackprof.unrecorded_gc_marking_samples--;
679
+
680
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
681
+ } else if (_stackprof.unrecorded_gc_sweeping_samples) {
682
+ _stackprof.frames_buffer[0] = FAKE_FRAME_SWEEP;
683
+ _stackprof.lines_buffer[0] = 0;
684
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
685
+ _stackprof.lines_buffer[1] = 0;
686
+
687
+ _stackprof.unrecorded_gc_sweeping_samples--;
688
+
689
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
690
+ } else {
691
+ _stackprof.frames_buffer[0] = FAKE_FRAME_GC;
692
+ _stackprof.lines_buffer[0] = 0;
693
+ stackprof_record_sample_for_stack(1, start_timestamp, timestamp_delta);
694
+ }
511
695
  }
512
696
  _stackprof.during_gc += _stackprof.unrecorded_gc_samples;
513
697
  _stackprof.unrecorded_gc_samples = 0;
698
+ _stackprof.unrecorded_gc_marking_samples = 0;
699
+ _stackprof.unrecorded_gc_sweeping_samples = 0;
700
+ }
701
+
702
+ // record the sample previously buffered by stackprof_buffer_sample
703
+ static void
704
+ stackprof_record_buffer(void)
705
+ {
706
+ stackprof_record_sample_for_stack(_stackprof.buffer_count, _stackprof.buffer_time.timestamp_usec, _stackprof.buffer_time.delta_usec);
707
+
708
+ // reset the buffer
709
+ _stackprof.buffer_count = 0;
514
710
  }
515
711
 
516
712
  static void
517
- stackprof_gc_job_handler(void *data)
713
+ stackprof_sample_and_record(void)
714
+ {
715
+ stackprof_buffer_sample();
716
+ stackprof_record_buffer();
717
+ }
718
+
719
+ static void
720
+ stackprof_job_record_gc(void *data)
518
721
  {
519
- static int in_signal_handler = 0;
520
- if (in_signal_handler) return;
521
722
  if (!_stackprof.running) return;
522
723
 
523
- in_signal_handler++;
524
724
  stackprof_record_gc_samples();
525
- in_signal_handler--;
526
725
  }
527
726
 
528
727
  static void
529
- stackprof_job_handler(void *data)
728
+ stackprof_job_sample_and_record(void *data)
729
+ {
730
+ if (!_stackprof.running) return;
731
+
732
+ stackprof_sample_and_record();
733
+ }
734
+
735
+ static void
736
+ stackprof_job_record_buffer(void *data)
530
737
  {
531
- static int in_signal_handler = 0;
532
- if (in_signal_handler) return;
533
738
  if (!_stackprof.running) return;
534
739
 
535
- in_signal_handler++;
536
- stackprof_record_sample();
537
- in_signal_handler--;
740
+ stackprof_record_buffer();
538
741
  }
539
742
 
540
743
  static void
541
744
  stackprof_signal_handler(int sig, siginfo_t *sinfo, void *ucontext)
542
745
  {
746
+ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
747
+
543
748
  _stackprof.overall_signals++;
544
- if (rb_during_gc()) {
749
+
750
+ if (!_stackprof.running) return;
751
+
752
+ // There's a possibility that the signal handler is invoked *after* the Ruby
753
+ // VM has been shut down (e.g. after ruby_cleanup(0)). In this case, things
754
+ // that rely on global VM state (e.g. rb_during_gc) will segfault.
755
+ if (!ruby_vm_running) return;
756
+
757
+ if (_stackprof.mode == sym_wall) {
758
+ // In "wall" mode, the SIGALRM signal will arrive at an arbitrary thread.
759
+ // In order to provide more useful results, especially under threaded web
760
+ // servers, we want to forward this signal to the original thread
761
+ // StackProf was started from.
762
+ // According to POSIX.1-2008 TC1 pthread_kill and pthread_self should be
763
+ // async-signal-safe.
764
+ if (pthread_self() != _stackprof.target_thread) {
765
+ pthread_kill(_stackprof.target_thread, sig);
766
+ return;
767
+ }
768
+ } else {
769
+ if (!ruby_native_thread_p()) return;
770
+ }
771
+
772
+ if (pthread_mutex_trylock(&lock)) return;
773
+
774
+ if (!_stackprof.ignore_gc && rb_during_gc()) {
775
+ VALUE mode = rb_gc_latest_gc_info(sym_state);
776
+ if (mode == sym_marking) {
777
+ _stackprof.unrecorded_gc_marking_samples++;
778
+ } else if (mode == sym_sweeping) {
779
+ _stackprof.unrecorded_gc_sweeping_samples++;
780
+ }
781
+ if(!_stackprof.unrecorded_gc_samples) {
782
+ // record start
783
+ capture_timestamp(&_stackprof.gc_start_timestamp);
784
+ }
545
785
  _stackprof.unrecorded_gc_samples++;
546
- rb_postponed_job_register_one(0, stackprof_gc_job_handler, (void*)0);
786
+ rb_postponed_job_register_one(0, stackprof_job_record_gc, (void*)0);
547
787
  } else {
548
- rb_postponed_job_register_one(0, stackprof_job_handler, (void*)0);
788
+ if (stackprof_use_postponed_job) {
789
+ rb_postponed_job_register_one(0, stackprof_job_sample_and_record, (void*)0);
790
+ } else {
791
+ // Buffer a sample immediately, if an existing sample exists this will
792
+ // return immediately
793
+ stackprof_buffer_sample();
794
+ // Enqueue a job to record the sample
795
+ rb_postponed_job_register_one(0, stackprof_job_record_buffer, (void*)0);
796
+ }
549
797
  }
798
+ pthread_mutex_unlock(&lock);
550
799
  }
551
800
 
552
801
  static void
@@ -555,7 +804,7 @@ stackprof_newobj_handler(VALUE tpval, void *data)
555
804
  _stackprof.overall_signals++;
556
805
  if (RTEST(_stackprof.interval) && _stackprof.overall_signals % NUM2LONG(_stackprof.interval))
557
806
  return;
558
- stackprof_job_handler(0);
807
+ stackprof_sample_and_record();
559
808
  }
560
809
 
561
810
  static VALUE
@@ -565,7 +814,7 @@ stackprof_sample(VALUE self)
565
814
  return Qfalse;
566
815
 
567
816
  _stackprof.overall_signals++;
568
- stackprof_job_handler(0);
817
+ stackprof_sample_and_record();
569
818
  return Qtrue;
570
819
  }
571
820
 
@@ -580,11 +829,25 @@ frame_mark_i(st_data_t key, st_data_t val, st_data_t arg)
580
829
  static void
581
830
  stackprof_gc_mark(void *data)
582
831
  {
832
+ if (RTEST(_stackprof.metadata))
833
+ rb_gc_mark(_stackprof.metadata);
834
+
583
835
  if (RTEST(_stackprof.out))
584
836
  rb_gc_mark(_stackprof.out);
585
837
 
586
838
  if (_stackprof.frames)
587
839
  st_foreach(_stackprof.frames, frame_mark_i, 0);
840
+
841
+ int i;
842
+ for (i = 0; i < _stackprof.buffer_count; i++) {
843
+ rb_gc_mark(_stackprof.frames_buffer[i]);
844
+ }
845
+ }
846
+
847
+ static size_t
848
+ stackprof_memsize(const void *data)
849
+ {
850
+ return sizeof(_stackprof);
588
851
  }
589
852
 
590
853
  static void
@@ -619,9 +882,41 @@ stackprof_atfork_child(void)
619
882
  stackprof_stop(rb_mStackProf);
620
883
  }
621
884
 
885
+ static VALUE
886
+ stackprof_use_postponed_job_l(VALUE self)
887
+ {
888
+ stackprof_use_postponed_job = 1;
889
+ return Qnil;
890
+ }
891
+
892
+ static void
893
+ stackprof_at_exit(ruby_vm_t* vm)
894
+ {
895
+ ruby_vm_running = 0;
896
+ }
897
+
898
+ static const rb_data_type_t stackprof_type = {
899
+ "StackProf",
900
+ {
901
+ stackprof_gc_mark,
902
+ NULL,
903
+ stackprof_memsize,
904
+ }
905
+ };
906
+
622
907
  void
623
908
  Init_stackprof(void)
624
909
  {
910
+ size_t i;
911
+ /*
912
+ * As of Ruby 3.0, it should be safe to read stack frames at any time, unless YJIT is enabled
913
+ * See https://github.com/ruby/ruby/commit/0e276dc458f94d9d79a0f7c7669bde84abe80f21
914
+ */
915
+ stackprof_use_postponed_job = RUBY_API_VERSION_MAJOR < 3;
916
+
917
+ ruby_vm_running = 1;
918
+ ruby_vm_at_exit(stackprof_at_exit);
919
+
625
920
  #define S(name) sym_##name = ID2SYM(rb_intern(#name));
626
921
  S(object);
627
922
  S(custom);
@@ -640,30 +935,42 @@ Init_stackprof(void)
640
935
  S(mode);
641
936
  S(interval);
642
937
  S(raw);
938
+ S(raw_lines);
939
+ S(raw_sample_timestamps);
643
940
  S(raw_timestamp_deltas);
644
941
  S(out);
942
+ S(metadata);
943
+ S(ignore_gc);
645
944
  S(frames);
646
945
  S(aggregate);
946
+ S(state);
947
+ S(marking);
948
+ S(sweeping);
647
949
  #undef S
648
950
 
649
- gc_hook = Data_Wrap_Struct(rb_cObject, stackprof_gc_mark, NULL, &_stackprof);
951
+ /* Need to run this to warm the symbol table before we call this during GC */
952
+ rb_gc_latest_gc_info(sym_state);
953
+
650
954
  rb_global_variable(&gc_hook);
955
+ gc_hook = TypedData_Wrap_Struct(rb_cObject, &stackprof_type, &_stackprof);
651
956
 
652
957
  _stackprof.raw_samples = NULL;
653
958
  _stackprof.raw_samples_len = 0;
654
959
  _stackprof.raw_samples_capa = 0;
655
960
  _stackprof.raw_sample_index = 0;
656
961
 
657
- _stackprof.raw_timestamp_deltas = NULL;
658
- _stackprof.raw_timestamp_deltas_len = 0;
659
- _stackprof.raw_timestamp_deltas_capa = 0;
962
+ _stackprof.raw_sample_times = NULL;
963
+ _stackprof.raw_sample_times_len = 0;
964
+ _stackprof.raw_sample_times_capa = 0;
660
965
 
661
- _stackprof.fake_gc_frame = INT2FIX(0x9C);
662
966
  _stackprof.empty_string = rb_str_new_cstr("");
663
- _stackprof.fake_gc_frame_name = rb_str_new_cstr("(garbage collection)");
664
- rb_global_variable(&_stackprof.fake_gc_frame_name);
665
967
  rb_global_variable(&_stackprof.empty_string);
666
968
 
969
+ for (i = 0; i < TOTAL_FAKE_FRAMES; i++) {
970
+ _stackprof.fake_frame_names[i] = rb_str_new_cstr(fake_frame_cstrs[i]);
971
+ rb_global_variable(&_stackprof.fake_frame_names[i]);
972
+ }
973
+
667
974
  rb_mStackProf = rb_define_module("StackProf");
668
975
  rb_define_singleton_method(rb_mStackProf, "running?", stackprof_running_p, 0);
669
976
  rb_define_singleton_method(rb_mStackProf, "run", stackprof_run, -1);
@@ -671,6 +978,7 @@ Init_stackprof(void)
671
978
  rb_define_singleton_method(rb_mStackProf, "stop", stackprof_stop, 0);
672
979
  rb_define_singleton_method(rb_mStackProf, "results", stackprof_results, -1);
673
980
  rb_define_singleton_method(rb_mStackProf, "sample", stackprof_sample, 0);
981
+ rb_define_singleton_method(rb_mStackProf, "use_postponed_job!", stackprof_use_postponed_job_l, 0);
674
982
 
675
983
  pthread_atfork(stackprof_atfork_prepare, stackprof_atfork_parent, stackprof_atfork_child);
676
984
  }