stackprof 0.2.12 → 0.2.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,15 +7,76 @@
7
7
  **********************************************************************/
8
8
 
9
9
  #include <ruby/ruby.h>
10
+ #include <ruby/version.h>
10
11
  #include <ruby/debug.h>
11
12
  #include <ruby/st.h>
12
13
  #include <ruby/io.h>
13
14
  #include <ruby/intern.h>
15
+ #include <ruby/vm.h>
14
16
  #include <signal.h>
15
17
  #include <sys/time.h>
18
+ #include <time.h>
16
19
  #include <pthread.h>
17
20
 
18
21
  #define BUF_SIZE 2048
22
+ #define MICROSECONDS_IN_SECOND 1000000
23
+ #define NANOSECONDS_IN_SECOND 1000000000
24
+
25
+ #define FAKE_FRAME_GC INT2FIX(0)
26
+ #define FAKE_FRAME_MARK INT2FIX(1)
27
+ #define FAKE_FRAME_SWEEP INT2FIX(2)
28
+
29
+ static const char *fake_frame_cstrs[] = {
30
+ "(garbage collection)",
31
+ "(marking)",
32
+ "(sweeping)",
33
+ };
34
+
35
+ static int stackprof_use_postponed_job = 1;
36
+ static int ruby_vm_running = 0;
37
+
38
+ #define TOTAL_FAKE_FRAMES (sizeof(fake_frame_cstrs) / sizeof(char *))
39
+
40
+ #ifdef _POSIX_MONOTONIC_CLOCK
41
+ #define timestamp_t timespec
42
+ typedef struct timestamp_t timestamp_t;
43
+
44
+ static void capture_timestamp(timestamp_t *ts) {
45
+ clock_gettime(CLOCK_MONOTONIC, ts);
46
+ }
47
+
48
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
49
+ int64_t result = MICROSECONDS_IN_SECOND * (end->tv_sec - start->tv_sec);
50
+ if (end->tv_nsec < start->tv_nsec) {
51
+ result -= MICROSECONDS_IN_SECOND;
52
+ result += (NANOSECONDS_IN_SECOND + end->tv_nsec - start->tv_nsec) / 1000;
53
+ } else {
54
+ result += (end->tv_nsec - start->tv_nsec) / 1000;
55
+ }
56
+ return result;
57
+ }
58
+
59
+ static uint64_t timestamp_usec(timestamp_t *ts) {
60
+ return (MICROSECONDS_IN_SECOND * ts->tv_sec) + (ts->tv_nsec / 1000);
61
+ }
62
+ #else
63
+ #define timestamp_t timeval
64
+ typedef struct timestamp_t timestamp_t;
65
+
66
+ static void capture_timestamp(timestamp_t *ts) {
67
+ gettimeofday(ts, NULL);
68
+ }
69
+
70
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
71
+ struct timeval diff;
72
+ timersub(end, start, &diff);
73
+ return (MICROSECONDS_IN_SECOND * diff.tv_sec) + diff.tv_usec;
74
+ }
75
+
76
+ static uint64_t timestamp_usec(timestamp_t *ts) {
77
+ return (MICROSECONDS_IN_SECOND * ts.tv_sec) + diff.tv_usec
78
+ }
79
+ #endif
19
80
 
20
81
  typedef struct {
21
82
  size_t total_samples;
@@ -25,6 +86,11 @@ typedef struct {
25
86
  st_table *lines;
26
87
  } frame_data_t;
27
88
 
89
+ typedef struct {
90
+ uint64_t timestamp_usec;
91
+ int64_t delta_usec;
92
+ } sample_time_t;
93
+
28
94
  static struct {
29
95
  int running;
30
96
  int raw;
@@ -33,33 +99,44 @@ static struct {
33
99
  VALUE mode;
34
100
  VALUE interval;
35
101
  VALUE out;
102
+ VALUE metadata;
103
+ int ignore_gc;
36
104
 
37
- VALUE *raw_samples;
105
+ uint64_t *raw_samples;
38
106
  size_t raw_samples_len;
39
107
  size_t raw_samples_capa;
40
108
  size_t raw_sample_index;
41
109
 
42
- struct timeval last_sample_at;
43
- int *raw_timestamp_deltas;
44
- size_t raw_timestamp_deltas_len;
45
- size_t raw_timestamp_deltas_capa;
110
+ struct timestamp_t last_sample_at;
111
+ sample_time_t *raw_sample_times;
112
+ size_t raw_sample_times_len;
113
+ size_t raw_sample_times_capa;
46
114
 
47
115
  size_t overall_signals;
48
116
  size_t overall_samples;
49
117
  size_t during_gc;
50
118
  size_t unrecorded_gc_samples;
119
+ size_t unrecorded_gc_marking_samples;
120
+ size_t unrecorded_gc_sweeping_samples;
51
121
  st_table *frames;
52
122
 
53
- VALUE fake_gc_frame;
54
- VALUE fake_gc_frame_name;
123
+ timestamp_t gc_start_timestamp;
124
+
125
+ VALUE fake_frame_names[TOTAL_FAKE_FRAMES];
55
126
  VALUE empty_string;
127
+
128
+ int buffer_count;
129
+ sample_time_t buffer_time;
56
130
  VALUE frames_buffer[BUF_SIZE];
57
131
  int lines_buffer[BUF_SIZE];
132
+
133
+ pthread_t target_thread;
58
134
  } _stackprof;
59
135
 
60
136
  static VALUE sym_object, sym_wall, sym_cpu, sym_custom, sym_name, sym_file, sym_line;
61
137
  static VALUE sym_samples, sym_total_samples, sym_missed_samples, sym_edges, sym_lines;
62
- static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_frames, sym_out, sym_aggregate, sym_raw_timestamp_deltas;
138
+ static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_raw_lines, sym_metadata, sym_frames, sym_ignore_gc, sym_out;
139
+ static VALUE sym_aggregate, sym_raw_sample_timestamps, sym_raw_timestamp_deltas, sym_state, sym_marking, sym_sweeping;
63
140
  static VALUE sym_gc_samples, objtracer;
64
141
  static VALUE gc_hook;
65
142
  static VALUE rb_mStackProf;
@@ -72,8 +149,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
72
149
  {
73
150
  struct sigaction sa;
74
151
  struct itimerval timer;
75
- VALUE opts = Qnil, mode = Qnil, interval = Qnil, out = Qfalse;
152
+ VALUE opts = Qnil, mode = Qnil, interval = Qnil, metadata = rb_hash_new(), out = Qfalse;
153
+ int ignore_gc = 0;
76
154
  int raw = 0, aggregate = 1;
155
+ VALUE metadata_val;
77
156
 
78
157
  if (_stackprof.running)
79
158
  return Qfalse;
@@ -84,6 +163,17 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
84
163
  mode = rb_hash_aref(opts, sym_mode);
85
164
  interval = rb_hash_aref(opts, sym_interval);
86
165
  out = rb_hash_aref(opts, sym_out);
166
+ if (RTEST(rb_hash_aref(opts, sym_ignore_gc))) {
167
+ ignore_gc = 1;
168
+ }
169
+
170
+ metadata_val = rb_hash_aref(opts, sym_metadata);
171
+ if (RTEST(metadata_val)) {
172
+ if (!RB_TYPE_P(metadata_val, T_HASH))
173
+ rb_raise(rb_eArgError, "metadata should be a hash");
174
+
175
+ metadata = metadata_val;
176
+ }
87
177
 
88
178
  if (RTEST(rb_hash_aref(opts, sym_raw)))
89
179
  raw = 1;
@@ -92,6 +182,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
92
182
  }
93
183
  if (!RTEST(mode)) mode = sym_wall;
94
184
 
185
+ if (!NIL_P(interval) && (NUM2INT(interval) < 1 || NUM2INT(interval) >= MICROSECONDS_IN_SECOND)) {
186
+ rb_raise(rb_eArgError, "interval is a number of microseconds between 1 and 1 million");
187
+ }
188
+
95
189
  if (!_stackprof.frames) {
96
190
  _stackprof.frames = st_init_numtable();
97
191
  _stackprof.overall_signals = 0;
@@ -128,10 +222,13 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
128
222
  _stackprof.aggregate = aggregate;
129
223
  _stackprof.mode = mode;
130
224
  _stackprof.interval = interval;
225
+ _stackprof.ignore_gc = ignore_gc;
226
+ _stackprof.metadata = metadata;
131
227
  _stackprof.out = out;
228
+ _stackprof.target_thread = pthread_self();
132
229
 
133
230
  if (raw) {
134
- gettimeofday(&_stackprof.last_sample_at, NULL);
231
+ capture_timestamp(&_stackprof.last_sample_at);
135
232
  }
136
233
 
137
234
  return Qtrue;
@@ -166,13 +263,19 @@ stackprof_stop(VALUE self)
166
263
  return Qtrue;
167
264
  }
168
265
 
266
+ #if SIZEOF_VOIDP == SIZEOF_LONG
267
+ # define PTR2NUM(x) (LONG2NUM((long)(x)))
268
+ #else
269
+ # define PTR2NUM(x) (LL2NUM((LONG_LONG)(x)))
270
+ #endif
271
+
169
272
  static int
170
273
  frame_edges_i(st_data_t key, st_data_t val, st_data_t arg)
171
274
  {
172
275
  VALUE edges = (VALUE)arg;
173
276
 
174
277
  intptr_t weight = (intptr_t)val;
175
- rb_hash_aset(edges, rb_obj_id((VALUE)key), INT2FIX(weight));
278
+ rb_hash_aset(edges, PTR2NUM(key), INT2FIX(weight));
176
279
  return ST_CONTINUE;
177
280
  }
178
281
 
@@ -199,10 +302,10 @@ frame_i(st_data_t key, st_data_t val, st_data_t arg)
199
302
  VALUE name, file, edges, lines;
200
303
  VALUE line;
201
304
 
202
- rb_hash_aset(results, rb_obj_id(frame), details);
305
+ rb_hash_aset(results, PTR2NUM(frame), details);
203
306
 
204
- if (frame == _stackprof.fake_gc_frame) {
205
- name = _stackprof.fake_gc_frame_name;
307
+ if (FIXNUM_P(frame)) {
308
+ name = _stackprof.fake_frame_names[FIX2INT(frame)];
206
309
  file = _stackprof.empty_string;
207
310
  line = INT2FIX(0);
208
311
  } else {
@@ -258,6 +361,9 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
258
361
  rb_hash_aset(results, sym_samples, SIZET2NUM(_stackprof.overall_samples));
259
362
  rb_hash_aset(results, sym_gc_samples, SIZET2NUM(_stackprof.during_gc));
260
363
  rb_hash_aset(results, sym_missed_samples, SIZET2NUM(_stackprof.overall_signals - _stackprof.overall_samples));
364
+ rb_hash_aset(results, sym_metadata, _stackprof.metadata);
365
+
366
+ _stackprof.metadata = Qnil;
261
367
 
262
368
  frames = rb_hash_new();
263
369
  rb_hash_aset(results, sym_frames, frames);
@@ -268,16 +374,25 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
268
374
 
269
375
  if (_stackprof.raw && _stackprof.raw_samples_len) {
270
376
  size_t len, n, o;
271
- VALUE raw_timestamp_deltas;
377
+ VALUE raw_sample_timestamps, raw_timestamp_deltas;
272
378
  VALUE raw_samples = rb_ary_new_capa(_stackprof.raw_samples_len);
379
+ VALUE raw_lines = rb_ary_new_capa(_stackprof.raw_samples_len);
273
380
 
274
381
  for (n = 0; n < _stackprof.raw_samples_len; n++) {
275
382
  len = (size_t)_stackprof.raw_samples[n];
276
383
  rb_ary_push(raw_samples, SIZET2NUM(len));
384
+ rb_ary_push(raw_lines, SIZET2NUM(len));
385
+
386
+ for (o = 0, n++; o < len; n++, o++) {
387
+ // Line is in the upper 16 bits
388
+ rb_ary_push(raw_lines, INT2NUM(_stackprof.raw_samples[n] >> 48));
389
+
390
+ VALUE frame = _stackprof.raw_samples[n] & ~((uint64_t)0xFFFF << 48);
391
+ rb_ary_push(raw_samples, PTR2NUM(frame));
392
+ }
277
393
 
278
- for (o = 0, n++; o < len; n++, o++)
279
- rb_ary_push(raw_samples, rb_obj_id(_stackprof.raw_samples[n]));
280
394
  rb_ary_push(raw_samples, SIZET2NUM((size_t)_stackprof.raw_samples[n]));
395
+ rb_ary_push(raw_lines, SIZET2NUM((size_t)_stackprof.raw_samples[n]));
281
396
  }
282
397
 
283
398
  free(_stackprof.raw_samples);
@@ -287,18 +402,22 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
287
402
  _stackprof.raw_sample_index = 0;
288
403
 
289
404
  rb_hash_aset(results, sym_raw, raw_samples);
405
+ rb_hash_aset(results, sym_raw_lines, raw_lines);
290
406
 
291
- raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_timestamp_deltas_len);
407
+ raw_sample_timestamps = rb_ary_new_capa(_stackprof.raw_sample_times_len);
408
+ raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_sample_times_len);
292
409
 
293
- for (n = 0; n < _stackprof.raw_timestamp_deltas_len; n++) {
294
- rb_ary_push(raw_timestamp_deltas, INT2FIX(_stackprof.raw_timestamp_deltas[n]));
410
+ for (n = 0; n < _stackprof.raw_sample_times_len; n++) {
411
+ rb_ary_push(raw_sample_timestamps, ULL2NUM(_stackprof.raw_sample_times[n].timestamp_usec));
412
+ rb_ary_push(raw_timestamp_deltas, LL2NUM(_stackprof.raw_sample_times[n].delta_usec));
295
413
  }
296
414
 
297
- free(_stackprof.raw_timestamp_deltas);
298
- _stackprof.raw_timestamp_deltas = NULL;
299
- _stackprof.raw_timestamp_deltas_len = 0;
300
- _stackprof.raw_timestamp_deltas_capa = 0;
415
+ free(_stackprof.raw_sample_times);
416
+ _stackprof.raw_sample_times = NULL;
417
+ _stackprof.raw_sample_times_len = 0;
418
+ _stackprof.raw_sample_times_capa = 0;
301
419
 
420
+ rb_hash_aset(results, sym_raw_sample_timestamps, raw_sample_timestamps);
302
421
  rb_hash_aset(results, sym_raw_timestamp_deltas, raw_timestamp_deltas);
303
422
 
304
423
  _stackprof.raw = 0;
@@ -309,11 +428,12 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
309
428
 
310
429
  if (RTEST(_stackprof.out)) {
311
430
  VALUE file;
312
- if (RB_TYPE_P(_stackprof.out, T_STRING)) {
313
- file = rb_file_open_str(_stackprof.out, "w");
314
- } else {
431
+ if (rb_respond_to(_stackprof.out, rb_intern("to_io"))) {
315
432
  file = rb_io_check_io(_stackprof.out);
433
+ } else {
434
+ file = rb_file_open_str(_stackprof.out, "w");
316
435
  }
436
+
317
437
  rb_marshal_dump(results, file);
318
438
  rb_io_flush(file);
319
439
  _stackprof.out = Qnil;
@@ -377,30 +497,47 @@ st_numtable_increment(st_table *table, st_data_t key, size_t increment)
377
497
  }
378
498
 
379
499
  void
380
- stackprof_record_sample_for_stack(int num, int timestamp_delta)
500
+ stackprof_record_sample_for_stack(int num, uint64_t sample_timestamp, int64_t timestamp_delta)
381
501
  {
382
502
  int i, n;
383
503
  VALUE prev_frame = Qnil;
384
504
 
385
505
  _stackprof.overall_samples++;
386
506
 
387
- if (_stackprof.raw) {
507
+ if (_stackprof.raw && num > 0) {
388
508
  int found = 0;
389
509
 
510
+ /* If there's no sample buffer allocated, then allocate one. The buffer
511
+ * format is the number of frames (num), then the list of frames (from
512
+ * `_stackprof.raw_samples`), followed by the number of times this
513
+ * particular stack has been seen in a row. Each "new" stack is added
514
+ * to the end of the buffer, but if the previous stack is the same as
515
+ * the current stack, the counter will be incremented. */
390
516
  if (!_stackprof.raw_samples) {
391
517
  _stackprof.raw_samples_capa = num * 100;
392
518
  _stackprof.raw_samples = malloc(sizeof(VALUE) * _stackprof.raw_samples_capa);
393
519
  }
394
520
 
521
+ /* If we can't fit all the samples in the buffer, double the buffer size. */
395
522
  while (_stackprof.raw_samples_capa <= _stackprof.raw_samples_len + (num + 2)) {
396
523
  _stackprof.raw_samples_capa *= 2;
397
524
  _stackprof.raw_samples = realloc(_stackprof.raw_samples, sizeof(VALUE) * _stackprof.raw_samples_capa);
398
525
  }
399
526
 
527
+ /* If we've seen this stack before in the last sample, then increment the "seen" count. */
400
528
  if (_stackprof.raw_samples_len > 0 && _stackprof.raw_samples[_stackprof.raw_sample_index] == (VALUE)num) {
529
+ /* The number of samples could have been the same, but the stack
530
+ * might be different, so we need to check the stack here. Stacks
531
+ * in the raw buffer are stored in the opposite direction of stacks
532
+ * in the frames buffer that came from Ruby. */
401
533
  for (i = num-1, n = 0; i >= 0; i--, n++) {
402
534
  VALUE frame = _stackprof.frames_buffer[i];
403
- if (_stackprof.raw_samples[_stackprof.raw_sample_index + 1 + n] != frame)
535
+ int line = _stackprof.lines_buffer[i];
536
+
537
+ // Encode the line in to the upper 16 bits.
538
+ uint64_t key = ((uint64_t)line << 48) | (uint64_t)frame;
539
+
540
+ if (_stackprof.raw_samples[_stackprof.raw_sample_index + 1 + n] != key)
404
541
  break;
405
542
  }
406
543
  if (i == -1) {
@@ -409,28 +546,43 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
409
546
  }
410
547
  }
411
548
 
549
+ /* If we haven't seen the stack, then add it to the buffer along with
550
+ * the length of the stack and a 1 for the "seen" count */
412
551
  if (!found) {
552
+ /* Bump the `raw_sample_index` up so that the next iteration can
553
+ * find the previously recorded stack size. */
413
554
  _stackprof.raw_sample_index = _stackprof.raw_samples_len;
414
555
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)num;
415
556
  for (i = num-1; i >= 0; i--) {
416
557
  VALUE frame = _stackprof.frames_buffer[i];
417
- _stackprof.raw_samples[_stackprof.raw_samples_len++] = frame;
558
+ int line = _stackprof.lines_buffer[i];
559
+
560
+ // Encode the line in to the upper 16 bits.
561
+ uint64_t key = ((uint64_t)line << 48) | (uint64_t)frame;
562
+
563
+ _stackprof.raw_samples[_stackprof.raw_samples_len++] = key;
418
564
  }
419
565
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)1;
420
566
  }
421
567
 
422
- if (!_stackprof.raw_timestamp_deltas) {
423
- _stackprof.raw_timestamp_deltas_capa = 100;
424
- _stackprof.raw_timestamp_deltas = malloc(sizeof(int) * _stackprof.raw_timestamp_deltas_capa);
425
- _stackprof.raw_timestamp_deltas_len = 0;
568
+ /* If there's no timestamp delta buffer, allocate one */
569
+ if (!_stackprof.raw_sample_times) {
570
+ _stackprof.raw_sample_times_capa = 100;
571
+ _stackprof.raw_sample_times = malloc(sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
572
+ _stackprof.raw_sample_times_len = 0;
426
573
  }
427
574
 
428
- while (_stackprof.raw_timestamp_deltas_capa <= _stackprof.raw_timestamp_deltas_len + 1) {
429
- _stackprof.raw_timestamp_deltas_capa *= 2;
430
- _stackprof.raw_timestamp_deltas = realloc(_stackprof.raw_timestamp_deltas, sizeof(int) * _stackprof.raw_timestamp_deltas_capa);
575
+ /* Double the buffer size if it's too small */
576
+ while (_stackprof.raw_sample_times_capa <= _stackprof.raw_sample_times_len + 1) {
577
+ _stackprof.raw_sample_times_capa *= 2;
578
+ _stackprof.raw_sample_times = realloc(_stackprof.raw_sample_times, sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
431
579
  }
432
580
 
433
- _stackprof.raw_timestamp_deltas[_stackprof.raw_timestamp_deltas_len++] = timestamp_delta;
581
+ /* Store the time delta (which is the amount of microseconds between samples). */
582
+ _stackprof.raw_sample_times[_stackprof.raw_sample_times_len++] = (sample_time_t) {
583
+ .timestamp_usec = sample_timestamp,
584
+ .delta_usec = timestamp_delta,
585
+ };
434
586
  }
435
587
 
436
588
  for (i = 0; i < num; i++) {
@@ -463,90 +615,187 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
463
615
  }
464
616
 
465
617
  if (_stackprof.raw) {
466
- gettimeofday(&_stackprof.last_sample_at, NULL);
618
+ capture_timestamp(&_stackprof.last_sample_at);
467
619
  }
468
620
  }
469
621
 
622
+ // buffer the current profile frames
623
+ // This must be async-signal-safe
624
+ // Returns immediately if another set of frames are already in the buffer
470
625
  void
471
- stackprof_record_sample()
626
+ stackprof_buffer_sample(void)
472
627
  {
473
- int timestamp_delta = 0;
628
+ uint64_t start_timestamp = 0;
629
+ int64_t timestamp_delta = 0;
474
630
  int num;
631
+
632
+ if (_stackprof.buffer_count > 0) {
633
+ // Another sample is already pending
634
+ return;
635
+ }
636
+
475
637
  if (_stackprof.raw) {
476
- struct timeval t;
477
- struct timeval diff;
478
- gettimeofday(&t, NULL);
479
- timersub(&t, &_stackprof.last_sample_at, &diff);
480
- timestamp_delta = (1000 * diff.tv_sec) + diff.tv_usec;
638
+ struct timestamp_t t;
639
+ capture_timestamp(&t);
640
+ start_timestamp = timestamp_usec(&t);
641
+ timestamp_delta = delta_usec(&_stackprof.last_sample_at, &t);
481
642
  }
643
+
482
644
  num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer);
483
- stackprof_record_sample_for_stack(num, timestamp_delta);
645
+
646
+ _stackprof.buffer_count = num;
647
+ _stackprof.buffer_time.timestamp_usec = start_timestamp;
648
+ _stackprof.buffer_time.delta_usec = timestamp_delta;
484
649
  }
485
650
 
651
+ // Postponed job
486
652
  void
487
- stackprof_record_gc_samples()
653
+ stackprof_record_gc_samples(void)
488
654
  {
489
- int delta_to_first_unrecorded_gc_sample = 0;
490
- int i;
655
+ int64_t delta_to_first_unrecorded_gc_sample = 0;
656
+ uint64_t start_timestamp = 0;
657
+ size_t i;
491
658
  if (_stackprof.raw) {
492
- struct timeval t;
493
- struct timeval diff;
494
- gettimeofday(&t, NULL);
495
- timersub(&t, &_stackprof.last_sample_at, &diff);
659
+ struct timestamp_t t = _stackprof.gc_start_timestamp;
660
+ start_timestamp = timestamp_usec(&t);
496
661
 
497
662
  // We don't know when the GC samples were actually marked, so let's
498
663
  // assume that they were marked at a perfectly regular interval.
499
- delta_to_first_unrecorded_gc_sample = (1000 * diff.tv_sec + diff.tv_usec) - (_stackprof.unrecorded_gc_samples - 1) * _stackprof.interval;
664
+ delta_to_first_unrecorded_gc_sample = delta_usec(&_stackprof.last_sample_at, &t) - (_stackprof.unrecorded_gc_samples - 1) * NUM2LONG(_stackprof.interval);
500
665
  if (delta_to_first_unrecorded_gc_sample < 0) {
501
666
  delta_to_first_unrecorded_gc_sample = 0;
502
667
  }
503
668
  }
504
669
 
505
- _stackprof.frames_buffer[0] = _stackprof.fake_gc_frame;
506
- _stackprof.lines_buffer[0] = 0;
507
-
508
670
  for (i = 0; i < _stackprof.unrecorded_gc_samples; i++) {
509
- int timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : _stackprof.interval;
510
- stackprof_record_sample_for_stack(1, timestamp_delta);
671
+ int64_t timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : NUM2LONG(_stackprof.interval);
672
+
673
+ if (_stackprof.unrecorded_gc_marking_samples) {
674
+ _stackprof.frames_buffer[0] = FAKE_FRAME_MARK;
675
+ _stackprof.lines_buffer[0] = 0;
676
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
677
+ _stackprof.lines_buffer[1] = 0;
678
+ _stackprof.unrecorded_gc_marking_samples--;
679
+
680
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
681
+ } else if (_stackprof.unrecorded_gc_sweeping_samples) {
682
+ _stackprof.frames_buffer[0] = FAKE_FRAME_SWEEP;
683
+ _stackprof.lines_buffer[0] = 0;
684
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
685
+ _stackprof.lines_buffer[1] = 0;
686
+
687
+ _stackprof.unrecorded_gc_sweeping_samples--;
688
+
689
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
690
+ } else {
691
+ _stackprof.frames_buffer[0] = FAKE_FRAME_GC;
692
+ _stackprof.lines_buffer[0] = 0;
693
+ stackprof_record_sample_for_stack(1, start_timestamp, timestamp_delta);
694
+ }
511
695
  }
512
696
  _stackprof.during_gc += _stackprof.unrecorded_gc_samples;
513
697
  _stackprof.unrecorded_gc_samples = 0;
698
+ _stackprof.unrecorded_gc_marking_samples = 0;
699
+ _stackprof.unrecorded_gc_sweeping_samples = 0;
700
+ }
701
+
702
+ // record the sample previously buffered by stackprof_buffer_sample
703
+ static void
704
+ stackprof_record_buffer(void)
705
+ {
706
+ stackprof_record_sample_for_stack(_stackprof.buffer_count, _stackprof.buffer_time.timestamp_usec, _stackprof.buffer_time.delta_usec);
707
+
708
+ // reset the buffer
709
+ _stackprof.buffer_count = 0;
514
710
  }
515
711
 
516
712
  static void
517
- stackprof_gc_job_handler(void *data)
713
+ stackprof_sample_and_record(void)
714
+ {
715
+ stackprof_buffer_sample();
716
+ stackprof_record_buffer();
717
+ }
718
+
719
+ static void
720
+ stackprof_job_record_gc(void *data)
518
721
  {
519
- static int in_signal_handler = 0;
520
- if (in_signal_handler) return;
521
722
  if (!_stackprof.running) return;
522
723
 
523
- in_signal_handler++;
524
724
  stackprof_record_gc_samples();
525
- in_signal_handler--;
526
725
  }
527
726
 
528
727
  static void
529
- stackprof_job_handler(void *data)
728
+ stackprof_job_sample_and_record(void *data)
729
+ {
730
+ if (!_stackprof.running) return;
731
+
732
+ stackprof_sample_and_record();
733
+ }
734
+
735
+ static void
736
+ stackprof_job_record_buffer(void *data)
530
737
  {
531
- static int in_signal_handler = 0;
532
- if (in_signal_handler) return;
533
738
  if (!_stackprof.running) return;
534
739
 
535
- in_signal_handler++;
536
- stackprof_record_sample();
537
- in_signal_handler--;
740
+ stackprof_record_buffer();
538
741
  }
539
742
 
540
743
  static void
541
744
  stackprof_signal_handler(int sig, siginfo_t *sinfo, void *ucontext)
542
745
  {
746
+ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
747
+
543
748
  _stackprof.overall_signals++;
544
- if (rb_during_gc()) {
749
+
750
+ if (!_stackprof.running) return;
751
+
752
+ // There's a possibility that the signal handler is invoked *after* the Ruby
753
+ // VM has been shut down (e.g. after ruby_cleanup(0)). In this case, things
754
+ // that rely on global VM state (e.g. rb_during_gc) will segfault.
755
+ if (!ruby_vm_running) return;
756
+
757
+ if (_stackprof.mode == sym_wall) {
758
+ // In "wall" mode, the SIGALRM signal will arrive at an arbitrary thread.
759
+ // In order to provide more useful results, especially under threaded web
760
+ // servers, we want to forward this signal to the original thread
761
+ // StackProf was started from.
762
+ // According to POSIX.1-2008 TC1 pthread_kill and pthread_self should be
763
+ // async-signal-safe.
764
+ if (pthread_self() != _stackprof.target_thread) {
765
+ pthread_kill(_stackprof.target_thread, sig);
766
+ return;
767
+ }
768
+ } else {
769
+ if (!ruby_native_thread_p()) return;
770
+ }
771
+
772
+ if (pthread_mutex_trylock(&lock)) return;
773
+
774
+ if (!_stackprof.ignore_gc && rb_during_gc()) {
775
+ VALUE mode = rb_gc_latest_gc_info(sym_state);
776
+ if (mode == sym_marking) {
777
+ _stackprof.unrecorded_gc_marking_samples++;
778
+ } else if (mode == sym_sweeping) {
779
+ _stackprof.unrecorded_gc_sweeping_samples++;
780
+ }
781
+ if(!_stackprof.unrecorded_gc_samples) {
782
+ // record start
783
+ capture_timestamp(&_stackprof.gc_start_timestamp);
784
+ }
545
785
  _stackprof.unrecorded_gc_samples++;
546
- rb_postponed_job_register_one(0, stackprof_gc_job_handler, (void*)0);
786
+ rb_postponed_job_register_one(0, stackprof_job_record_gc, (void*)0);
547
787
  } else {
548
- rb_postponed_job_register_one(0, stackprof_job_handler, (void*)0);
788
+ if (stackprof_use_postponed_job) {
789
+ rb_postponed_job_register_one(0, stackprof_job_sample_and_record, (void*)0);
790
+ } else {
791
+ // Buffer a sample immediately, if an existing sample exists this will
792
+ // return immediately
793
+ stackprof_buffer_sample();
794
+ // Enqueue a job to record the sample
795
+ rb_postponed_job_register_one(0, stackprof_job_record_buffer, (void*)0);
796
+ }
549
797
  }
798
+ pthread_mutex_unlock(&lock);
550
799
  }
551
800
 
552
801
  static void
@@ -555,7 +804,7 @@ stackprof_newobj_handler(VALUE tpval, void *data)
555
804
  _stackprof.overall_signals++;
556
805
  if (RTEST(_stackprof.interval) && _stackprof.overall_signals % NUM2LONG(_stackprof.interval))
557
806
  return;
558
- stackprof_job_handler(0);
807
+ stackprof_sample_and_record();
559
808
  }
560
809
 
561
810
  static VALUE
@@ -565,7 +814,7 @@ stackprof_sample(VALUE self)
565
814
  return Qfalse;
566
815
 
567
816
  _stackprof.overall_signals++;
568
- stackprof_job_handler(0);
817
+ stackprof_sample_and_record();
569
818
  return Qtrue;
570
819
  }
571
820
 
@@ -580,11 +829,25 @@ frame_mark_i(st_data_t key, st_data_t val, st_data_t arg)
580
829
  static void
581
830
  stackprof_gc_mark(void *data)
582
831
  {
832
+ if (RTEST(_stackprof.metadata))
833
+ rb_gc_mark(_stackprof.metadata);
834
+
583
835
  if (RTEST(_stackprof.out))
584
836
  rb_gc_mark(_stackprof.out);
585
837
 
586
838
  if (_stackprof.frames)
587
839
  st_foreach(_stackprof.frames, frame_mark_i, 0);
840
+
841
+ int i;
842
+ for (i = 0; i < _stackprof.buffer_count; i++) {
843
+ rb_gc_mark(_stackprof.frames_buffer[i]);
844
+ }
845
+ }
846
+
847
+ static size_t
848
+ stackprof_memsize(const void *data)
849
+ {
850
+ return sizeof(_stackprof);
588
851
  }
589
852
 
590
853
  static void
@@ -619,9 +882,41 @@ stackprof_atfork_child(void)
619
882
  stackprof_stop(rb_mStackProf);
620
883
  }
621
884
 
885
+ static VALUE
886
+ stackprof_use_postponed_job_l(VALUE self)
887
+ {
888
+ stackprof_use_postponed_job = 1;
889
+ return Qnil;
890
+ }
891
+
892
+ static void
893
+ stackprof_at_exit(ruby_vm_t* vm)
894
+ {
895
+ ruby_vm_running = 0;
896
+ }
897
+
898
+ static const rb_data_type_t stackprof_type = {
899
+ "StackProf",
900
+ {
901
+ stackprof_gc_mark,
902
+ NULL,
903
+ stackprof_memsize,
904
+ }
905
+ };
906
+
622
907
  void
623
908
  Init_stackprof(void)
624
909
  {
910
+ size_t i;
911
+ /*
912
+ * As of Ruby 3.0, it should be safe to read stack frames at any time, unless YJIT is enabled
913
+ * See https://github.com/ruby/ruby/commit/0e276dc458f94d9d79a0f7c7669bde84abe80f21
914
+ */
915
+ stackprof_use_postponed_job = RUBY_API_VERSION_MAJOR < 3;
916
+
917
+ ruby_vm_running = 1;
918
+ ruby_vm_at_exit(stackprof_at_exit);
919
+
625
920
  #define S(name) sym_##name = ID2SYM(rb_intern(#name));
626
921
  S(object);
627
922
  S(custom);
@@ -640,30 +935,42 @@ Init_stackprof(void)
640
935
  S(mode);
641
936
  S(interval);
642
937
  S(raw);
938
+ S(raw_lines);
939
+ S(raw_sample_timestamps);
643
940
  S(raw_timestamp_deltas);
644
941
  S(out);
942
+ S(metadata);
943
+ S(ignore_gc);
645
944
  S(frames);
646
945
  S(aggregate);
946
+ S(state);
947
+ S(marking);
948
+ S(sweeping);
647
949
  #undef S
648
950
 
649
- gc_hook = Data_Wrap_Struct(rb_cObject, stackprof_gc_mark, NULL, &_stackprof);
951
+ /* Need to run this to warm the symbol table before we call this during GC */
952
+ rb_gc_latest_gc_info(sym_state);
953
+
650
954
  rb_global_variable(&gc_hook);
955
+ gc_hook = TypedData_Wrap_Struct(rb_cObject, &stackprof_type, &_stackprof);
651
956
 
652
957
  _stackprof.raw_samples = NULL;
653
958
  _stackprof.raw_samples_len = 0;
654
959
  _stackprof.raw_samples_capa = 0;
655
960
  _stackprof.raw_sample_index = 0;
656
961
 
657
- _stackprof.raw_timestamp_deltas = NULL;
658
- _stackprof.raw_timestamp_deltas_len = 0;
659
- _stackprof.raw_timestamp_deltas_capa = 0;
962
+ _stackprof.raw_sample_times = NULL;
963
+ _stackprof.raw_sample_times_len = 0;
964
+ _stackprof.raw_sample_times_capa = 0;
660
965
 
661
- _stackprof.fake_gc_frame = INT2FIX(0x9C);
662
966
  _stackprof.empty_string = rb_str_new_cstr("");
663
- _stackprof.fake_gc_frame_name = rb_str_new_cstr("(garbage collection)");
664
- rb_global_variable(&_stackprof.fake_gc_frame_name);
665
967
  rb_global_variable(&_stackprof.empty_string);
666
968
 
969
+ for (i = 0; i < TOTAL_FAKE_FRAMES; i++) {
970
+ _stackprof.fake_frame_names[i] = rb_str_new_cstr(fake_frame_cstrs[i]);
971
+ rb_global_variable(&_stackprof.fake_frame_names[i]);
972
+ }
973
+
667
974
  rb_mStackProf = rb_define_module("StackProf");
668
975
  rb_define_singleton_method(rb_mStackProf, "running?", stackprof_running_p, 0);
669
976
  rb_define_singleton_method(rb_mStackProf, "run", stackprof_run, -1);
@@ -671,6 +978,7 @@ Init_stackprof(void)
671
978
  rb_define_singleton_method(rb_mStackProf, "stop", stackprof_stop, 0);
672
979
  rb_define_singleton_method(rb_mStackProf, "results", stackprof_results, -1);
673
980
  rb_define_singleton_method(rb_mStackProf, "sample", stackprof_sample, 0);
981
+ rb_define_singleton_method(rb_mStackProf, "use_postponed_job!", stackprof_use_postponed_job_l, 0);
674
982
 
675
983
  pthread_atfork(stackprof_atfork_prepare, stackprof_atfork_parent, stackprof_atfork_child);
676
984
  }