stackprof 0.2.11 → 0.2.25

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,24 +7,90 @@
7
7
  **********************************************************************/
8
8
 
9
9
  #include <ruby/ruby.h>
10
+ #include <ruby/version.h>
10
11
  #include <ruby/debug.h>
11
12
  #include <ruby/st.h>
12
13
  #include <ruby/io.h>
13
14
  #include <ruby/intern.h>
15
+ #include <ruby/vm.h>
14
16
  #include <signal.h>
15
17
  #include <sys/time.h>
18
+ #include <time.h>
16
19
  #include <pthread.h>
17
20
 
18
21
  #define BUF_SIZE 2048
22
+ #define MICROSECONDS_IN_SECOND 1000000
23
+ #define NANOSECONDS_IN_SECOND 1000000000
24
+
25
+ #define FAKE_FRAME_GC INT2FIX(0)
26
+ #define FAKE_FRAME_MARK INT2FIX(1)
27
+ #define FAKE_FRAME_SWEEP INT2FIX(2)
28
+
29
+ static const char *fake_frame_cstrs[] = {
30
+ "(garbage collection)",
31
+ "(marking)",
32
+ "(sweeping)",
33
+ };
34
+
35
+ static int stackprof_use_postponed_job = 1;
36
+ static int ruby_vm_running = 0;
37
+
38
+ #define TOTAL_FAKE_FRAMES (sizeof(fake_frame_cstrs) / sizeof(char *))
39
+
40
+ #ifdef _POSIX_MONOTONIC_CLOCK
41
+ #define timestamp_t timespec
42
+ typedef struct timestamp_t timestamp_t;
43
+
44
+ static void capture_timestamp(timestamp_t *ts) {
45
+ clock_gettime(CLOCK_MONOTONIC, ts);
46
+ }
47
+
48
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
49
+ int64_t result = MICROSECONDS_IN_SECOND * (end->tv_sec - start->tv_sec);
50
+ if (end->tv_nsec < start->tv_nsec) {
51
+ result -= MICROSECONDS_IN_SECOND;
52
+ result += (NANOSECONDS_IN_SECOND + end->tv_nsec - start->tv_nsec) / 1000;
53
+ } else {
54
+ result += (end->tv_nsec - start->tv_nsec) / 1000;
55
+ }
56
+ return result;
57
+ }
58
+
59
+ static uint64_t timestamp_usec(timestamp_t *ts) {
60
+ return (MICROSECONDS_IN_SECOND * ts->tv_sec) + (ts->tv_nsec / 1000);
61
+ }
62
+ #else
63
+ #define timestamp_t timeval
64
+ typedef struct timestamp_t timestamp_t;
65
+
66
+ static void capture_timestamp(timestamp_t *ts) {
67
+ gettimeofday(ts, NULL);
68
+ }
69
+
70
+ static int64_t delta_usec(timestamp_t *start, timestamp_t *end) {
71
+ struct timeval diff;
72
+ timersub(end, start, &diff);
73
+ return (MICROSECONDS_IN_SECOND * diff.tv_sec) + diff.tv_usec;
74
+ }
75
+
76
+ static uint64_t timestamp_usec(timestamp_t *ts) {
77
+ return (MICROSECONDS_IN_SECOND * ts.tv_sec) + diff.tv_usec
78
+ }
79
+ #endif
19
80
 
20
81
  typedef struct {
21
82
  size_t total_samples;
22
83
  size_t caller_samples;
23
- int already_accounted_in_total;
84
+ size_t seen_at_sample_number;
24
85
  st_table *edges;
25
86
  st_table *lines;
26
87
  } frame_data_t;
27
88
 
89
+ typedef struct {
90
+ uint64_t timestamp_usec;
91
+ int64_t delta_usec;
92
+ } sample_time_t;
93
+
28
94
  static struct {
29
95
  int running;
30
96
  int raw;
@@ -33,33 +99,42 @@ static struct {
33
99
  VALUE mode;
34
100
  VALUE interval;
35
101
  VALUE out;
102
+ VALUE metadata;
103
+ int ignore_gc;
36
104
 
37
105
  VALUE *raw_samples;
38
106
  size_t raw_samples_len;
39
107
  size_t raw_samples_capa;
40
108
  size_t raw_sample_index;
41
109
 
42
- struct timeval last_sample_at;
43
- int *raw_timestamp_deltas;
44
- size_t raw_timestamp_deltas_len;
45
- size_t raw_timestamp_deltas_capa;
110
+ struct timestamp_t last_sample_at;
111
+ sample_time_t *raw_sample_times;
112
+ size_t raw_sample_times_len;
113
+ size_t raw_sample_times_capa;
46
114
 
47
115
  size_t overall_signals;
48
116
  size_t overall_samples;
49
117
  size_t during_gc;
50
118
  size_t unrecorded_gc_samples;
119
+ size_t unrecorded_gc_marking_samples;
120
+ size_t unrecorded_gc_sweeping_samples;
51
121
  st_table *frames;
52
122
 
53
- VALUE fake_gc_frame;
54
- VALUE fake_gc_frame_name;
123
+ VALUE fake_frame_names[TOTAL_FAKE_FRAMES];
55
124
  VALUE empty_string;
125
+
126
+ int buffer_count;
127
+ sample_time_t buffer_time;
56
128
  VALUE frames_buffer[BUF_SIZE];
57
129
  int lines_buffer[BUF_SIZE];
130
+
131
+ pthread_t target_thread;
58
132
  } _stackprof;
59
133
 
60
134
  static VALUE sym_object, sym_wall, sym_cpu, sym_custom, sym_name, sym_file, sym_line;
61
135
  static VALUE sym_samples, sym_total_samples, sym_missed_samples, sym_edges, sym_lines;
62
- static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_frames, sym_out, sym_aggregate, sym_raw_timestamp_deltas;
136
+ static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_metadata, sym_frames, sym_ignore_gc, sym_out;
137
+ static VALUE sym_aggregate, sym_raw_sample_timestamps, sym_raw_timestamp_deltas, sym_state, sym_marking, sym_sweeping;
63
138
  static VALUE sym_gc_samples, objtracer;
64
139
  static VALUE gc_hook;
65
140
  static VALUE rb_mStackProf;
@@ -72,8 +147,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
72
147
  {
73
148
  struct sigaction sa;
74
149
  struct itimerval timer;
75
- VALUE opts = Qnil, mode = Qnil, interval = Qnil, out = Qfalse;
150
+ VALUE opts = Qnil, mode = Qnil, interval = Qnil, metadata = rb_hash_new(), out = Qfalse;
151
+ int ignore_gc = 0;
76
152
  int raw = 0, aggregate = 1;
153
+ VALUE metadata_val;
77
154
 
78
155
  if (_stackprof.running)
79
156
  return Qfalse;
@@ -84,6 +161,17 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
84
161
  mode = rb_hash_aref(opts, sym_mode);
85
162
  interval = rb_hash_aref(opts, sym_interval);
86
163
  out = rb_hash_aref(opts, sym_out);
164
+ if (RTEST(rb_hash_aref(opts, sym_ignore_gc))) {
165
+ ignore_gc = 1;
166
+ }
167
+
168
+ metadata_val = rb_hash_aref(opts, sym_metadata);
169
+ if (RTEST(metadata_val)) {
170
+ if (!RB_TYPE_P(metadata_val, T_HASH))
171
+ rb_raise(rb_eArgError, "metadata should be a hash");
172
+
173
+ metadata = metadata_val;
174
+ }
87
175
 
88
176
  if (RTEST(rb_hash_aref(opts, sym_raw)))
89
177
  raw = 1;
@@ -92,6 +180,10 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
92
180
  }
93
181
  if (!RTEST(mode)) mode = sym_wall;
94
182
 
183
+ if (!NIL_P(interval) && (NUM2INT(interval) < 1 || NUM2INT(interval) >= MICROSECONDS_IN_SECOND)) {
184
+ rb_raise(rb_eArgError, "interval is a number of microseconds between 1 and 1 million");
185
+ }
186
+
95
187
  if (!_stackprof.frames) {
96
188
  _stackprof.frames = st_init_numtable();
97
189
  _stackprof.overall_signals = 0;
@@ -128,10 +220,13 @@ stackprof_start(int argc, VALUE *argv, VALUE self)
128
220
  _stackprof.aggregate = aggregate;
129
221
  _stackprof.mode = mode;
130
222
  _stackprof.interval = interval;
223
+ _stackprof.ignore_gc = ignore_gc;
224
+ _stackprof.metadata = metadata;
131
225
  _stackprof.out = out;
226
+ _stackprof.target_thread = pthread_self();
132
227
 
133
228
  if (raw) {
134
- gettimeofday(&_stackprof.last_sample_at, NULL);
229
+ capture_timestamp(&_stackprof.last_sample_at);
135
230
  }
136
231
 
137
232
  return Qtrue;
@@ -166,13 +261,19 @@ stackprof_stop(VALUE self)
166
261
  return Qtrue;
167
262
  }
168
263
 
264
+ #if SIZEOF_VOIDP == SIZEOF_LONG
265
+ # define PTR2NUM(x) (LONG2NUM((long)(x)))
266
+ #else
267
+ # define PTR2NUM(x) (LL2NUM((LONG_LONG)(x)))
268
+ #endif
269
+
169
270
  static int
170
271
  frame_edges_i(st_data_t key, st_data_t val, st_data_t arg)
171
272
  {
172
273
  VALUE edges = (VALUE)arg;
173
274
 
174
275
  intptr_t weight = (intptr_t)val;
175
- rb_hash_aset(edges, rb_obj_id((VALUE)key), INT2FIX(weight));
276
+ rb_hash_aset(edges, PTR2NUM(key), INT2FIX(weight));
176
277
  return ST_CONTINUE;
177
278
  }
178
279
 
@@ -199,10 +300,10 @@ frame_i(st_data_t key, st_data_t val, st_data_t arg)
199
300
  VALUE name, file, edges, lines;
200
301
  VALUE line;
201
302
 
202
- rb_hash_aset(results, rb_obj_id(frame), details);
303
+ rb_hash_aset(results, PTR2NUM(frame), details);
203
304
 
204
- if (frame == _stackprof.fake_gc_frame) {
205
- name = _stackprof.fake_gc_frame_name;
305
+ if (FIXNUM_P(frame)) {
306
+ name = _stackprof.fake_frame_names[FIX2INT(frame)];
206
307
  file = _stackprof.empty_string;
207
308
  line = INT2FIX(0);
208
309
  } else {
@@ -258,6 +359,9 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
258
359
  rb_hash_aset(results, sym_samples, SIZET2NUM(_stackprof.overall_samples));
259
360
  rb_hash_aset(results, sym_gc_samples, SIZET2NUM(_stackprof.during_gc));
260
361
  rb_hash_aset(results, sym_missed_samples, SIZET2NUM(_stackprof.overall_signals - _stackprof.overall_samples));
362
+ rb_hash_aset(results, sym_metadata, _stackprof.metadata);
363
+
364
+ _stackprof.metadata = Qnil;
261
365
 
262
366
  frames = rb_hash_new();
263
367
  rb_hash_aset(results, sym_frames, frames);
@@ -268,6 +372,7 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
268
372
 
269
373
  if (_stackprof.raw && _stackprof.raw_samples_len) {
270
374
  size_t len, n, o;
375
+ VALUE raw_sample_timestamps, raw_timestamp_deltas;
271
376
  VALUE raw_samples = rb_ary_new_capa(_stackprof.raw_samples_len);
272
377
 
273
378
  for (n = 0; n < _stackprof.raw_samples_len; n++) {
@@ -275,7 +380,7 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
275
380
  rb_ary_push(raw_samples, SIZET2NUM(len));
276
381
 
277
382
  for (o = 0, n++; o < len; n++, o++)
278
- rb_ary_push(raw_samples, rb_obj_id(_stackprof.raw_samples[n]));
383
+ rb_ary_push(raw_samples, PTR2NUM(_stackprof.raw_samples[n]));
279
384
  rb_ary_push(raw_samples, SIZET2NUM((size_t)_stackprof.raw_samples[n]));
280
385
  }
281
386
 
@@ -287,17 +392,20 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
287
392
 
288
393
  rb_hash_aset(results, sym_raw, raw_samples);
289
394
 
290
- VALUE raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_timestamp_deltas_len);
395
+ raw_sample_timestamps = rb_ary_new_capa(_stackprof.raw_sample_times_len);
396
+ raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_sample_times_len);
291
397
 
292
- for (n = 0; n < _stackprof.raw_timestamp_deltas_len; n++) {
293
- rb_ary_push(raw_timestamp_deltas, INT2FIX(_stackprof.raw_timestamp_deltas[n]));
398
+ for (n = 0; n < _stackprof.raw_sample_times_len; n++) {
399
+ rb_ary_push(raw_sample_timestamps, ULL2NUM(_stackprof.raw_sample_times[n].timestamp_usec));
400
+ rb_ary_push(raw_timestamp_deltas, LL2NUM(_stackprof.raw_sample_times[n].delta_usec));
294
401
  }
295
402
 
296
- free(_stackprof.raw_timestamp_deltas);
297
- _stackprof.raw_timestamp_deltas = NULL;
298
- _stackprof.raw_timestamp_deltas_len = 0;
299
- _stackprof.raw_timestamp_deltas_capa = 0;
403
+ free(_stackprof.raw_sample_times);
404
+ _stackprof.raw_sample_times = NULL;
405
+ _stackprof.raw_sample_times_len = 0;
406
+ _stackprof.raw_sample_times_capa = 0;
300
407
 
408
+ rb_hash_aset(results, sym_raw_sample_timestamps, raw_sample_timestamps);
301
409
  rb_hash_aset(results, sym_raw_timestamp_deltas, raw_timestamp_deltas);
302
410
 
303
411
  _stackprof.raw = 0;
@@ -308,11 +416,12 @@ stackprof_results(int argc, VALUE *argv, VALUE self)
308
416
 
309
417
  if (RTEST(_stackprof.out)) {
310
418
  VALUE file;
311
- if (RB_TYPE_P(_stackprof.out, T_STRING)) {
312
- file = rb_file_open_str(_stackprof.out, "w");
313
- } else {
419
+ if (rb_respond_to(_stackprof.out, rb_intern("to_io"))) {
314
420
  file = rb_io_check_io(_stackprof.out);
421
+ } else {
422
+ file = rb_file_open_str(_stackprof.out, "w");
315
423
  }
424
+
316
425
  rb_marshal_dump(results, file);
317
426
  rb_io_flush(file);
318
427
  _stackprof.out = Qnil;
@@ -376,27 +485,39 @@ st_numtable_increment(st_table *table, st_data_t key, size_t increment)
376
485
  }
377
486
 
378
487
  void
379
- stackprof_record_sample_for_stack(int num, int timestamp_delta)
488
+ stackprof_record_sample_for_stack(int num, uint64_t sample_timestamp, int64_t timestamp_delta)
380
489
  {
381
490
  int i, n;
382
491
  VALUE prev_frame = Qnil;
383
492
 
384
493
  _stackprof.overall_samples++;
385
494
 
386
- if (_stackprof.raw) {
495
+ if (_stackprof.raw && num > 0) {
387
496
  int found = 0;
388
497
 
498
+ /* If there's no sample buffer allocated, then allocate one. The buffer
499
+ * format is the number of frames (num), then the list of frames (from
500
+ * `_stackprof.raw_samples`), followed by the number of times this
501
+ * particular stack has been seen in a row. Each "new" stack is added
502
+ * to the end of the buffer, but if the previous stack is the same as
503
+ * the current stack, the counter will be incremented. */
389
504
  if (!_stackprof.raw_samples) {
390
505
  _stackprof.raw_samples_capa = num * 100;
391
506
  _stackprof.raw_samples = malloc(sizeof(VALUE) * _stackprof.raw_samples_capa);
392
507
  }
393
508
 
509
+ /* If we can't fit all the samples in the buffer, double the buffer size. */
394
510
  while (_stackprof.raw_samples_capa <= _stackprof.raw_samples_len + (num + 2)) {
395
511
  _stackprof.raw_samples_capa *= 2;
396
512
  _stackprof.raw_samples = realloc(_stackprof.raw_samples, sizeof(VALUE) * _stackprof.raw_samples_capa);
397
513
  }
398
514
 
515
+ /* If we've seen this stack before in the last sample, then increment the "seen" count. */
399
516
  if (_stackprof.raw_samples_len > 0 && _stackprof.raw_samples[_stackprof.raw_sample_index] == (VALUE)num) {
517
+ /* The number of samples could have been the same, but the stack
518
+ * might be different, so we need to check the stack here. Stacks
519
+ * in the raw buffer are stored in the opposite direction of stacks
520
+ * in the frames buffer that came from Ruby. */
400
521
  for (i = num-1, n = 0; i >= 0; i--, n++) {
401
522
  VALUE frame = _stackprof.frames_buffer[i];
402
523
  if (_stackprof.raw_samples[_stackprof.raw_sample_index + 1 + n] != frame)
@@ -408,7 +529,11 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
408
529
  }
409
530
  }
410
531
 
532
+ /* If we haven't seen the stack, then add it to the buffer along with
533
+ * the length of the stack and a 1 for the "seen" count */
411
534
  if (!found) {
535
+ /* Bump the `raw_sample_index` up so that the next iteration can
536
+ * find the previously recorded stack size. */
412
537
  _stackprof.raw_sample_index = _stackprof.raw_samples_len;
413
538
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)num;
414
539
  for (i = num-1; i >= 0; i--) {
@@ -418,23 +543,24 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
418
543
  _stackprof.raw_samples[_stackprof.raw_samples_len++] = (VALUE)1;
419
544
  }
420
545
 
421
- if (!_stackprof.raw_timestamp_deltas) {
422
- _stackprof.raw_timestamp_deltas_capa = 100;
423
- _stackprof.raw_timestamp_deltas = malloc(sizeof(int) * _stackprof.raw_timestamp_deltas_capa);
424
- _stackprof.raw_timestamp_deltas_len = 0;
546
+ /* If there's no timestamp delta buffer, allocate one */
547
+ if (!_stackprof.raw_sample_times) {
548
+ _stackprof.raw_sample_times_capa = 100;
549
+ _stackprof.raw_sample_times = malloc(sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
550
+ _stackprof.raw_sample_times_len = 0;
425
551
  }
426
552
 
427
- while (_stackprof.raw_timestamp_deltas_capa <= _stackprof.raw_timestamp_deltas_len + 1) {
428
- _stackprof.raw_timestamp_deltas_capa *= 2;
429
- _stackprof.raw_timestamp_deltas = realloc(_stackprof.raw_timestamp_deltas, sizeof(int) * _stackprof.raw_timestamp_deltas_capa);
553
+ /* Double the buffer size if it's too small */
554
+ while (_stackprof.raw_sample_times_capa <= _stackprof.raw_sample_times_len + 1) {
555
+ _stackprof.raw_sample_times_capa *= 2;
556
+ _stackprof.raw_sample_times = realloc(_stackprof.raw_sample_times, sizeof(sample_time_t) * _stackprof.raw_sample_times_capa);
430
557
  }
431
558
 
432
- _stackprof.raw_timestamp_deltas[_stackprof.raw_timestamp_deltas_len++] = timestamp_delta;
433
- }
434
-
435
- for (i = 0; i < num; i++) {
436
- VALUE frame = _stackprof.frames_buffer[i];
437
- sample_for(frame)->already_accounted_in_total = 0;
559
+ /* Store the time delta (which is the amount of microseconds between samples). */
560
+ _stackprof.raw_sample_times[_stackprof.raw_sample_times_len++] = (sample_time_t) {
561
+ .timestamp_usec = sample_timestamp,
562
+ .delta_usec = timestamp_delta,
563
+ };
438
564
  }
439
565
 
440
566
  for (i = 0; i < num; i++) {
@@ -442,9 +568,10 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
442
568
  VALUE frame = _stackprof.frames_buffer[i];
443
569
  frame_data_t *frame_data = sample_for(frame);
444
570
 
445
- if (!frame_data->already_accounted_in_total)
571
+ if (frame_data->seen_at_sample_number != _stackprof.overall_samples) {
446
572
  frame_data->total_samples++;
447
- frame_data->already_accounted_in_total = 1;
573
+ }
574
+ frame_data->seen_at_sample_number = _stackprof.overall_samples;
448
575
 
449
576
  if (i == 0) {
450
577
  frame_data->caller_samples++;
@@ -455,10 +582,10 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
455
582
  }
456
583
 
457
584
  if (_stackprof.aggregate && line > 0) {
458
- if (!frame_data->lines)
459
- frame_data->lines = st_init_numtable();
460
585
  size_t half = (size_t)1<<(8*SIZEOF_SIZE_T/2);
461
586
  size_t increment = i == 0 ? half + 1 : half;
587
+ if (!frame_data->lines)
588
+ frame_data->lines = st_init_numtable();
462
589
  st_numtable_increment(frame_data->lines, (st_data_t)line, increment);
463
590
  }
464
591
 
@@ -466,90 +593,183 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta)
466
593
  }
467
594
 
468
595
  if (_stackprof.raw) {
469
- gettimeofday(&_stackprof.last_sample_at, NULL);
596
+ capture_timestamp(&_stackprof.last_sample_at);
470
597
  }
471
598
  }
472
599
 
600
+ // buffer the current profile frames
601
+ // This must be async-signal-safe
602
+ // Returns immediately if another set of frames are already in the buffer
473
603
  void
474
- stackprof_record_sample()
604
+ stackprof_buffer_sample(void)
475
605
  {
476
- int timestamp_delta = 0;
606
+ uint64_t start_timestamp = 0;
607
+ int64_t timestamp_delta = 0;
608
+ int num;
609
+
610
+ if (_stackprof.buffer_count > 0) {
611
+ // Another sample is already pending
612
+ return;
613
+ }
614
+
477
615
  if (_stackprof.raw) {
478
- struct timeval t;
479
- gettimeofday(&t, NULL);
480
- struct timeval diff;
481
- timersub(&t, &_stackprof.last_sample_at, &diff);
482
- timestamp_delta = (1000 * diff.tv_sec) + diff.tv_usec;
616
+ struct timestamp_t t;
617
+ capture_timestamp(&t);
618
+ start_timestamp = timestamp_usec(&t);
619
+ timestamp_delta = delta_usec(&_stackprof.last_sample_at, &t);
483
620
  }
484
- int num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer);
485
- stackprof_record_sample_for_stack(num, timestamp_delta);
621
+
622
+ num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer);
623
+
624
+ _stackprof.buffer_count = num;
625
+ _stackprof.buffer_time.timestamp_usec = start_timestamp;
626
+ _stackprof.buffer_time.delta_usec = timestamp_delta;
486
627
  }
487
628
 
488
629
  void
489
- stackprof_record_gc_samples()
630
+ stackprof_record_gc_samples(void)
490
631
  {
491
- int delta_to_first_unrecorded_gc_sample = 0;
632
+ int64_t delta_to_first_unrecorded_gc_sample = 0;
633
+ uint64_t start_timestamp = 0;
634
+ size_t i;
492
635
  if (_stackprof.raw) {
493
- struct timeval t;
494
- gettimeofday(&t, NULL);
495
- struct timeval diff;
496
- timersub(&t, &_stackprof.last_sample_at, &diff);
636
+ struct timestamp_t t;
637
+ capture_timestamp(&t);
638
+ start_timestamp = timestamp_usec(&t);
497
639
 
498
640
  // We don't know when the GC samples were actually marked, so let's
499
641
  // assume that they were marked at a perfectly regular interval.
500
- delta_to_first_unrecorded_gc_sample = (1000 * diff.tv_sec + diff.tv_usec) - (_stackprof.unrecorded_gc_samples - 1) * _stackprof.interval;
642
+ delta_to_first_unrecorded_gc_sample = delta_usec(&_stackprof.last_sample_at, &t) - (_stackprof.unrecorded_gc_samples - 1) * NUM2LONG(_stackprof.interval);
501
643
  if (delta_to_first_unrecorded_gc_sample < 0) {
502
644
  delta_to_first_unrecorded_gc_sample = 0;
503
645
  }
504
646
  }
505
647
 
506
- int i;
507
-
508
- _stackprof.frames_buffer[0] = _stackprof.fake_gc_frame;
509
- _stackprof.lines_buffer[0] = 0;
510
-
511
648
  for (i = 0; i < _stackprof.unrecorded_gc_samples; i++) {
512
- int timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : _stackprof.interval;
513
- stackprof_record_sample_for_stack(1, timestamp_delta);
649
+ int64_t timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : NUM2LONG(_stackprof.interval);
650
+
651
+ if (_stackprof.unrecorded_gc_marking_samples) {
652
+ _stackprof.frames_buffer[0] = FAKE_FRAME_MARK;
653
+ _stackprof.lines_buffer[0] = 0;
654
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
655
+ _stackprof.lines_buffer[1] = 0;
656
+ _stackprof.unrecorded_gc_marking_samples--;
657
+
658
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
659
+ } else if (_stackprof.unrecorded_gc_sweeping_samples) {
660
+ _stackprof.frames_buffer[0] = FAKE_FRAME_SWEEP;
661
+ _stackprof.lines_buffer[0] = 0;
662
+ _stackprof.frames_buffer[1] = FAKE_FRAME_GC;
663
+ _stackprof.lines_buffer[1] = 0;
664
+
665
+ _stackprof.unrecorded_gc_sweeping_samples--;
666
+
667
+ stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta);
668
+ } else {
669
+ _stackprof.frames_buffer[0] = FAKE_FRAME_GC;
670
+ _stackprof.lines_buffer[0] = 0;
671
+ stackprof_record_sample_for_stack(1, start_timestamp, timestamp_delta);
672
+ }
514
673
  }
515
674
  _stackprof.during_gc += _stackprof.unrecorded_gc_samples;
516
675
  _stackprof.unrecorded_gc_samples = 0;
676
+ _stackprof.unrecorded_gc_marking_samples = 0;
677
+ _stackprof.unrecorded_gc_sweeping_samples = 0;
517
678
  }
518
679
 
680
+ // record the sample previously buffered by stackprof_buffer_sample
519
681
  static void
520
- stackprof_gc_job_handler(void *data)
682
+ stackprof_record_buffer(void)
683
+ {
684
+ stackprof_record_sample_for_stack(_stackprof.buffer_count, _stackprof.buffer_time.timestamp_usec, _stackprof.buffer_time.delta_usec);
685
+
686
+ // reset the buffer
687
+ _stackprof.buffer_count = 0;
688
+ }
689
+
690
+ static void
691
+ stackprof_sample_and_record(void)
692
+ {
693
+ stackprof_buffer_sample();
694
+ stackprof_record_buffer();
695
+ }
696
+
697
+ static void
698
+ stackprof_job_record_gc(void *data)
521
699
  {
522
- static int in_signal_handler = 0;
523
- if (in_signal_handler) return;
524
700
  if (!_stackprof.running) return;
525
701
 
526
- in_signal_handler++;
527
702
  stackprof_record_gc_samples();
528
- in_signal_handler--;
529
703
  }
530
704
 
531
705
  static void
532
- stackprof_job_handler(void *data)
706
+ stackprof_job_sample_and_record(void *data)
533
707
  {
534
- static int in_signal_handler = 0;
535
- if (in_signal_handler) return;
536
708
  if (!_stackprof.running) return;
537
709
 
538
- in_signal_handler++;
539
- stackprof_record_sample();
540
- in_signal_handler--;
710
+ stackprof_sample_and_record();
711
+ }
712
+
713
+ static void
714
+ stackprof_job_record_buffer(void *data)
715
+ {
716
+ if (!_stackprof.running) return;
717
+
718
+ stackprof_record_buffer();
541
719
  }
542
720
 
543
721
  static void
544
722
  stackprof_signal_handler(int sig, siginfo_t *sinfo, void *ucontext)
545
723
  {
724
+ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
725
+
546
726
  _stackprof.overall_signals++;
547
- if (rb_during_gc()) {
727
+
728
+ if (!_stackprof.running) return;
729
+
730
+ // There's a possibility that the signal handler is invoked *after* the Ruby
731
+ // VM has been shut down (e.g. after ruby_cleanup(0)). In this case, things
732
+ // that rely on global VM state (e.g. rb_during_gc) will segfault.
733
+ if (!ruby_vm_running) return;
734
+
735
+ if (_stackprof.mode == sym_wall) {
736
+ // In "wall" mode, the SIGALRM signal will arrive at an arbitrary thread.
737
+ // In order to provide more useful results, especially under threaded web
738
+ // servers, we want to forward this signal to the original thread
739
+ // StackProf was started from.
740
+ // According to POSIX.1-2008 TC1 pthread_kill and pthread_self should be
741
+ // async-signal-safe.
742
+ if (pthread_self() != _stackprof.target_thread) {
743
+ pthread_kill(_stackprof.target_thread, sig);
744
+ return;
745
+ }
746
+ } else {
747
+ if (!ruby_native_thread_p()) return;
748
+ }
749
+
750
+ if (pthread_mutex_trylock(&lock)) return;
751
+
752
+ if (!_stackprof.ignore_gc && rb_during_gc()) {
753
+ VALUE mode = rb_gc_latest_gc_info(sym_state);
754
+ if (mode == sym_marking) {
755
+ _stackprof.unrecorded_gc_marking_samples++;
756
+ } else if (mode == sym_sweeping) {
757
+ _stackprof.unrecorded_gc_sweeping_samples++;
758
+ }
548
759
  _stackprof.unrecorded_gc_samples++;
549
- rb_postponed_job_register_one(0, stackprof_gc_job_handler, (void*)0);
760
+ rb_postponed_job_register_one(0, stackprof_job_record_gc, (void*)0);
550
761
  } else {
551
- rb_postponed_job_register_one(0, stackprof_job_handler, (void*)0);
762
+ if (stackprof_use_postponed_job) {
763
+ rb_postponed_job_register_one(0, stackprof_job_sample_and_record, (void*)0);
764
+ } else {
765
+ // Buffer a sample immediately, if an existing sample exists this will
766
+ // return immediately
767
+ stackprof_buffer_sample();
768
+ // Enqueue a job to record the sample
769
+ rb_postponed_job_register_one(0, stackprof_job_record_buffer, (void*)0);
770
+ }
552
771
  }
772
+ pthread_mutex_unlock(&lock);
553
773
  }
554
774
 
555
775
  static void
@@ -558,7 +778,7 @@ stackprof_newobj_handler(VALUE tpval, void *data)
558
778
  _stackprof.overall_signals++;
559
779
  if (RTEST(_stackprof.interval) && _stackprof.overall_signals % NUM2LONG(_stackprof.interval))
560
780
  return;
561
- stackprof_job_handler(0);
781
+ stackprof_sample_and_record();
562
782
  }
563
783
 
564
784
  static VALUE
@@ -568,7 +788,7 @@ stackprof_sample(VALUE self)
568
788
  return Qfalse;
569
789
 
570
790
  _stackprof.overall_signals++;
571
- stackprof_job_handler(0);
791
+ stackprof_sample_and_record();
572
792
  return Qtrue;
573
793
  }
574
794
 
@@ -583,11 +803,18 @@ frame_mark_i(st_data_t key, st_data_t val, st_data_t arg)
583
803
  static void
584
804
  stackprof_gc_mark(void *data)
585
805
  {
806
+ if (RTEST(_stackprof.metadata))
807
+ rb_gc_mark(_stackprof.metadata);
808
+
586
809
  if (RTEST(_stackprof.out))
587
810
  rb_gc_mark(_stackprof.out);
588
811
 
589
812
  if (_stackprof.frames)
590
813
  st_foreach(_stackprof.frames, frame_mark_i, 0);
814
+
815
+ for (int i = 0; i < _stackprof.buffer_count; i++) {
816
+ rb_gc_mark(_stackprof.frames_buffer[i]);
817
+ }
591
818
  }
592
819
 
593
820
  static void
@@ -622,9 +849,32 @@ stackprof_atfork_child(void)
622
849
  stackprof_stop(rb_mStackProf);
623
850
  }
624
851
 
852
+ static VALUE
853
+ stackprof_use_postponed_job_l(VALUE self)
854
+ {
855
+ stackprof_use_postponed_job = 1;
856
+ return Qnil;
857
+ }
858
+
859
+ static void
860
+ stackprof_at_exit(ruby_vm_t* vm)
861
+ {
862
+ ruby_vm_running = 0;
863
+ }
864
+
625
865
  void
626
866
  Init_stackprof(void)
627
867
  {
868
+ size_t i;
869
+ /*
870
+ * As of Ruby 3.0, it should be safe to read stack frames at any time, unless YJIT is enabled
871
+ * See https://github.com/ruby/ruby/commit/0e276dc458f94d9d79a0f7c7669bde84abe80f21
872
+ */
873
+ stackprof_use_postponed_job = RUBY_API_VERSION_MAJOR < 3;
874
+
875
+ ruby_vm_running = 1;
876
+ ruby_vm_at_exit(stackprof_at_exit);
877
+
628
878
  #define S(name) sym_##name = ID2SYM(rb_intern(#name));
629
879
  S(object);
630
880
  S(custom);
@@ -643,12 +893,21 @@ Init_stackprof(void)
643
893
  S(mode);
644
894
  S(interval);
645
895
  S(raw);
896
+ S(raw_sample_timestamps);
646
897
  S(raw_timestamp_deltas);
647
898
  S(out);
899
+ S(metadata);
900
+ S(ignore_gc);
648
901
  S(frames);
649
902
  S(aggregate);
903
+ S(state);
904
+ S(marking);
905
+ S(sweeping);
650
906
  #undef S
651
907
 
908
+ /* Need to run this to warm the symbol table before we call this during GC */
909
+ rb_gc_latest_gc_info(sym_state);
910
+
652
911
  gc_hook = Data_Wrap_Struct(rb_cObject, stackprof_gc_mark, NULL, &_stackprof);
653
912
  rb_global_variable(&gc_hook);
654
913
 
@@ -657,16 +916,18 @@ Init_stackprof(void)
657
916
  _stackprof.raw_samples_capa = 0;
658
917
  _stackprof.raw_sample_index = 0;
659
918
 
660
- _stackprof.raw_timestamp_deltas = NULL;
661
- _stackprof.raw_timestamp_deltas_len = 0;
662
- _stackprof.raw_timestamp_deltas_capa = 0;
919
+ _stackprof.raw_sample_times = NULL;
920
+ _stackprof.raw_sample_times_len = 0;
921
+ _stackprof.raw_sample_times_capa = 0;
663
922
 
664
- _stackprof.fake_gc_frame = INT2FIX(0x9C);
665
923
  _stackprof.empty_string = rb_str_new_cstr("");
666
- _stackprof.fake_gc_frame_name = rb_str_new_cstr("(garbage collection)");
667
- rb_global_variable(&_stackprof.fake_gc_frame_name);
668
924
  rb_global_variable(&_stackprof.empty_string);
669
925
 
926
+ for (i = 0; i < TOTAL_FAKE_FRAMES; i++) {
927
+ _stackprof.fake_frame_names[i] = rb_str_new_cstr(fake_frame_cstrs[i]);
928
+ rb_global_variable(&_stackprof.fake_frame_names[i]);
929
+ }
930
+
670
931
  rb_mStackProf = rb_define_module("StackProf");
671
932
  rb_define_singleton_method(rb_mStackProf, "running?", stackprof_running_p, 0);
672
933
  rb_define_singleton_method(rb_mStackProf, "run", stackprof_run, -1);
@@ -674,6 +935,7 @@ Init_stackprof(void)
674
935
  rb_define_singleton_method(rb_mStackProf, "stop", stackprof_stop, 0);
675
936
  rb_define_singleton_method(rb_mStackProf, "results", stackprof_results, -1);
676
937
  rb_define_singleton_method(rb_mStackProf, "sample", stackprof_sample, 0);
938
+ rb_define_singleton_method(rb_mStackProf, "use_postponed_job!", stackprof_use_postponed_job_l, 0);
677
939
 
678
940
  pthread_atfork(stackprof_atfork_prepare, stackprof_atfork_parent, stackprof_atfork_child);
679
941
  }