fluent-plugin-perf-tools 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.rubocop.yml +26 -0
  4. data/.ruby-version +1 -0
  5. data/CHANGELOG.md +5 -0
  6. data/CODE_OF_CONDUCT.md +84 -0
  7. data/Gemfile +5 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +43 -0
  10. data/Rakefile +17 -0
  11. data/bin/console +15 -0
  12. data/bin/setup +8 -0
  13. data/fluent-plugin-perf-tools.gemspec +48 -0
  14. data/lib/fluent/plugin/in_perf_tools.rb +42 -0
  15. data/lib/fluent/plugin/perf_tools/cachestat.rb +65 -0
  16. data/lib/fluent/plugin/perf_tools/command.rb +30 -0
  17. data/lib/fluent/plugin/perf_tools/version.rb +9 -0
  18. data/lib/fluent/plugin/perf_tools.rb +11 -0
  19. data/perf-tools/LICENSE +339 -0
  20. data/perf-tools/README.md +205 -0
  21. data/perf-tools/bin/bitesize +1 -0
  22. data/perf-tools/bin/cachestat +1 -0
  23. data/perf-tools/bin/execsnoop +1 -0
  24. data/perf-tools/bin/funccount +1 -0
  25. data/perf-tools/bin/funcgraph +1 -0
  26. data/perf-tools/bin/funcslower +1 -0
  27. data/perf-tools/bin/functrace +1 -0
  28. data/perf-tools/bin/iolatency +1 -0
  29. data/perf-tools/bin/iosnoop +1 -0
  30. data/perf-tools/bin/killsnoop +1 -0
  31. data/perf-tools/bin/kprobe +1 -0
  32. data/perf-tools/bin/opensnoop +1 -0
  33. data/perf-tools/bin/perf-stat-hist +1 -0
  34. data/perf-tools/bin/reset-ftrace +1 -0
  35. data/perf-tools/bin/syscount +1 -0
  36. data/perf-tools/bin/tcpretrans +1 -0
  37. data/perf-tools/bin/tpoint +1 -0
  38. data/perf-tools/bin/uprobe +1 -0
  39. data/perf-tools/deprecated/README.md +1 -0
  40. data/perf-tools/deprecated/execsnoop-proc +150 -0
  41. data/perf-tools/deprecated/execsnoop-proc.8 +80 -0
  42. data/perf-tools/deprecated/execsnoop-proc_example.txt +46 -0
  43. data/perf-tools/disk/bitesize +175 -0
  44. data/perf-tools/examples/bitesize_example.txt +63 -0
  45. data/perf-tools/examples/cachestat_example.txt +58 -0
  46. data/perf-tools/examples/execsnoop_example.txt +153 -0
  47. data/perf-tools/examples/funccount_example.txt +126 -0
  48. data/perf-tools/examples/funcgraph_example.txt +2178 -0
  49. data/perf-tools/examples/funcslower_example.txt +110 -0
  50. data/perf-tools/examples/functrace_example.txt +341 -0
  51. data/perf-tools/examples/iolatency_example.txt +350 -0
  52. data/perf-tools/examples/iosnoop_example.txt +302 -0
  53. data/perf-tools/examples/killsnoop_example.txt +62 -0
  54. data/perf-tools/examples/kprobe_example.txt +379 -0
  55. data/perf-tools/examples/opensnoop_example.txt +47 -0
  56. data/perf-tools/examples/perf-stat-hist_example.txt +149 -0
  57. data/perf-tools/examples/reset-ftrace_example.txt +88 -0
  58. data/perf-tools/examples/syscount_example.txt +297 -0
  59. data/perf-tools/examples/tcpretrans_example.txt +93 -0
  60. data/perf-tools/examples/tpoint_example.txt +210 -0
  61. data/perf-tools/examples/uprobe_example.txt +321 -0
  62. data/perf-tools/execsnoop +292 -0
  63. data/perf-tools/fs/cachestat +167 -0
  64. data/perf-tools/images/perf-tools_2016.png +0 -0
  65. data/perf-tools/iolatency +296 -0
  66. data/perf-tools/iosnoop +296 -0
  67. data/perf-tools/kernel/funccount +146 -0
  68. data/perf-tools/kernel/funcgraph +259 -0
  69. data/perf-tools/kernel/funcslower +248 -0
  70. data/perf-tools/kernel/functrace +192 -0
  71. data/perf-tools/kernel/kprobe +270 -0
  72. data/perf-tools/killsnoop +263 -0
  73. data/perf-tools/man/man8/bitesize.8 +70 -0
  74. data/perf-tools/man/man8/cachestat.8 +111 -0
  75. data/perf-tools/man/man8/execsnoop.8 +104 -0
  76. data/perf-tools/man/man8/funccount.8 +76 -0
  77. data/perf-tools/man/man8/funcgraph.8 +166 -0
  78. data/perf-tools/man/man8/funcslower.8 +129 -0
  79. data/perf-tools/man/man8/functrace.8 +123 -0
  80. data/perf-tools/man/man8/iolatency.8 +116 -0
  81. data/perf-tools/man/man8/iosnoop.8 +169 -0
  82. data/perf-tools/man/man8/killsnoop.8 +100 -0
  83. data/perf-tools/man/man8/kprobe.8 +162 -0
  84. data/perf-tools/man/man8/opensnoop.8 +113 -0
  85. data/perf-tools/man/man8/perf-stat-hist.8 +111 -0
  86. data/perf-tools/man/man8/reset-ftrace.8 +49 -0
  87. data/perf-tools/man/man8/syscount.8 +96 -0
  88. data/perf-tools/man/man8/tcpretrans.8 +93 -0
  89. data/perf-tools/man/man8/tpoint.8 +140 -0
  90. data/perf-tools/man/man8/uprobe.8 +168 -0
  91. data/perf-tools/misc/perf-stat-hist +223 -0
  92. data/perf-tools/net/tcpretrans +311 -0
  93. data/perf-tools/opensnoop +280 -0
  94. data/perf-tools/syscount +192 -0
  95. data/perf-tools/system/tpoint +232 -0
  96. data/perf-tools/tools/reset-ftrace +123 -0
  97. data/perf-tools/user/uprobe +390 -0
  98. metadata +349 -0
@@ -0,0 +1,88 @@
1
+ Demonstrations of reset-ftrace, the Linux ftrace tool.
2
+
3
+
4
+ You will probably never need this tool. If you kill -9 an ftrace-based tool,
5
+ leaving the kernel in a tracing enabled state, you could try using this tool
6
+ to reset ftrace and disable tracing. Make sure no other ftrace sessions are
7
+ in use on your system, or it will kill those.
8
+
9
+ Here's an example:
10
+
11
+ # ./opensnoop
12
+ Tracing open()s. Ctrl-C to end.
13
+ ERROR: ftrace may be in use by PID 2197 /var/tmp/.ftrace-lock
14
+
15
+ I tried to run opensnoop, but there's a lock file for PID 2197. Checking if it
16
+ exists:
17
+
18
+ # ps -fp 2197
19
+ UID PID PPID C STIME TTY TIME CMD
20
+ #
21
+
22
+ No.
23
+
24
+ I also know that no one is using ftrace on this system. So I'll use reset-ftrace
25
+ to clean up this lock file and ftrace state:
26
+
27
+ # ./reset-ftrace
28
+ ERROR: ftrace lock (/var/tmp/.ftrace-lock) exists. It shows ftrace may be in use by PID 2197.
29
+ Double check to see if that PID is still active. If not, consider using -f to force a reset. Exiting.
30
+
31
+ ... except it's complaining about the lock file too. I'm already sure that this
32
+ PID doesn't exist, so I'll add the -f option:
33
+
34
+ # ./reset-ftrace -f
35
+ Reseting ftrace state...
36
+
37
+ current_tracer, before:
38
+ 1 nop
39
+ current_tracer, after:
40
+ 1 nop
41
+
42
+ set_ftrace_filter, before:
43
+ 1 #### all functions enabled ####
44
+ set_ftrace_filter, after:
45
+ 1 #### all functions enabled ####
46
+
47
+ set_ftrace_pid, before:
48
+ 1 no pid
49
+ set_ftrace_pid, after:
50
+ 1 no pid
51
+
52
+ kprobe_events, before:
53
+ kprobe_events, after:
54
+
55
+ Done.
56
+
57
+ The output shows what has been reset, including the before and after state of
58
+ these files.
59
+
60
+ Now I can try iosnoop again:
61
+
62
+ # ./iosnoop
63
+ Tracing block I/O. Ctrl-C to end.
64
+ COMM PID TYPE DEV BLOCK BYTES LATms
65
+ supervise 1689 W 202,1 17039664 4096 0.58
66
+ supervise 1689 W 202,1 17039672 4096 0.47
67
+ supervise 1694 W 202,1 17039744 4096 0.98
68
+ supervise 1694 W 202,1 17039752 4096 0.74
69
+ supervise 1684 W 202,1 17039760 4096 0.63
70
+ [...]
71
+
72
+ Fixed.
73
+
74
+ Note that reset-ftrace currently only resets a few methods of enabling
75
+ tracing, such as set_ftrace_filter and kprobe_events. Static tracepoints could
76
+ be enabled individually, and this script currently doesn't find and disable
77
+ those.
78
+
79
+
80
+ Use -h to print the USAGE message:
81
+
82
+ # ./reset-ftrace -h
83
+ USAGE: reset-ftrace [-fhq]
84
+ -f # force: delete ftrace lock file
85
+ -q # quiet: reset, but say nothing
86
+ -h # this usage message
87
+ eg,
88
+ reset-ftrace # disable active ftrace session
@@ -0,0 +1,297 @@
1
+ Demonstrations of syscount, the Linux perf_events version.
2
+
3
+
4
+ The first mode I use is "-c", where it behaves like "strace -c", but for the
5
+ entire system (all procesess) and with much lower overhead:
6
+
7
+ # ./syscount -c
8
+ Tracing... Ctrl-C to end.
9
+ ^Csleep: Interrupt
10
+ SYSCALL COUNT
11
+ accept 1
12
+ getsockopt 1
13
+ setsid 1
14
+ chdir 2
15
+ getcwd 2
16
+ getpeername 2
17
+ getsockname 2
18
+ setgid 2
19
+ setgroups 2
20
+ setpgid 2
21
+ setuid 2
22
+ getpgrp 4
23
+ getpid 4
24
+ rename 4
25
+ setitimer 4
26
+ setrlimit 4
27
+ setsockopt 4
28
+ statfs 4
29
+ set_tid_address 5
30
+ readlink 6
31
+ set_robust_list 6
32
+ nanosleep 7
33
+ newuname 7
34
+ faccessat 8
35
+ futex 10
36
+ clock_gettime 16
37
+ newlstat 20
38
+ pipe 20
39
+ epoll_wait 24
40
+ getrlimit 25
41
+ socket 27
42
+ connect 29
43
+ exit_group 30
44
+ getppid 31
45
+ dup2 34
46
+ wait4 51
47
+ fcntl 58
48
+ getegid 72
49
+ getgid 72
50
+ getuid 72
51
+ geteuid 75
52
+ perf_event_open 100
53
+ munmap 121
54
+ gettimeofday 216
55
+ access 266
56
+ ioctl 340
57
+ poll 348
58
+ sendto 374
59
+ mprotect 414
60
+ brk 597
61
+ rt_sigaction 632
62
+ recvfrom 664
63
+ lseek 749
64
+ newfstatat 2922
65
+ openat 2925
66
+ newfstat 3229
67
+ newstat 4334
68
+ open 4534
69
+ fchdir 5845
70
+ getdents 5854
71
+ read 7673
72
+ close 7728
73
+ select 9633
74
+ rt_sigprocmask 19886
75
+ write 34581
76
+
77
+ While tracing, the write() syscall was executed 34,581 times.
78
+
79
+ This mode uses "perf stat" to count the syscalls:* tracepoints in-kernel.
80
+
81
+
82
+ You can add a duration (-d) and limit the number shown (-t):
83
+
84
+ # ./syscount -cd 5 -t 10
85
+ Tracing for 5 seconds. Top 10 only...
86
+ SYSCALL COUNT
87
+ gettimeofday 1009
88
+ write 3583
89
+ read 8174
90
+ openat 21550
91
+ newfstat 21558
92
+ open 21824
93
+ fchdir 43098
94
+ getdents 43106
95
+ close 43694
96
+ newfstatat 110936
97
+
98
+ While tracing for 5 seconds, the newfstatat() syscall was executed 110,936
99
+ times.
100
+
101
+
102
+ Without the -c, syscount shows syscalls by process name:
103
+
104
+ # ./syscount -d 5 -t 10
105
+ Tracing for 5 seconds. Top 10 only...
106
+ [ perf record: Woken up 66 times to write data ]
107
+ [ perf record: Captured and wrote 16.513 MB perf.data (~721455 samples) ]
108
+ COMM COUNT
109
+ stat 450
110
+ perl 537
111
+ catalina.sh 1700
112
+ postgres 2094
113
+ run 2362
114
+ :6946 4764
115
+ ps 5961
116
+ sshd 45796
117
+ find 61039
118
+
119
+ So processes named "find" called 61,039 syscalls during the 5 seconds of
120
+ tracing.
121
+
122
+ Note that this mode writes a perf.data file. This is higher overhead for a
123
+ few reasons:
124
+
125
+ - all data is passed from kernel to user space, which eats CPU for the memory
126
+ copy. Note that it is buffered in an efficient way by perf_events, which
127
+ wakes up and context switches only a small number of times: 66 in this case,
128
+ to hand 16 Mbytes of trace data to user space.
129
+ - data is post-processed in user space, eating more CPU.
130
+ - data is stored on the file system in the perf.data file, consuming available
131
+ storage.
132
+
133
+ This will be improved in future kernels, but it is difficult to improve this
134
+ much further in existing kernels. For example, using a pipe to "perf script"
135
+ instead of writing perf.data can have issues with feedback loops, where
136
+ perf traces itself. This syscount version goes to lengths to avoid tracing
137
+ its own perf, but
138
+ right now with existing functionality in older kernels. The trip via perf.data
139
+ is necessary
140
+
141
+
142
+ Running without options shows syscalls by process name until Ctrl-C:
143
+
144
+ # ./syscount
145
+ Tracing... Ctrl-C to end.
146
+ ^C[ perf record: Woken up 39 times to write data ]
147
+ [ perf record: Captured and wrote 9.644 MB perf.data (~421335 samples) ]
148
+ COMM COUNT
149
+ apache2 8
150
+ apacheLogParser 13
151
+ platformservice 16
152
+ snmpd 16
153
+ ntpd 21
154
+ multilog 66
155
+ supervise 84
156
+ dirname 102
157
+ echo 102
158
+ svstat 108
159
+ cut 111
160
+ bash 113
161
+ grep 132
162
+ xargs 132
163
+ redis-server 190
164
+ sed 192
165
+ setuidgid 294
166
+ stat 450
167
+ perl 537
168
+ catalina.sh 1275
169
+ postgres 1736
170
+ run 2352
171
+ :7396 4527
172
+ ps 5925
173
+ sshd 20154
174
+ find 28700
175
+
176
+ Note again it is writing a perf.data file to do this.
177
+
178
+
179
+ The -v option adds process IDs:
180
+
181
+ # ./syscount -v
182
+ Tracing... Ctrl-C to end.
183
+ ^C[ perf record: Woken up 48 times to write data ]
184
+ [ perf record: Captured and wrote 12.114 MB perf.data (~529276 samples) ]
185
+ PID COMM COUNT
186
+ 3599 apacheLogParser 3
187
+ 7977 xargs 3
188
+ 7982 supervise 3
189
+ 7993 xargs 3
190
+ 3575 apache2 4
191
+ 1311 ntpd 6
192
+ 3135 postgres 6
193
+ 3600 apacheLogParser 6
194
+ 3210 platformservice 8
195
+ 6503 sshd 9
196
+ 7978 :7978 9
197
+ 7994 run 9
198
+ 7968 :7968 11
199
+ 7984 run 11
200
+ 1451 snmpd 16
201
+ 3040 svscan 17
202
+ 3066 postgres 17
203
+ 3133 postgres 24
204
+ 3134 postgres 24
205
+ 3136 postgres 24
206
+ 3061 multilog 29
207
+ 3055 supervise 30
208
+ 7979 bash 31
209
+ 7977 echo 34
210
+ 7981 dirname 34
211
+ 7993 echo 34
212
+ 7968 svstat 36
213
+ 7984 svstat 36
214
+ 7975 cut 37
215
+ 7991 cut 37
216
+ 9857 bash 37
217
+ 7967 :7967 40
218
+ 7983 run 40
219
+ 7972 :7972 41
220
+ 7976 xargs 41
221
+ 7988 run 41
222
+ 7992 xargs 41
223
+ 7969 :7969 42
224
+ 7976 :7976 42
225
+ 7985 run 42
226
+ 7992 run 42
227
+ 7973 :7973 43
228
+ 7974 :7974 43
229
+ 7989 run 43
230
+ 7990 run 43
231
+ 7973 grep 44
232
+ 7989 grep 44
233
+ 7975 :7975 45
234
+ 7991 run 45
235
+ 7970 :7970 51
236
+ 7986 run 51
237
+ 7981 catalina.sh 52
238
+ 7974 sed 64
239
+ 7990 sed 64
240
+ 3455 postgres 66
241
+ 7971 :7971 66
242
+ 7987 run 66
243
+ 7966 :7966 96
244
+ 7966 setuidgid 98
245
+ 3064 redis-server 110
246
+ 7970 stat 150
247
+ 7986 stat 150
248
+ 7969 perl 179
249
+ 7985 perl 179
250
+ 7982 run 341
251
+ 7966 catalina.sh 373
252
+ 7980 postgres 432
253
+ 7972 ps 1971
254
+ 7988 ps 1983
255
+ 9832 sshd 37511
256
+ 7979 find 51040
257
+
258
+ Once you've found a process ID of interest, you can use "-c" and "-p PID" to
259
+ show syscall names. This also switches to "perf stat" mode for in-kernel
260
+ counts, and lower overhead:
261
+
262
+ # ./syscount -cp 7979
263
+ Tracing PID 7979... Ctrl-C to end.
264
+ ^CSYSCALL COUNT
265
+ brk 10
266
+ newfstat 2171
267
+ open 2171
268
+ newfstatat 2175
269
+ openat 2175
270
+ close 4346
271
+ fchdir 4346
272
+ getdents 4351
273
+ write 25482
274
+
275
+ So the most frequent syscall by PID 7979 was write().
276
+
277
+
278
+ Use -h to print the USAGE message:
279
+
280
+ # ./syscount -h
281
+ USAGE: syscount [-chv] [-t top] {-p PID|-d seconds|command}
282
+ syscount # count by process name
283
+ -c # show counts by syscall name
284
+ -h # this usage message
285
+ -v # verbose: shows PID
286
+ -p PID # trace this PID only
287
+ -d seconds # duration of trace
288
+ -t num # show top number only
289
+ command # run and trace this command
290
+ eg,
291
+ syscount # syscalls by process name
292
+ syscount -c # syscalls by syscall name
293
+ syscount -d 5 # trace for 5 seconds
294
+ syscount -cp 923 # syscall names for PID 923
295
+ syscount -c ls # syscall names for "ls"
296
+
297
+ See the man page and example file for more info.
@@ -0,0 +1,93 @@
1
+ Demonstrations of tcpretrans, the Linux ftrace version.
2
+
3
+
4
+ Tracing TCP retransmits on a busy server:
5
+
6
+ # ./tcpretrans
7
+ TIME PID LADDR:LPORT -- RADDR:RPORT STATE
8
+ 05:16:44 3375 10.150.18.225:53874 R> 10.105.152.3:6001 ESTABLISHED
9
+ 05:16:44 3375 10.150.18.225:53874 R> 10.105.152.3:6001 ESTABLISHED
10
+ 05:16:54 4028 10.150.18.225:6002 R> 10.150.30.249:1710 ESTABLISHED
11
+ 05:16:54 4028 10.150.18.225:6002 R> 10.150.30.249:1710 ESTABLISHED
12
+ 05:16:54 4028 10.150.18.225:6002 R> 10.150.30.249:1710 ESTABLISHED
13
+ 05:16:54 4028 10.150.18.225:6002 R> 10.150.30.249:1710 ESTABLISHED
14
+ 05:16:54 4028 10.150.18.225:6002 R> 10.150.30.249:1710 ESTABLISHED
15
+ 05:16:54 4028 10.150.18.225:6002 R> 10.150.30.249:1710 ESTABLISHED
16
+ 05:16:54 4028 10.150.18.225:6002 R> 10.150.30.249:1710 ESTABLISHED
17
+ 05:16:55 0 10.150.18.225:47115 R> 10.71.171.158:6001 ESTABLISHED
18
+ 05:16:58 0 10.150.18.225:44388 R> 10.103.130.120:6001 ESTABLISHED
19
+ 05:16:58 0 10.150.18.225:44388 R> 10.103.130.120:6001 ESTABLISHED
20
+ 05:16:58 0 10.150.18.225:44388 R> 10.103.130.120:6001 ESTABLISHED
21
+ 05:16:59 0 10.150.18.225:56086 R> 10.150.32.107:6001 ESTABLISHED
22
+ 05:16:59 0 10.150.18.225:56086 R> 10.150.32.107:6001 ESTABLISHED
23
+ ^C
24
+ Ending tracing...
25
+
26
+ This shows TCP retransmits by dynamically tracing the kernel function that does
27
+ the retransmit. This is a low overhead approach.
28
+
29
+ The PID may or may not make sense: it's showing the PID that was on-CPU,
30
+ however, retransmits are often timer-based, where it's the kernel that is
31
+ on-CPU.
32
+
33
+ The STATE column shows the TCP state for the socket performing the retransmit.
34
+ The "--" column is the packet type. "R>" for retransmit.
35
+
36
+
37
+ Kernel stack traces can be included with -s, which may show the type of
38
+ retransmit:
39
+
40
+ # ./tcpretrans -s
41
+ TIME PID LADDR:LPORT -- RADDR:RPORT STATE
42
+ 06:21:10 19516 10.144.107.151:22 R> 10.13.106.251:32167 ESTABLISHED
43
+ => tcp_fastretrans_alert
44
+ => tcp_ack
45
+ => tcp_rcv_established
46
+ => tcp_v4_do_rcv
47
+ => tcp_v4_rcv
48
+ => ip_local_deliver_finish
49
+ => ip_local_deliver
50
+ => ip_rcv_finish
51
+ => ip_rcv
52
+ => __netif_receive_skb
53
+ => netif_receive_skb
54
+ => handle_incoming_queue
55
+ => xennet_poll
56
+ => net_rx_action
57
+ => __do_softirq
58
+ => call_softirq
59
+ => do_softirq
60
+ => irq_exit
61
+ => xen_evtchn_do_upcall
62
+ => xen_do_hypervisor_callback
63
+
64
+ This looks like a fast retransmit (inclusion of tcp_fastretrans_alert(), and
65
+ being based on receiving an ACK, rather than a timer).
66
+
67
+
68
+ The -l option will include TCP tail loss probe events (TLP; see
69
+ http://lwn.net/Articles/542642/). Eg:
70
+
71
+ # ./tcpretrans -l
72
+ TIME PID LADDR:LPORT -- RADDR:RPORT STATE
73
+ 21:56:06 0 10.100.155.200:22 R> 10.10.237.72:18554 LAST_ACK
74
+ 21:56:08 0 10.100.155.200:22 R> 10.10.237.72:18554 LAST_ACK
75
+ 21:56:10 16452 10.100.155.200:22 R> 10.10.237.72:18554 LAST_ACK
76
+ 21:56:10 0 10.100.155.200:22 L> 10.10.237.72:46408 LAST_ACK
77
+ 21:56:10 0 10.100.155.200:22 R> 10.10.237.72:46408 LAST_ACK
78
+ 21:56:12 0 10.100.155.200:22 R> 10.10.237.72:46408 LAST_ACK
79
+ 21:56:13 0 10.100.155.200:22 R> 10.10.237.72:46408 LAST_ACK
80
+ ^C
81
+ Ending tracing...
82
+
83
+ Look for "L>" in the type column ("--") for TLP events.
84
+
85
+
86
+ Use -h to print the USAGE message:
87
+
88
+ # ./tcpretrans -h
89
+ USAGE: tcpretrans [-hs]
90
+ -h # help message
91
+ -s # print stack traces
92
+ eg,
93
+ tcpretrans # trace TCP retransmits
@@ -0,0 +1,210 @@
1
+ Demonstrations of tpoint, the Linux ftrace version.
2
+
3
+
4
+ Let's trace block:block_rq_issue, to see block device (disk) I/O requests:
5
+
6
+ # ./tpoint block:block_rq_issue
7
+ Tracing block:block_rq_issue. Ctrl-C to end.
8
+ supervise-1692 [001] d... 7269912.982162: block_rq_issue: 202,1 W 0 () 17039656 + 8 [supervise]
9
+ supervise-1696 [000] d... 7269912.982243: block_rq_issue: 202,1 W 0 () 12862264 + 8 [supervise]
10
+ cksum-12994 [000] d... 7269913.317924: block_rq_issue: 202,1 R 0 () 9357056 + 72 [cksum]
11
+ cksum-12994 [000] d... 7269913.319013: block_rq_issue: 202,1 R 0 () 2977536 + 144 [cksum]
12
+ cksum-12994 [000] d... 7269913.320217: block_rq_issue: 202,1 R 0 () 2986240 + 216 [cksum]
13
+ cksum-12994 [000] d... 7269913.321677: block_rq_issue: 202,1 R 0 () 620344 + 56 [cksum]
14
+ cksum-12994 [001] d... 7269913.329309: block_rq_issue: 202,1 R 0 () 9107912 + 88 [cksum]
15
+ cksum-12994 [001] d... 7269913.340133: block_rq_issue: 202,1 R 0 () 3147008 + 248 [cksum]
16
+ cksum-12994 [001] d... 7269913.354551: block_rq_issue: 202,1 R 0 () 11583488 + 256 [cksum]
17
+ cksum-12994 [001] d... 7269913.379904: block_rq_issue: 202,1 R 0 () 11583744 + 256 [cksum]
18
+ [...]
19
+ ^C
20
+ Ending tracing...
21
+
22
+ Great, that was easy!
23
+
24
+ perf_events can do this as well, and is better in many ways, including a more
25
+ efficient buffering strategy, and multi-user access. It's not that easy to do
26
+ this one-liner in perf_events, however. An equivalent for recent kernels is:
27
+
28
+ perf record --no-buffer -e block:block_rq_issue -a -o - | PAGER=cat stdbuf -oL perf script -i -
29
+
30
+ Older kernels, use -D instead of --no-buffer. Even better is to set the buffer
31
+ page size to a sufficient grouping (using -m), to minimize overheads, at the
32
+ expense of liveliness of updates. Note that stack traces (-g) don't work on
33
+ my systems with this perf one-liner, however, they do work with tpoint -s.
34
+
35
+
36
+ Column headings can be printed using -H:
37
+
38
+ # ./tpoint -H block:block_rq_issue
39
+ Tracing block:block_rq_issue. Ctrl-C to end.
40
+ # tracer: nop
41
+ #
42
+ # entries-in-buffer/entries-written: 0/0 #P:2
43
+ #
44
+ # _-----=> irqs-off
45
+ # / _----=> need-resched
46
+ # | / _---=> hardirq/softirq
47
+ # || / _--=> preempt-depth
48
+ # ||| / delay
49
+ # TASK-PID CPU# |||| TIMESTAMP FUNCTION
50
+ # | | | |||| | |
51
+ supervise-1697 [000] d... 7270545.340856: block_rq_issue: 202,1 W 0 () 12862464 + 8 [supervise]
52
+ supervise-1697 [000] d... 7270545.341256: block_rq_issue: 202,1 W 0 () 12862472 + 8 [supervise]
53
+ supervise-1690 [000] d... 7270545.342363: block_rq_issue: 202,1 W 0 () 17040368 + 8 [supervise]
54
+ [...]
55
+
56
+ They are also documented in the Linux kernel source under:
57
+ Documentation/trace/ftrace.txt.
58
+
59
+
60
+ How about stacks traces for those block_rq_issue events? Adding -s:
61
+
62
+ # ./tpoint -s block:block_rq_issue
63
+ Tracing block:block_rq_issue. Ctrl-C to end.
64
+ supervise-1691 [000] d... 7269511.079179: block_rq_issue: 202,1 W 0 () 17040232 + 8 [supervise]
65
+ supervise-1691 [000] d... 7269511.079188: <stack trace>
66
+ => blk_peek_request
67
+ => do_blkif_request
68
+ => __blk_run_queue
69
+ => queue_unplugged
70
+ => blk_flush_plug_list
71
+ => blk_finish_plug
72
+ => ext4_writepages
73
+ => do_writepages
74
+ => __filemap_fdatawrite_range
75
+ => filemap_flush
76
+ => ext4_alloc_da_blocks
77
+ => ext4_rename
78
+ => vfs_rename
79
+ => SYSC_renameat2
80
+ => SyS_renameat2
81
+ => SyS_rename
82
+ => system_call_fastpath
83
+ cksum-7428 [000] d... 7269511.331778: block_rq_issue: 202,1 R 0 () 9006848 + 208 [cksum]
84
+ cksum-7428 [000] d... 7269511.331784: <stack trace>
85
+ => blk_peek_request
86
+ => do_blkif_request
87
+ => __blk_run_queue
88
+ => queue_unplugged
89
+ => blk_flush_plug_list
90
+ => blk_finish_plug
91
+ => __do_page_cache_readahead
92
+ => ondemand_readahead
93
+ => page_cache_async_readahead
94
+ => generic_file_read_iter
95
+ => new_sync_read
96
+ => vfs_read
97
+ => SyS_read
98
+ => system_call_fastpath
99
+ cksum-7428 [000] d... 7269511.332631: block_rq_issue: 202,1 R 0 () 620992 + 200 [cksum]
100
+ cksum-7428 [000] d... 7269511.332639: <stack trace>
101
+ => blk_peek_request
102
+ => do_blkif_request
103
+ => __blk_run_queue
104
+ => queue_unplugged
105
+ => blk_flush_plug_list
106
+ => blk_finish_plug
107
+ => __do_page_cache_readahead
108
+ => ondemand_readahead
109
+ => page_cache_sync_readahead
110
+ => generic_file_read_iter
111
+ => new_sync_read
112
+ => vfs_read
113
+ => SyS_read
114
+ => system_call_fastpath
115
+ ^C
116
+ Ending tracing...
117
+
118
+ Easy. Now I can read the ancestry to understand what actually lead to issuing
119
+ a block device (disk) I/O.
120
+
121
+
122
+ Here's insertion onto the block I/O queue (better matches processes):
123
+
124
+ # ./tpoint -s block:block_rq_insert
125
+ Tracing block:block_rq_insert. Ctrl-C to end.
126
+ cksum-11908 [000] d... 7269834.882517: block_rq_insert: 202,1 R 0 () 736304 + 256 [cksum]
127
+ cksum-11908 [000] d... 7269834.882528: <stack trace>
128
+ => __elv_add_request
129
+ => blk_flush_plug_list
130
+ => blk_finish_plug
131
+ => __do_page_cache_readahead
132
+ => ondemand_readahead
133
+ => page_cache_sync_readahead
134
+ => generic_file_read_iter
135
+ => new_sync_read
136
+ => vfs_read
137
+ => SyS_read
138
+ => system_call_fastpath
139
+ [...]
140
+
141
+
142
+ You can also add tracepoint filters. To see what variables you can use, use -v:
143
+
144
+ # ./tpoint -v block:block_rq_issue
145
+ name: block_rq_issue
146
+ ID: 942
147
+ format:
148
+ field:unsigned short common_type; offset:0; size:2; signed:0;
149
+ field:unsigned char common_flags; offset:2; size:1; signed:0;
150
+ field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
151
+ field:int common_pid; offset:4; size:4; signed:1;
152
+
153
+ field:dev_t dev; offset:8; size:4; signed:0;
154
+ field:sector_t sector; offset:16; size:8; signed:0;
155
+ field:unsigned int nr_sector; offset:24; size:4; signed:0;
156
+ field:unsigned int bytes; offset:28; size:4; signed:0;
157
+ field:char rwbs[8]; offset:32; size:8; signed:1;
158
+ field:char comm[16]; offset:40; size:16; signed:1;
159
+ field:__data_loc char[] cmd; offset:56; size:4; signed:1;
160
+
161
+ print fmt: "%d,%d %s %u (%s) %llu + %u [%s]", ((unsigned int) ((REC->dev) >> 20)), ((unsigned int) ((REC->dev) & ((1U << 20) - 1))), REC->rwbs, REC->bytes, __get_str(cmd), (unsigned long long)REC->sector, REC->nr_sector, REC->comm
162
+
163
+
164
+ Now I'll add a filter to check that the rwbs field (I/O type) includes an "R",
165
+ making it a read:
166
+
167
+ # ./tpoint -s block:block_rq_insert 'rwbs ~ "*R*"'
168
+ cksum-11908 [000] d... 7269839.919098: block_rq_insert: 202,1 R 0 () 736560 + 136 [cksum]
169
+ cksum-11908 [000] d... 7269839.919107: <stack trace>
170
+ => __elv_add_request
171
+ => blk_flush_plug_list
172
+ => blk_finish_plug
173
+ => __do_page_cache_readahead
174
+ => ondemand_readahead
175
+ => page_cache_async_readahead
176
+ => generic_file_read_iter
177
+ => new_sync_read
178
+ => vfs_read
179
+ => SyS_read
180
+ => system_call_fastpath
181
+ [...]
182
+
183
+
184
+ Use -h to print the USAGE message:
185
+
186
+ # ./tpoint -h
187
+ USAGE: tpoint [-hHsv] [-d secs] [-p PID] [-L TID] tracepoint [filter]
188
+ tpoint -l
189
+ -d seconds # trace duration, and use buffers
190
+ -p PID # PID to match on events
191
+ -L TID # thread id to match on events
192
+ -v # view format file (don't trace)
193
+ -H # include column headers
194
+ -l # list all tracepoints
195
+ -s # show kernel stack traces
196
+ -h # this usage message
197
+
198
+ Note that these examples may need modification to match your kernel
199
+ version's function names and platform's register usage.
200
+ eg,
201
+ tpoint -l | grep open
202
+ # find tracepoints containing "open"
203
+ tpoint syscalls:sys_enter_open
204
+ # trace open() syscall entry
205
+ tpoint block:block_rq_issue
206
+ # trace block I/O issue
207
+ tpoint -s block:black_rq_issue
208
+ # show kernel stacks
209
+
210
+ See the man page and example file for more info.