pcp-mcp 1.3.2__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pcp_mcp/errors.py CHANGED
@@ -21,6 +21,10 @@ class PCPMetricNotFoundError(PCPError):
21
21
  def handle_pcp_error(e: Exception, operation: str) -> ToolError:
22
22
  """Convert PCP/httpx exceptions to MCP ToolErrors.
23
23
 
24
+ Uses isinstance() checks instead of match/case class patterns for resilience
25
+ against module reloading (e.g., FastMCP's FileSystemProvider), which can
26
+ create different class identities that break structural pattern matching.
27
+
24
28
  Args:
25
29
  e: The exception to convert.
26
30
  operation: Description of the operation that failed.
@@ -28,20 +32,23 @@ def handle_pcp_error(e: Exception, operation: str) -> ToolError:
28
32
  Returns:
29
33
  A ToolError with an appropriate message.
30
34
  """
31
- match e:
32
- case httpx.ConnectError():
33
- return ToolError("Cannot connect to pmproxy. Is it running? (systemctl start pmproxy)")
34
- case httpx.HTTPStatusError() as he if he.response.status_code == 400:
35
- return ToolError(f"Bad request during {operation}: {he.response.text}")
36
- case httpx.HTTPStatusError() as he if he.response.status_code == 404:
35
+ if isinstance(e, httpx.ConnectError):
36
+ return ToolError("Cannot connect to pmproxy. Is it running? (systemctl start pmproxy)")
37
+
38
+ if isinstance(e, httpx.HTTPStatusError):
39
+ if e.response.status_code == 400:
40
+ return ToolError(f"Bad request during {operation}: {e.response.text}")
41
+ if e.response.status_code == 404:
37
42
  return ToolError(f"Metric not found during {operation}")
38
- case httpx.HTTPStatusError() as he:
39
- return ToolError(f"pmproxy error ({he.response.status_code}): {he.response.text}")
40
- case httpx.TimeoutException():
41
- return ToolError(f"Request timed out during {operation}")
42
- case PCPConnectionError():
43
- return ToolError(str(e))
44
- case PCPMetricNotFoundError():
45
- return ToolError(f"Metric not found: {e}")
46
- case _:
47
- return ToolError(f"Error during {operation}: {e}")
43
+ return ToolError(f"pmproxy error ({e.response.status_code}): {e.response.text}")
44
+
45
+ if isinstance(e, httpx.TimeoutException):
46
+ return ToolError(f"Request timed out during {operation}")
47
+
48
+ if isinstance(e, PCPMetricNotFoundError):
49
+ return ToolError(f"Metric not found: {e}")
50
+
51
+ if isinstance(e, PCPConnectionError):
52
+ return ToolError(str(e))
53
+
54
+ return ToolError(f"Error during {operation}: {e}")
@@ -4,305 +4,33 @@ from __future__ import annotations
4
4
 
5
5
  from typing import TYPE_CHECKING
6
6
 
7
- from pcp_mcp.icons import (
8
- ICON_CPU,
9
- ICON_DIAGNOSE,
10
- ICON_DISK,
11
- ICON_MEMORY,
12
- ICON_NETWORK,
13
- TAGS_CPU,
14
- TAGS_DIAGNOSE,
15
- TAGS_DISK,
16
- TAGS_MEMORY,
17
- TAGS_NETWORK,
18
- )
7
+ from pcp_mcp.prompts.cpu import analyze_cpu_usage
8
+ from pcp_mcp.prompts.diagnose import diagnose_slow_system
9
+ from pcp_mcp.prompts.disk import find_io_bottleneck
10
+ from pcp_mcp.prompts.memory import investigate_memory_usage
11
+ from pcp_mcp.prompts.network import check_network_performance
19
12
 
20
13
  if TYPE_CHECKING:
21
14
  from fastmcp import FastMCP
22
15
 
16
+ __all__ = [
17
+ "diagnose_slow_system",
18
+ "investigate_memory_usage",
19
+ "find_io_bottleneck",
20
+ "analyze_cpu_usage",
21
+ "check_network_performance",
22
+ "register_prompts",
23
+ ]
24
+
23
25
 
24
26
  def register_prompts(mcp: FastMCP) -> None:
25
- """Register diagnostic prompts with the MCP server.
27
+ """Register all prompts with the MCP server.
26
28
 
27
29
  Args:
28
30
  mcp: The FastMCP server instance.
29
31
  """
30
-
31
- @mcp.prompt(icons=[ICON_DIAGNOSE], tags=TAGS_DIAGNOSE)
32
- def diagnose_slow_system() -> str:
33
- """Diagnose why a system is running slowly.
34
-
35
- Returns a structured investigation workflow to identify performance
36
- bottlenecks by examining CPU, memory, disk, and network metrics.
37
- """
38
- return """Investigate system slowness:
39
-
40
- 1. Get baseline: get_system_snapshot(categories=["cpu", "memory", "load", "disk", "network"])
41
-
42
- 2. Interpret the assessment fields:
43
- - If cpu.assessment mentions "I/O wait": Disk bottleneck (skip to step 4)
44
- - If cpu.assessment mentions "user/system": CPU bottleneck (go to step 3)
45
- - If memory.assessment mentions "swap": Memory pressure (go to step 5)
46
- - If load.assessment shows high load: Check load vs ncpu ratio
47
-
48
- 3. Find CPU hogs:
49
- - Run: get_process_top(sort_by="cpu", limit=10)
50
- - Identify processes with high cpu_percent
51
- - Note: cpu_percent > 100% means multi-threaded (e.g., 200% = 2 cores)
52
-
53
- 4. Check disk I/O bottleneck:
54
- - If disk.assessment shows high read/write rates
55
- - Run: search_metrics("disk.dev") to see per-device metrics
56
- - Run: get_process_top(sort_by="io", limit=10) to find I/O-heavy processes
57
- - Cross-check: Does kernel.all.cpu.wait.total correlate with disk activity?
58
-
59
- 5. Check memory pressure:
60
- - If memory.assessment indicates swapping
61
- - Run: get_process_top(sort_by="memory", limit=20)
62
- - Look for large rss_bytes processes
63
- - Check if swap usage is growing
64
-
65
- 6. Check network saturation:
66
- - If network.assessment shows high throughput
67
- - Run: search_metrics("network.interface") for per-interface breakdown
68
- - Look for interface errors or packet drops
69
-
70
- 7. Report findings:
71
- - Primary bottleneck (CPU/disk/memory/network)
72
- - Specific culprits (process names, PIDs)
73
- - Quantified impact (e.g., "process X using 45% CPU on 8-core system")
74
- - Recommendations (kill process, add RAM, optimize queries, etc.)
75
- """
76
-
77
- @mcp.prompt(icons=[ICON_MEMORY], tags=TAGS_MEMORY)
78
- def investigate_memory_usage() -> str:
79
- """Investigate memory consumption and identify memory pressure.
80
-
81
- Returns a workflow to analyze memory utilization, identify memory
82
- hogs, and distinguish between normal cache usage and actual pressure.
83
- """
84
- return """Memory investigation workflow:
85
-
86
- 1. Get memory overview:
87
- - Run: get_system_snapshot(categories=["memory"])
88
- - Read memory.assessment field for quick diagnosis
89
-
90
- 2. Interpret memory metrics:
91
- - mem.util.available is KEY metric (not "free"!)
92
- - Large cache is NORMAL (Linux uses free RAM for cache)
93
- - Swapping = BAD (indicates memory pressure)
94
- - Check used_percent vs swap usage
95
-
96
- 3. Assessment-based actions:
97
- - "Memory pressure" → Go to step 4
98
- - "Cache is large" → Normal, but check top consumers anyway (step 4)
99
- - "Swapping actively" → CRITICAL, go to step 4 immediately
100
-
101
- 4. Find memory consumers:
102
- - Run: get_process_top(sort_by="memory", limit=20)
103
- - Note processes with high rss_bytes
104
- - Calculate: rss_percent shows memory impact
105
- - Look for unexpected memory hogs (leaked memory, runaway processes)
106
-
107
- 5. Detailed memory breakdown:
108
- - Run: search_metrics("mem.util") for full breakdown
109
- - Check: mem.util.slab (kernel memory)
110
- - Check: mem.util.anonpages (process private memory)
111
- - Check: mem.util.swapCached (pages swapped but still in RAM)
112
-
113
- 6. NUMA systems (if applicable):
114
- - Run: search_metrics("mem.numa") to check per-node allocation
115
- - Look for imbalanced NUMA usage
116
-
117
- 7. Report:
118
- - Total memory: X GB
119
- - Used: Y% (Z GB used, W GB available)
120
- - Top 5 memory consumers with RSS sizes
121
- - Swap status: active/inactive, growth rate if swapping
122
- - Recommendation:
123
- * No pressure + large cache = Normal
124
- * High usage + no swap = Monitor but OK
125
- * Active swapping = Add RAM or reduce load
126
- * Single process consuming >50% = Investigate for memory leak
127
- """
128
-
129
- @mcp.prompt(icons=[ICON_DISK], tags=TAGS_DISK)
130
- def find_io_bottleneck() -> str:
131
- """Find disk I/O bottlenecks and identify processes causing high I/O.
132
-
133
- Returns a workflow to diagnose disk performance issues, identify
134
- hot devices, and find I/O-intensive processes.
135
- """
136
- return """Disk I/O investigation:
137
-
138
- 1. Get system-wide I/O snapshot:
139
- - Run: get_system_snapshot(categories=["disk", "cpu"])
140
- - Check disk.assessment for read/write rates
141
- - Check cpu.assessment for iowait_percent
142
-
143
- 2. Interpret I/O metrics:
144
- - High iowait_percent (>20%) = CPU waiting for disk
145
- - Read vs write imbalance may indicate backup, logging, or database queries
146
- - Sustained high I/O (>100 MB/s on HDD, >500 MB/s on SSD) = saturated
147
-
148
- 3. Identify hot disks:
149
- - Run: search_metrics("disk.dev")
150
- - Run: query_metrics(["disk.dev.read_bytes", "disk.dev.write_bytes"])
151
- - Note: These are COUNTERS, use get_system_snapshot for rates
152
- - Look for specific devices with disproportionate activity
153
-
154
- 4. Find I/O-heavy processes:
155
- - Run: get_process_top(sort_by="io", limit=10, sample_interval=2.0)
156
- - Note: Longer sample_interval (2-5s) gives more accurate I/O rates
157
- - Identify processes with high io_read_bytes_sec or io_write_bytes_sec
158
-
159
- 5. Correlate with CPU iowait:
160
- - If cpu.iowait_percent is high AND disk I/O is high:
161
- → Confirmed disk bottleneck
162
- - If disk I/O is high BUT iowait is low:
163
- → Fast storage keeping up (SSD/NVMe)
164
- - If iowait is high BUT disk I/O is low:
165
- → May be network storage (NFS) or storage controller issue
166
-
167
- 6. Check for I/O patterns:
168
- - Bursty I/O: Scheduled jobs, backups, log rotation
169
- - Sustained I/O: Database, file server, streaming
170
- - Random I/O: Database seeks (slow on HDD, fast on SSD)
171
- - Sequential I/O: Backups, large file copies
172
-
173
- 7. Advanced: Check per-partition I/O (if needed):
174
- - Run: search_metrics("disk.partitions")
175
- - Useful for systems with multiple partitions on same disk
176
-
177
- 8. Report:
178
- - Busiest disks by name (e.g., sda, nvme0n1)
179
- - Read vs write breakdown (e.g., "80% reads, 20% writes")
180
- - Top 3-5 processes causing I/O with rates
181
- - I/O pattern: bursty vs sustained, random vs sequential
182
- - Bottleneck severity: iowait % and queue depth
183
- - Recommendations:
184
- * High random I/O on HDD → Migrate to SSD
185
- * Single process saturating disk → Optimize queries/access patterns
186
- * Multiple processes fighting for I/O → I/O scheduler tuning or workload separation
187
- * Backup/batch jobs during business hours → Reschedule
188
- """
189
-
190
- @mcp.prompt(icons=[ICON_CPU], tags=TAGS_CPU)
191
- def analyze_cpu_usage() -> str:
192
- """Analyze CPU utilization patterns and identify CPU-bound processes.
193
-
194
- Returns a workflow to diagnose high CPU usage, distinguish between
195
- user-space and kernel CPU time, and identify optimization opportunities.
196
- """
197
- return """CPU usage analysis workflow:
198
-
199
- 1. Get CPU baseline:
200
- - Run: get_system_snapshot(categories=["cpu", "load"])
201
- - Read cpu.assessment for quick diagnosis
202
- - Note: ncpu value (number of CPUs/cores)
203
-
204
- 2. Interpret CPU metrics:
205
- - user_percent: Application code execution
206
- - system_percent: Kernel/syscall overhead
207
- - idle_percent: Unused CPU capacity
208
- - iowait_percent: CPU waiting for I/O (NOT CPU-bound if high)
209
- - Load average: Runnable + waiting processes (compare to ncpu)
210
-
211
- 3. CPU pattern classification:
212
- - High user + low system = CPU-intensive application (normal)
213
- - High system + low user = Kernel overhead (syscalls, context switches)
214
- - High iowait = NOT a CPU problem, it's disk/storage (see find_io_bottleneck)
215
- - Load > ncpu = More demand than capacity (may include I/O wait)
216
-
217
- 4. Find CPU hogs:
218
- - Run: get_process_top(sort_by="cpu", limit=15)
219
- - Note: cpu_percent > 100% means multi-core usage (e.g., 400% = 4 cores)
220
- - Identify unexpected high CPU consumers
221
-
222
- 5. Per-CPU breakdown (if needed):
223
- - Run: search_metrics("kernel.percpu.cpu")
224
- - Useful for: Thread affinity issues, interrupt handling imbalance
225
- - Look for: One CPU at 100% while others idle (poor parallelization)
226
-
227
- 6. Check for CPU saturation indicators:
228
- - Run: query_metrics(["kernel.all.runnable", "kernel.all.pswitch"])
229
- - High runnable count: More threads than cores (contention)
230
- - High pswitch (context switches): Thread thrashing
231
-
232
- 7. Distinguish workload types:
233
- - Compute-bound: High user%, low syscalls (scientific, encoding, crypto)
234
- - I/O-bound: High iowait%, moderate user% (databases, file processing)
235
- - System-bound: High system%, moderate user% (network servers, many syscalls)
236
-
237
- 8. Report:
238
- - CPU utilization breakdown: X% user, Y% system, Z% iowait, W% idle
239
- - Load average: 1/5/15 min values vs ncpu (e.g., "load 8.5 on 8-core = 106%")
240
- - Top 5 CPU consumers with cpu_percent and command names
241
- - CPU pattern: compute-bound / I/O-bound / system-bound
242
- - Saturation indicators: runnable queue, context switches
243
- - Recommendations:
244
- * Low idle + high load → Add CPU capacity or optimize hot processes
245
- * High iowait → Disk bottleneck, not CPU (see I/O investigation)
246
- * High system% → Profile syscalls, reduce I/O frequency, optimize locking
247
- * Single-threaded bottleneck → Parallelize if possible
248
- * Many small processes → Reduce process spawning overhead
249
- """
250
-
251
- @mcp.prompt(icons=[ICON_NETWORK], tags=TAGS_NETWORK)
252
- def check_network_performance() -> str:
253
- """Check network performance and identify bandwidth/error issues.
254
-
255
- Returns a workflow to analyze network throughput, identify saturated
256
- interfaces, and detect packet loss or errors.
257
- """
258
- return """Network performance investigation:
259
-
260
- 1. Get network overview:
261
- - Run: get_system_snapshot(categories=["network"])
262
- - Read network.assessment for quick diagnosis
263
- - Note: Rates are per-second (bytes/sec, packets/sec)
264
-
265
- 2. Interpret network metrics:
266
- - in_bytes_sec / out_bytes_sec: Throughput (compare to link speed)
267
- - in_packets_sec / out_packets_sec: Packet rate
268
- - Assessment field indicates saturation or errors
269
-
270
- 3. Per-interface breakdown:
271
- - Run: search_metrics("network.interface")
272
- - Run: query_metrics(["network.interface.in.bytes", "network.interface.out.bytes"])
273
- - Note: These are COUNTERS, use get_system_snapshot for rates
274
- - Identify busy interfaces vs idle interfaces (e.g., eth0 busy, lo idle)
275
-
276
- 4. Check for errors and drops:
277
- - Run: query_metrics(["network.interface.in.errors", "network.interface.out.errors"])
278
- - Run: query_metrics(["network.interface.in.drops", "network.interface.out.drops"])
279
- - Non-zero errors = Hardware, driver, or cable issues
280
- - Non-zero drops = Buffer overflow (traffic exceeds processing capacity)
281
-
282
- 5. Calculate interface saturation:
283
- - Compare throughput to link speed (e.g., 950 Mbps on 1 Gbps link = 95%)
284
- - Sustained >80% = Approaching saturation
285
- - Bursts >95% = Temporarily saturated
286
-
287
- 6. Find network-heavy processes (indirect):
288
- - PCP proc.* namespace doesn't have per-process network metrics
289
- - Use system tools: netstat, ss, iftop (outside PCP)
290
- - Or correlate: High network I/O often correlates with high CPU/disk I/O
291
-
292
- 7. Check protocol-level stats (if needed):
293
- - Run: search_metrics("network.tcp")
294
- - Run: search_metrics("network.udp")
295
- - Look for: Retransmissions, failed connections, buffer overflows
296
-
297
- 8. Report:
298
- - Per-interface throughput (e.g., "eth0: 850 Mbps in, 120 Mbps out")
299
- - Link utilization % (if link speed known)
300
- - Errors/drops: Count and affected interfaces
301
- - Traffic pattern: Symmetric (similar in/out) vs asymmetric (download/upload heavy)
302
- - Packet rate: Normal vs abnormal (tiny packets = inefficient, possible attack)
303
- - Recommendations:
304
- * High utilization + no errors → Upgrade link or load balance
305
- * Errors/drops present → Check cables, NIC drivers, switch ports
306
- * Asymmetric traffic → Normal for client (download heavy) or server (upload heavy)
307
- * High packet rate + low byte rate → Small packets (check for SYN flood, fragmentation)
308
- """
32
+ mcp.add_prompt(diagnose_slow_system)
33
+ mcp.add_prompt(investigate_memory_usage)
34
+ mcp.add_prompt(find_io_bottleneck)
35
+ mcp.add_prompt(analyze_cpu_usage)
36
+ mcp.add_prompt(check_network_performance)
pcp_mcp/prompts/cpu.py ADDED
@@ -0,0 +1,69 @@
1
+ """Analyze CPU usage prompt."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from fastmcp.prompts import prompt
6
+
7
+ from pcp_mcp.icons import ICON_CPU, TAGS_CPU
8
+
9
+
10
+ @prompt(icons=[ICON_CPU], tags=TAGS_CPU)
11
+ def analyze_cpu_usage() -> str:
12
+ """Analyze CPU utilization patterns and identify CPU-bound processes.
13
+
14
+ Returns a workflow to diagnose high CPU usage, distinguish between
15
+ user-space and kernel CPU time, and identify optimization opportunities.
16
+ """
17
+ return """CPU usage analysis workflow:
18
+
19
+ 1. Get CPU baseline:
20
+ - Run: get_system_snapshot(categories=["cpu", "load"])
21
+ - Read cpu.assessment for quick diagnosis
22
+ - Note: ncpu value (number of CPUs/cores)
23
+
24
+ 2. Interpret CPU metrics:
25
+ - user_percent: Application code execution
26
+ - system_percent: Kernel/syscall overhead
27
+ - idle_percent: Unused CPU capacity
28
+ - iowait_percent: CPU waiting for I/O (NOT CPU-bound if high)
29
+ - Load average: Runnable + waiting processes (compare to ncpu)
30
+
31
+ 3. CPU pattern classification:
32
+ - High user + low system = CPU-intensive application (normal)
33
+ - High system + low user = Kernel overhead (syscalls, context switches)
34
+ - High iowait = NOT a CPU problem, it's disk/storage (see find_io_bottleneck)
35
+ - Load > ncpu = More demand than capacity (may include I/O wait)
36
+
37
+ 4. Find CPU hogs:
38
+ - Run: get_process_top(sort_by="cpu", limit=15)
39
+ - Note: cpu_percent > 100% means multi-core usage (e.g., 400% = 4 cores)
40
+ - Identify unexpected high CPU consumers
41
+
42
+ 5. Per-CPU breakdown (if needed):
43
+ - Run: search_metrics("kernel.percpu.cpu")
44
+ - Useful for: Thread affinity issues, interrupt handling imbalance
45
+ - Look for: One CPU at 100% while others idle (poor parallelization)
46
+
47
+ 6. Check for CPU saturation indicators:
48
+ - Run: query_metrics(["kernel.all.runnable", "kernel.all.pswitch"])
49
+ - High runnable count: More threads than cores (contention)
50
+ - High pswitch (context switches): Thread thrashing
51
+
52
+ 7. Distinguish workload types:
53
+ - Compute-bound: High user%, low syscalls (scientific, encoding, crypto)
54
+ - I/O-bound: High iowait%, moderate user% (databases, file processing)
55
+ - System-bound: High system%, moderate user% (network servers, many syscalls)
56
+
57
+ 8. Report:
58
+ - CPU utilization breakdown: X% user, Y% system, Z% iowait, W% idle
59
+ - Load average: 1/5/15 min values vs ncpu (e.g., "load 8.5 on 8-core = 106%")
60
+ - Top 5 CPU consumers with cpu_percent and command names
61
+ - CPU pattern: compute-bound / I/O-bound / system-bound
62
+ - Saturation indicators: runnable queue, context switches
63
+ - Recommendations:
64
+ * Low idle + high load → Add CPU capacity or optimize hot processes
65
+ * High iowait → Disk bottleneck, not CPU (see I/O investigation)
66
+ * High system% → Profile syscalls, reduce I/O frequency, optimize locking
67
+ * Single-threaded bottleneck → Parallelize if possible
68
+ * Many small processes → Reduce process spawning overhead
69
+ """
@@ -0,0 +1,54 @@
1
+ """Diagnose slow system prompt."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from fastmcp.prompts import prompt
6
+
7
+ from pcp_mcp.icons import ICON_DIAGNOSE, TAGS_DIAGNOSE
8
+
9
+
10
+ @prompt(icons=[ICON_DIAGNOSE], tags=TAGS_DIAGNOSE)
11
+ def diagnose_slow_system() -> str:
12
+ """Diagnose why a system is running slowly.
13
+
14
+ Returns a structured investigation workflow to identify performance
15
+ bottlenecks by examining CPU, memory, disk, and network metrics.
16
+ """
17
+ return """Investigate system slowness:
18
+
19
+ 1. Get baseline: get_system_snapshot(categories=["cpu", "memory", "load", "disk", "network"])
20
+
21
+ 2. Interpret the assessment fields:
22
+ - If cpu.assessment mentions "I/O wait": Disk bottleneck (skip to step 4)
23
+ - If cpu.assessment mentions "user/system": CPU bottleneck (go to step 3)
24
+ - If memory.assessment mentions "swap": Memory pressure (go to step 5)
25
+ - If load.assessment shows high load: Check load vs ncpu ratio
26
+
27
+ 3. Find CPU hogs:
28
+ - Run: get_process_top(sort_by="cpu", limit=10)
29
+ - Identify processes with high cpu_percent
30
+ - Note: cpu_percent > 100% means multi-threaded (e.g., 200% = 2 cores)
31
+
32
+ 4. Check disk I/O bottleneck:
33
+ - If disk.assessment shows high read/write rates
34
+ - Run: search_metrics("disk.dev") to see per-device metrics
35
+ - Run: get_process_top(sort_by="io", limit=10) to find I/O-heavy processes
36
+ - Cross-check: Does kernel.all.cpu.wait.total correlate with disk activity?
37
+
38
+ 5. Check memory pressure:
39
+ - If memory.assessment indicates swapping
40
+ - Run: get_process_top(sort_by="memory", limit=20)
41
+ - Look for large rss_bytes processes
42
+ - Check if swap usage is growing
43
+
44
+ 6. Check network saturation:
45
+ - If network.assessment shows high throughput
46
+ - Run: search_metrics("network.interface") for per-interface breakdown
47
+ - Look for interface errors or packet drops
48
+
49
+ 7. Report findings:
50
+ - Primary bottleneck (CPU/disk/memory/network)
51
+ - Specific culprits (process names, PIDs)
52
+ - Quantified impact (e.g., "process X using 45% CPU on 8-core system")
53
+ - Recommendations (kill process, add RAM, optimize queries, etc.)
54
+ """
@@ -0,0 +1,69 @@
1
+ """Find I/O bottleneck prompt."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from fastmcp.prompts import prompt
6
+
7
+ from pcp_mcp.icons import ICON_DISK, TAGS_DISK
8
+
9
+
10
+ @prompt(icons=[ICON_DISK], tags=TAGS_DISK)
11
+ def find_io_bottleneck() -> str:
12
+ """Find disk I/O bottlenecks and identify processes causing high I/O.
13
+
14
+ Returns a workflow to diagnose disk performance issues, identify
15
+ hot devices, and find I/O-intensive processes.
16
+ """
17
+ return """Disk I/O investigation:
18
+
19
+ 1. Get system-wide I/O snapshot:
20
+ - Run: get_system_snapshot(categories=["disk", "cpu"])
21
+ - Check disk.assessment for read/write rates
22
+ - Check cpu.assessment for iowait_percent
23
+
24
+ 2. Interpret I/O metrics:
25
+ - High iowait_percent (>20%) = CPU waiting for disk
26
+ - Read vs write imbalance may indicate backup, logging, or database queries
27
+ - Sustained high I/O (>100 MB/s on HDD, >500 MB/s on SSD) = saturated
28
+
29
+ 3. Identify hot disks:
30
+ - Run: search_metrics("disk.dev")
31
+ - Run: query_metrics(["disk.dev.read_bytes", "disk.dev.write_bytes"])
32
+ - Note: These are COUNTERS, use get_system_snapshot for rates
33
+ - Look for specific devices with disproportionate activity
34
+
35
+ 4. Find I/O-heavy processes:
36
+ - Run: get_process_top(sort_by="io", limit=10, sample_interval=2.0)
37
+ - Note: Longer sample_interval (2-5s) gives more accurate I/O rates
38
+ - Identify processes with high io_read_bytes_sec or io_write_bytes_sec
39
+
40
+ 5. Correlate with CPU iowait:
41
+ - If cpu.iowait_percent is high AND disk I/O is high:
42
+ → Confirmed disk bottleneck
43
+ - If disk I/O is high BUT iowait is low:
44
+ → Fast storage keeping up (SSD/NVMe)
45
+ - If iowait is high BUT disk I/O is low:
46
+ → May be network storage (NFS) or storage controller issue
47
+
48
+ 6. Check for I/O patterns:
49
+ - Bursty I/O: Scheduled jobs, backups, log rotation
50
+ - Sustained I/O: Database, file server, streaming
51
+ - Random I/O: Database seeks (slow on HDD, fast on SSD)
52
+ - Sequential I/O: Backups, large file copies
53
+
54
+ 7. Advanced: Check per-partition I/O (if needed):
55
+ - Run: search_metrics("disk.partitions")
56
+ - Useful for systems with multiple partitions on same disk
57
+
58
+ 8. Report:
59
+ - Busiest disks by name (e.g., sda, nvme0n1)
60
+ - Read vs write breakdown (e.g., "80% reads, 20% writes")
61
+ - Top 3-5 processes causing I/O with rates
62
+ - I/O pattern: bursty vs sustained, random vs sequential
63
+ - Bottleneck severity: iowait % and queue depth
64
+ - Recommendations:
65
+ * High random I/O on HDD → Migrate to SSD
66
+ * Single process saturating disk → Optimize queries/access patterns
67
+ * Multiple processes fighting for I/O → I/O scheduler tuning or workload separation
68
+ * Backup/batch jobs during business hours → Reschedule
69
+ """
@@ -0,0 +1,60 @@
1
+ """Investigate memory usage prompt."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from fastmcp.prompts import prompt
6
+
7
+ from pcp_mcp.icons import ICON_MEMORY, TAGS_MEMORY
8
+
9
+
10
+ @prompt(icons=[ICON_MEMORY], tags=TAGS_MEMORY)
11
+ def investigate_memory_usage() -> str:
12
+ """Investigate memory consumption and identify memory pressure.
13
+
14
+ Returns a workflow to analyze memory utilization, identify memory
15
+ hogs, and distinguish between normal cache usage and actual pressure.
16
+ """
17
+ return """Memory investigation workflow:
18
+
19
+ 1. Get memory overview:
20
+ - Run: get_system_snapshot(categories=["memory"])
21
+ - Read memory.assessment field for quick diagnosis
22
+
23
+ 2. Interpret memory metrics:
24
+ - mem.util.available is KEY metric (not "free"!)
25
+ - Large cache is NORMAL (Linux uses free RAM for cache)
26
+ - Swapping = BAD (indicates memory pressure)
27
+ - Check used_percent vs swap usage
28
+
29
+ 3. Assessment-based actions:
30
+ - "Memory pressure" → Go to step 4
31
+ - "Cache is large" → Normal, but check top consumers anyway (step 4)
32
+ - "Swapping actively" → CRITICAL, go to step 4 immediately
33
+
34
+ 4. Find memory consumers:
35
+ - Run: get_process_top(sort_by="memory", limit=20)
36
+ - Note processes with high rss_bytes
37
+ - Calculate: rss_percent shows memory impact
38
+ - Look for unexpected memory hogs (leaked memory, runaway processes)
39
+
40
+ 5. Detailed memory breakdown:
41
+ - Run: search_metrics("mem.util") for full breakdown
42
+ - Check: mem.util.slab (kernel memory)
43
+ - Check: mem.util.anonpages (process private memory)
44
+ - Check: mem.util.swapCached (pages swapped but still in RAM)
45
+
46
+ 6. NUMA systems (if applicable):
47
+ - Run: search_metrics("mem.numa") to check per-node allocation
48
+ - Look for imbalanced NUMA usage
49
+
50
+ 7. Report:
51
+ - Total memory: X GB
52
+ - Used: Y% (Z GB used, W GB available)
53
+ - Top 5 memory consumers with RSS sizes
54
+ - Swap status: active/inactive, growth rate if swapping
55
+ - Recommendation:
56
+ * No pressure + large cache = Normal
57
+ * High usage + no swap = Monitor but OK
58
+ * Active swapping = Add RAM or reduce load
59
+ * Single process consuming >50% = Investigate for memory leak
60
+ """