pcp-mcp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,295 @@
1
+ """Diagnostic prompts for guided troubleshooting workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from fastmcp import FastMCP
9
+
10
+
11
+ def register_prompts(mcp: FastMCP) -> None:
12
+ """Register diagnostic prompts with the MCP server.
13
+
14
+ Args:
15
+ mcp: The FastMCP server instance.
16
+ """
17
+
18
+ @mcp.prompt()
19
+ def diagnose_slow_system() -> str:
20
+ """Diagnose why a system is running slowly.
21
+
22
+ Returns a structured investigation workflow to identify performance
23
+ bottlenecks by examining CPU, memory, disk, and network metrics.
24
+ """
25
+ return """Investigate system slowness:
26
+
27
+ 1. Get baseline: get_system_snapshot(categories=["cpu", "memory", "load", "disk", "network"])
28
+
29
+ 2. Interpret the assessment fields:
30
+ - If cpu.assessment mentions "I/O wait": Disk bottleneck (skip to step 4)
31
+ - If cpu.assessment mentions "user/system": CPU bottleneck (go to step 3)
32
+ - If memory.assessment mentions "swap": Memory pressure (go to step 5)
33
+ - If load.assessment shows high load: Check load vs ncpu ratio
34
+
35
+ 3. Find CPU hogs:
36
+ - Run: get_process_top(sort_by="cpu", limit=10)
37
+ - Identify processes with high cpu_percent
38
+ - Note: cpu_percent > 100% means multi-threaded (e.g., 200% = 2 cores)
39
+
40
+ 4. Check disk I/O bottleneck:
41
+ - If disk.assessment shows high read/write rates
42
+ - Run: search_metrics("disk.dev") to see per-device metrics
43
+ - Run: get_process_top(sort_by="io", limit=10) to find I/O-heavy processes
44
+ - Cross-check: Does kernel.all.cpu.wait.total correlate with disk activity?
45
+
46
+ 5. Check memory pressure:
47
+ - If memory.assessment indicates swapping
48
+ - Run: get_process_top(sort_by="memory", limit=20)
49
+ - Look for large rss_bytes processes
50
+ - Check if swap usage is growing
51
+
52
+ 6. Check network saturation:
53
+ - If network.assessment shows high throughput
54
+ - Run: search_metrics("network.interface") for per-interface breakdown
55
+ - Look for interface errors or packet drops
56
+
57
+ 7. Report findings:
58
+ - Primary bottleneck (CPU/disk/memory/network)
59
+ - Specific culprits (process names, PIDs)
60
+ - Quantified impact (e.g., "process X using 45% CPU on 8-core system")
61
+ - Recommendations (kill process, add RAM, optimize queries, etc.)
62
+ """
63
+
64
+ @mcp.prompt()
65
+ def investigate_memory_usage() -> str:
66
+ """Investigate memory consumption and identify memory pressure.
67
+
68
+ Returns a workflow to analyze memory utilization, identify memory
69
+ hogs, and distinguish between normal cache usage and actual pressure.
70
+ """
71
+ return """Memory investigation workflow:
72
+
73
+ 1. Get memory overview:
74
+ - Run: get_system_snapshot(categories=["memory"])
75
+ - Read memory.assessment field for quick diagnosis
76
+
77
+ 2. Interpret memory metrics:
78
+ - mem.util.available is KEY metric (not "free"!)
79
+ - Large cache is NORMAL (Linux uses free RAM for cache)
80
+ - Swapping = BAD (indicates memory pressure)
81
+ - Check used_percent vs swap usage
82
+
83
+ 3. Assessment-based actions:
84
+ - "Memory pressure" → Go to step 4
85
+ - "Cache is large" → Normal, but check top consumers anyway (step 4)
86
+ - "Swapping actively" → CRITICAL, go to step 4 immediately
87
+
88
+ 4. Find memory consumers:
89
+ - Run: get_process_top(sort_by="memory", limit=20)
90
+ - Note processes with high rss_bytes
91
+ - Calculate: rss_percent shows memory impact
92
+ - Look for unexpected memory hogs (leaked memory, runaway processes)
93
+
94
+ 5. Detailed memory breakdown:
95
+ - Run: search_metrics("mem.util") for full breakdown
96
+ - Check: mem.util.slab (kernel memory)
97
+ - Check: mem.util.anonpages (process private memory)
98
+ - Check: mem.util.swapCached (pages swapped but still in RAM)
99
+
100
+ 6. NUMA systems (if applicable):
101
+ - Run: search_metrics("mem.numa") to check per-node allocation
102
+ - Look for imbalanced NUMA usage
103
+
104
+ 7. Report:
105
+ - Total memory: X GB
106
+ - Used: Y% (Z GB used, W GB available)
107
+ - Top 5 memory consumers with RSS sizes
108
+ - Swap status: active/inactive, growth rate if swapping
109
+ - Recommendation:
110
+ * No pressure + large cache = Normal
111
+ * High usage + no swap = Monitor but OK
112
+ * Active swapping = Add RAM or reduce load
113
+ * Single process consuming >50% = Investigate for memory leak
114
+ """
115
+
116
+ @mcp.prompt()
117
+ def find_io_bottleneck() -> str:
118
+ """Find disk I/O bottlenecks and identify processes causing high I/O.
119
+
120
+ Returns a workflow to diagnose disk performance issues, identify
121
+ hot devices, and find I/O-intensive processes.
122
+ """
123
+ return """Disk I/O investigation:
124
+
125
+ 1. Get system-wide I/O snapshot:
126
+ - Run: get_system_snapshot(categories=["disk", "cpu"])
127
+ - Check disk.assessment for read/write rates
128
+ - Check cpu.assessment for iowait_percent
129
+
130
+ 2. Interpret I/O metrics:
131
+ - High iowait_percent (>20%) = CPU waiting for disk
132
+ - Read vs write imbalance may indicate backup, logging, or database queries
133
+ - Sustained high I/O (>100 MB/s on HDD, >500 MB/s on SSD) = saturated
134
+
135
+ 3. Identify hot disks:
136
+ - Run: search_metrics("disk.dev")
137
+ - Run: query_metrics(["disk.dev.read_bytes", "disk.dev.write_bytes"])
138
+ - Note: These are COUNTERS, use get_system_snapshot for rates
139
+ - Look for specific devices with disproportionate activity
140
+
141
+ 4. Find I/O-heavy processes:
142
+ - Run: get_process_top(sort_by="io", limit=10, sample_interval=2.0)
143
+ - Note: Longer sample_interval (2-5s) gives more accurate I/O rates
144
+ - Identify processes with high io_read_bytes_sec or io_write_bytes_sec
145
+
146
+ 5. Correlate with CPU iowait:
147
+ - If cpu.iowait_percent is high AND disk I/O is high:
148
+ → Confirmed disk bottleneck
149
+ - If disk I/O is high BUT iowait is low:
150
+ → Fast storage keeping up (SSD/NVMe)
151
+ - If iowait is high BUT disk I/O is low:
152
+ → May be network storage (NFS) or storage controller issue
153
+
154
+ 6. Check for I/O patterns:
155
+ - Bursty I/O: Scheduled jobs, backups, log rotation
156
+ - Sustained I/O: Database, file server, streaming
157
+ - Random I/O: Database seeks (slow on HDD, fast on SSD)
158
+ - Sequential I/O: Backups, large file copies
159
+
160
+ 7. Advanced: Check per-partition I/O (if needed):
161
+ - Run: search_metrics("disk.partitions")
162
+ - Useful for systems with multiple partitions on same disk
163
+
164
+ 8. Report:
165
+ - Busiest disks by name (e.g., sda, nvme0n1)
166
+ - Read vs write breakdown (e.g., "80% reads, 20% writes")
167
+ - Top 3-5 processes causing I/O with rates
168
+ - I/O pattern: bursty vs sustained, random vs sequential
169
+ - Bottleneck severity: iowait % and queue depth
170
+ - Recommendations:
171
+ * High random I/O on HDD → Migrate to SSD
172
+ * Single process saturating disk → Optimize queries/access patterns
173
+ * Multiple processes fighting for I/O → I/O scheduler tuning or workload separation
174
+ * Backup/batch jobs during business hours → Reschedule
175
+ """
176
+
177
+ @mcp.prompt()
178
+ def analyze_cpu_usage() -> str:
179
+ """Analyze CPU utilization patterns and identify CPU-bound processes.
180
+
181
+ Returns a workflow to diagnose high CPU usage, distinguish between
182
+ user-space and kernel CPU time, and identify optimization opportunities.
183
+ """
184
+ return """CPU usage analysis workflow:
185
+
186
+ 1. Get CPU baseline:
187
+ - Run: get_system_snapshot(categories=["cpu", "load"])
188
+ - Read cpu.assessment for quick diagnosis
189
+ - Note: ncpu value (number of CPUs/cores)
190
+
191
+ 2. Interpret CPU metrics:
192
+ - user_percent: Application code execution
193
+ - system_percent: Kernel/syscall overhead
194
+ - idle_percent: Unused CPU capacity
195
+ - iowait_percent: CPU waiting for I/O (NOT CPU-bound if high)
196
+ - Load average: Runnable + waiting processes (compare to ncpu)
197
+
198
+ 3. CPU pattern classification:
199
+ - High user + low system = CPU-intensive application (normal)
200
+ - High system + low user = Kernel overhead (syscalls, context switches)
201
+ - High iowait = NOT a CPU problem, it's disk/storage (see find_io_bottleneck)
202
+ - Load > ncpu = More demand than capacity (may include I/O wait)
203
+
204
+ 4. Find CPU hogs:
205
+ - Run: get_process_top(sort_by="cpu", limit=15)
206
+ - Note: cpu_percent > 100% means multi-core usage (e.g., 400% = 4 cores)
207
+ - Identify unexpected high CPU consumers
208
+
209
+ 5. Per-CPU breakdown (if needed):
210
+ - Run: search_metrics("kernel.percpu.cpu")
211
+ - Useful for: Thread affinity issues, interrupt handling imbalance
212
+ - Look for: One CPU at 100% while others idle (poor parallelization)
213
+
214
+ 6. Check for CPU saturation indicators:
215
+ - Run: query_metrics(["kernel.all.runnable", "kernel.all.pswitch"])
216
+ - High runnable count: More threads than cores (contention)
217
+ - High pswitch (context switches): Thread thrashing
218
+
219
+ 7. Distinguish workload types:
220
+ - Compute-bound: High user%, low syscalls (scientific, encoding, crypto)
221
+ - I/O-bound: High iowait%, moderate user% (databases, file processing)
222
+ - System-bound: High system%, moderate user% (network servers, many syscalls)
223
+
224
+ 8. Report:
225
+ - CPU utilization breakdown: X% user, Y% system, Z% iowait, W% idle
226
+ - Load average: 1/5/15 min values vs ncpu (e.g., "load 8.5 on 8-core = 106%")
227
+ - Top 5 CPU consumers with cpu_percent and command names
228
+ - CPU pattern: compute-bound / I/O-bound / system-bound
229
+ - Saturation indicators: runnable queue, context switches
230
+ - Recommendations:
231
+ * Low idle + high load → Add CPU capacity or optimize hot processes
232
+ * High iowait → Disk bottleneck, not CPU (see I/O investigation)
233
+ * High system% → Profile syscalls, reduce I/O frequency, optimize locking
234
+ * Single-threaded bottleneck → Parallelize if possible
235
+ * Many small processes → Reduce process spawning overhead
236
+ """
237
+
238
+ @mcp.prompt()
239
+ def check_network_performance() -> str:
240
+ """Check network performance and identify bandwidth/error issues.
241
+
242
+ Returns a workflow to analyze network throughput, identify saturated
243
+ interfaces, and detect packet loss or errors.
244
+ """
245
+ return """Network performance investigation:
246
+
247
+ 1. Get network overview:
248
+ - Run: get_system_snapshot(categories=["network"])
249
+ - Read network.assessment for quick diagnosis
250
+ - Note: Rates are per-second (bytes/sec, packets/sec)
251
+
252
+ 2. Interpret network metrics:
253
+ - in_bytes_sec / out_bytes_sec: Throughput (compare to link speed)
254
+ - in_packets_sec / out_packets_sec: Packet rate
255
+ - Assessment field indicates saturation or errors
256
+
257
+ 3. Per-interface breakdown:
258
+ - Run: search_metrics("network.interface")
259
+ - Run: query_metrics(["network.interface.in.bytes", "network.interface.out.bytes"])
260
+ - Note: These are COUNTERS, use get_system_snapshot for rates
261
+ - Identify busy interfaces vs idle interfaces (e.g., eth0 busy, lo idle)
262
+
263
+ 4. Check for errors and drops:
264
+ - Run: query_metrics(["network.interface.in.errors", "network.interface.out.errors"])
265
+ - Run: query_metrics(["network.interface.in.drops", "network.interface.out.drops"])
266
+ - Non-zero errors = Hardware, driver, or cable issues
267
+ - Non-zero drops = Buffer overflow (traffic exceeds processing capacity)
268
+
269
+ 5. Calculate interface saturation:
270
+ - Compare throughput to link speed (e.g., 950 Mbps on 1 Gbps link = 95%)
271
+ - Sustained >80% = Approaching saturation
272
+ - Bursts >95% = Temporarily saturated
273
+
274
+ 6. Find network-heavy processes (indirect):
275
+ - PCP proc.* namespace doesn't have per-process network metrics
276
+ - Use system tools: netstat, ss, iftop (outside PCP)
277
+ - Or correlate: High network I/O often correlates with high CPU/disk I/O
278
+
279
+ 7. Check protocol-level stats (if needed):
280
+ - Run: search_metrics("network.tcp")
281
+ - Run: search_metrics("network.udp")
282
+ - Look for: Retransmissions, failed connections, buffer overflows
283
+
284
+ 8. Report:
285
+ - Per-interface throughput (e.g., "eth0: 850 Mbps in, 120 Mbps out")
286
+ - Link utilization % (if link speed known)
287
+ - Errors/drops: Count and affected interfaces
288
+ - Traffic pattern: Symmetric (similar in/out) vs asymmetric (download/upload heavy)
289
+ - Packet rate: Normal vs abnormal (tiny packets = inefficient, possible attack)
290
+ - Recommendations:
291
+ * High utilization + no errors → Upgrade link or load balance
292
+ * Errors/drops present → Check cables, NIC drivers, switch ports
293
+ * Asymmetric traffic → Normal for client (download heavy) or server (upload heavy)
294
+ * High packet rate + low byte rate → Small packets (check for SYN flood, fragmentation)
295
+ """
@@ -0,0 +1,21 @@
1
+ """Resource registration for the PCP MCP server."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from fastmcp import FastMCP
9
+
10
+
11
+ def register_resources(mcp: FastMCP) -> None:
12
+ """Register all resources with the MCP server.
13
+
14
+ Args:
15
+ mcp: The FastMCP server instance.
16
+ """
17
+ from pcp_mcp.resources.catalog import register_catalog_resources
18
+ from pcp_mcp.resources.health import register_health_resources
19
+
20
+ register_health_resources(mcp)
21
+ register_catalog_resources(mcp)
@@ -0,0 +1,233 @@
1
+ """Catalog resources for common metrics and namespaces."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ from fastmcp import Context
8
+
9
+ if TYPE_CHECKING:
10
+ from fastmcp import FastMCP
11
+
12
+
13
+ def register_catalog_resources(mcp: FastMCP) -> None:
14
+ """Register catalog resources with the MCP server.
15
+
16
+ Args:
17
+ mcp: The FastMCP server instance.
18
+ """
19
+
20
+ @mcp.resource("pcp://metrics/common")
21
+ def common_metrics_catalog() -> str:
22
+ """Catalog of commonly used metric groups.
23
+
24
+ Returns a structured guide to the most useful PCP metrics organized
25
+ by troubleshooting domain.
26
+ """
27
+ return """# Common PCP Metric Groups
28
+
29
+ ## CPU Performance
30
+ - kernel.all.cpu.user → User-space CPU time (counter) ⚠️
31
+ - kernel.all.cpu.sys → Kernel CPU time (counter) ⚠️
32
+ - kernel.all.cpu.idle → Idle CPU time (counter) ⚠️
33
+ - kernel.all.cpu.wait.total → I/O wait time (counter) ⚠️ High = disk bottleneck
34
+ - kernel.all.load → Load average (1, 5, 15 min) [instances: 1, 5, 15]
35
+ - kernel.all.runnable → Runnable processes (instant)
36
+ - kernel.all.nprocs → Total processes (instant)
37
+ - hinv.ncpu → Number of CPUs (instant)
38
+
39
+ ## Memory
40
+ - mem.physmem → Total physical memory in KB (instant)
41
+ - mem.util.used → Used memory in KB (instant)
42
+ - mem.util.free → Free memory in KB (instant)
43
+ - mem.util.available → Available for apps in KB (instant) ⭐ Use this, not "free"
44
+ - mem.util.cached → Cached data in KB (instant)
45
+ - mem.util.bufmem → Buffer memory in KB (instant)
46
+ - mem.util.swapTotal → Total swap in KB (instant)
47
+ - mem.util.swapFree → Free swap in KB (instant)
48
+ - mem.util.slab → Kernel slab allocator in KB (instant)
49
+
50
+ ## Disk I/O
51
+ - disk.all.read_bytes → Total bytes read (counter) ⚠️
52
+ - disk.all.write_bytes → Total bytes written (counter) ⚠️
53
+ - disk.all.read → Total read operations (counter) ⚠️
54
+ - disk.all.write → Total write operations (counter) ⚠️
55
+ - disk.dev.read_bytes → Per-disk reads in bytes [instances: sda, sdb, ...] (counter) ⚠️
56
+ - disk.dev.write_bytes → Per-disk writes in bytes [instances: sda, sdb, ...] (counter) ⚠️
57
+ - disk.dev.avactive → Average time disk was active (instant)
58
+
59
+ ## Network
60
+ - network.interface.in.bytes → Bytes received [instances: eth0, lo, ...] (counter) ⚠️
61
+ - network.interface.out.bytes → Bytes sent [instances: eth0, lo, ...] (counter) ⚠️
62
+ - network.interface.in.packets → Packets received [instances] (counter) ⚠️
63
+ - network.interface.out.packets → Packets sent [instances] (counter) ⚠️
64
+ - network.interface.in.errors → Receive errors [instances] (counter) ⚠️
65
+ - network.interface.out.errors → Transmit errors [instances] (counter) ⚠️
66
+
67
+ ## Process Metrics (⚠️ Use get_process_top instead of raw queries)
68
+ - proc.psinfo.pid → Process ID [instances: PIDs]
69
+ - proc.psinfo.cmd → Command name [instances: PIDs]
70
+ - proc.psinfo.psargs → Full command line [instances: PIDs]
71
+ - proc.memory.rss → Resident set size in KB [instances: PIDs] (instant)
72
+ - proc.memory.vmsize → Virtual memory size in KB [instances: PIDs] (instant)
73
+ - proc.psinfo.utime → User CPU time in ms [instances: PIDs] (counter) ⚠️
74
+ - proc.psinfo.stime → System CPU time in ms [instances: PIDs] (counter) ⚠️
75
+ - proc.io.read_bytes → Process I/O reads in bytes [instances: PIDs] (counter) ⚠️
76
+ - proc.io.write_bytes → Process I/O writes in bytes [instances: PIDs] (counter) ⚠️
77
+
78
+ ## System Health
79
+ - kernel.all.uptime → System uptime in seconds (instant)
80
+ - kernel.all.nusers → Logged-in users (instant)
81
+ - pmcd.agent.status → PMDA agent health [instances: agent names] (instant)
82
+ - pmcd.pmlogger.host → Active pmlogger hosts [instances] (instant)
83
+
84
+ ## Container Metrics (requires cgroups PMDA)
85
+ - cgroup.cpuacct.usage → CPU usage per cgroup [instances: cgroup paths] (counter) ⚠️
86
+ - cgroup.memory.usage → Memory usage per cgroup [instances: cgroup paths] (instant)
87
+ - cgroup.blkio.io_service_bytes → I/O per cgroup [instances: cgroup paths] (counter) ⚠️
88
+
89
+ ---
90
+
91
+ ## Legend
92
+ ⚠️ = COUNTER METRIC - Use get_system_snapshot() or get_process_top() for rates
93
+ ⭐ = Recommended over alternatives
94
+ [instances] = Returns multiple values (per-CPU, per-disk, per-process, etc.)
95
+ (instant) = Instantaneous gauge value
96
+ (counter) = Cumulative counter since boot
97
+ """
98
+
99
+ @mcp.resource("pcp://namespaces")
100
+ async def metric_namespaces(ctx: Context) -> str:
101
+ """List available PCP metric namespaces discovered from the live system.
102
+
103
+ Queries the connected PCP server to enumerate top-level namespaces
104
+ and active PMDAs, showing exactly what's available on this system.
105
+ """
106
+ from pcp_mcp.context import get_client
107
+ from pcp_mcp.errors import handle_pcp_error
108
+
109
+ client = get_client(ctx)
110
+
111
+ try:
112
+ all_metrics = await client.search("")
113
+ namespaces = sorted(
114
+ {m.get("name", "").split(".")[0] for m in all_metrics if m.get("name")}
115
+ )
116
+
117
+ pmda_status = await client.fetch(["pmcd.agent.status"])
118
+ active_pmdas = []
119
+ for metric in pmda_status.get("values", []):
120
+ for inst in metric.get("instances", []):
121
+ instance_id = inst.get("instance")
122
+ status = inst.get("value")
123
+ if instance_id is not None and instance_id != -1 and status == 0:
124
+ active_pmdas.append(str(instance_id))
125
+
126
+ except Exception as e:
127
+ raise handle_pcp_error(e, "discovering namespaces") from e
128
+
129
+ output = f"""# PCP Metric Namespaces (Live Discovery)
130
+
131
+ Connected to: {client.target_host}
132
+ Active PMDAs: {len(active_pmdas)}
133
+ Top-level namespaces: {len(namespaces)}
134
+
135
+ ## Available Namespaces
136
+
137
+ """
138
+
139
+ namespace_docs = {
140
+ "kernel": "System-wide kernel statistics (CPU, load, interrupts, uptime)",
141
+ "mem": "Memory subsystem (physmem, swap, cache, buffers, NUMA)",
142
+ "disk": "Disk I/O (aggregates, per-device, partitions, device mapper)",
143
+ "network": "Network interfaces and protocols (TCP, UDP, IP)",
144
+ "proc": "Per-process metrics ⚠️ Use get_process_top instead of raw queries",
145
+ "hinv": "Hardware inventory (ncpu, physmem, architecture - static info)",
146
+ "pmcd": "PCP daemon health (agent status, clients, control)",
147
+ "pmproxy": "pmproxy daemon metrics (if pmproxy PMDA loaded)",
148
+ "cgroup": "Container/cgroup metrics (CPU, memory, I/O per cgroup)",
149
+ "containers": "Container metrics (Docker, Podman via PMDA)",
150
+ "filesys": "Filesystem metrics (capacity, used, free per mount point)",
151
+ "nfs": "NFS version-agnostic metrics",
152
+ "nfs3": "NFSv3 client and server metrics",
153
+ "nfs4": "NFSv4 client and server metrics",
154
+ "swap": "Swap device metrics (activity per swap device)",
155
+ "quota": "Filesystem quota metrics",
156
+ "xfs": "XFS filesystem-specific metrics",
157
+ "btrfs": "Btrfs filesystem-specific metrics",
158
+ "zfs": "ZFS filesystem-specific metrics",
159
+ "kvm": "KVM hypervisor metrics (guest VMs)",
160
+ "libvirt": "libvirt virtualization metrics",
161
+ "redis": "Redis server metrics (via redis PMDA)",
162
+ "postgresql": "PostgreSQL database metrics (via postgresql PMDA)",
163
+ "mysql": "MySQL database metrics (via mysql PMDA)",
164
+ "nginx": "nginx web server metrics",
165
+ "apache": "Apache web server metrics",
166
+ "haproxy": "HAProxy load balancer metrics",
167
+ "elasticsearch": "Elasticsearch metrics",
168
+ "mongodb": "MongoDB metrics",
169
+ "bcc": "eBPF-based advanced profiling (BPF PMDA - requires kernel 4.1+)",
170
+ "hotproc": "Hot process tracking (automatically tracks top resource consumers)",
171
+ "mmv": "Memory-mapped value metrics (custom app instrumentation)",
172
+ "sysfs": "Linux sysfs metrics",
173
+ "event": "System event tracing",
174
+ "ipc": "Inter-process communication metrics (SysV IPC)",
175
+ "jbd2": "JBD2 journal metrics (ext4 filesystem journaling)",
176
+ "rpc": "RPC statistics",
177
+ "acct": "Process accounting metrics",
178
+ "fchost": "Fibre Channel host metrics",
179
+ "tape": "Tape device metrics",
180
+ "hyperv": "Hyper-V guest metrics",
181
+ }
182
+
183
+ for ns in namespaces:
184
+ doc = namespace_docs.get(ns, "Namespace provided by PMDA (no built-in description)")
185
+ output += f"- **{ns}.***: {doc}\n"
186
+
187
+ output += f"""
188
+ ## Active PMDAs on This System
189
+
190
+ {", ".join(active_pmdas) if active_pmdas else "Unable to enumerate PMDAs"}
191
+
192
+ Status 0 = Running, non-zero = Error
193
+
194
+ ## Namespace Categories
195
+
196
+ ### Core System (always available)
197
+ kernel, mem, disk, network, proc, hinv, pmcd
198
+
199
+ ### Filesystems
200
+ filesys, xfs, btrfs, zfs, quota, swap
201
+
202
+ ### Virtualization
203
+ kvm, libvirt, containers, cgroup, hyperv
204
+
205
+ ### Databases
206
+ redis, postgresql, mysql, elasticsearch, mongodb
207
+
208
+ ### Web Servers
209
+ nginx, apache, haproxy
210
+
211
+ ### Advanced
212
+ bcc (eBPF), hotproc (auto-tracking), mmv (custom metrics), event (tracing)
213
+
214
+ ## Discovery Workflow
215
+
216
+ 1. **Explore a namespace**: search_metrics("{namespaces[0] if namespaces else "kernel"}")
217
+ 2. **Count metrics in namespace**: search_metrics("disk") to see all disk.* metrics
218
+ 3. **Get metric details**: describe_metric("full.metric.name")
219
+ 4. **Query specific metrics**: query_metrics(["name1", "name2"])
220
+
221
+ ## Navigation Strategy
222
+
223
+ **Top-down** (recommended for troubleshooting):
224
+ 1. Start with get_system_snapshot() → Identifies problem domain
225
+ 2. Drill into relevant namespace (e.g., "disk" issue → search_metrics("disk.dev"))
226
+ 3. Query specific metrics with query_metrics([...])
227
+
228
+ **Bottom-up** (exploring new system):
229
+ 1. Browse this pcp://namespaces resource → See what's available
230
+ 2. search_metrics("interesting.namespace") → Explore subtree
231
+ 3. describe_metric("full.name") → Understand semantics
232
+ """
233
+ return output
@@ -0,0 +1,74 @@
1
+ """Health summary resource for quick system status."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime, timezone
6
+ from typing import TYPE_CHECKING
7
+
8
+ from fastmcp import Context
9
+
10
+ from pcp_mcp.context import get_client
11
+ from pcp_mcp.tools.system import COUNTER_METRICS, SNAPSHOT_METRICS
12
+ from pcp_mcp.utils.builders import (
13
+ build_cpu_metrics,
14
+ build_load_metrics,
15
+ build_memory_metrics,
16
+ )
17
+
18
+ if TYPE_CHECKING:
19
+ from fastmcp import FastMCP
20
+
21
+
22
+ def register_health_resources(mcp: FastMCP) -> None:
23
+ """Register health resources with the MCP server."""
24
+
25
+ @mcp.resource("pcp://health")
26
+ async def health_summary(ctx: Context) -> str:
27
+ """Quick system health summary.
28
+
29
+ Returns a text summary of CPU, memory, and load status suitable
30
+ for quick health checks. For detailed metrics, use the
31
+ get_system_snapshot tool instead.
32
+ """
33
+ client = get_client(ctx)
34
+
35
+ metrics = SNAPSHOT_METRICS["cpu"] + SNAPSHOT_METRICS["memory"] + SNAPSHOT_METRICS["load"]
36
+
37
+ try:
38
+ data = await client.fetch_with_rates(metrics, COUNTER_METRICS, sample_interval=1.0)
39
+ except Exception as e:
40
+ return f"Error fetching health data: {e}"
41
+
42
+ cpu = build_cpu_metrics(data)
43
+ memory = build_memory_metrics(data)
44
+ load = build_load_metrics(data)
45
+
46
+ timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
47
+
48
+ return f"""# System Health Summary
49
+ Host: {client.target_host}
50
+ Time: {timestamp}
51
+
52
+ ## CPU
53
+ - User: {cpu.user_percent}%
54
+ - System: {cpu.system_percent}%
55
+ - Idle: {cpu.idle_percent}%
56
+ - I/O Wait: {cpu.iowait_percent}%
57
+ - CPUs: {cpu.ncpu}
58
+ - Assessment: {cpu.assessment}
59
+
60
+ ## Memory
61
+ - Used: {memory.used_percent}% ({memory.used_bytes / 1e9:.1f} / {memory.total_bytes / 1e9:.1f} GB)
62
+ - Available: {memory.available_bytes / 1e9:.1f} GB
63
+ - Cached: {memory.cached_bytes / 1e9:.1f} GB
64
+ - Swap: {memory.swap_used_bytes / 1e9:.1f} GB / {memory.swap_total_bytes / 1e9:.1f} GB
65
+ - Assessment: {memory.assessment}
66
+
67
+ ## Load
68
+ - 1 min: {load.load_1m}
69
+ - 5 min: {load.load_5m}
70
+ - 15 min: {load.load_15m}
71
+ - Runnable: {load.runnable}
72
+ - Processes: {load.nprocs}
73
+ - Assessment: {load.assessment}
74
+ """