pcp-mcp 1.3.1__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/PKG-INFO +2 -10
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/README.md +0 -8
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/pyproject.toml +2 -2
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/AGENTS.md +0 -1
- pcp_mcp-1.4.0/src/pcp_mcp/errors.py +54 -0
- pcp_mcp-1.4.0/src/pcp_mcp/prompts/__init__.py +36 -0
- pcp_mcp-1.4.0/src/pcp_mcp/prompts/cpu.py +69 -0
- pcp_mcp-1.4.0/src/pcp_mcp/prompts/diagnose.py +54 -0
- pcp_mcp-1.4.0/src/pcp_mcp/prompts/disk.py +69 -0
- pcp_mcp-1.4.0/src/pcp_mcp/prompts/memory.py +60 -0
- pcp_mcp-1.4.0/src/pcp_mcp/prompts/network.py +67 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/server.py +9 -14
- pcp_mcp-1.4.0/src/pcp_mcp/tools/__init__.py +37 -0
- pcp_mcp-1.4.0/src/pcp_mcp/tools/metrics.py +203 -0
- pcp_mcp-1.4.0/src/pcp_mcp/tools/system.py +623 -0
- pcp_mcp-1.3.1/src/pcp_mcp/errors.py +0 -47
- pcp_mcp-1.3.1/src/pcp_mcp/prompts/__init__.py +0 -308
- pcp_mcp-1.3.1/src/pcp_mcp/resources/__init__.py +0 -21
- pcp_mcp-1.3.1/src/pcp_mcp/resources/catalog.py +0 -307
- pcp_mcp-1.3.1/src/pcp_mcp/resources/health.py +0 -117
- pcp_mcp-1.3.1/src/pcp_mcp/tools/__init__.py +0 -21
- pcp_mcp-1.3.1/src/pcp_mcp/tools/metrics.py +0 -189
- pcp_mcp-1.3.1/src/pcp_mcp/tools/system.py +0 -592
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/__init__.py +0 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/client.py +0 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/config.py +0 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/context.py +0 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/icons.py +0 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/middleware.py +0 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/models.py +0 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/py.typed +0 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/tools/AGENTS.md +0 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/utils/__init__.py +0 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/utils/builders.py +0 -0
- {pcp_mcp-1.3.1 → pcp_mcp-1.4.0}/src/pcp_mcp/utils/extractors.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pcp-mcp
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: MCP server for Performance Co-Pilot
|
|
5
5
|
Keywords: mcp,pcp,performance-co-pilot,monitoring,model-context-protocol
|
|
6
6
|
Author: Major Hayden
|
|
@@ -18,7 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
18
18
|
Classifier: Topic :: System :: Monitoring
|
|
19
19
|
Classifier: Typing :: Typed
|
|
20
20
|
Requires-Dist: cachetools>=5.0
|
|
21
|
-
Requires-Dist: fastmcp>=
|
|
21
|
+
Requires-Dist: fastmcp>=3.0.0b1
|
|
22
22
|
Requires-Dist: httpx>=0.27
|
|
23
23
|
Requires-Dist: pydantic-settings>=2.0.0
|
|
24
24
|
Requires-Dist: typing-extensions>=4.0 ; python_full_version < '3.11'
|
|
@@ -189,14 +189,6 @@ For remote monitoring:
|
|
|
189
189
|
→ Uses describe_metric(name="kernel.all.load")
|
|
190
190
|
```
|
|
191
191
|
|
|
192
|
-
## 📚 Resources
|
|
193
|
-
|
|
194
|
-
Browse metrics via MCP resources:
|
|
195
|
-
|
|
196
|
-
- `pcp://health` - Quick system health summary
|
|
197
|
-
- `pcp://metrics/common` - Catalog of commonly used metrics
|
|
198
|
-
- `pcp://namespaces` - Live-discovered metric namespaces
|
|
199
|
-
|
|
200
192
|
## 💡 Use Cases
|
|
201
193
|
|
|
202
194
|
### Performance Troubleshooting
|
|
@@ -160,14 +160,6 @@ For remote monitoring:
|
|
|
160
160
|
→ Uses describe_metric(name="kernel.all.load")
|
|
161
161
|
```
|
|
162
162
|
|
|
163
|
-
## 📚 Resources
|
|
164
|
-
|
|
165
|
-
Browse metrics via MCP resources:
|
|
166
|
-
|
|
167
|
-
- `pcp://health` - Quick system health summary
|
|
168
|
-
- `pcp://metrics/common` - Catalog of commonly used metrics
|
|
169
|
-
- `pcp://namespaces` - Live-discovered metric namespaces
|
|
170
|
-
|
|
171
163
|
## 💡 Use Cases
|
|
172
164
|
|
|
173
165
|
### Performance Troubleshooting
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "pcp-mcp"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.4.0"
|
|
4
4
|
description = "MCP server for Performance Co-Pilot"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "MIT"
|
|
@@ -22,7 +22,7 @@ classifiers = [
|
|
|
22
22
|
]
|
|
23
23
|
dependencies = [
|
|
24
24
|
"cachetools>=5.0",
|
|
25
|
-
"fastmcp>=
|
|
25
|
+
"fastmcp>=3.0.0b1",
|
|
26
26
|
"httpx>=0.27",
|
|
27
27
|
"pydantic-settings>=2.0.0",
|
|
28
28
|
"typing_extensions>=4.0; python_version < '3.11'",
|
|
@@ -18,7 +18,6 @@ pcp_mcp/
|
|
|
18
18
|
├── middleware.py # Request caching middleware
|
|
19
19
|
├── icons.py # System assessment icons (emoji mappings)
|
|
20
20
|
├── tools/ # MCP tools (see tools/AGENTS.md)
|
|
21
|
-
├── resources/ # MCP resources (health.py, catalog.py)
|
|
22
21
|
├── utils/ # Extractors, builders
|
|
23
22
|
└── prompts/ # LLM system prompts
|
|
24
23
|
```
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Error mapping from httpx to MCP ToolErrors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
from fastmcp.exceptions import ToolError
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PCPError(Exception):
|
|
10
|
+
"""Base PCP error."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PCPConnectionError(PCPError):
|
|
14
|
+
"""Cannot connect to pmproxy."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PCPMetricNotFoundError(PCPError):
|
|
18
|
+
"""Metric does not exist."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def handle_pcp_error(e: Exception, operation: str) -> ToolError:
|
|
22
|
+
"""Convert PCP/httpx exceptions to MCP ToolErrors.
|
|
23
|
+
|
|
24
|
+
Uses isinstance() checks instead of match/case class patterns for resilience
|
|
25
|
+
against module reloading (e.g., FastMCP's FileSystemProvider), which can
|
|
26
|
+
create different class identities that break structural pattern matching.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
e: The exception to convert.
|
|
30
|
+
operation: Description of the operation that failed.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
A ToolError with an appropriate message.
|
|
34
|
+
"""
|
|
35
|
+
if isinstance(e, httpx.ConnectError):
|
|
36
|
+
return ToolError("Cannot connect to pmproxy. Is it running? (systemctl start pmproxy)")
|
|
37
|
+
|
|
38
|
+
if isinstance(e, httpx.HTTPStatusError):
|
|
39
|
+
if e.response.status_code == 400:
|
|
40
|
+
return ToolError(f"Bad request during {operation}: {e.response.text}")
|
|
41
|
+
if e.response.status_code == 404:
|
|
42
|
+
return ToolError(f"Metric not found during {operation}")
|
|
43
|
+
return ToolError(f"pmproxy error ({e.response.status_code}): {e.response.text}")
|
|
44
|
+
|
|
45
|
+
if isinstance(e, httpx.TimeoutException):
|
|
46
|
+
return ToolError(f"Request timed out during {operation}")
|
|
47
|
+
|
|
48
|
+
if isinstance(e, PCPMetricNotFoundError):
|
|
49
|
+
return ToolError(f"Metric not found: {e}")
|
|
50
|
+
|
|
51
|
+
if isinstance(e, PCPConnectionError):
|
|
52
|
+
return ToolError(str(e))
|
|
53
|
+
|
|
54
|
+
return ToolError(f"Error during {operation}: {e}")
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Diagnostic prompts for guided troubleshooting workflows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from pcp_mcp.prompts.cpu import analyze_cpu_usage
|
|
8
|
+
from pcp_mcp.prompts.diagnose import diagnose_slow_system
|
|
9
|
+
from pcp_mcp.prompts.disk import find_io_bottleneck
|
|
10
|
+
from pcp_mcp.prompts.memory import investigate_memory_usage
|
|
11
|
+
from pcp_mcp.prompts.network import check_network_performance
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from fastmcp import FastMCP
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"diagnose_slow_system",
|
|
18
|
+
"investigate_memory_usage",
|
|
19
|
+
"find_io_bottleneck",
|
|
20
|
+
"analyze_cpu_usage",
|
|
21
|
+
"check_network_performance",
|
|
22
|
+
"register_prompts",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def register_prompts(mcp: FastMCP) -> None:
|
|
27
|
+
"""Register all prompts with the MCP server.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
mcp: The FastMCP server instance.
|
|
31
|
+
"""
|
|
32
|
+
mcp.add_prompt(diagnose_slow_system)
|
|
33
|
+
mcp.add_prompt(investigate_memory_usage)
|
|
34
|
+
mcp.add_prompt(find_io_bottleneck)
|
|
35
|
+
mcp.add_prompt(analyze_cpu_usage)
|
|
36
|
+
mcp.add_prompt(check_network_performance)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Analyze CPU usage prompt."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastmcp.prompts import prompt
|
|
6
|
+
|
|
7
|
+
from pcp_mcp.icons import ICON_CPU, TAGS_CPU
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@prompt(icons=[ICON_CPU], tags=TAGS_CPU)
|
|
11
|
+
def analyze_cpu_usage() -> str:
|
|
12
|
+
"""Analyze CPU utilization patterns and identify CPU-bound processes.
|
|
13
|
+
|
|
14
|
+
Returns a workflow to diagnose high CPU usage, distinguish between
|
|
15
|
+
user-space and kernel CPU time, and identify optimization opportunities.
|
|
16
|
+
"""
|
|
17
|
+
return """CPU usage analysis workflow:
|
|
18
|
+
|
|
19
|
+
1. Get CPU baseline:
|
|
20
|
+
- Run: get_system_snapshot(categories=["cpu", "load"])
|
|
21
|
+
- Read cpu.assessment for quick diagnosis
|
|
22
|
+
- Note: ncpu value (number of CPUs/cores)
|
|
23
|
+
|
|
24
|
+
2. Interpret CPU metrics:
|
|
25
|
+
- user_percent: Application code execution
|
|
26
|
+
- system_percent: Kernel/syscall overhead
|
|
27
|
+
- idle_percent: Unused CPU capacity
|
|
28
|
+
- iowait_percent: CPU waiting for I/O (NOT CPU-bound if high)
|
|
29
|
+
- Load average: Runnable + waiting processes (compare to ncpu)
|
|
30
|
+
|
|
31
|
+
3. CPU pattern classification:
|
|
32
|
+
- High user + low system = CPU-intensive application (normal)
|
|
33
|
+
- High system + low user = Kernel overhead (syscalls, context switches)
|
|
34
|
+
- High iowait = NOT a CPU problem, it's disk/storage (see find_io_bottleneck)
|
|
35
|
+
- Load > ncpu = More demand than capacity (may include I/O wait)
|
|
36
|
+
|
|
37
|
+
4. Find CPU hogs:
|
|
38
|
+
- Run: get_process_top(sort_by="cpu", limit=15)
|
|
39
|
+
- Note: cpu_percent > 100% means multi-core usage (e.g., 400% = 4 cores)
|
|
40
|
+
- Identify unexpected high CPU consumers
|
|
41
|
+
|
|
42
|
+
5. Per-CPU breakdown (if needed):
|
|
43
|
+
- Run: search_metrics("kernel.percpu.cpu")
|
|
44
|
+
- Useful for: Thread affinity issues, interrupt handling imbalance
|
|
45
|
+
- Look for: One CPU at 100% while others idle (poor parallelization)
|
|
46
|
+
|
|
47
|
+
6. Check for CPU saturation indicators:
|
|
48
|
+
- Run: query_metrics(["kernel.all.runnable", "kernel.all.pswitch"])
|
|
49
|
+
- High runnable count: More threads than cores (contention)
|
|
50
|
+
- High pswitch (context switches): Thread thrashing
|
|
51
|
+
|
|
52
|
+
7. Distinguish workload types:
|
|
53
|
+
- Compute-bound: High user%, low syscalls (scientific, encoding, crypto)
|
|
54
|
+
- I/O-bound: High iowait%, moderate user% (databases, file processing)
|
|
55
|
+
- System-bound: High system%, moderate user% (network servers, many syscalls)
|
|
56
|
+
|
|
57
|
+
8. Report:
|
|
58
|
+
- CPU utilization breakdown: X% user, Y% system, Z% iowait, W% idle
|
|
59
|
+
- Load average: 1/5/15 min values vs ncpu (e.g., "load 8.5 on 8-core = 106%")
|
|
60
|
+
- Top 5 CPU consumers with cpu_percent and command names
|
|
61
|
+
- CPU pattern: compute-bound / I/O-bound / system-bound
|
|
62
|
+
- Saturation indicators: runnable queue, context switches
|
|
63
|
+
- Recommendations:
|
|
64
|
+
* Low idle + high load → Add CPU capacity or optimize hot processes
|
|
65
|
+
* High iowait → Disk bottleneck, not CPU (see I/O investigation)
|
|
66
|
+
* High system% → Profile syscalls, reduce I/O frequency, optimize locking
|
|
67
|
+
* Single-threaded bottleneck → Parallelize if possible
|
|
68
|
+
* Many small processes → Reduce process spawning overhead
|
|
69
|
+
"""
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Diagnose slow system prompt."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastmcp.prompts import prompt
|
|
6
|
+
|
|
7
|
+
from pcp_mcp.icons import ICON_DIAGNOSE, TAGS_DIAGNOSE
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@prompt(icons=[ICON_DIAGNOSE], tags=TAGS_DIAGNOSE)
|
|
11
|
+
def diagnose_slow_system() -> str:
|
|
12
|
+
"""Diagnose why a system is running slowly.
|
|
13
|
+
|
|
14
|
+
Returns a structured investigation workflow to identify performance
|
|
15
|
+
bottlenecks by examining CPU, memory, disk, and network metrics.
|
|
16
|
+
"""
|
|
17
|
+
return """Investigate system slowness:
|
|
18
|
+
|
|
19
|
+
1. Get baseline: get_system_snapshot(categories=["cpu", "memory", "load", "disk", "network"])
|
|
20
|
+
|
|
21
|
+
2. Interpret the assessment fields:
|
|
22
|
+
- If cpu.assessment mentions "I/O wait": Disk bottleneck (skip to step 4)
|
|
23
|
+
- If cpu.assessment mentions "user/system": CPU bottleneck (go to step 3)
|
|
24
|
+
- If memory.assessment mentions "swap": Memory pressure (go to step 5)
|
|
25
|
+
- If load.assessment shows high load: Check load vs ncpu ratio
|
|
26
|
+
|
|
27
|
+
3. Find CPU hogs:
|
|
28
|
+
- Run: get_process_top(sort_by="cpu", limit=10)
|
|
29
|
+
- Identify processes with high cpu_percent
|
|
30
|
+
- Note: cpu_percent > 100% means multi-threaded (e.g., 200% = 2 cores)
|
|
31
|
+
|
|
32
|
+
4. Check disk I/O bottleneck:
|
|
33
|
+
- If disk.assessment shows high read/write rates
|
|
34
|
+
- Run: search_metrics("disk.dev") to see per-device metrics
|
|
35
|
+
- Run: get_process_top(sort_by="io", limit=10) to find I/O-heavy processes
|
|
36
|
+
- Cross-check: Does kernel.all.cpu.wait.total correlate with disk activity?
|
|
37
|
+
|
|
38
|
+
5. Check memory pressure:
|
|
39
|
+
- If memory.assessment indicates swapping
|
|
40
|
+
- Run: get_process_top(sort_by="memory", limit=20)
|
|
41
|
+
- Look for large rss_bytes processes
|
|
42
|
+
- Check if swap usage is growing
|
|
43
|
+
|
|
44
|
+
6. Check network saturation:
|
|
45
|
+
- If network.assessment shows high throughput
|
|
46
|
+
- Run: search_metrics("network.interface") for per-interface breakdown
|
|
47
|
+
- Look for interface errors or packet drops
|
|
48
|
+
|
|
49
|
+
7. Report findings:
|
|
50
|
+
- Primary bottleneck (CPU/disk/memory/network)
|
|
51
|
+
- Specific culprits (process names, PIDs)
|
|
52
|
+
- Quantified impact (e.g., "process X using 45% CPU on 8-core system")
|
|
53
|
+
- Recommendations (kill process, add RAM, optimize queries, etc.)
|
|
54
|
+
"""
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Find I/O bottleneck prompt."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastmcp.prompts import prompt
|
|
6
|
+
|
|
7
|
+
from pcp_mcp.icons import ICON_DISK, TAGS_DISK
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@prompt(icons=[ICON_DISK], tags=TAGS_DISK)
|
|
11
|
+
def find_io_bottleneck() -> str:
|
|
12
|
+
"""Find disk I/O bottlenecks and identify processes causing high I/O.
|
|
13
|
+
|
|
14
|
+
Returns a workflow to diagnose disk performance issues, identify
|
|
15
|
+
hot devices, and find I/O-intensive processes.
|
|
16
|
+
"""
|
|
17
|
+
return """Disk I/O investigation:
|
|
18
|
+
|
|
19
|
+
1. Get system-wide I/O snapshot:
|
|
20
|
+
- Run: get_system_snapshot(categories=["disk", "cpu"])
|
|
21
|
+
- Check disk.assessment for read/write rates
|
|
22
|
+
- Check cpu.assessment for iowait_percent
|
|
23
|
+
|
|
24
|
+
2. Interpret I/O metrics:
|
|
25
|
+
- High iowait_percent (>20%) = CPU waiting for disk
|
|
26
|
+
- Read vs write imbalance may indicate backup, logging, or database queries
|
|
27
|
+
- Sustained high I/O (>100 MB/s on HDD, >500 MB/s on SSD) = saturated
|
|
28
|
+
|
|
29
|
+
3. Identify hot disks:
|
|
30
|
+
- Run: search_metrics("disk.dev")
|
|
31
|
+
- Run: query_metrics(["disk.dev.read_bytes", "disk.dev.write_bytes"])
|
|
32
|
+
- Note: These are COUNTERS, use get_system_snapshot for rates
|
|
33
|
+
- Look for specific devices with disproportionate activity
|
|
34
|
+
|
|
35
|
+
4. Find I/O-heavy processes:
|
|
36
|
+
- Run: get_process_top(sort_by="io", limit=10, sample_interval=2.0)
|
|
37
|
+
- Note: Longer sample_interval (2-5s) gives more accurate I/O rates
|
|
38
|
+
- Identify processes with high io_read_bytes_sec or io_write_bytes_sec
|
|
39
|
+
|
|
40
|
+
5. Correlate with CPU iowait:
|
|
41
|
+
- If cpu.iowait_percent is high AND disk I/O is high:
|
|
42
|
+
→ Confirmed disk bottleneck
|
|
43
|
+
- If disk I/O is high BUT iowait is low:
|
|
44
|
+
→ Fast storage keeping up (SSD/NVMe)
|
|
45
|
+
- If iowait is high BUT disk I/O is low:
|
|
46
|
+
→ May be network storage (NFS) or storage controller issue
|
|
47
|
+
|
|
48
|
+
6. Check for I/O patterns:
|
|
49
|
+
- Bursty I/O: Scheduled jobs, backups, log rotation
|
|
50
|
+
- Sustained I/O: Database, file server, streaming
|
|
51
|
+
- Random I/O: Database seeks (slow on HDD, fast on SSD)
|
|
52
|
+
- Sequential I/O: Backups, large file copies
|
|
53
|
+
|
|
54
|
+
7. Advanced: Check per-partition I/O (if needed):
|
|
55
|
+
- Run: search_metrics("disk.partitions")
|
|
56
|
+
- Useful for systems with multiple partitions on same disk
|
|
57
|
+
|
|
58
|
+
8. Report:
|
|
59
|
+
- Busiest disks by name (e.g., sda, nvme0n1)
|
|
60
|
+
- Read vs write breakdown (e.g., "80% reads, 20% writes")
|
|
61
|
+
- Top 3-5 processes causing I/O with rates
|
|
62
|
+
- I/O pattern: bursty vs sustained, random vs sequential
|
|
63
|
+
- Bottleneck severity: iowait % and queue depth
|
|
64
|
+
- Recommendations:
|
|
65
|
+
* High random I/O on HDD → Migrate to SSD
|
|
66
|
+
* Single process saturating disk → Optimize queries/access patterns
|
|
67
|
+
* Multiple processes fighting for I/O → I/O scheduler tuning or workload separation
|
|
68
|
+
* Backup/batch jobs during business hours → Reschedule
|
|
69
|
+
"""
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Investigate memory usage prompt."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastmcp.prompts import prompt
|
|
6
|
+
|
|
7
|
+
from pcp_mcp.icons import ICON_MEMORY, TAGS_MEMORY
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@prompt(icons=[ICON_MEMORY], tags=TAGS_MEMORY)
|
|
11
|
+
def investigate_memory_usage() -> str:
|
|
12
|
+
"""Investigate memory consumption and identify memory pressure.
|
|
13
|
+
|
|
14
|
+
Returns a workflow to analyze memory utilization, identify memory
|
|
15
|
+
hogs, and distinguish between normal cache usage and actual pressure.
|
|
16
|
+
"""
|
|
17
|
+
return """Memory investigation workflow:
|
|
18
|
+
|
|
19
|
+
1. Get memory overview:
|
|
20
|
+
- Run: get_system_snapshot(categories=["memory"])
|
|
21
|
+
- Read memory.assessment field for quick diagnosis
|
|
22
|
+
|
|
23
|
+
2. Interpret memory metrics:
|
|
24
|
+
- mem.util.available is KEY metric (not "free"!)
|
|
25
|
+
- Large cache is NORMAL (Linux uses free RAM for cache)
|
|
26
|
+
- Swapping = BAD (indicates memory pressure)
|
|
27
|
+
- Check used_percent vs swap usage
|
|
28
|
+
|
|
29
|
+
3. Assessment-based actions:
|
|
30
|
+
- "Memory pressure" → Go to step 4
|
|
31
|
+
- "Cache is large" → Normal, but check top consumers anyway (step 4)
|
|
32
|
+
- "Swapping actively" → CRITICAL, go to step 4 immediately
|
|
33
|
+
|
|
34
|
+
4. Find memory consumers:
|
|
35
|
+
- Run: get_process_top(sort_by="memory", limit=20)
|
|
36
|
+
- Note processes with high rss_bytes
|
|
37
|
+
- Calculate: rss_percent shows memory impact
|
|
38
|
+
- Look for unexpected memory hogs (leaked memory, runaway processes)
|
|
39
|
+
|
|
40
|
+
5. Detailed memory breakdown:
|
|
41
|
+
- Run: search_metrics("mem.util") for full breakdown
|
|
42
|
+
- Check: mem.util.slab (kernel memory)
|
|
43
|
+
- Check: mem.util.anonpages (process private memory)
|
|
44
|
+
- Check: mem.util.swapCached (pages swapped but still in RAM)
|
|
45
|
+
|
|
46
|
+
6. NUMA systems (if applicable):
|
|
47
|
+
- Run: search_metrics("mem.numa") to check per-node allocation
|
|
48
|
+
- Look for imbalanced NUMA usage
|
|
49
|
+
|
|
50
|
+
7. Report:
|
|
51
|
+
- Total memory: X GB
|
|
52
|
+
- Used: Y% (Z GB used, W GB available)
|
|
53
|
+
- Top 5 memory consumers with RSS sizes
|
|
54
|
+
- Swap status: active/inactive, growth rate if swapping
|
|
55
|
+
- Recommendation:
|
|
56
|
+
* No pressure + large cache = Normal
|
|
57
|
+
* High usage + no swap = Monitor but OK
|
|
58
|
+
* Active swapping = Add RAM or reduce load
|
|
59
|
+
* Single process consuming >50% = Investigate for memory leak
|
|
60
|
+
"""
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Check network performance prompt."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastmcp.prompts import prompt
|
|
6
|
+
|
|
7
|
+
from pcp_mcp.icons import ICON_NETWORK, TAGS_NETWORK
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@prompt(icons=[ICON_NETWORK], tags=TAGS_NETWORK)
|
|
11
|
+
def check_network_performance() -> str:
|
|
12
|
+
"""Check network performance and identify bandwidth/error issues.
|
|
13
|
+
|
|
14
|
+
Returns a workflow to analyze network throughput, identify saturated
|
|
15
|
+
interfaces, and detect packet loss or errors.
|
|
16
|
+
"""
|
|
17
|
+
return """Network performance investigation:
|
|
18
|
+
|
|
19
|
+
1. Get network overview:
|
|
20
|
+
- Run: get_system_snapshot(categories=["network"])
|
|
21
|
+
- Read network.assessment for quick diagnosis
|
|
22
|
+
- Note: Rates are per-second (bytes/sec, packets/sec)
|
|
23
|
+
|
|
24
|
+
2. Interpret network metrics:
|
|
25
|
+
- in_bytes_sec / out_bytes_sec: Throughput (compare to link speed)
|
|
26
|
+
- in_packets_sec / out_packets_sec: Packet rate
|
|
27
|
+
- Assessment field indicates saturation or errors
|
|
28
|
+
|
|
29
|
+
3. Per-interface breakdown:
|
|
30
|
+
- Run: search_metrics("network.interface")
|
|
31
|
+
- Run: query_metrics(["network.interface.in.bytes", "network.interface.out.bytes"])
|
|
32
|
+
- Note: These are COUNTERS, use get_system_snapshot for rates
|
|
33
|
+
- Identify busy interfaces vs idle interfaces (e.g., eth0 busy, lo idle)
|
|
34
|
+
|
|
35
|
+
4. Check for errors and drops:
|
|
36
|
+
- Run: query_metrics(["network.interface.in.errors", "network.interface.out.errors"])
|
|
37
|
+
- Run: query_metrics(["network.interface.in.drops", "network.interface.out.drops"])
|
|
38
|
+
- Non-zero errors = Hardware, driver, or cable issues
|
|
39
|
+
- Non-zero drops = Buffer overflow (traffic exceeds processing capacity)
|
|
40
|
+
|
|
41
|
+
5. Calculate interface saturation:
|
|
42
|
+
- Compare throughput to link speed (e.g., 950 Mbps on 1 Gbps link = 95%)
|
|
43
|
+
- Sustained >80% = Approaching saturation
|
|
44
|
+
- Bursts >95% = Temporarily saturated
|
|
45
|
+
|
|
46
|
+
6. Find network-heavy processes (indirect):
|
|
47
|
+
- PCP proc.* namespace doesn't have per-process network metrics
|
|
48
|
+
- Use system tools: netstat, ss, iftop (outside PCP)
|
|
49
|
+
- Or correlate: High network I/O often correlates with high CPU/disk I/O
|
|
50
|
+
|
|
51
|
+
7. Check protocol-level stats (if needed):
|
|
52
|
+
- Run: search_metrics("network.tcp")
|
|
53
|
+
- Run: search_metrics("network.udp")
|
|
54
|
+
- Look for: Retransmissions, failed connections, buffer overflows
|
|
55
|
+
|
|
56
|
+
8. Report:
|
|
57
|
+
- Per-interface throughput (e.g., "eth0: 850 Mbps in, 120 Mbps out")
|
|
58
|
+
- Link utilization % (if link speed known)
|
|
59
|
+
- Errors/drops: Count and affected interfaces
|
|
60
|
+
- Traffic pattern: Symmetric (similar in/out) vs asymmetric (download/upload heavy)
|
|
61
|
+
- Packet rate: Normal vs abnormal (tiny packets = inefficient, possible attack)
|
|
62
|
+
- Recommendations:
|
|
63
|
+
* High utilization + no errors → Upgrade link or load balance
|
|
64
|
+
* Errors/drops present → Check cables, NIC drivers, switch ports
|
|
65
|
+
* Asymmetric traffic → Normal for client (download heavy) or server (upload heavy)
|
|
66
|
+
* High packet rate + low byte rate → Small packets (check for SYN flood, fragmentation)
|
|
67
|
+
"""
|
|
@@ -4,10 +4,12 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from collections.abc import AsyncIterator
|
|
6
6
|
from contextlib import asynccontextmanager
|
|
7
|
+
from pathlib import Path
|
|
7
8
|
from typing import Any
|
|
8
9
|
|
|
9
10
|
from fastmcp import FastMCP
|
|
10
11
|
from fastmcp.server.middleware.logging import StructuredLoggingMiddleware
|
|
12
|
+
from fastmcp.server.providers import FileSystemProvider
|
|
11
13
|
|
|
12
14
|
from pcp_mcp.client import PCPClient
|
|
13
15
|
from pcp_mcp.config import PCPMCPSettings
|
|
@@ -108,13 +110,6 @@ Tools:
|
|
|
108
110
|
- get_system_snapshot: System overview (CPU, memory, disk, network) - USE THIS FIRST
|
|
109
111
|
- get_process_top: Top processes by resource consumption
|
|
110
112
|
|
|
111
|
-
Resources:
|
|
112
|
-
- pcp://health - Quick system health summary
|
|
113
|
-
- pcp://host/{{hostname}}/health - Per-host health summary (template)
|
|
114
|
-
- pcp://metric/{{name}}/info - Detailed metric metadata (template)
|
|
115
|
-
- pcp://metrics/common - Catalog of commonly used metrics
|
|
116
|
-
- pcp://namespaces - Dynamically discovered metric namespaces
|
|
117
|
-
|
|
118
113
|
Prompts (invoke for guided troubleshooting workflows):
|
|
119
114
|
- diagnose_slow_system: Complete slowness investigation
|
|
120
115
|
- investigate_memory_usage: Memory pressure analysis
|
|
@@ -133,12 +128,12 @@ Prompts (invoke for guided troubleshooting workflows):
|
|
|
133
128
|
)
|
|
134
129
|
mcp.add_middleware(MetricCacheMiddleware())
|
|
135
130
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
131
|
+
# Auto-discover tools and prompts from filesystem
|
|
132
|
+
base_dir = Path(__file__).parent
|
|
133
|
+
provider = FileSystemProvider(
|
|
134
|
+
root=base_dir,
|
|
135
|
+
reload=False,
|
|
136
|
+
)
|
|
137
|
+
mcp.add_provider(provider)
|
|
143
138
|
|
|
144
139
|
return mcp
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Tool registration for the PCP MCP server."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from fastmcp import FastMCP
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def register_tools(mcp: FastMCP) -> None:
|
|
12
|
+
"""Register all tools with the MCP server.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
mcp: The FastMCP server instance.
|
|
16
|
+
"""
|
|
17
|
+
from pcp_mcp.tools.metrics import (
|
|
18
|
+
describe_metric,
|
|
19
|
+
query_metrics,
|
|
20
|
+
search_metrics,
|
|
21
|
+
)
|
|
22
|
+
from pcp_mcp.tools.system import (
|
|
23
|
+
get_filesystem_usage,
|
|
24
|
+
get_process_top,
|
|
25
|
+
get_system_snapshot,
|
|
26
|
+
quick_health,
|
|
27
|
+
smart_diagnose,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
mcp.add_tool(query_metrics)
|
|
31
|
+
mcp.add_tool(search_metrics)
|
|
32
|
+
mcp.add_tool(describe_metric)
|
|
33
|
+
mcp.add_tool(get_system_snapshot)
|
|
34
|
+
mcp.add_tool(quick_health)
|
|
35
|
+
mcp.add_tool(get_process_top)
|
|
36
|
+
mcp.add_tool(smart_diagnose)
|
|
37
|
+
mcp.add_tool(get_filesystem_usage)
|