vmware-debug 1.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_server/__init__.py +1 -0
- mcp_server/server.py +78 -0
- vmware_debug/__init__.py +8 -0
- vmware_debug/cli.py +85 -0
- vmware_debug/envelope.py +173 -0
- vmware_debug/mcp/__init__.py +2 -0
- vmware_debug/mcp/tools.py +33 -0
- vmware_debug/ops/__init__.py +1 -0
- vmware_debug/ops/timeline.py +312 -0
- vmware_debug-1.6.1.dist-info/METADATA +52 -0
- vmware_debug-1.6.1.dist-info/RECORD +13 -0
- vmware_debug-1.6.1.dist-info/WHEEL +4 -0
- vmware_debug-1.6.1.dist-info/entry_points.txt +3 -0
mcp_server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""stdio MCP server package for vmware-debug."""
|
mcp_server/server.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""vmware-debug MCP server entry point.
|
|
2
|
+
|
|
3
|
+
Tools are defined in vmware_debug.mcp.tools (so audit logs see skill=debug).
|
|
4
|
+
This module wires them into a FastMCP server and provides the stdio entry point.
|
|
5
|
+
|
|
6
|
+
Note: signatures here use typing.Optional, never PEP 604 ``X | None`` — FastMCP
|
|
7
|
+
reflects these at registration and ``X | None`` crashes on Python 3.10 + older
|
|
8
|
+
mcp/pydantic (CLAUDE.md 踩坑 #33).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import sys
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
from mcp.server.fastmcp import FastMCP
|
|
15
|
+
|
|
16
|
+
from vmware_debug.mcp import tools as t
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def build_server() -> FastMCP:
|
|
20
|
+
"""Construct and configure the MCP server."""
|
|
21
|
+
server = FastMCP("vmware-debug")
|
|
22
|
+
|
|
23
|
+
@server.tool(name="incident_timeline")
|
|
24
|
+
def _incident_timeline_impl(
|
|
25
|
+
events: list[dict],
|
|
26
|
+
bin_seconds: Optional[float] = None,
|
|
27
|
+
z_threshold: float = 2.0,
|
|
28
|
+
top_n: int = 5,
|
|
29
|
+
) -> dict:
|
|
30
|
+
"""[READ] Correlate already-fetched VMware events into one incident view.
|
|
31
|
+
|
|
32
|
+
WHEN: after you've pulled events for an incident from the data-source
|
|
33
|
+
skills (vmware-monitor event_list/alarm_list, vmware-aria alerts/anomaly,
|
|
34
|
+
vmware-log-insight log_search/log_aggregate, vmware-nsx) — feed them all
|
|
35
|
+
here to find what correlates and where to look next. This tool does NOT
|
|
36
|
+
fetch anything itself; it has no vCenter/network access.
|
|
37
|
+
|
|
38
|
+
INPUT: events = list of event envelopes, each {ts, source, severity,
|
|
39
|
+
entity, text, fields} (ts may be ISO-8601, epoch seconds, or millis;
|
|
40
|
+
severity is normalised). Optional: bin_seconds (time-bin width; auto if
|
|
41
|
+
omitted), z_threshold (spike sensitivity, default 2.0), top_n (max
|
|
42
|
+
hypotheses, default 5).
|
|
43
|
+
|
|
44
|
+
RETURNS: {event_count, window, spikes (anomalous time bins), hypotheses
|
|
45
|
+
(ranked root-cause candidates, each with a suggested_check), next_checks
|
|
46
|
+
(concrete ideas for what to investigate next, including which skill/tool
|
|
47
|
+
to run)}.
|
|
48
|
+
|
|
49
|
+
GOTCHAS: read-only and stateless — nothing is executed. Remediation is
|
|
50
|
+
routed to vmware-aiops (single fix) or vmware-pilot (multi-step, gated).
|
|
51
|
+
A malformed event raises ValueError naming its index."""
|
|
52
|
+
return t.incident_timeline(events, bin_seconds, z_threshold, top_n)
|
|
53
|
+
|
|
54
|
+
@server.tool(name="list_symptom_categories")
|
|
55
|
+
def _list_symptom_categories_impl() -> list[dict]:
|
|
56
|
+
"""[READ] List the symptom categories vmware-debug recognises and, for
|
|
57
|
+
each, example keywords and the suggested next check (which skill/tool to
|
|
58
|
+
run). Takes no parameters. Use this when you don't yet know what to look
|
|
59
|
+
at — it turns "something's wrong" into concrete investigation steps.
|
|
60
|
+
Read-only; no network access."""
|
|
61
|
+
return t.list_symptom_categories()
|
|
62
|
+
|
|
63
|
+
return server
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def main() -> None:
|
|
67
|
+
"""Entry point for `vmware-debug-mcp` (stdio transport)."""
|
|
68
|
+
if sys.version_info < (3, 11):
|
|
69
|
+
sys.exit(
|
|
70
|
+
"vmware-debug-mcp requires Python >= 3.11 (FastMCP schema reflection "
|
|
71
|
+
"is unreliable on 3.10). Reinstall under 3.11+: "
|
|
72
|
+
"uv tool install --python 3.11 vmware-debug"
|
|
73
|
+
)
|
|
74
|
+
build_server().run()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
if __name__ == "__main__":
|
|
78
|
+
main()
|
vmware_debug/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""vmware-debug — VMware diagnostic brain.
|
|
2
|
+
|
|
3
|
+
Read-only incident triage: correlate events from monitor/aria/log-insight/nsx
|
|
4
|
+
into a unified timeline, detect spikes, rank root-cause hypotheses, and route
|
|
5
|
+
remediation to vmware-aiops / vmware-pilot. Never writes; never executes fixes.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "1.6.1"
|
vmware_debug/cli.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""vmware-debug CLI — read-only incident triage from the terminal.
|
|
2
|
+
|
|
3
|
+
Correlation is local and offline: feed it events you've already collected (a
|
|
4
|
+
JSON file or stdin) and it returns a ranked timeline + next-check ideas. The
|
|
5
|
+
`mcp` subcommand starts the stdio MCP server (entry point that does not touch
|
|
6
|
+
the network, so it works behind corporate TLS proxies — CLAUDE.md 踩坑 #25).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
import typer
|
|
17
|
+
from rich.console import Console
|
|
18
|
+
from rich.table import Table
|
|
19
|
+
|
|
20
|
+
from vmware_debug import __version__
|
|
21
|
+
from vmware_debug.mcp.tools import incident_timeline, list_symptom_categories
|
|
22
|
+
|
|
23
|
+
app = typer.Typer(
|
|
24
|
+
add_completion=False,
|
|
25
|
+
help="VMware diagnostic brain — read-only incident triage and root-cause routing.",
|
|
26
|
+
)
|
|
27
|
+
console = Console()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@app.command()
|
|
31
|
+
def version() -> None:
|
|
32
|
+
"""Print the installed version."""
|
|
33
|
+
console.print(f"vmware-debug {__version__}")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@app.command()
|
|
37
|
+
def categories() -> None:
|
|
38
|
+
"""List the symptom categories debug recognises and what to check for each."""
|
|
39
|
+
table = Table(title="vmware-debug symptom categories")
|
|
40
|
+
table.add_column("category", style="cyan")
|
|
41
|
+
table.add_column("example keywords")
|
|
42
|
+
table.add_column("suggested check", style="green")
|
|
43
|
+
for c in list_symptom_categories():
|
|
44
|
+
table.add_row(c["category"], ", ".join(c["example_keywords"]), c["suggested_check"])
|
|
45
|
+
console.print(table)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@app.command()
|
|
49
|
+
def triage(
|
|
50
|
+
events_file: Optional[Path] = typer.Option(
|
|
51
|
+
None, "--events", "-e", help="JSON file of event envelopes; reads stdin if omitted."
|
|
52
|
+
),
|
|
53
|
+
bin_seconds: Optional[float] = typer.Option(None, help="Time-bin width; auto if omitted."),
|
|
54
|
+
top_n: int = typer.Option(5, help="Max hypotheses to return."),
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Correlate a set of pre-collected events into a ranked incident timeline."""
|
|
57
|
+
raw = events_file.read_text() if events_file else sys.stdin.read()
|
|
58
|
+
try:
|
|
59
|
+
events = json.loads(raw)
|
|
60
|
+
except json.JSONDecodeError as exc:
|
|
61
|
+
console.print(f"[red]Invalid JSON:[/red] {exc}. Provide a JSON array of event envelopes.")
|
|
62
|
+
raise typer.Exit(code=2) from exc
|
|
63
|
+
if not isinstance(events, list):
|
|
64
|
+
console.print("[red]Expected a JSON array of event envelopes.[/red]")
|
|
65
|
+
raise typer.Exit(code=2)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
result = incident_timeline(events, bin_seconds=bin_seconds, top_n=top_n)
|
|
69
|
+
except ValueError as exc:
|
|
70
|
+
console.print(f"[red]Could not correlate events:[/red] {exc}")
|
|
71
|
+
raise typer.Exit(code=2) from exc
|
|
72
|
+
|
|
73
|
+
console.print_json(data=result)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@app.command()
|
|
77
|
+
def mcp() -> None:
|
|
78
|
+
"""Start the stdio MCP server (no network access; proxy-safe)."""
|
|
79
|
+
from mcp_server.server import main as _main
|
|
80
|
+
|
|
81
|
+
_main()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
if __name__ == "__main__":
|
|
85
|
+
app()
|
vmware_debug/envelope.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""The unified event envelope — the contract between vmware-debug and every
|
|
2
|
+
data-source skill (monitor, aria, log-insight, nsx, ...).
|
|
3
|
+
|
|
4
|
+
vmware-debug deliberately has NO runtime dependency on the other skill packages
|
|
5
|
+
(CLAUDE.md 踩坑 #21/#32: no hidden cross-skill coupling, no version lockstep).
|
|
6
|
+
Instead the orchestrating agent fetches events with each skill's own read tools
|
|
7
|
+
and hands them to debug's correlator as plain dicts. This module normalises
|
|
8
|
+
those heterogeneous dicts into one immutable ``Event`` shape so the timeline /
|
|
9
|
+
spike / hypothesis logic can stay source-agnostic and unit-testable.
|
|
10
|
+
|
|
11
|
+
Envelope shape (also documented in references/event-envelope.md):
|
|
12
|
+
|
|
13
|
+
{
|
|
14
|
+
"ts": <ISO8601 string | epoch seconds | epoch millis>,
|
|
15
|
+
"source": "monitor" | "aria" | "loginsight" | "nsx" | ...,
|
|
16
|
+
"severity": "critical" | "error" | "warning" | "info" | "unknown",
|
|
17
|
+
"entity": "vm-web01" | "host-12" | "" ,
|
|
18
|
+
"text": "<human-readable message>",
|
|
19
|
+
"fields": { ... source-specific extras ... }
|
|
20
|
+
}
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from dataclasses import dataclass, field
|
|
26
|
+
from datetime import datetime, timezone
|
|
27
|
+
|
|
28
|
+
# Canonical severities, ordered by weight (higher = more severe). Used both for
|
|
29
|
+
# normalisation and for hypothesis scoring.
|
|
30
|
+
SEVERITY_WEIGHT: dict[str, int] = {
|
|
31
|
+
"critical": 5,
|
|
32
|
+
"error": 4,
|
|
33
|
+
"warning": 3,
|
|
34
|
+
"info": 1,
|
|
35
|
+
"unknown": 0,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# Common vendor spellings mapped onto the canonical set. Lower-cased on lookup.
|
|
39
|
+
_SEVERITY_ALIASES: dict[str, str] = {
|
|
40
|
+
"crit": "critical",
|
|
41
|
+
"critical": "critical",
|
|
42
|
+
"fatal": "critical",
|
|
43
|
+
"alert": "critical",
|
|
44
|
+
"emergency": "critical",
|
|
45
|
+
"err": "error",
|
|
46
|
+
"error": "error",
|
|
47
|
+
"red": "error",
|
|
48
|
+
"warn": "warning",
|
|
49
|
+
"warning": "warning",
|
|
50
|
+
"yellow": "warning",
|
|
51
|
+
"notice": "info",
|
|
52
|
+
"info": "info",
|
|
53
|
+
"information": "info",
|
|
54
|
+
"informational": "info",
|
|
55
|
+
"green": "info",
|
|
56
|
+
"debug": "info",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass(frozen=True)
|
|
61
|
+
class Event:
|
|
62
|
+
"""One normalised observation on the incident timeline."""
|
|
63
|
+
|
|
64
|
+
ts: float # epoch seconds (UTC)
|
|
65
|
+
source: str
|
|
66
|
+
severity: str
|
|
67
|
+
entity: str
|
|
68
|
+
text: str
|
|
69
|
+
fields: dict = field(default_factory=dict)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def normalize_severity(raw: object) -> str:
|
|
73
|
+
"""Map an arbitrary severity token onto the canonical set."""
|
|
74
|
+
if raw is None:
|
|
75
|
+
return "unknown"
|
|
76
|
+
return _SEVERITY_ALIASES.get(str(raw).strip().lower(), "unknown")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# Numeric timestamps below this (epoch seconds for ~1973-03) are implausible for
|
|
80
|
+
# VMware incident data and almost certainly a parse error (e.g. a bare year like
|
|
81
|
+
# "2020" -> 1970). Rejected loudly rather than landing silently at the epoch.
|
|
82
|
+
_MIN_PLAUSIBLE_EPOCH = 10**8
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def parse_timestamp(raw: object) -> float:
|
|
86
|
+
"""Parse a timestamp into epoch seconds (UTC).
|
|
87
|
+
|
|
88
|
+
Accepts ISO-8601 strings (with or without 'Z'), epoch seconds, or epoch
|
|
89
|
+
milliseconds (auto-detected by magnitude). ISO is tried before the numeric
|
|
90
|
+
fallback, and implausibly small epochs are rejected, so a malformed value
|
|
91
|
+
(a bare year, garbage) surfaces loudly rather than landing at 1970.
|
|
92
|
+
bool is excluded explicitly (it is an int subclass).
|
|
93
|
+
"""
|
|
94
|
+
if isinstance(raw, bool):
|
|
95
|
+
raise ValueError(f"unparseable timestamp: {raw!r}")
|
|
96
|
+
if isinstance(raw, (int, float)):
|
|
97
|
+
value = float(raw)
|
|
98
|
+
# Values past ~year 2286 in seconds are really milliseconds.
|
|
99
|
+
if value > 1e11:
|
|
100
|
+
value /= 1000.0
|
|
101
|
+
if value < _MIN_PLAUSIBLE_EPOCH:
|
|
102
|
+
raise ValueError(f"implausible epoch timestamp: {raw!r}")
|
|
103
|
+
return value
|
|
104
|
+
if isinstance(raw, str):
|
|
105
|
+
text = raw.strip()
|
|
106
|
+
if not text:
|
|
107
|
+
raise ValueError("empty timestamp")
|
|
108
|
+
# ISO-8601 first (so "2020-..." is a date, not epoch 2020).
|
|
109
|
+
try:
|
|
110
|
+
dt = datetime.fromisoformat(text.replace("Z", "+00:00"))
|
|
111
|
+
if dt.tzinfo is None:
|
|
112
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
113
|
+
return dt.timestamp()
|
|
114
|
+
except ValueError:
|
|
115
|
+
pass
|
|
116
|
+
# Then a numeric epoch string, subject to the same plausibility floor.
|
|
117
|
+
return parse_timestamp(float(text))
|
|
118
|
+
raise ValueError(f"unparseable timestamp: {raw!r}")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _first(d: dict, *keys: str) -> object:
|
|
122
|
+
"""Return the first present, non-None value among ``keys``."""
|
|
123
|
+
for k in keys:
|
|
124
|
+
if k in d and d[k] is not None:
|
|
125
|
+
return d[k]
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def normalize_event(raw: dict, source: str | None = None) -> Event:
|
|
130
|
+
"""Normalise one source-specific event dict into an :class:`Event`.
|
|
131
|
+
|
|
132
|
+
Tolerant of the common field-name variations across vCenter events, Aria
|
|
133
|
+
alerts/anomalies, Log Insight events, and NSX. Unknown extras are preserved
|
|
134
|
+
under ``fields`` so nothing is silently dropped.
|
|
135
|
+
"""
|
|
136
|
+
ts_raw = _first(raw, "ts", "timestamp", "time", "createTime", "startTimeUTC")
|
|
137
|
+
if ts_raw is None:
|
|
138
|
+
raise ValueError(f"event has no timestamp field: {raw!r}")
|
|
139
|
+
|
|
140
|
+
src = source or _first(raw, "source", "skill") or "unknown"
|
|
141
|
+
sev = normalize_severity(_first(raw, "severity", "criticality", "level", "status"))
|
|
142
|
+
entity = _first(
|
|
143
|
+
raw, "entity", "entity_name", "resourceName", "vm", "vm_name", "object", "host"
|
|
144
|
+
)
|
|
145
|
+
text = _first(raw, "text", "message", "msg", "description", "fullFormattedMessage")
|
|
146
|
+
|
|
147
|
+
known = {
|
|
148
|
+
"ts", "timestamp", "time", "createTime", "startTimeUTC",
|
|
149
|
+
"source", "skill", "severity", "criticality", "level", "status",
|
|
150
|
+
"entity", "entity_name", "resourceName", "vm", "vm_name", "object", "host",
|
|
151
|
+
"text", "message", "msg", "description", "fullFormattedMessage",
|
|
152
|
+
}
|
|
153
|
+
extras = {k: v for k, v in raw.items() if k not in known}
|
|
154
|
+
|
|
155
|
+
return Event(
|
|
156
|
+
ts=parse_timestamp(ts_raw),
|
|
157
|
+
source=str(src),
|
|
158
|
+
severity=sev,
|
|
159
|
+
entity=str(entity) if entity is not None else "",
|
|
160
|
+
text=str(text) if text is not None else "",
|
|
161
|
+
fields=extras,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def normalize_events(raw_events: list[dict], source: str | None = None) -> list[Event]:
|
|
166
|
+
"""Normalise a batch, skipping nothing — a bad event raises with its index."""
|
|
167
|
+
out: list[Event] = []
|
|
168
|
+
for i, raw in enumerate(raw_events):
|
|
169
|
+
try:
|
|
170
|
+
out.append(normalize_event(raw, source))
|
|
171
|
+
except (ValueError, AttributeError, TypeError) as exc:
|
|
172
|
+
raise ValueError(f"event[{i}] could not be normalised: {exc}") from exc
|
|
173
|
+
return out
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""vmware-debug MCP tool logic — pure, read-only correlation. No network, no
|
|
2
|
+
writes, no cross-skill imports. The agent fetches events with the other skills'
|
|
3
|
+
read tools and passes them here as plain dicts (the unified event envelope)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from vmware_debug.envelope import normalize_events
|
|
10
|
+
from vmware_debug.ops.timeline import category_routing
|
|
11
|
+
from vmware_debug.ops.timeline import incident_timeline as _incident_timeline
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def incident_timeline(
|
|
15
|
+
events: list[dict],
|
|
16
|
+
bin_seconds: Optional[float] = None,
|
|
17
|
+
z_threshold: float = 2.0,
|
|
18
|
+
top_n: int = 5,
|
|
19
|
+
) -> dict:
|
|
20
|
+
"""Correlate pre-fetched events into a timeline + spikes + ranked hypotheses.
|
|
21
|
+
|
|
22
|
+
``events`` is a list of event envelopes (see references/event-envelope.md).
|
|
23
|
+
Raises ValueError (with the offending index) if an event can't be normalised.
|
|
24
|
+
"""
|
|
25
|
+
normalized = normalize_events(events)
|
|
26
|
+
return _incident_timeline(
|
|
27
|
+
normalized, bin_seconds=bin_seconds, z_threshold=z_threshold, top_n=top_n
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def list_symptom_categories() -> list[dict]:
|
|
32
|
+
"""List the symptom categories debug recognises and what to check for each."""
|
|
33
|
+
return category_routing()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Diagnostic operations for vmware-debug."""
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""Incident correlation engine — the heart of vmware-debug.
|
|
2
|
+
|
|
3
|
+
Pure functions over a list of normalised :class:`~vmware_debug.envelope.Event`.
|
|
4
|
+
No I/O, no network, no cross-skill imports: the orchestrating agent fetches
|
|
5
|
+
events via each data-source skill's read tools and feeds them here. That keeps
|
|
6
|
+
the valuable logic (timeline merge, spike detection, hypothesis ranking, and
|
|
7
|
+
next-check suggestions) self-contained and unit-testable.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
|
|
14
|
+
from vmware_debug.envelope import SEVERITY_WEIGHT, Event
|
|
15
|
+
|
|
16
|
+
# A symptom taxonomy: keyword signatures -> (category, which skill/tool to look
|
|
17
|
+
# at next). This is what lets debug "give a valuable idea even when the user
|
|
18
|
+
# doesn't know what to check". Keywords are matched case-insensitively against
|
|
19
|
+
# event text + entity. Order matters only for the human-readable label; scoring
|
|
20
|
+
# counts every category that matches.
|
|
21
|
+
_CATEGORY_SIGNATURES: tuple[tuple[str, tuple[str, ...], str], ...] = (
|
|
22
|
+
(
|
|
23
|
+
"storage",
|
|
24
|
+
("datastore", "scsi", "latency", "vsan", "lun", "naa.", "apd", "pdl",
|
|
25
|
+
"disk full", "no space", "vmfs", "iscsi"),
|
|
26
|
+
"vmware-storage (datastore/vsan health) + vmware-log-insight (search "
|
|
27
|
+
"the host's vmkernel for scsi/apd events around the spike)",
|
|
28
|
+
),
|
|
29
|
+
(
|
|
30
|
+
"network",
|
|
31
|
+
("vmotion", "vnic", "dvswitch", "dvs ", "uplink", "link down", "mtu",
|
|
32
|
+
"firewall", "dfw", "segment", "tier-0", "tier-1", "bgp", "packet drop"),
|
|
33
|
+
"vmware-nsx / vmware-nsx-security (run a traceflow between the affected "
|
|
34
|
+
"endpoints; check DFW rule hits) + vmware-log-insight (network logs)",
|
|
35
|
+
),
|
|
36
|
+
(
|
|
37
|
+
"compute",
|
|
38
|
+
("cpu ready", "memory", "balloon", "swap", "contention", "overcommit",
|
|
39
|
+
"numa"),
|
|
40
|
+
"vmware-aria (CPU-ready / memory-contention metrics + anomalies for the "
|
|
41
|
+
"VM and its host) ",
|
|
42
|
+
),
|
|
43
|
+
(
|
|
44
|
+
"ha_drs",
|
|
45
|
+
("ha ", "high availability", "drs", "failover", "admission control",
|
|
46
|
+
"host isolation", "heartbeat"),
|
|
47
|
+
"vmware-monitor (cluster + host health, recent HA/DRS events) + "
|
|
48
|
+
"vmware-aiops (cluster state)",
|
|
49
|
+
),
|
|
50
|
+
(
|
|
51
|
+
"power_lifecycle",
|
|
52
|
+
("power on", "power off", "failed to start", "boot", "vmx", "ovf",
|
|
53
|
+
"deploy", "clone", "snapshot", "consolidate"),
|
|
54
|
+
"vmware-aiops (VM task status, snapshot tree) + vmware-monitor (the VM's "
|
|
55
|
+
"recent events)",
|
|
56
|
+
),
|
|
57
|
+
(
|
|
58
|
+
"auth",
|
|
59
|
+
("login", "authentication", "permission", "denied", "unauthorized",
|
|
60
|
+
"401", "403", "token", "certificate", "tls"),
|
|
61
|
+
"check the service account + credentials in config/.env; verify the "
|
|
62
|
+
"target's certificate/time sync",
|
|
63
|
+
),
|
|
64
|
+
(
|
|
65
|
+
"platform",
|
|
66
|
+
("vpxd", "hostd", "service", "restart", "crash", "core dump", "503",
|
|
67
|
+
"not responding", "disconnected"),
|
|
68
|
+
"vmware-monitor (host connection state + service health) + "
|
|
69
|
+
"vmware-log-insight (vpxd/hostd logs around the first error)",
|
|
70
|
+
),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass(frozen=True)
|
|
75
|
+
class Bucket:
|
|
76
|
+
"""A time bin with event counts."""
|
|
77
|
+
|
|
78
|
+
start: float
|
|
79
|
+
end: float
|
|
80
|
+
count: int
|
|
81
|
+
by_severity: dict[str, int] = field(default_factory=dict)
|
|
82
|
+
by_source: dict[str, int] = field(default_factory=dict)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass(frozen=True)
|
|
86
|
+
class Spike:
|
|
87
|
+
"""A bin whose count is anomalously high vs the series mean."""
|
|
88
|
+
|
|
89
|
+
start: float
|
|
90
|
+
end: float
|
|
91
|
+
count: int
|
|
92
|
+
zscore: float
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass(frozen=True)
|
|
96
|
+
class Hypothesis:
|
|
97
|
+
"""A ranked root-cause candidate with evidence and a next step."""
|
|
98
|
+
|
|
99
|
+
category: str
|
|
100
|
+
score: float
|
|
101
|
+
summary: str
|
|
102
|
+
evidence_count: int
|
|
103
|
+
first_seen: float
|
|
104
|
+
last_seen: float
|
|
105
|
+
sample_text: str
|
|
106
|
+
suggested_check: str
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def build_timeline(events: list[Event]) -> list[Event]:
|
|
110
|
+
"""Return events sorted chronologically (stable)."""
|
|
111
|
+
return sorted(events, key=lambda e: e.ts)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def bin_events(events: list[Event], bin_seconds: float) -> list[Bucket]:
|
|
115
|
+
"""Bucket events into fixed-width time bins covering [min_ts, max_ts]."""
|
|
116
|
+
if not events:
|
|
117
|
+
return []
|
|
118
|
+
if bin_seconds <= 0:
|
|
119
|
+
raise ValueError("bin_seconds must be positive")
|
|
120
|
+
ordered = build_timeline(events)
|
|
121
|
+
start = ordered[0].ts
|
|
122
|
+
end = ordered[-1].ts
|
|
123
|
+
n_bins = int((end - start) // bin_seconds) + 1
|
|
124
|
+
counts: list[dict] = [
|
|
125
|
+
{"count": 0, "by_severity": {}, "by_source": {}} for _ in range(n_bins)
|
|
126
|
+
]
|
|
127
|
+
for e in ordered:
|
|
128
|
+
idx = min(int((e.ts - start) // bin_seconds), n_bins - 1)
|
|
129
|
+
b = counts[idx]
|
|
130
|
+
b["count"] += 1
|
|
131
|
+
b["by_severity"][e.severity] = b["by_severity"].get(e.severity, 0) + 1
|
|
132
|
+
b["by_source"][e.source] = b["by_source"].get(e.source, 0) + 1
|
|
133
|
+
return [
|
|
134
|
+
Bucket(
|
|
135
|
+
start=start + i * bin_seconds,
|
|
136
|
+
end=start + (i + 1) * bin_seconds,
|
|
137
|
+
count=b["count"],
|
|
138
|
+
by_severity=b["by_severity"],
|
|
139
|
+
by_source=b["by_source"],
|
|
140
|
+
)
|
|
141
|
+
for i, b in enumerate(counts)
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def detect_spikes(buckets: list[Bucket], z_threshold: float = 2.0) -> list[Spike]:
|
|
146
|
+
"""Flag bins whose count exceeds mean + ``z_threshold`` * stddev.
|
|
147
|
+
|
|
148
|
+
Needs at least 3 non-trivial bins to have a meaningful baseline; below that
|
|
149
|
+
it returns nothing rather than calling every event a spike.
|
|
150
|
+
"""
|
|
151
|
+
counts = [b.count for b in buckets]
|
|
152
|
+
if len(counts) < 3:
|
|
153
|
+
return []
|
|
154
|
+
mean = sum(counts) / len(counts)
|
|
155
|
+
variance = sum((c - mean) ** 2 for c in counts) / len(counts)
|
|
156
|
+
stddev = variance**0.5
|
|
157
|
+
if stddev == 0:
|
|
158
|
+
return []
|
|
159
|
+
spikes = []
|
|
160
|
+
for b in buckets:
|
|
161
|
+
z = (b.count - mean) / stddev
|
|
162
|
+
if z >= z_threshold:
|
|
163
|
+
spikes.append(Spike(start=b.start, end=b.end, count=b.count, zscore=z))
|
|
164
|
+
return spikes
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _categorize(text: str, entity: str) -> list[tuple[str, str]]:
|
|
168
|
+
"""Return (category, suggested_check) for every signature the text matches."""
|
|
169
|
+
haystack = f"{text} {entity}".lower()
|
|
170
|
+
hits = []
|
|
171
|
+
for category, keywords, suggestion in _CATEGORY_SIGNATURES:
|
|
172
|
+
if any(kw in haystack for kw in keywords):
|
|
173
|
+
hits.append((category, suggestion))
|
|
174
|
+
return hits
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def rank_hypotheses(events: list[Event], top_n: int = 5) -> list[Hypothesis]:
|
|
178
|
+
"""Cluster events by symptom category and rank them as root-cause candidates.
|
|
179
|
+
|
|
180
|
+
Score = sum of severity weights of the events in the category. Ties broken
|
|
181
|
+
by recency (a category whose evidence is more recent ranks higher). Events
|
|
182
|
+
matching no category are grouped under "uncategorized" so they remain
|
|
183
|
+
visible rather than dropped.
|
|
184
|
+
"""
|
|
185
|
+
groups: dict[str, dict] = {}
|
|
186
|
+
for e in events:
|
|
187
|
+
cats = _categorize(e.text, e.entity) or [("uncategorized", "")]
|
|
188
|
+
for category, suggestion in cats:
|
|
189
|
+
g = groups.setdefault(
|
|
190
|
+
category,
|
|
191
|
+
{"score": 0.0, "events": [], "suggestion": suggestion},
|
|
192
|
+
)
|
|
193
|
+
g["score"] += SEVERITY_WEIGHT.get(e.severity, 0)
|
|
194
|
+
g["events"].append(e)
|
|
195
|
+
if suggestion:
|
|
196
|
+
g["suggestion"] = suggestion
|
|
197
|
+
|
|
198
|
+
hypotheses = []
|
|
199
|
+
for category, g in groups.items():
|
|
200
|
+
evs = build_timeline(g["events"])
|
|
201
|
+
worst = max(evs, key=lambda e: SEVERITY_WEIGHT.get(e.severity, 0))
|
|
202
|
+
hypotheses.append(
|
|
203
|
+
Hypothesis(
|
|
204
|
+
category=category,
|
|
205
|
+
score=g["score"],
|
|
206
|
+
summary=(
|
|
207
|
+
f"{len(evs)} {category} event(s); most severe is "
|
|
208
|
+
f"'{worst.severity}' from {worst.source}"
|
|
209
|
+
),
|
|
210
|
+
evidence_count=len(evs),
|
|
211
|
+
first_seen=evs[0].ts,
|
|
212
|
+
last_seen=evs[-1].ts,
|
|
213
|
+
sample_text=worst.text[:200],
|
|
214
|
+
suggested_check=g["suggestion"]
|
|
215
|
+
or "no category matched — widen the search window or pull events "
|
|
216
|
+
"from another source (monitor events, aria alerts, log-insight)",
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
hypotheses.sort(key=lambda h: (h.score, h.last_seen), reverse=True)
|
|
220
|
+
return hypotheses[:top_n]
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _auto_bin_seconds(events: list[Event]) -> float:
|
|
224
|
+
"""Pick a sensible bin width from the event span (~30 bins, min 1s)."""
|
|
225
|
+
ordered = build_timeline(events)
|
|
226
|
+
span = ordered[-1].ts - ordered[0].ts
|
|
227
|
+
if span <= 0:
|
|
228
|
+
return 1.0
|
|
229
|
+
return max(1.0, span / 30.0)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def incident_timeline(
|
|
233
|
+
events: list[Event],
|
|
234
|
+
*,
|
|
235
|
+
bin_seconds: float | None = None,
|
|
236
|
+
z_threshold: float = 2.0,
|
|
237
|
+
top_n: int = 5,
|
|
238
|
+
) -> dict:
|
|
239
|
+
"""Top-level correlation: timeline summary + spikes + ranked hypotheses.
|
|
240
|
+
|
|
241
|
+
Returns a JSON-serialisable dict suitable for an MCP tool response. Empty
|
|
242
|
+
input yields an explicit "no events" result with a suggestion rather than
|
|
243
|
+
an empty/ambiguous payload.
|
|
244
|
+
"""
|
|
245
|
+
if not events:
|
|
246
|
+
return {
|
|
247
|
+
"event_count": 0,
|
|
248
|
+
"window": None,
|
|
249
|
+
"spikes": [],
|
|
250
|
+
"hypotheses": [],
|
|
251
|
+
"next_checks": [
|
|
252
|
+
"No events supplied. Pull a starting set: vmware-monitor "
|
|
253
|
+
"event_list / alarm_list for the affected entity, then "
|
|
254
|
+
"vmware-log-insight log_search around the reported time."
|
|
255
|
+
],
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
ordered = build_timeline(events)
|
|
259
|
+
width = bin_seconds or _auto_bin_seconds(ordered)
|
|
260
|
+
buckets = bin_events(ordered, width)
|
|
261
|
+
spikes = detect_spikes(buckets, z_threshold=z_threshold)
|
|
262
|
+
hyps = rank_hypotheses(ordered, top_n=top_n)
|
|
263
|
+
|
|
264
|
+
next_checks = [h.suggested_check for h in hyps if h.category != "uncategorized"]
|
|
265
|
+
if not next_checks:
|
|
266
|
+
next_checks = [
|
|
267
|
+
"No known symptom pattern matched. Widen the time window, or pull "
|
|
268
|
+
"metrics from vmware-aria (anomalies) and logs from "
|
|
269
|
+
"vmware-log-insight to enrich the timeline."
|
|
270
|
+
]
|
|
271
|
+
|
|
272
|
+
return {
|
|
273
|
+
"event_count": len(ordered),
|
|
274
|
+
"window": {"start": ordered[0].ts, "end": ordered[-1].ts, "bin_seconds": width},
|
|
275
|
+
"spikes": [
|
|
276
|
+
{"start": s.start, "end": s.end, "count": s.count, "zscore": round(s.zscore, 2)}
|
|
277
|
+
for s in spikes
|
|
278
|
+
],
|
|
279
|
+
"hypotheses": [
|
|
280
|
+
{
|
|
281
|
+
"category": h.category,
|
|
282
|
+
"score": h.score,
|
|
283
|
+
"summary": h.summary,
|
|
284
|
+
"evidence_count": h.evidence_count,
|
|
285
|
+
"first_seen": h.first_seen,
|
|
286
|
+
"last_seen": h.last_seen,
|
|
287
|
+
"sample_text": h.sample_text,
|
|
288
|
+
"suggested_check": h.suggested_check,
|
|
289
|
+
}
|
|
290
|
+
for h in hyps
|
|
291
|
+
],
|
|
292
|
+
"next_checks": next_checks,
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
# Kept module-private but exported for tests that assert the catalogue stays
|
|
297
|
+
# in sync with the routing playbooks.
|
|
298
|
+
def known_categories() -> list[str]:
|
|
299
|
+
"""Return the symptom categories debug knows how to route."""
|
|
300
|
+
return [name for name, _kw, _s in _CATEGORY_SIGNATURES]
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def category_routing() -> list[dict]:
|
|
304
|
+
"""Return each symptom category with sample keywords and its next-check idea.
|
|
305
|
+
|
|
306
|
+
Powers the discovery tool that answers "what can you help me check?" — and
|
|
307
|
+
lets a regression test keep this catalogue in sync with the playbooks.
|
|
308
|
+
"""
|
|
309
|
+
return [
|
|
310
|
+
{"category": name, "example_keywords": list(keywords[:6]), "suggested_check": suggestion}
|
|
311
|
+
for name, keywords, suggestion in _CATEGORY_SIGNATURES
|
|
312
|
+
]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vmware-debug
|
|
3
|
+
Version: 1.6.1
|
|
4
|
+
Summary: VMware diagnostic brain — read-only incident triage, log/event correlation, and root-cause routing across the VMware skill family
|
|
5
|
+
Author-email: Wei Zhou <wei-wz.zhou@broadcom.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: ai-ops,debug,diagnostics,mcp,rca,troubleshooting,vmware,vsphere
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Topic :: System :: Monitoring
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Requires-Dist: mcp[cli]<2.0,>=1.10
|
|
14
|
+
Requires-Dist: rich<15.0,>=13.0
|
|
15
|
+
Requires-Dist: typer<1.0,>=0.12
|
|
16
|
+
Requires-Dist: vmware-policy<2.0,>=1.0.0
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
<!-- mcp-name: io.github.zw008/vmware-debug -->
|
|
20
|
+
|
|
21
|
+
# VMware Debug
|
|
22
|
+
|
|
23
|
+
> ⚠️ **Work in progress** — the core (event correlation engine, MCP tools, CLI)
|
|
24
|
+
> is built and tested; README, `server.json`, full reference docs, and packaging
|
|
25
|
+
> polish are still landing. Not yet published to PyPI.
|
|
26
|
+
|
|
27
|
+
> **Disclaimer**: Community-maintained open-source project, **not affiliated with,
|
|
28
|
+
> endorsed by, or sponsored by VMware, Inc. or Broadcom Inc.** "VMware" and
|
|
29
|
+
> "vSphere" are trademarks of Broadcom. Source is publicly auditable under the MIT
|
|
30
|
+
> license.
|
|
31
|
+
|
|
32
|
+
The diagnostic brain of the VMware skill family. You bring the symptom (an error,
|
|
33
|
+
a log dump, a slow VM); this skill runs a systematic investigation, correlates
|
|
34
|
+
events from the other skills into one timeline, ranks root-cause hypotheses, and
|
|
35
|
+
tells you what to check next. It is **read-only** — it never changes anything and
|
|
36
|
+
never executes fixes. Remediation is routed to `vmware-aiops` (single op) or
|
|
37
|
+
`vmware-pilot` (multi-step, gated), mirroring the `vmware-harden → vmware-pilot`
|
|
38
|
+
advisor/executor split.
|
|
39
|
+
|
|
40
|
+
See [`skills/vmware-debug/SKILL.md`](skills/vmware-debug/SKILL.md) for the full
|
|
41
|
+
methodology, the event-envelope contract, and symptom routing.
|
|
42
|
+
|
|
43
|
+
## MCP tools
|
|
44
|
+
|
|
45
|
+
| Tool | What |
|
|
46
|
+
|---|---|
|
|
47
|
+
| `incident_timeline` | [READ] Correlate pre-fetched events → timeline + spikes + ranked hypotheses + next-check ideas |
|
|
48
|
+
| `list_symptom_categories` | [READ] List recognised symptom categories + what to check for each |
|
|
49
|
+
|
|
50
|
+
## License
|
|
51
|
+
|
|
52
|
+
MIT.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
mcp_server/__init__.py,sha256=QZpQxriDdqDCYaS-HP7McsaPbTPbHwcamIwu-7UxY-E,49
|
|
2
|
+
mcp_server/server.py,sha256=dsQhAD4AR42IYPGtPuadzUTVvEu7Yl-l4OeBGuLSPJw,3188
|
|
3
|
+
vmware_debug/__init__.py,sha256=NG481KQ7ilwOHB36cwloCjUBkfMd0-zAXKKVNGmxMP0,309
|
|
4
|
+
vmware_debug/cli.py,sha256=bObdZFEQ3SVHwxMqKDxcac_9cOcvofOsmyGtItr5Hjg,2818
|
|
5
|
+
vmware_debug/envelope.py,sha256=rPW5xRb8n_vio6q5i3JXy0tvmIZjmNZ84kc_8ToXctg,6368
|
|
6
|
+
vmware_debug/mcp/__init__.py,sha256=6EYkBo6LUZj2gtTLcjCInqUT441nNqai-XU1bO6uWyc,149
|
|
7
|
+
vmware_debug/mcp/tools.py,sha256=RIumf88g3suRGQW_XqCRvufiyiNL04j5OHmeRKPjtS8,1214
|
|
8
|
+
vmware_debug/ops/__init__.py,sha256=Co9u0zZexST7-IOUBOAS5eq5RHkaEeI2FqfYijzhI1g,46
|
|
9
|
+
vmware_debug/ops/timeline.py,sha256=mZNQNTiQxdKGakh2cZAv3jjIzLCCb3OPifjQAGTvnLc,11273
|
|
10
|
+
vmware_debug-1.6.1.dist-info/METADATA,sha256=-nOEynNcGJJhQ7_nMMt-d1MPFvYjP4cf_Bpw5nYtcQg,2211
|
|
11
|
+
vmware_debug-1.6.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
12
|
+
vmware_debug-1.6.1.dist-info/entry_points.txt,sha256=Ye2yiV3UraSCbeb6bfadvGx2ZK2KEbUK4PC9KpaN0ak,96
|
|
13
|
+
vmware_debug-1.6.1.dist-info/RECORD,,
|