vmware-debug 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcp_server/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """stdio MCP server package for vmware-debug."""
mcp_server/server.py ADDED
@@ -0,0 +1,78 @@
1
+ """vmware-debug MCP server entry point.
2
+
3
+ Tools are defined in vmware_debug.mcp.tools (so audit logs see skill=debug).
4
+ This module wires them into a FastMCP server and provides the stdio entry point.
5
+
6
+ Note: signatures here use typing.Optional, never PEP 604 ``X | None`` — FastMCP
7
+ reflects these at registration and ``X | None`` crashes on Python 3.10 + older
8
+ mcp/pydantic (CLAUDE.md 踩坑 #33).
9
+ """
10
+
11
+ import sys
12
+ from typing import Optional
13
+
14
+ from mcp.server.fastmcp import FastMCP
15
+
16
+ from vmware_debug.mcp import tools as t
17
+
18
+
19
+ def build_server() -> FastMCP:
20
+ """Construct and configure the MCP server."""
21
+ server = FastMCP("vmware-debug")
22
+
23
+ @server.tool(name="incident_timeline")
24
+ def _incident_timeline_impl(
25
+ events: list[dict],
26
+ bin_seconds: Optional[float] = None,
27
+ z_threshold: float = 2.0,
28
+ top_n: int = 5,
29
+ ) -> dict:
30
+ """[READ] Correlate already-fetched VMware events into one incident view.
31
+
32
+ WHEN: after you've pulled events for an incident from the data-source
33
+ skills (vmware-monitor event_list/alarm_list, vmware-aria alerts/anomaly,
34
+ vmware-log-insight log_search/log_aggregate, vmware-nsx) — feed them all
35
+ here to find what correlates and where to look next. This tool does NOT
36
+ fetch anything itself; it has no vCenter/network access.
37
+
38
+ INPUT: events = list of event envelopes, each {ts, source, severity,
39
+ entity, text, fields} (ts may be ISO-8601, epoch seconds, or millis;
40
+ severity is normalised). Optional: bin_seconds (time-bin width; auto if
41
+ omitted), z_threshold (spike sensitivity, default 2.0), top_n (max
42
+ hypotheses, default 5).
43
+
44
+ RETURNS: {event_count, window, spikes (anomalous time bins), hypotheses
45
+ (ranked root-cause candidates, each with a suggested_check), next_checks
46
+ (concrete ideas for what to investigate next, including which skill/tool
47
+ to run)}.
48
+
49
+ GOTCHAS: read-only and stateless — nothing is executed. Remediation is
50
+ routed to vmware-aiops (single fix) or vmware-pilot (multi-step, gated).
51
+ A malformed event raises ValueError naming its index."""
52
+ return t.incident_timeline(events, bin_seconds, z_threshold, top_n)
53
+
54
+ @server.tool(name="list_symptom_categories")
55
+ def _list_symptom_categories_impl() -> list[dict]:
56
+ """[READ] List the symptom categories vmware-debug recognises and, for
57
+ each, example keywords and the suggested next check (which skill/tool to
58
+ run). Takes no parameters. Use this when you don't yet know what to look
59
+ at — it turns "something's wrong" into concrete investigation steps.
60
+ Read-only; no network access."""
61
+ return t.list_symptom_categories()
62
+
63
+ return server
64
+
65
+
66
+ def main() -> None:
67
+ """Entry point for `vmware-debug-mcp` (stdio transport)."""
68
+ if sys.version_info < (3, 11):
69
+ sys.exit(
70
+ "vmware-debug-mcp requires Python >= 3.11 (FastMCP schema reflection "
71
+ "is unreliable on 3.10). Reinstall under 3.11+: "
72
+ "uv tool install --python 3.11 vmware-debug"
73
+ )
74
+ build_server().run()
75
+
76
+
77
+ if __name__ == "__main__":
78
+ main()
@@ -0,0 +1,8 @@
1
+ """vmware-debug — VMware diagnostic brain.
2
+
3
+ Read-only incident triage: correlate events from monitor/aria/log-insight/nsx
4
+ into a unified timeline, detect spikes, rank root-cause hypotheses, and route
5
+ remediation to vmware-aiops / vmware-pilot. Never writes; never executes fixes.
6
+ """
7
+
8
+ __version__ = "1.6.1"
vmware_debug/cli.py ADDED
@@ -0,0 +1,85 @@
1
+ """vmware-debug CLI — read-only incident triage from the terminal.
2
+
3
+ Correlation is local and offline: feed it events you've already collected (a
4
+ JSON file or stdin) and it returns a ranked timeline + next-check ideas. The
5
+ `mcp` subcommand starts the stdio MCP server (entry point that does not touch
6
+ the network, so it works behind corporate TLS proxies — CLAUDE.md 踩坑 #25).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import sys
13
+ from pathlib import Path
14
+ from typing import Optional
15
+
16
+ import typer
17
+ from rich.console import Console
18
+ from rich.table import Table
19
+
20
+ from vmware_debug import __version__
21
+ from vmware_debug.mcp.tools import incident_timeline, list_symptom_categories
22
+
23
+ app = typer.Typer(
24
+ add_completion=False,
25
+ help="VMware diagnostic brain — read-only incident triage and root-cause routing.",
26
+ )
27
+ console = Console()
28
+
29
+
30
+ @app.command()
31
+ def version() -> None:
32
+ """Print the installed version."""
33
+ console.print(f"vmware-debug {__version__}")
34
+
35
+
36
+ @app.command()
37
+ def categories() -> None:
38
+ """List the symptom categories debug recognises and what to check for each."""
39
+ table = Table(title="vmware-debug symptom categories")
40
+ table.add_column("category", style="cyan")
41
+ table.add_column("example keywords")
42
+ table.add_column("suggested check", style="green")
43
+ for c in list_symptom_categories():
44
+ table.add_row(c["category"], ", ".join(c["example_keywords"]), c["suggested_check"])
45
+ console.print(table)
46
+
47
+
48
+ @app.command()
49
+ def triage(
50
+ events_file: Optional[Path] = typer.Option(
51
+ None, "--events", "-e", help="JSON file of event envelopes; reads stdin if omitted."
52
+ ),
53
+ bin_seconds: Optional[float] = typer.Option(None, help="Time-bin width; auto if omitted."),
54
+ top_n: int = typer.Option(5, help="Max hypotheses to return."),
55
+ ) -> None:
56
+ """Correlate a set of pre-collected events into a ranked incident timeline."""
57
+ raw = events_file.read_text() if events_file else sys.stdin.read()
58
+ try:
59
+ events = json.loads(raw)
60
+ except json.JSONDecodeError as exc:
61
+ console.print(f"[red]Invalid JSON:[/red] {exc}. Provide a JSON array of event envelopes.")
62
+ raise typer.Exit(code=2) from exc
63
+ if not isinstance(events, list):
64
+ console.print("[red]Expected a JSON array of event envelopes.[/red]")
65
+ raise typer.Exit(code=2)
66
+
67
+ try:
68
+ result = incident_timeline(events, bin_seconds=bin_seconds, top_n=top_n)
69
+ except ValueError as exc:
70
+ console.print(f"[red]Could not correlate events:[/red] {exc}")
71
+ raise typer.Exit(code=2) from exc
72
+
73
+ console.print_json(data=result)
74
+
75
+
76
+ @app.command()
77
+ def mcp() -> None:
78
+ """Start the stdio MCP server (no network access; proxy-safe)."""
79
+ from mcp_server.server import main as _main
80
+
81
+ _main()
82
+
83
+
84
+ if __name__ == "__main__":
85
+ app()
@@ -0,0 +1,173 @@
1
+ """The unified event envelope — the contract between vmware-debug and every
2
+ data-source skill (monitor, aria, log-insight, nsx, ...).
3
+
4
+ vmware-debug deliberately has NO runtime dependency on the other skill packages
5
+ (CLAUDE.md 踩坑 #21/#32: no hidden cross-skill coupling, no version lockstep).
6
+ Instead the orchestrating agent fetches events with each skill's own read tools
7
+ and hands them to debug's correlator as plain dicts. This module normalises
8
+ those heterogeneous dicts into one immutable ``Event`` shape so the timeline /
9
+ spike / hypothesis logic can stay source-agnostic and unit-testable.
10
+
11
+ Envelope shape (also documented in references/event-envelope.md):
12
+
13
+ {
14
+ "ts": <ISO8601 string | epoch seconds | epoch millis>,
15
+ "source": "monitor" | "aria" | "loginsight" | "nsx" | ...,
16
+ "severity": "critical" | "error" | "warning" | "info" | "unknown",
17
+ "entity": "vm-web01" | "host-12" | "" ,
18
+ "text": "<human-readable message>",
19
+ "fields": { ... source-specific extras ... }
20
+ }
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from dataclasses import dataclass, field
26
+ from datetime import datetime, timezone
27
+
28
+ # Canonical severities, ordered by weight (higher = more severe). Used both for
29
+ # normalisation and for hypothesis scoring.
30
+ SEVERITY_WEIGHT: dict[str, int] = {
31
+ "critical": 5,
32
+ "error": 4,
33
+ "warning": 3,
34
+ "info": 1,
35
+ "unknown": 0,
36
+ }
37
+
38
+ # Common vendor spellings mapped onto the canonical set. Lower-cased on lookup.
39
+ _SEVERITY_ALIASES: dict[str, str] = {
40
+ "crit": "critical",
41
+ "critical": "critical",
42
+ "fatal": "critical",
43
+ "alert": "critical",
44
+ "emergency": "critical",
45
+ "err": "error",
46
+ "error": "error",
47
+ "red": "error",
48
+ "warn": "warning",
49
+ "warning": "warning",
50
+ "yellow": "warning",
51
+ "notice": "info",
52
+ "info": "info",
53
+ "information": "info",
54
+ "informational": "info",
55
+ "green": "info",
56
+ "debug": "info",
57
+ }
58
+
59
+
60
+ @dataclass(frozen=True)
61
+ class Event:
62
+ """One normalised observation on the incident timeline."""
63
+
64
+ ts: float # epoch seconds (UTC)
65
+ source: str
66
+ severity: str
67
+ entity: str
68
+ text: str
69
+ fields: dict = field(default_factory=dict)
70
+
71
+
72
+ def normalize_severity(raw: object) -> str:
73
+ """Map an arbitrary severity token onto the canonical set."""
74
+ if raw is None:
75
+ return "unknown"
76
+ return _SEVERITY_ALIASES.get(str(raw).strip().lower(), "unknown")
77
+
78
+
79
+ # Numeric timestamps below this (epoch seconds for ~1973-03) are implausible for
80
+ # VMware incident data and almost certainly a parse error (e.g. a bare year like
81
+ # "2020" -> 1970). Rejected loudly rather than landing silently at the epoch.
82
+ _MIN_PLAUSIBLE_EPOCH = 10**8
83
+
84
+
85
+ def parse_timestamp(raw: object) -> float:
86
+ """Parse a timestamp into epoch seconds (UTC).
87
+
88
+ Accepts ISO-8601 strings (with or without 'Z'), epoch seconds, or epoch
89
+ milliseconds (auto-detected by magnitude). ISO is tried before the numeric
90
+ fallback, and implausibly small epochs are rejected, so a malformed value
91
+ (a bare year, garbage) surfaces loudly rather than landing at 1970.
92
+ bool is excluded explicitly (it is an int subclass).
93
+ """
94
+ if isinstance(raw, bool):
95
+ raise ValueError(f"unparseable timestamp: {raw!r}")
96
+ if isinstance(raw, (int, float)):
97
+ value = float(raw)
98
+ # Values past ~year 2286 in seconds are really milliseconds.
99
+ if value > 1e11:
100
+ value /= 1000.0
101
+ if value < _MIN_PLAUSIBLE_EPOCH:
102
+ raise ValueError(f"implausible epoch timestamp: {raw!r}")
103
+ return value
104
+ if isinstance(raw, str):
105
+ text = raw.strip()
106
+ if not text:
107
+ raise ValueError("empty timestamp")
108
+ # ISO-8601 first (so "2020-..." is a date, not epoch 2020).
109
+ try:
110
+ dt = datetime.fromisoformat(text.replace("Z", "+00:00"))
111
+ if dt.tzinfo is None:
112
+ dt = dt.replace(tzinfo=timezone.utc)
113
+ return dt.timestamp()
114
+ except ValueError:
115
+ pass
116
+ # Then a numeric epoch string, subject to the same plausibility floor.
117
+ return parse_timestamp(float(text))
118
+ raise ValueError(f"unparseable timestamp: {raw!r}")
119
+
120
+
121
+ def _first(d: dict, *keys: str) -> object:
122
+ """Return the first present, non-None value among ``keys``."""
123
+ for k in keys:
124
+ if k in d and d[k] is not None:
125
+ return d[k]
126
+ return None
127
+
128
+
129
+ def normalize_event(raw: dict, source: str | None = None) -> Event:
130
+ """Normalise one source-specific event dict into an :class:`Event`.
131
+
132
+ Tolerant of the common field-name variations across vCenter events, Aria
133
+ alerts/anomalies, Log Insight events, and NSX. Unknown extras are preserved
134
+ under ``fields`` so nothing is silently dropped.
135
+ """
136
+ ts_raw = _first(raw, "ts", "timestamp", "time", "createTime", "startTimeUTC")
137
+ if ts_raw is None:
138
+ raise ValueError(f"event has no timestamp field: {raw!r}")
139
+
140
+ src = source or _first(raw, "source", "skill") or "unknown"
141
+ sev = normalize_severity(_first(raw, "severity", "criticality", "level", "status"))
142
+ entity = _first(
143
+ raw, "entity", "entity_name", "resourceName", "vm", "vm_name", "object", "host"
144
+ )
145
+ text = _first(raw, "text", "message", "msg", "description", "fullFormattedMessage")
146
+
147
+ known = {
148
+ "ts", "timestamp", "time", "createTime", "startTimeUTC",
149
+ "source", "skill", "severity", "criticality", "level", "status",
150
+ "entity", "entity_name", "resourceName", "vm", "vm_name", "object", "host",
151
+ "text", "message", "msg", "description", "fullFormattedMessage",
152
+ }
153
+ extras = {k: v for k, v in raw.items() if k not in known}
154
+
155
+ return Event(
156
+ ts=parse_timestamp(ts_raw),
157
+ source=str(src),
158
+ severity=sev,
159
+ entity=str(entity) if entity is not None else "",
160
+ text=str(text) if text is not None else "",
161
+ fields=extras,
162
+ )
163
+
164
+
165
+ def normalize_events(raw_events: list[dict], source: str | None = None) -> list[Event]:
166
+ """Normalise a batch, skipping nothing — a bad event raises with its index."""
167
+ out: list[Event] = []
168
+ for i, raw in enumerate(raw_events):
169
+ try:
170
+ out.append(normalize_event(raw, source))
171
+ except (ValueError, AttributeError, TypeError) as exc:
172
+ raise ValueError(f"event[{i}] could not be normalised: {exc}") from exc
173
+ return out
@@ -0,0 +1,2 @@
1
+ """MCP tool implementations for vmware-debug (kept here so audit logs see
2
+ skill=debug). The stdio server in mcp_server/ wires these into FastMCP."""
@@ -0,0 +1,33 @@
1
+ """vmware-debug MCP tool logic — pure, read-only correlation. No network, no
2
+ writes, no cross-skill imports. The agent fetches events with the other skills'
3
+ read tools and passes them here as plain dicts (the unified event envelope)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Optional
8
+
9
+ from vmware_debug.envelope import normalize_events
10
+ from vmware_debug.ops.timeline import category_routing
11
+ from vmware_debug.ops.timeline import incident_timeline as _incident_timeline
12
+
13
+
14
+ def incident_timeline(
15
+ events: list[dict],
16
+ bin_seconds: Optional[float] = None,
17
+ z_threshold: float = 2.0,
18
+ top_n: int = 5,
19
+ ) -> dict:
20
+ """Correlate pre-fetched events into a timeline + spikes + ranked hypotheses.
21
+
22
+ ``events`` is a list of event envelopes (see references/event-envelope.md).
23
+ Raises ValueError (with the offending index) if an event can't be normalised.
24
+ """
25
+ normalized = normalize_events(events)
26
+ return _incident_timeline(
27
+ normalized, bin_seconds=bin_seconds, z_threshold=z_threshold, top_n=top_n
28
+ )
29
+
30
+
31
+ def list_symptom_categories() -> list[dict]:
32
+ """List the symptom categories debug recognises and what to check for each."""
33
+ return category_routing()
@@ -0,0 +1 @@
1
+ """Diagnostic operations for vmware-debug."""
@@ -0,0 +1,312 @@
1
+ """Incident correlation engine — the heart of vmware-debug.
2
+
3
+ Pure functions over a list of normalised :class:`~vmware_debug.envelope.Event`.
4
+ No I/O, no network, no cross-skill imports: the orchestrating agent fetches
5
+ events via each data-source skill's read tools and feeds them here. That keeps
6
+ the valuable logic (timeline merge, spike detection, hypothesis ranking, and
7
+ next-check suggestions) self-contained and unit-testable.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from dataclasses import dataclass, field
13
+
14
+ from vmware_debug.envelope import SEVERITY_WEIGHT, Event
15
+
16
+ # A symptom taxonomy: keyword signatures -> (category, which skill/tool to look
17
+ # at next). This is what lets debug "give a valuable idea even when the user
18
+ # doesn't know what to check". Keywords are matched case-insensitively against
19
+ # event text + entity. Order matters only for the human-readable label; scoring
20
+ # counts every category that matches.
21
+ _CATEGORY_SIGNATURES: tuple[tuple[str, tuple[str, ...], str], ...] = (
22
+ (
23
+ "storage",
24
+ ("datastore", "scsi", "latency", "vsan", "lun", "naa.", "apd", "pdl",
25
+ "disk full", "no space", "vmfs", "iscsi"),
26
+ "vmware-storage (datastore/vsan health) + vmware-log-insight (search "
27
+ "the host's vmkernel for scsi/apd events around the spike)",
28
+ ),
29
+ (
30
+ "network",
31
+ ("vmotion", "vnic", "dvswitch", "dvs ", "uplink", "link down", "mtu",
32
+ "firewall", "dfw", "segment", "tier-0", "tier-1", "bgp", "packet drop"),
33
+ "vmware-nsx / vmware-nsx-security (run a traceflow between the affected "
34
+ "endpoints; check DFW rule hits) + vmware-log-insight (network logs)",
35
+ ),
36
+ (
37
+ "compute",
38
+ ("cpu ready", "memory", "balloon", "swap", "contention", "overcommit",
39
+ "numa"),
40
+ "vmware-aria (CPU-ready / memory-contention metrics + anomalies for the "
41
+ "VM and its host) ",
42
+ ),
43
+ (
44
+ "ha_drs",
45
+ ("ha ", "high availability", "drs", "failover", "admission control",
46
+ "host isolation", "heartbeat"),
47
+ "vmware-monitor (cluster + host health, recent HA/DRS events) + "
48
+ "vmware-aiops (cluster state)",
49
+ ),
50
+ (
51
+ "power_lifecycle",
52
+ ("power on", "power off", "failed to start", "boot", "vmx", "ovf",
53
+ "deploy", "clone", "snapshot", "consolidate"),
54
+ "vmware-aiops (VM task status, snapshot tree) + vmware-monitor (the VM's "
55
+ "recent events)",
56
+ ),
57
+ (
58
+ "auth",
59
+ ("login", "authentication", "permission", "denied", "unauthorized",
60
+ "401", "403", "token", "certificate", "tls"),
61
+ "check the service account + credentials in config/.env; verify the "
62
+ "target's certificate/time sync",
63
+ ),
64
+ (
65
+ "platform",
66
+ ("vpxd", "hostd", "service", "restart", "crash", "core dump", "503",
67
+ "not responding", "disconnected"),
68
+ "vmware-monitor (host connection state + service health) + "
69
+ "vmware-log-insight (vpxd/hostd logs around the first error)",
70
+ ),
71
+ )
72
+
73
+
74
+ @dataclass(frozen=True)
75
+ class Bucket:
76
+ """A time bin with event counts."""
77
+
78
+ start: float
79
+ end: float
80
+ count: int
81
+ by_severity: dict[str, int] = field(default_factory=dict)
82
+ by_source: dict[str, int] = field(default_factory=dict)
83
+
84
+
85
+ @dataclass(frozen=True)
86
+ class Spike:
87
+ """A bin whose count is anomalously high vs the series mean."""
88
+
89
+ start: float
90
+ end: float
91
+ count: int
92
+ zscore: float
93
+
94
+
95
+ @dataclass(frozen=True)
96
+ class Hypothesis:
97
+ """A ranked root-cause candidate with evidence and a next step."""
98
+
99
+ category: str
100
+ score: float
101
+ summary: str
102
+ evidence_count: int
103
+ first_seen: float
104
+ last_seen: float
105
+ sample_text: str
106
+ suggested_check: str
107
+
108
+
109
+ def build_timeline(events: list[Event]) -> list[Event]:
110
+ """Return events sorted chronologically (stable)."""
111
+ return sorted(events, key=lambda e: e.ts)
112
+
113
+
114
+ def bin_events(events: list[Event], bin_seconds: float) -> list[Bucket]:
115
+ """Bucket events into fixed-width time bins covering [min_ts, max_ts]."""
116
+ if not events:
117
+ return []
118
+ if bin_seconds <= 0:
119
+ raise ValueError("bin_seconds must be positive")
120
+ ordered = build_timeline(events)
121
+ start = ordered[0].ts
122
+ end = ordered[-1].ts
123
+ n_bins = int((end - start) // bin_seconds) + 1
124
+ counts: list[dict] = [
125
+ {"count": 0, "by_severity": {}, "by_source": {}} for _ in range(n_bins)
126
+ ]
127
+ for e in ordered:
128
+ idx = min(int((e.ts - start) // bin_seconds), n_bins - 1)
129
+ b = counts[idx]
130
+ b["count"] += 1
131
+ b["by_severity"][e.severity] = b["by_severity"].get(e.severity, 0) + 1
132
+ b["by_source"][e.source] = b["by_source"].get(e.source, 0) + 1
133
+ return [
134
+ Bucket(
135
+ start=start + i * bin_seconds,
136
+ end=start + (i + 1) * bin_seconds,
137
+ count=b["count"],
138
+ by_severity=b["by_severity"],
139
+ by_source=b["by_source"],
140
+ )
141
+ for i, b in enumerate(counts)
142
+ ]
143
+
144
+
145
+ def detect_spikes(buckets: list[Bucket], z_threshold: float = 2.0) -> list[Spike]:
146
+ """Flag bins whose count exceeds mean + ``z_threshold`` * stddev.
147
+
148
+ Needs at least 3 non-trivial bins to have a meaningful baseline; below that
149
+ it returns nothing rather than calling every event a spike.
150
+ """
151
+ counts = [b.count for b in buckets]
152
+ if len(counts) < 3:
153
+ return []
154
+ mean = sum(counts) / len(counts)
155
+ variance = sum((c - mean) ** 2 for c in counts) / len(counts)
156
+ stddev = variance**0.5
157
+ if stddev == 0:
158
+ return []
159
+ spikes = []
160
+ for b in buckets:
161
+ z = (b.count - mean) / stddev
162
+ if z >= z_threshold:
163
+ spikes.append(Spike(start=b.start, end=b.end, count=b.count, zscore=z))
164
+ return spikes
165
+
166
+
167
+ def _categorize(text: str, entity: str) -> list[tuple[str, str]]:
168
+ """Return (category, suggested_check) for every signature the text matches."""
169
+ haystack = f"{text} {entity}".lower()
170
+ hits = []
171
+ for category, keywords, suggestion in _CATEGORY_SIGNATURES:
172
+ if any(kw in haystack for kw in keywords):
173
+ hits.append((category, suggestion))
174
+ return hits
175
+
176
+
177
+ def rank_hypotheses(events: list[Event], top_n: int = 5) -> list[Hypothesis]:
178
+ """Cluster events by symptom category and rank them as root-cause candidates.
179
+
180
+ Score = sum of severity weights of the events in the category. Ties broken
181
+ by recency (a category whose evidence is more recent ranks higher). Events
182
+ matching no category are grouped under "uncategorized" so they remain
183
+ visible rather than dropped.
184
+ """
185
+ groups: dict[str, dict] = {}
186
+ for e in events:
187
+ cats = _categorize(e.text, e.entity) or [("uncategorized", "")]
188
+ for category, suggestion in cats:
189
+ g = groups.setdefault(
190
+ category,
191
+ {"score": 0.0, "events": [], "suggestion": suggestion},
192
+ )
193
+ g["score"] += SEVERITY_WEIGHT.get(e.severity, 0)
194
+ g["events"].append(e)
195
+ if suggestion:
196
+ g["suggestion"] = suggestion
197
+
198
+ hypotheses = []
199
+ for category, g in groups.items():
200
+ evs = build_timeline(g["events"])
201
+ worst = max(evs, key=lambda e: SEVERITY_WEIGHT.get(e.severity, 0))
202
+ hypotheses.append(
203
+ Hypothesis(
204
+ category=category,
205
+ score=g["score"],
206
+ summary=(
207
+ f"{len(evs)} {category} event(s); most severe is "
208
+ f"'{worst.severity}' from {worst.source}"
209
+ ),
210
+ evidence_count=len(evs),
211
+ first_seen=evs[0].ts,
212
+ last_seen=evs[-1].ts,
213
+ sample_text=worst.text[:200],
214
+ suggested_check=g["suggestion"]
215
+ or "no category matched — widen the search window or pull events "
216
+ "from another source (monitor events, aria alerts, log-insight)",
217
+ )
218
+ )
219
+ hypotheses.sort(key=lambda h: (h.score, h.last_seen), reverse=True)
220
+ return hypotheses[:top_n]
221
+
222
+
223
+ def _auto_bin_seconds(events: list[Event]) -> float:
224
+ """Pick a sensible bin width from the event span (~30 bins, min 1s)."""
225
+ ordered = build_timeline(events)
226
+ span = ordered[-1].ts - ordered[0].ts
227
+ if span <= 0:
228
+ return 1.0
229
+ return max(1.0, span / 30.0)
230
+
231
+
232
+ def incident_timeline(
233
+ events: list[Event],
234
+ *,
235
+ bin_seconds: float | None = None,
236
+ z_threshold: float = 2.0,
237
+ top_n: int = 5,
238
+ ) -> dict:
239
+ """Top-level correlation: timeline summary + spikes + ranked hypotheses.
240
+
241
+ Returns a JSON-serialisable dict suitable for an MCP tool response. Empty
242
+ input yields an explicit "no events" result with a suggestion rather than
243
+ an empty/ambiguous payload.
244
+ """
245
+ if not events:
246
+ return {
247
+ "event_count": 0,
248
+ "window": None,
249
+ "spikes": [],
250
+ "hypotheses": [],
251
+ "next_checks": [
252
+ "No events supplied. Pull a starting set: vmware-monitor "
253
+ "event_list / alarm_list for the affected entity, then "
254
+ "vmware-log-insight log_search around the reported time."
255
+ ],
256
+ }
257
+
258
+ ordered = build_timeline(events)
259
+ width = bin_seconds or _auto_bin_seconds(ordered)
260
+ buckets = bin_events(ordered, width)
261
+ spikes = detect_spikes(buckets, z_threshold=z_threshold)
262
+ hyps = rank_hypotheses(ordered, top_n=top_n)
263
+
264
+ next_checks = [h.suggested_check for h in hyps if h.category != "uncategorized"]
265
+ if not next_checks:
266
+ next_checks = [
267
+ "No known symptom pattern matched. Widen the time window, or pull "
268
+ "metrics from vmware-aria (anomalies) and logs from "
269
+ "vmware-log-insight to enrich the timeline."
270
+ ]
271
+
272
+ return {
273
+ "event_count": len(ordered),
274
+ "window": {"start": ordered[0].ts, "end": ordered[-1].ts, "bin_seconds": width},
275
+ "spikes": [
276
+ {"start": s.start, "end": s.end, "count": s.count, "zscore": round(s.zscore, 2)}
277
+ for s in spikes
278
+ ],
279
+ "hypotheses": [
280
+ {
281
+ "category": h.category,
282
+ "score": h.score,
283
+ "summary": h.summary,
284
+ "evidence_count": h.evidence_count,
285
+ "first_seen": h.first_seen,
286
+ "last_seen": h.last_seen,
287
+ "sample_text": h.sample_text,
288
+ "suggested_check": h.suggested_check,
289
+ }
290
+ for h in hyps
291
+ ],
292
+ "next_checks": next_checks,
293
+ }
294
+
295
+
296
+ # Kept module-private but exported for tests that assert the catalogue stays
297
+ # in sync with the routing playbooks.
298
+ def known_categories() -> list[str]:
299
+ """Return the symptom categories debug knows how to route."""
300
+ return [name for name, _kw, _s in _CATEGORY_SIGNATURES]
301
+
302
+
303
+ def category_routing() -> list[dict]:
304
+ """Return each symptom category with sample keywords and its next-check idea.
305
+
306
+ Powers the discovery tool that answers "what can you help me check?" — and
307
+ lets a regression test keep this catalogue in sync with the playbooks.
308
+ """
309
+ return [
310
+ {"category": name, "example_keywords": list(keywords[:6]), "suggested_check": suggestion}
311
+ for name, keywords, suggestion in _CATEGORY_SIGNATURES
312
+ ]
@@ -0,0 +1,52 @@
1
+ Metadata-Version: 2.4
2
+ Name: vmware-debug
3
+ Version: 1.6.1
4
+ Summary: VMware diagnostic brain — read-only incident triage, log/event correlation, and root-cause routing across the VMware skill family
5
+ Author-email: Wei Zhou <wei-wz.zhou@broadcom.com>
6
+ License-Expression: MIT
7
+ Keywords: ai-ops,debug,diagnostics,mcp,rca,troubleshooting,vmware,vsphere
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Topic :: System :: Monitoring
12
+ Requires-Python: >=3.10
13
+ Requires-Dist: mcp[cli]<2.0,>=1.10
14
+ Requires-Dist: rich<15.0,>=13.0
15
+ Requires-Dist: typer<1.0,>=0.12
16
+ Requires-Dist: vmware-policy<2.0,>=1.0.0
17
+ Description-Content-Type: text/markdown
18
+
19
+ <!-- mcp-name: io.github.zw008/vmware-debug -->
20
+
21
+ # VMware Debug
22
+
23
+ > ⚠️ **Work in progress** — the core (event correlation engine, MCP tools, CLI)
24
+ > is built and tested; README, `server.json`, full reference docs, and packaging
25
+ > polish are still landing. Not yet published to PyPI.
26
+
27
+ > **Disclaimer**: Community-maintained open-source project, **not affiliated with,
28
+ > endorsed by, or sponsored by VMware, Inc. or Broadcom Inc.** "VMware" and
29
+ > "vSphere" are trademarks of Broadcom. Source is publicly auditable under the MIT
30
+ > license.
31
+
32
+ The diagnostic brain of the VMware skill family. You bring the symptom (an error,
33
+ a log dump, a slow VM); this skill runs a systematic investigation, correlates
34
+ events from the other skills into one timeline, ranks root-cause hypotheses, and
35
+ tells you what to check next. It is **read-only** — it never changes anything and
36
+ never executes fixes. Remediation is routed to `vmware-aiops` (single op) or
37
+ `vmware-pilot` (multi-step, gated), mirroring the `vmware-harden → vmware-pilot`
38
+ advisor/executor split.
39
+
40
+ See [`skills/vmware-debug/SKILL.md`](skills/vmware-debug/SKILL.md) for the full
41
+ methodology, the event-envelope contract, and symptom routing.
42
+
43
+ ## MCP tools
44
+
45
+ | Tool | What |
46
+ |---|---|
47
+ | `incident_timeline` | [READ] Correlate pre-fetched events → timeline + spikes + ranked hypotheses + next-check ideas |
48
+ | `list_symptom_categories` | [READ] List recognised symptom categories + what to check for each |
49
+
50
+ ## License
51
+
52
+ MIT.
@@ -0,0 +1,13 @@
1
+ mcp_server/__init__.py,sha256=QZpQxriDdqDCYaS-HP7McsaPbTPbHwcamIwu-7UxY-E,49
2
+ mcp_server/server.py,sha256=dsQhAD4AR42IYPGtPuadzUTVvEu7Yl-l4OeBGuLSPJw,3188
3
+ vmware_debug/__init__.py,sha256=NG481KQ7ilwOHB36cwloCjUBkfMd0-zAXKKVNGmxMP0,309
4
+ vmware_debug/cli.py,sha256=bObdZFEQ3SVHwxMqKDxcac_9cOcvofOsmyGtItr5Hjg,2818
5
+ vmware_debug/envelope.py,sha256=rPW5xRb8n_vio6q5i3JXy0tvmIZjmNZ84kc_8ToXctg,6368
6
+ vmware_debug/mcp/__init__.py,sha256=6EYkBo6LUZj2gtTLcjCInqUT441nNqai-XU1bO6uWyc,149
7
+ vmware_debug/mcp/tools.py,sha256=RIumf88g3suRGQW_XqCRvufiyiNL04j5OHmeRKPjtS8,1214
8
+ vmware_debug/ops/__init__.py,sha256=Co9u0zZexST7-IOUBOAS5eq5RHkaEeI2FqfYijzhI1g,46
9
+ vmware_debug/ops/timeline.py,sha256=mZNQNTiQxdKGakh2cZAv3jjIzLCCb3OPifjQAGTvnLc,11273
10
+ vmware_debug-1.6.1.dist-info/METADATA,sha256=-nOEynNcGJJhQ7_nMMt-d1MPFvYjP4cf_Bpw5nYtcQg,2211
11
+ vmware_debug-1.6.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
12
+ vmware_debug-1.6.1.dist-info/entry_points.txt,sha256=Ye2yiV3UraSCbeb6bfadvGx2ZK2KEbUK4PC9KpaN0ak,96
13
+ vmware_debug-1.6.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ vmware-debug = vmware_debug.cli:app
3
+ vmware-debug-mcp = mcp_server.server:main