scryml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scry/__init__.py +6 -0
- scry/api/__init__.py +25 -0
- scry/api/forecaster.py +110 -0
- scry/api/main.py +423 -0
- scry/api/predictor.py +251 -0
- scry/api/schemas.py +293 -0
- scry/config/__init__.py +30 -0
- scry/config/auto_discovery.py +192 -0
- scry/config/loader.py +195 -0
- scry/data/__init__.py +65 -0
- scry/data/feature_engineering.py +419 -0
- scry/data/fetcher.py +187 -0
- scry/data/pipeline.py +207 -0
- scry/data/sources/__init__.py +13 -0
- scry/data/sources/base.py +98 -0
- scry/data/sources/http_ingest.py +461 -0
- scry/data/sources/object_store.py +234 -0
- scry/model/__init__.py +82 -0
- scry/model/clustering.py +120 -0
- scry/model/decoders.py +131 -0
- scry/model/drift.py +192 -0
- scry/model/encoders.py +161 -0
- scry/model/evaluate.py +323 -0
- scry/model/export.py +301 -0
- scry/model/forecasting/__init__.py +4 -0
- scry/model/forecasting/accuracy.py +170 -0
- scry/model/forecasting/anomaly_detector.py +115 -0
- scry/model/forecasting/chronos_wrapper.py +125 -0
- scry/model/forecasting/enriched_pipeline.py +151 -0
- scry/model/forecasting/residual_features.py +42 -0
- scry/model/losses.py +173 -0
- scry/model/trainer.py +580 -0
- scry/model/xdec.py +196 -0
- scry/model/xvae.py +160 -0
- scry/utils/__init__.py +4 -0
- scry/utils/config.py +94 -0
- scry/utils/tracing.py +178 -0
- scryml-0.1.0.dist-info/METADATA +132 -0
- scryml-0.1.0.dist-info/RECORD +41 -0
- scryml-0.1.0.dist-info/WHEEL +4 -0
- scryml-0.1.0.dist-info/licenses/LICENSE +201 -0
scry/__init__.py
ADDED
scry/api/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Description: FastAPI prediction API for exposing cluster predictions
|
|
2
|
+
# Description: and recommended remediation actions.
|
|
3
|
+
|
|
4
|
+
"""FastAPI prediction service for Scry."""
|
|
5
|
+
|
|
6
|
+
from scry.api.main import app, create_app
|
|
7
|
+
from scry.api.predictor import Predictor
|
|
8
|
+
from scry.api.schemas import (
|
|
9
|
+
ClusterInfo,
|
|
10
|
+
HealthResponse,
|
|
11
|
+
PredictionRequest,
|
|
12
|
+
PredictionResponse,
|
|
13
|
+
get_cluster_info,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"app",
|
|
18
|
+
"create_app",
|
|
19
|
+
"Predictor",
|
|
20
|
+
"PredictionRequest",
|
|
21
|
+
"PredictionResponse",
|
|
22
|
+
"HealthResponse",
|
|
23
|
+
"ClusterInfo",
|
|
24
|
+
"get_cluster_info",
|
|
25
|
+
]
|
scry/api/forecaster.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# Description: Singleton wrapper around ChronosForecaster for API use.
|
|
2
|
+
# Description: Lazy-loads the Chronos model on first request, reuses across calls.
|
|
3
|
+
|
|
4
|
+
"""Forecaster service for the API."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import torch
|
|
13
|
+
|
|
14
|
+
log = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Forecaster:
|
|
18
|
+
"""Singleton forecaster service wrapping ChronosForecaster.
|
|
19
|
+
|
|
20
|
+
Lazy-loads the Chronos foundation model on first call.
|
|
21
|
+
Provides a forecast_metrics method that takes multiple metrics
|
|
22
|
+
and returns per-metric, per-horizon forecasts with quantiles.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
model_id: HuggingFace model ID for Chronos.
|
|
26
|
+
device: Inference device (cpu, cuda).
|
|
27
|
+
horizons: Forecast horizons in timesteps.
|
|
28
|
+
quantile_levels: Quantile levels for prediction intervals.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
model_id: str = "amazon/chronos-bolt-tiny",
|
|
34
|
+
device: str = "cpu",
|
|
35
|
+
horizons: list[int] | None = None,
|
|
36
|
+
quantile_levels: list[float] | None = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
self.model_id = model_id
|
|
39
|
+
self.device = device
|
|
40
|
+
self.horizons = horizons or [15, 60, 240, 1440]
|
|
41
|
+
self.quantile_levels = quantile_levels or [0.1, 0.5, 0.9]
|
|
42
|
+
self._forecaster: Any = None
|
|
43
|
+
|
|
44
|
+
def _load(self) -> None:
|
|
45
|
+
"""Lazy-load the ChronosForecaster on first use."""
|
|
46
|
+
if self._forecaster is not None:
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
from scry.model.forecasting.chronos_wrapper import ChronosForecaster
|
|
50
|
+
|
|
51
|
+
log.info(
|
|
52
|
+
"Loading ChronosForecaster model=%s device=%s horizons=%s",
|
|
53
|
+
self.model_id,
|
|
54
|
+
self.device,
|
|
55
|
+
self.horizons,
|
|
56
|
+
)
|
|
57
|
+
self._forecaster = ChronosForecaster(
|
|
58
|
+
model_id=self.model_id,
|
|
59
|
+
device=self.device,
|
|
60
|
+
horizons=self.horizons,
|
|
61
|
+
quantile_levels=self.quantile_levels,
|
|
62
|
+
)
|
|
63
|
+
log.info("ChronosForecaster loaded")
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def is_loaded(self) -> bool:
|
|
67
|
+
"""Whether the Chronos model is loaded."""
|
|
68
|
+
return self._forecaster is not None
|
|
69
|
+
|
|
70
|
+
def forecast_metrics(
|
|
71
|
+
self,
|
|
72
|
+
metrics: dict[str, list[float]],
|
|
73
|
+
) -> list[dict[str, Any]]:
|
|
74
|
+
"""Forecast multiple metrics and extract values at configured horizons.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
metrics: Dict mapping metric names to historical time series values.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of per-metric forecast dicts, each containing:
|
|
81
|
+
- metric_name: str
|
|
82
|
+
- horizons: list of {horizon, median, lower, upper} dicts
|
|
83
|
+
"""
|
|
84
|
+
self._load()
|
|
85
|
+
|
|
86
|
+
metric_names = list(metrics.keys())
|
|
87
|
+
contexts = [
|
|
88
|
+
torch.tensor(values, dtype=torch.float32)
|
|
89
|
+
for values in metrics.values()
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
batch_forecast = self._forecaster.forecast_batch(contexts)
|
|
93
|
+
at_horizons = self._forecaster.extract_at_horizons(batch_forecast)
|
|
94
|
+
|
|
95
|
+
results = []
|
|
96
|
+
for i, name in enumerate(metric_names):
|
|
97
|
+
horizon_results = []
|
|
98
|
+
for j, h in enumerate(self.horizons):
|
|
99
|
+
horizon_results.append({
|
|
100
|
+
"horizon": h,
|
|
101
|
+
"median": float(np.round(at_horizons["median"][i, j], 4)),
|
|
102
|
+
"lower": float(np.round(at_horizons["lower"][i, j], 4)),
|
|
103
|
+
"upper": float(np.round(at_horizons["upper"][i, j], 4)),
|
|
104
|
+
})
|
|
105
|
+
results.append({
|
|
106
|
+
"metric_name": name,
|
|
107
|
+
"horizons": horizon_results,
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
return results
|
scry/api/main.py
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
# Description: FastAPI application for the prediction service.
|
|
2
|
+
# Description: Exposes /health, /predict, /predict/lookup, /clusters, /forecast, /drift, /anomaly, /accuracy.
|
|
3
|
+
|
|
4
|
+
"""FastAPI application for cluster prediction."""
|
|
5
|
+
|
|
6
|
+
import atexit
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import time
|
|
10
|
+
from datetime import datetime, timedelta, timezone
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from fastapi import FastAPI, HTTPException, Query
|
|
14
|
+
|
|
15
|
+
from scry.api.forecaster import Forecaster
|
|
16
|
+
from scry.api.predictor import Predictor
|
|
17
|
+
from scry.api.schemas import (
|
|
18
|
+
ClusterInfo,
|
|
19
|
+
DetailedHealthResponse,
|
|
20
|
+
ForecastRequest,
|
|
21
|
+
ForecastResponse,
|
|
22
|
+
HealthResponse,
|
|
23
|
+
MetricForecast,
|
|
24
|
+
PredictionRequest,
|
|
25
|
+
PredictionResponse,
|
|
26
|
+
get_cluster_info,
|
|
27
|
+
)
|
|
28
|
+
from scry.utils.config import get_config
|
|
29
|
+
from scry.utils.tracing import get_tracer, setup_tracing, shutdown_tracing
|
|
30
|
+
|
|
31
|
+
tracer = get_tracer(__name__)
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
logging.basicConfig(
|
|
35
|
+
level=logging.INFO,
|
|
36
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# API version
|
|
40
|
+
VERSION = "0.1.0"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _data_uri() -> str | None:
|
|
44
|
+
"""The configured object-store data URI, if any (env SCRY_DATA_URI / config)."""
|
|
45
|
+
return getattr(get_config(), "data_uri", None) or os.environ.get("SCRY_DATA_URI")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _datasource_descriptor() -> str | None:
|
|
49
|
+
"""Describe the configured data source for diagnostics, or None if unconfigured.
|
|
50
|
+
|
|
51
|
+
Object storage (SCRY_DATA_URI) is the default. The HttpIngest adapter is used
|
|
52
|
+
only when the ``logicmonitor`` extra is installed.
|
|
53
|
+
"""
|
|
54
|
+
uri = _data_uri()
|
|
55
|
+
if uri:
|
|
56
|
+
return f"object-store: {uri}"
|
|
57
|
+
try:
|
|
58
|
+
import scry.data.sources.http_ingest # noqa: F401
|
|
59
|
+
except ImportError:
|
|
60
|
+
return None
|
|
61
|
+
return f"httpingest: {get_config().httpingest_url}"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
async def _resource_metrics(resource_id: str, lookback_days: int = 30) -> Any:
|
|
65
|
+
"""Fetch recent metrics for a resource through the configured data source.
|
|
66
|
+
|
|
67
|
+
Returns a canonical-schema DataFrame filtered to the resource, or None when no
|
|
68
|
+
data source is configured. Object storage is the default; the HttpIngest
|
|
69
|
+
adapter is used only when the ``logicmonitor`` extra is installed.
|
|
70
|
+
"""
|
|
71
|
+
from scry.data import DataFetcher
|
|
72
|
+
|
|
73
|
+
end = datetime.now(timezone.utc)
|
|
74
|
+
start = end - timedelta(days=lookback_days)
|
|
75
|
+
profile_name = os.environ.get("SCRY_PROFILE") # None -> features.yaml default
|
|
76
|
+
|
|
77
|
+
uri = _data_uri()
|
|
78
|
+
if uri:
|
|
79
|
+
fetcher = DataFetcher.from_object_store(uri)
|
|
80
|
+
df = await fetcher.get_metrics_dataframe(start, end, profile_name)
|
|
81
|
+
else:
|
|
82
|
+
try:
|
|
83
|
+
from scry.data import HttpIngestClient
|
|
84
|
+
except ImportError:
|
|
85
|
+
return None
|
|
86
|
+
async with HttpIngestClient(base_url=get_config().httpingest_url) as client:
|
|
87
|
+
fetcher = DataFetcher.from_http_client(client)
|
|
88
|
+
df = await fetcher.get_metrics_dataframe(start, end, profile_name)
|
|
89
|
+
|
|
90
|
+
if df.empty:
|
|
91
|
+
return df
|
|
92
|
+
|
|
93
|
+
needle = resource_id.lower()
|
|
94
|
+
mask = df["resource_id"].astype(str).str.lower().str.contains(needle, na=False) | df[
|
|
95
|
+
"host_name"
|
|
96
|
+
].astype(str).str.lower().str.contains(needle, na=False)
|
|
97
|
+
return df[mask]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _split_by_profile(
|
|
101
|
+
df: Any,
|
|
102
|
+
profile_name: str | None = None,
|
|
103
|
+
) -> tuple[dict[str, list[float]], dict[str, list[int]]]:
|
|
104
|
+
"""Group canonical metric rows into numerical/categorical series per the active profile."""
|
|
105
|
+
from scry.config.loader import get_profile
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
profile = get_profile(profile_name)
|
|
109
|
+
except (FileNotFoundError, ValueError):
|
|
110
|
+
return {}, {}
|
|
111
|
+
|
|
112
|
+
num_set = set(profile.numerical_features)
|
|
113
|
+
cat_set = set(profile.categorical_features)
|
|
114
|
+
numerical: dict[str, list[float]] = {}
|
|
115
|
+
categorical: dict[str, list[int]] = {}
|
|
116
|
+
|
|
117
|
+
ordered = df.sort_values("timestamp")
|
|
118
|
+
for name, group in ordered.groupby("metric_name"):
|
|
119
|
+
values = group["value"].dropna().tolist()
|
|
120
|
+
if not values:
|
|
121
|
+
continue
|
|
122
|
+
if name in num_set:
|
|
123
|
+
numerical[name] = [float(v) for v in values]
|
|
124
|
+
elif name in cat_set:
|
|
125
|
+
categorical[name] = [int(v) for v in values]
|
|
126
|
+
return numerical, categorical
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def create_app(model_path: str | None = None) -> FastAPI:
|
|
130
|
+
"""Create the FastAPI application.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
model_path: Path to the model file. If None, uses the MODEL_PATH env var.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Configured FastAPI application.
|
|
137
|
+
"""
|
|
138
|
+
app = FastAPI(
|
|
139
|
+
title="Scry Predictive API",
|
|
140
|
+
description="Predicts infrastructure failure states from a stream of metrics",
|
|
141
|
+
version=VERSION,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Optional tracing (no-op unless the 'otel' extra is installed and enabled).
|
|
145
|
+
setup_tracing(app)
|
|
146
|
+
atexit.register(shutdown_tracing)
|
|
147
|
+
|
|
148
|
+
# Load model
|
|
149
|
+
if model_path is None:
|
|
150
|
+
model_path = os.environ.get("MODEL_PATH", "models/xdec_model.pt")
|
|
151
|
+
|
|
152
|
+
app.state.started_at = time.monotonic()
|
|
153
|
+
app.state.model_path = model_path
|
|
154
|
+
app.state.model_load_time_ms = None
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
t0 = time.monotonic()
|
|
158
|
+
predictor = Predictor(model_path=model_path)
|
|
159
|
+
app.state.model_load_time_ms = (time.monotonic() - t0) * 1000.0
|
|
160
|
+
app.state.predictor = predictor
|
|
161
|
+
app.state.model_loaded = True
|
|
162
|
+
except FileNotFoundError:
|
|
163
|
+
app.state.predictor = None
|
|
164
|
+
app.state.model_loaded = False
|
|
165
|
+
|
|
166
|
+
# Forecaster is lazy-loaded on first /forecast request, independent of X-DEC.
|
|
167
|
+
forecast_model_id = os.environ.get("FORECAST_MODEL_ID", "amazon/chronos-bolt-tiny")
|
|
168
|
+
forecast_device = os.environ.get("FORECAST_DEVICE", "cpu")
|
|
169
|
+
app.state.forecaster = Forecaster(
|
|
170
|
+
model_id=forecast_model_id,
|
|
171
|
+
device=forecast_device,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
@app.get("/")
|
|
175
|
+
def root() -> dict:
|
|
176
|
+
"""Root endpoint with API info."""
|
|
177
|
+
return {"name": "Scry Predictive API", "version": VERSION, "docs": "/docs"}
|
|
178
|
+
|
|
179
|
+
@app.get("/health", response_model=HealthResponse)
|
|
180
|
+
def health() -> HealthResponse:
|
|
181
|
+
"""Liveness/readiness check."""
|
|
182
|
+
status = "healthy" if app.state.model_loaded else "unhealthy"
|
|
183
|
+
return HealthResponse(
|
|
184
|
+
status=status,
|
|
185
|
+
model_loaded=app.state.model_loaded,
|
|
186
|
+
version=VERSION,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
@app.get("/health/detailed", response_model=DetailedHealthResponse)
|
|
190
|
+
def health_detailed() -> DetailedHealthResponse:
|
|
191
|
+
"""Detailed health check: model metadata, configured data source, uptime."""
|
|
192
|
+
status = "healthy" if app.state.model_loaded else "unhealthy"
|
|
193
|
+
uptime = time.monotonic() - app.state.started_at
|
|
194
|
+
|
|
195
|
+
model_version = None
|
|
196
|
+
if app.state.predictor is not None:
|
|
197
|
+
cfg = app.state.predictor.config
|
|
198
|
+
model_version = f"xdec-k{cfg['n_clusters']}-d{cfg['latent_dim']}"
|
|
199
|
+
env_version = os.environ.get("MODEL_VERSION")
|
|
200
|
+
if env_version:
|
|
201
|
+
model_version = f"{model_version}-{env_version}" if model_version else env_version
|
|
202
|
+
|
|
203
|
+
return DetailedHealthResponse(
|
|
204
|
+
status=status,
|
|
205
|
+
model_loaded=app.state.model_loaded,
|
|
206
|
+
version=VERSION,
|
|
207
|
+
model_load_time_ms=app.state.model_load_time_ms,
|
|
208
|
+
model_version=model_version,
|
|
209
|
+
model_path=app.state.model_path,
|
|
210
|
+
datasource=_datasource_descriptor(),
|
|
211
|
+
chronos_loaded=app.state.forecaster.is_loaded,
|
|
212
|
+
uptime_seconds=round(uptime, 2),
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
@app.post("/predict", response_model=PredictionResponse)
|
|
216
|
+
def predict(request: PredictionRequest) -> PredictionResponse:
|
|
217
|
+
"""Predict the operational state for metrics supplied in the request body.
|
|
218
|
+
|
|
219
|
+
Raises:
|
|
220
|
+
HTTPException: 503 if the model is not loaded.
|
|
221
|
+
"""
|
|
222
|
+
if not app.state.model_loaded:
|
|
223
|
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
|
224
|
+
|
|
225
|
+
result = app.state.predictor.predict(
|
|
226
|
+
numerical_metrics=request.numerical_metrics,
|
|
227
|
+
categorical_metrics=request.categorical_metrics,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return PredictionResponse(
|
|
231
|
+
resource_id=request.resource_id,
|
|
232
|
+
cluster_id=result["cluster_id"],
|
|
233
|
+
cluster_name=result["cluster_name"],
|
|
234
|
+
confidence=result["confidence"],
|
|
235
|
+
action=result["action"],
|
|
236
|
+
priority=result["priority"],
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
@app.get("/clusters", response_model=list[ClusterInfo])
|
|
240
|
+
def clusters() -> list[ClusterInfo]:
|
|
241
|
+
"""Get all operational state (cluster) definitions."""
|
|
242
|
+
return get_cluster_info()
|
|
243
|
+
|
|
244
|
+
@app.get("/predict/lookup", response_model=PredictionResponse)
|
|
245
|
+
async def predict_lookup(
|
|
246
|
+
resource_id: str = Query(..., description="Resource id or hostname to look up"),
|
|
247
|
+
) -> PredictionResponse:
|
|
248
|
+
"""Look up a resource's recent metrics through the configured data source and predict.
|
|
249
|
+
|
|
250
|
+
Object storage (SCRY_DATA_URI) is the default source; the HttpIngest
|
|
251
|
+
adapter is used when the ``logicmonitor`` extra is installed.
|
|
252
|
+
|
|
253
|
+
Raises:
|
|
254
|
+
HTTPException: 503 if the model is not loaded or no source is configured,
|
|
255
|
+
404 if the resource has no usable recent metrics, 502 on a source error.
|
|
256
|
+
"""
|
|
257
|
+
with tracer.start_as_current_span("predict_lookup") as span:
|
|
258
|
+
span.set_attribute("resource_id", resource_id)
|
|
259
|
+
|
|
260
|
+
if not app.state.model_loaded:
|
|
261
|
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
df = await _resource_metrics(resource_id)
|
|
265
|
+
except Exception as e:
|
|
266
|
+
logger.error("lookup data fetch failed: %s: %s", type(e).__name__, e)
|
|
267
|
+
raise HTTPException(status_code=502, detail=f"data source error: {e}") from e
|
|
268
|
+
|
|
269
|
+
if df is None:
|
|
270
|
+
raise HTTPException(
|
|
271
|
+
status_code=503,
|
|
272
|
+
detail=(
|
|
273
|
+
"No data source configured. Set SCRY_DATA_URI, or install the "
|
|
274
|
+
"'logicmonitor' extra and set HTTPINGEST_URL."
|
|
275
|
+
),
|
|
276
|
+
)
|
|
277
|
+
if df.empty:
|
|
278
|
+
raise HTTPException(
|
|
279
|
+
status_code=404,
|
|
280
|
+
detail=f"No recent metrics found for resource '{resource_id}'",
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
numerical, categorical = _split_by_profile(df)
|
|
284
|
+
if not numerical:
|
|
285
|
+
raise HTTPException(
|
|
286
|
+
status_code=404,
|
|
287
|
+
detail=(
|
|
288
|
+
f"No usable metrics for resource '{resource_id}' "
|
|
289
|
+
"under the active profile"
|
|
290
|
+
),
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
span.set_attribute("numerical_metrics.count", len(numerical))
|
|
294
|
+
span.set_attribute("categorical_metrics.count", len(categorical))
|
|
295
|
+
result = app.state.predictor.predict(
|
|
296
|
+
numerical_metrics=numerical,
|
|
297
|
+
categorical_metrics=categorical,
|
|
298
|
+
)
|
|
299
|
+
return PredictionResponse(
|
|
300
|
+
resource_id=resource_id,
|
|
301
|
+
cluster_id=result["cluster_id"],
|
|
302
|
+
cluster_name=result["cluster_name"],
|
|
303
|
+
confidence=result["confidence"],
|
|
304
|
+
action=result["action"],
|
|
305
|
+
priority=result["priority"],
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
@app.post("/forecast", response_model=ForecastResponse)
|
|
309
|
+
def forecast(request: ForecastRequest) -> ForecastResponse:
|
|
310
|
+
"""Forecast metric values at the requested horizons using Chronos.
|
|
311
|
+
|
|
312
|
+
Independent of the X-DEC cluster model. Requires the ``forecast`` extra.
|
|
313
|
+
"""
|
|
314
|
+
forecaster: Forecaster = app.state.forecaster
|
|
315
|
+
|
|
316
|
+
if request.horizons != forecaster.horizons:
|
|
317
|
+
forecaster.horizons = request.horizons
|
|
318
|
+
|
|
319
|
+
metric_forecasts = forecaster.forecast_metrics(request.metrics)
|
|
320
|
+
|
|
321
|
+
return ForecastResponse(
|
|
322
|
+
resource_id=request.resource_id,
|
|
323
|
+
forecasts=[MetricForecast(**mf) for mf in metric_forecasts],
|
|
324
|
+
model_id=forecaster.model_id,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
@app.get("/drift")
|
|
328
|
+
def drift_status() -> dict:
|
|
329
|
+
"""Get current drift detection status (PSI feature drift, ADWIN prediction drift)."""
|
|
330
|
+
if not hasattr(app.state, "drift_detector"):
|
|
331
|
+
return {
|
|
332
|
+
"feature_drift": {"has_drift": False, "message": "No reference data configured"},
|
|
333
|
+
"prediction_drift": {"has_drift": False, "message": "No error history available"},
|
|
334
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
detector = app.state.drift_detector
|
|
338
|
+
return detector.get_drift_status(
|
|
339
|
+
app.state.reference_data,
|
|
340
|
+
app.state.current_data,
|
|
341
|
+
app.state.error_stream,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
@app.get("/anomaly")
|
|
345
|
+
def anomaly_status() -> dict:
|
|
346
|
+
"""Get current forecast-based anomaly detection status."""
|
|
347
|
+
if not hasattr(app.state, "anomaly_detector"):
|
|
348
|
+
return {
|
|
349
|
+
"is_anomaly": False,
|
|
350
|
+
"anomaly_score": 0.0,
|
|
351
|
+
"violated_metrics": [],
|
|
352
|
+
"severity": "low",
|
|
353
|
+
"metric_count": 0,
|
|
354
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
detector = app.state.anomaly_detector
|
|
358
|
+
actuals = app.state.last_actuals
|
|
359
|
+
forecast = app.state.last_forecast
|
|
360
|
+
|
|
361
|
+
result = detector.detect(actuals, forecast)
|
|
362
|
+
|
|
363
|
+
return {
|
|
364
|
+
"is_anomaly": bool(result["is_anomaly"]),
|
|
365
|
+
"anomaly_score": float(result["anomaly_score"]),
|
|
366
|
+
"violated_metrics": result["violated_metrics"],
|
|
367
|
+
"severity": result["severity"],
|
|
368
|
+
"metric_count": len(detector.metric_names),
|
|
369
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
@app.get("/accuracy")
|
|
373
|
+
def accuracy_status() -> dict:
|
|
374
|
+
"""Get forecast accuracy and cluster stability metrics as flat key=value pairs."""
|
|
375
|
+
start_ms = time.monotonic_ns() // 1_000_000
|
|
376
|
+
|
|
377
|
+
if not hasattr(app.state, "accuracy_tracker"):
|
|
378
|
+
elapsed = (time.monotonic_ns() // 1_000_000) - start_ms
|
|
379
|
+
fallback: dict = {}
|
|
380
|
+
for metric in ["Picp", "Mae", "Mase", "Mpiw"]:
|
|
381
|
+
for horizon in ["15m", "1h", "4h", "24h"]:
|
|
382
|
+
fallback[f"{metric}{horizon}"] = 0.0
|
|
383
|
+
fallback["TransitionRate"] = 0.0
|
|
384
|
+
fallback["ConfidenceStd"] = 0.0
|
|
385
|
+
fallback["DominantClusterPct"] = 0.0
|
|
386
|
+
fallback["ObservationCount"] = 0
|
|
387
|
+
fallback["ApiStatus"] = 1
|
|
388
|
+
fallback["ApiLatencyMs"] = elapsed
|
|
389
|
+
fallback["timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
390
|
+
return fallback
|
|
391
|
+
|
|
392
|
+
tracker = app.state.accuracy_tracker
|
|
393
|
+
metrics = tracker.compute_metrics()
|
|
394
|
+
elapsed = (time.monotonic_ns() // 1_000_000) - start_ms
|
|
395
|
+
|
|
396
|
+
result: dict = {}
|
|
397
|
+
for metric_key, flat_prefix in [
|
|
398
|
+
("picp", "Picp"),
|
|
399
|
+
("mae", "Mae"),
|
|
400
|
+
("mase", "Mase"),
|
|
401
|
+
("mpiw", "Mpiw"),
|
|
402
|
+
]:
|
|
403
|
+
for horizon in metrics["horizons"]:
|
|
404
|
+
val = metrics["horizons"][horizon][metric_key]
|
|
405
|
+
if isinstance(val, float) and (val != val): # NaN check
|
|
406
|
+
val = 0.0
|
|
407
|
+
result[f"{flat_prefix}{horizon}"] = round(val, 4)
|
|
408
|
+
|
|
409
|
+
result["TransitionRate"] = round(metrics["stability"]["transition_rate"], 4)
|
|
410
|
+
result["ConfidenceStd"] = round(metrics["stability"]["confidence_std"], 4)
|
|
411
|
+
result["DominantClusterPct"] = round(metrics["stability"]["dominant_cluster_pct"], 1)
|
|
412
|
+
result["ObservationCount"] = metrics["observation_count"]
|
|
413
|
+
result["ApiStatus"] = 1
|
|
414
|
+
result["ApiLatencyMs"] = elapsed
|
|
415
|
+
result["timestamp"] = metrics["timestamp"]
|
|
416
|
+
|
|
417
|
+
return result
|
|
418
|
+
|
|
419
|
+
return app
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
# Default app instance for uvicorn
|
|
423
|
+
app = create_app()
|