scryml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scry/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ # Description: Scry predicts infrastructure failure states from a stream of metrics.
2
+ # Description: Top-level package; exposes the package version.
3
+
4
+ """Scry: predict infrastructure failure states from a stream of metrics."""
5
+
6
+ __version__ = "0.1.0"
scry/api/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ # Description: FastAPI prediction API for exposing cluster predictions
2
+ # Description: and recommended remediation actions.
3
+
4
+ """FastAPI prediction service for Scry."""
5
+
6
+ from scry.api.main import app, create_app
7
+ from scry.api.predictor import Predictor
8
+ from scry.api.schemas import (
9
+ ClusterInfo,
10
+ HealthResponse,
11
+ PredictionRequest,
12
+ PredictionResponse,
13
+ get_cluster_info,
14
+ )
15
+
16
+ __all__ = [
17
+ "app",
18
+ "create_app",
19
+ "Predictor",
20
+ "PredictionRequest",
21
+ "PredictionResponse",
22
+ "HealthResponse",
23
+ "ClusterInfo",
24
+ "get_cluster_info",
25
+ ]
scry/api/forecaster.py ADDED
@@ -0,0 +1,110 @@
1
+ # Description: Singleton wrapper around ChronosForecaster for API use.
2
+ # Description: Lazy-loads the Chronos model on first request, reuses across calls.
3
+
4
+ """Forecaster service for the API."""
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ from typing import Any
10
+
11
+ import numpy as np
12
+ import torch
13
+
14
+ log = logging.getLogger(__name__)
15
+
16
+
17
+ class Forecaster:
18
+ """Singleton forecaster service wrapping ChronosForecaster.
19
+
20
+ Lazy-loads the Chronos foundation model on first call.
21
+ Provides a forecast_metrics method that takes multiple metrics
22
+ and returns per-metric, per-horizon forecasts with quantiles.
23
+
24
+ Args:
25
+ model_id: HuggingFace model ID for Chronos.
26
+ device: Inference device (cpu, cuda).
27
+ horizons: Forecast horizons in timesteps.
28
+ quantile_levels: Quantile levels for prediction intervals.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ model_id: str = "amazon/chronos-bolt-tiny",
34
+ device: str = "cpu",
35
+ horizons: list[int] | None = None,
36
+ quantile_levels: list[float] | None = None,
37
+ ) -> None:
38
+ self.model_id = model_id
39
+ self.device = device
40
+ self.horizons = horizons or [15, 60, 240, 1440]
41
+ self.quantile_levels = quantile_levels or [0.1, 0.5, 0.9]
42
+ self._forecaster: Any = None
43
+
44
+ def _load(self) -> None:
45
+ """Lazy-load the ChronosForecaster on first use."""
46
+ if self._forecaster is not None:
47
+ return
48
+
49
+ from scry.model.forecasting.chronos_wrapper import ChronosForecaster
50
+
51
+ log.info(
52
+ "Loading ChronosForecaster model=%s device=%s horizons=%s",
53
+ self.model_id,
54
+ self.device,
55
+ self.horizons,
56
+ )
57
+ self._forecaster = ChronosForecaster(
58
+ model_id=self.model_id,
59
+ device=self.device,
60
+ horizons=self.horizons,
61
+ quantile_levels=self.quantile_levels,
62
+ )
63
+ log.info("ChronosForecaster loaded")
64
+
65
+ @property
66
+ def is_loaded(self) -> bool:
67
+ """Whether the Chronos model is loaded."""
68
+ return self._forecaster is not None
69
+
70
+ def forecast_metrics(
71
+ self,
72
+ metrics: dict[str, list[float]],
73
+ ) -> list[dict[str, Any]]:
74
+ """Forecast multiple metrics and extract values at configured horizons.
75
+
76
+ Args:
77
+ metrics: Dict mapping metric names to historical time series values.
78
+
79
+ Returns:
80
+ List of per-metric forecast dicts, each containing:
81
+ - metric_name: str
82
+ - horizons: list of {horizon, median, lower, upper} dicts
83
+ """
84
+ self._load()
85
+
86
+ metric_names = list(metrics.keys())
87
+ contexts = [
88
+ torch.tensor(values, dtype=torch.float32)
89
+ for values in metrics.values()
90
+ ]
91
+
92
+ batch_forecast = self._forecaster.forecast_batch(contexts)
93
+ at_horizons = self._forecaster.extract_at_horizons(batch_forecast)
94
+
95
+ results = []
96
+ for i, name in enumerate(metric_names):
97
+ horizon_results = []
98
+ for j, h in enumerate(self.horizons):
99
+ horizon_results.append({
100
+ "horizon": h,
101
+ "median": float(np.round(at_horizons["median"][i, j], 4)),
102
+ "lower": float(np.round(at_horizons["lower"][i, j], 4)),
103
+ "upper": float(np.round(at_horizons["upper"][i, j], 4)),
104
+ })
105
+ results.append({
106
+ "metric_name": name,
107
+ "horizons": horizon_results,
108
+ })
109
+
110
+ return results
scry/api/main.py ADDED
@@ -0,0 +1,423 @@
1
+ # Description: FastAPI application for the prediction service.
2
+ # Description: Exposes /health, /predict, /predict/lookup, /clusters, /forecast, /drift, /anomaly, /accuracy.
3
+
4
+ """FastAPI application for cluster prediction."""
5
+
6
+ import atexit
7
+ import logging
8
+ import os
9
+ import time
10
+ from datetime import datetime, timedelta, timezone
11
+ from typing import Any
12
+
13
+ from fastapi import FastAPI, HTTPException, Query
14
+
15
+ from scry.api.forecaster import Forecaster
16
+ from scry.api.predictor import Predictor
17
+ from scry.api.schemas import (
18
+ ClusterInfo,
19
+ DetailedHealthResponse,
20
+ ForecastRequest,
21
+ ForecastResponse,
22
+ HealthResponse,
23
+ MetricForecast,
24
+ PredictionRequest,
25
+ PredictionResponse,
26
+ get_cluster_info,
27
+ )
28
+ from scry.utils.config import get_config
29
+ from scry.utils.tracing import get_tracer, setup_tracing, shutdown_tracing
30
+
31
+ tracer = get_tracer(__name__)
32
+
33
+ logger = logging.getLogger(__name__)
34
+ logging.basicConfig(
35
+ level=logging.INFO,
36
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
37
+ )
38
+
39
+ # API version
40
+ VERSION = "0.1.0"
41
+
42
+
43
+ def _data_uri() -> str | None:
44
+ """The configured object-store data URI, if any (env SCRY_DATA_URI / config)."""
45
+ return getattr(get_config(), "data_uri", None) or os.environ.get("SCRY_DATA_URI")
46
+
47
+
48
+ def _datasource_descriptor() -> str | None:
49
+ """Describe the configured data source for diagnostics, or None if unconfigured.
50
+
51
+ Object storage (SCRY_DATA_URI) is the default. The HttpIngest adapter is used
52
+ only when the ``logicmonitor`` extra is installed.
53
+ """
54
+ uri = _data_uri()
55
+ if uri:
56
+ return f"object-store: {uri}"
57
+ try:
58
+ import scry.data.sources.http_ingest # noqa: F401
59
+ except ImportError:
60
+ return None
61
+ return f"httpingest: {get_config().httpingest_url}"
62
+
63
+
64
+ async def _resource_metrics(resource_id: str, lookback_days: int = 30) -> Any:
65
+ """Fetch recent metrics for a resource through the configured data source.
66
+
67
+ Returns a canonical-schema DataFrame filtered to the resource, or None when no
68
+ data source is configured. Object storage is the default; the HttpIngest
69
+ adapter is used only when the ``logicmonitor`` extra is installed.
70
+ """
71
+ from scry.data import DataFetcher
72
+
73
+ end = datetime.now(timezone.utc)
74
+ start = end - timedelta(days=lookback_days)
75
+ profile_name = os.environ.get("SCRY_PROFILE") # None -> features.yaml default
76
+
77
+ uri = _data_uri()
78
+ if uri:
79
+ fetcher = DataFetcher.from_object_store(uri)
80
+ df = await fetcher.get_metrics_dataframe(start, end, profile_name)
81
+ else:
82
+ try:
83
+ from scry.data import HttpIngestClient
84
+ except ImportError:
85
+ return None
86
+ async with HttpIngestClient(base_url=get_config().httpingest_url) as client:
87
+ fetcher = DataFetcher.from_http_client(client)
88
+ df = await fetcher.get_metrics_dataframe(start, end, profile_name)
89
+
90
+ if df.empty:
91
+ return df
92
+
93
+ needle = resource_id.lower()
94
+ mask = df["resource_id"].astype(str).str.lower().str.contains(needle, na=False) | df[
95
+ "host_name"
96
+ ].astype(str).str.lower().str.contains(needle, na=False)
97
+ return df[mask]
98
+
99
+
100
+ def _split_by_profile(
101
+ df: Any,
102
+ profile_name: str | None = None,
103
+ ) -> tuple[dict[str, list[float]], dict[str, list[int]]]:
104
+ """Group canonical metric rows into numerical/categorical series per the active profile."""
105
+ from scry.config.loader import get_profile
106
+
107
+ try:
108
+ profile = get_profile(profile_name)
109
+ except (FileNotFoundError, ValueError):
110
+ return {}, {}
111
+
112
+ num_set = set(profile.numerical_features)
113
+ cat_set = set(profile.categorical_features)
114
+ numerical: dict[str, list[float]] = {}
115
+ categorical: dict[str, list[int]] = {}
116
+
117
+ ordered = df.sort_values("timestamp")
118
+ for name, group in ordered.groupby("metric_name"):
119
+ values = group["value"].dropna().tolist()
120
+ if not values:
121
+ continue
122
+ if name in num_set:
123
+ numerical[name] = [float(v) for v in values]
124
+ elif name in cat_set:
125
+ categorical[name] = [int(v) for v in values]
126
+ return numerical, categorical
127
+
128
+
129
+ def create_app(model_path: str | None = None) -> FastAPI:
130
+ """Create the FastAPI application.
131
+
132
+ Args:
133
+ model_path: Path to the model file. If None, uses the MODEL_PATH env var.
134
+
135
+ Returns:
136
+ Configured FastAPI application.
137
+ """
138
+ app = FastAPI(
139
+ title="Scry Predictive API",
140
+ description="Predicts infrastructure failure states from a stream of metrics",
141
+ version=VERSION,
142
+ )
143
+
144
+ # Optional tracing (no-op unless the 'otel' extra is installed and enabled).
145
+ setup_tracing(app)
146
+ atexit.register(shutdown_tracing)
147
+
148
+ # Load model
149
+ if model_path is None:
150
+ model_path = os.environ.get("MODEL_PATH", "models/xdec_model.pt")
151
+
152
+ app.state.started_at = time.monotonic()
153
+ app.state.model_path = model_path
154
+ app.state.model_load_time_ms = None
155
+
156
+ try:
157
+ t0 = time.monotonic()
158
+ predictor = Predictor(model_path=model_path)
159
+ app.state.model_load_time_ms = (time.monotonic() - t0) * 1000.0
160
+ app.state.predictor = predictor
161
+ app.state.model_loaded = True
162
+ except FileNotFoundError:
163
+ app.state.predictor = None
164
+ app.state.model_loaded = False
165
+
166
+ # Forecaster is lazy-loaded on first /forecast request, independent of X-DEC.
167
+ forecast_model_id = os.environ.get("FORECAST_MODEL_ID", "amazon/chronos-bolt-tiny")
168
+ forecast_device = os.environ.get("FORECAST_DEVICE", "cpu")
169
+ app.state.forecaster = Forecaster(
170
+ model_id=forecast_model_id,
171
+ device=forecast_device,
172
+ )
173
+
174
+ @app.get("/")
175
+ def root() -> dict:
176
+ """Root endpoint with API info."""
177
+ return {"name": "Scry Predictive API", "version": VERSION, "docs": "/docs"}
178
+
179
+ @app.get("/health", response_model=HealthResponse)
180
+ def health() -> HealthResponse:
181
+ """Liveness/readiness check."""
182
+ status = "healthy" if app.state.model_loaded else "unhealthy"
183
+ return HealthResponse(
184
+ status=status,
185
+ model_loaded=app.state.model_loaded,
186
+ version=VERSION,
187
+ )
188
+
189
+ @app.get("/health/detailed", response_model=DetailedHealthResponse)
190
+ def health_detailed() -> DetailedHealthResponse:
191
+ """Detailed health check: model metadata, configured data source, uptime."""
192
+ status = "healthy" if app.state.model_loaded else "unhealthy"
193
+ uptime = time.monotonic() - app.state.started_at
194
+
195
+ model_version = None
196
+ if app.state.predictor is not None:
197
+ cfg = app.state.predictor.config
198
+ model_version = f"xdec-k{cfg['n_clusters']}-d{cfg['latent_dim']}"
199
+ env_version = os.environ.get("MODEL_VERSION")
200
+ if env_version:
201
+ model_version = f"{model_version}-{env_version}" if model_version else env_version
202
+
203
+ return DetailedHealthResponse(
204
+ status=status,
205
+ model_loaded=app.state.model_loaded,
206
+ version=VERSION,
207
+ model_load_time_ms=app.state.model_load_time_ms,
208
+ model_version=model_version,
209
+ model_path=app.state.model_path,
210
+ datasource=_datasource_descriptor(),
211
+ chronos_loaded=app.state.forecaster.is_loaded,
212
+ uptime_seconds=round(uptime, 2),
213
+ )
214
+
215
+ @app.post("/predict", response_model=PredictionResponse)
216
+ def predict(request: PredictionRequest) -> PredictionResponse:
217
+ """Predict the operational state for metrics supplied in the request body.
218
+
219
+ Raises:
220
+ HTTPException: 503 if the model is not loaded.
221
+ """
222
+ if not app.state.model_loaded:
223
+ raise HTTPException(status_code=503, detail="Model not loaded")
224
+
225
+ result = app.state.predictor.predict(
226
+ numerical_metrics=request.numerical_metrics,
227
+ categorical_metrics=request.categorical_metrics,
228
+ )
229
+
230
+ return PredictionResponse(
231
+ resource_id=request.resource_id,
232
+ cluster_id=result["cluster_id"],
233
+ cluster_name=result["cluster_name"],
234
+ confidence=result["confidence"],
235
+ action=result["action"],
236
+ priority=result["priority"],
237
+ )
238
+
239
+ @app.get("/clusters", response_model=list[ClusterInfo])
240
+ def clusters() -> list[ClusterInfo]:
241
+ """Get all operational state (cluster) definitions."""
242
+ return get_cluster_info()
243
+
244
+ @app.get("/predict/lookup", response_model=PredictionResponse)
245
+ async def predict_lookup(
246
+ resource_id: str = Query(..., description="Resource id or hostname to look up"),
247
+ ) -> PredictionResponse:
248
+ """Look up a resource's recent metrics through the configured data source and predict.
249
+
250
+ Object storage (SCRY_DATA_URI) is the default source; the HttpIngest
251
+ adapter is used when the ``logicmonitor`` extra is installed.
252
+
253
+ Raises:
254
+ HTTPException: 503 if the model is not loaded or no source is configured,
255
+ 404 if the resource has no usable recent metrics, 502 on a source error.
256
+ """
257
+ with tracer.start_as_current_span("predict_lookup") as span:
258
+ span.set_attribute("resource_id", resource_id)
259
+
260
+ if not app.state.model_loaded:
261
+ raise HTTPException(status_code=503, detail="Model not loaded")
262
+
263
+ try:
264
+ df = await _resource_metrics(resource_id)
265
+ except Exception as e:
266
+ logger.error("lookup data fetch failed: %s: %s", type(e).__name__, e)
267
+ raise HTTPException(status_code=502, detail=f"data source error: {e}") from e
268
+
269
+ if df is None:
270
+ raise HTTPException(
271
+ status_code=503,
272
+ detail=(
273
+ "No data source configured. Set SCRY_DATA_URI, or install the "
274
+ "'logicmonitor' extra and set HTTPINGEST_URL."
275
+ ),
276
+ )
277
+ if df.empty:
278
+ raise HTTPException(
279
+ status_code=404,
280
+ detail=f"No recent metrics found for resource '{resource_id}'",
281
+ )
282
+
283
+ numerical, categorical = _split_by_profile(df)
284
+ if not numerical:
285
+ raise HTTPException(
286
+ status_code=404,
287
+ detail=(
288
+ f"No usable metrics for resource '{resource_id}' "
289
+ "under the active profile"
290
+ ),
291
+ )
292
+
293
+ span.set_attribute("numerical_metrics.count", len(numerical))
294
+ span.set_attribute("categorical_metrics.count", len(categorical))
295
+ result = app.state.predictor.predict(
296
+ numerical_metrics=numerical,
297
+ categorical_metrics=categorical,
298
+ )
299
+ return PredictionResponse(
300
+ resource_id=resource_id,
301
+ cluster_id=result["cluster_id"],
302
+ cluster_name=result["cluster_name"],
303
+ confidence=result["confidence"],
304
+ action=result["action"],
305
+ priority=result["priority"],
306
+ )
307
+
308
+ @app.post("/forecast", response_model=ForecastResponse)
309
+ def forecast(request: ForecastRequest) -> ForecastResponse:
310
+ """Forecast metric values at the requested horizons using Chronos.
311
+
312
+ Independent of the X-DEC cluster model. Requires the ``forecast`` extra.
313
+ """
314
+ forecaster: Forecaster = app.state.forecaster
315
+
316
+ if request.horizons != forecaster.horizons:
317
+ forecaster.horizons = request.horizons
318
+
319
+ metric_forecasts = forecaster.forecast_metrics(request.metrics)
320
+
321
+ return ForecastResponse(
322
+ resource_id=request.resource_id,
323
+ forecasts=[MetricForecast(**mf) for mf in metric_forecasts],
324
+ model_id=forecaster.model_id,
325
+ )
326
+
327
+ @app.get("/drift")
328
+ def drift_status() -> dict:
329
+ """Get current drift detection status (PSI feature drift, ADWIN prediction drift)."""
330
+ if not hasattr(app.state, "drift_detector"):
331
+ return {
332
+ "feature_drift": {"has_drift": False, "message": "No reference data configured"},
333
+ "prediction_drift": {"has_drift": False, "message": "No error history available"},
334
+ "timestamp": datetime.now(timezone.utc).isoformat(),
335
+ }
336
+
337
+ detector = app.state.drift_detector
338
+ return detector.get_drift_status(
339
+ app.state.reference_data,
340
+ app.state.current_data,
341
+ app.state.error_stream,
342
+ )
343
+
344
+ @app.get("/anomaly")
345
+ def anomaly_status() -> dict:
346
+ """Get current forecast-based anomaly detection status."""
347
+ if not hasattr(app.state, "anomaly_detector"):
348
+ return {
349
+ "is_anomaly": False,
350
+ "anomaly_score": 0.0,
351
+ "violated_metrics": [],
352
+ "severity": "low",
353
+ "metric_count": 0,
354
+ "timestamp": datetime.now(timezone.utc).isoformat(),
355
+ }
356
+
357
+ detector = app.state.anomaly_detector
358
+ actuals = app.state.last_actuals
359
+ forecast = app.state.last_forecast
360
+
361
+ result = detector.detect(actuals, forecast)
362
+
363
+ return {
364
+ "is_anomaly": bool(result["is_anomaly"]),
365
+ "anomaly_score": float(result["anomaly_score"]),
366
+ "violated_metrics": result["violated_metrics"],
367
+ "severity": result["severity"],
368
+ "metric_count": len(detector.metric_names),
369
+ "timestamp": datetime.now(timezone.utc).isoformat(),
370
+ }
371
+
372
+ @app.get("/accuracy")
373
+ def accuracy_status() -> dict:
374
+ """Get forecast accuracy and cluster stability metrics as flat key=value pairs."""
375
+ start_ms = time.monotonic_ns() // 1_000_000
376
+
377
+ if not hasattr(app.state, "accuracy_tracker"):
378
+ elapsed = (time.monotonic_ns() // 1_000_000) - start_ms
379
+ fallback: dict = {}
380
+ for metric in ["Picp", "Mae", "Mase", "Mpiw"]:
381
+ for horizon in ["15m", "1h", "4h", "24h"]:
382
+ fallback[f"{metric}{horizon}"] = 0.0
383
+ fallback["TransitionRate"] = 0.0
384
+ fallback["ConfidenceStd"] = 0.0
385
+ fallback["DominantClusterPct"] = 0.0
386
+ fallback["ObservationCount"] = 0
387
+ fallback["ApiStatus"] = 1
388
+ fallback["ApiLatencyMs"] = elapsed
389
+ fallback["timestamp"] = datetime.now(timezone.utc).isoformat()
390
+ return fallback
391
+
392
+ tracker = app.state.accuracy_tracker
393
+ metrics = tracker.compute_metrics()
394
+ elapsed = (time.monotonic_ns() // 1_000_000) - start_ms
395
+
396
+ result: dict = {}
397
+ for metric_key, flat_prefix in [
398
+ ("picp", "Picp"),
399
+ ("mae", "Mae"),
400
+ ("mase", "Mase"),
401
+ ("mpiw", "Mpiw"),
402
+ ]:
403
+ for horizon in metrics["horizons"]:
404
+ val = metrics["horizons"][horizon][metric_key]
405
+ if isinstance(val, float) and (val != val): # NaN check
406
+ val = 0.0
407
+ result[f"{flat_prefix}{horizon}"] = round(val, 4)
408
+
409
+ result["TransitionRate"] = round(metrics["stability"]["transition_rate"], 4)
410
+ result["ConfidenceStd"] = round(metrics["stability"]["confidence_std"], 4)
411
+ result["DominantClusterPct"] = round(metrics["stability"]["dominant_cluster_pct"], 1)
412
+ result["ObservationCount"] = metrics["observation_count"]
413
+ result["ApiStatus"] = 1
414
+ result["ApiLatencyMs"] = elapsed
415
+ result["timestamp"] = metrics["timestamp"]
416
+
417
+ return result
418
+
419
+ return app
420
+
421
+
422
+ # Default app instance for uvicorn
423
+ app = create_app()