radhiops 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- radhiops/__init__.py +59 -0
- radhiops/agents/__init__.py +19 -0
- radhiops/agents/base.py +52 -0
- radhiops/agents/cyberdefense.py +134 -0
- radhiops/agents/deployment.py +132 -0
- radhiops/agents/monitor.py +169 -0
- radhiops/agents/repo.py +247 -0
- radhiops/agents/soc.py +53 -0
- radhiops/backend.py +146 -0
- radhiops/cli.py +226 -0
- radhiops/client.py +258 -0
- radhiops/config.py +64 -0
- radhiops/credits.py +91 -0
- radhiops/defense/__init__.py +21 -0
- radhiops/defense/behavior.py +86 -0
- radhiops/defense/engine.py +102 -0
- radhiops/defense/request.py +34 -0
- radhiops/defense/signatures.py +103 -0
- radhiops/deploy/__init__.py +23 -0
- radhiops/deploy/base.py +157 -0
- radhiops/deploy/diagnose.py +119 -0
- radhiops/deploy/netlify.py +80 -0
- radhiops/deploy/railway.py +89 -0
- radhiops/deploy/registry.py +72 -0
- radhiops/deploy/render.py +86 -0
- radhiops/deploy/surge.py +70 -0
- radhiops/deploy/vercel.py +74 -0
- radhiops/exceptions.py +58 -0
- radhiops/monitor/__init__.py +24 -0
- radhiops/monitor/anomaly.py +90 -0
- radhiops/monitor/crash.py +38 -0
- radhiops/monitor/health.py +75 -0
- radhiops/monitor/incident.py +97 -0
- radhiops/monitor/metrics.py +69 -0
- radhiops/orchestrator/__init__.py +9 -0
- radhiops/orchestrator/autonomy.py +30 -0
- radhiops/orchestrator/core.py +314 -0
- radhiops/orchestrator/playbooks.py +83 -0
- radhiops/plans.py +43 -0
- radhiops/providers/__init__.py +20 -0
- radhiops/providers/anthropic_provider.py +51 -0
- radhiops/providers/base.py +92 -0
- radhiops/providers/google_provider.py +40 -0
- radhiops/providers/huggingface_provider.py +50 -0
- radhiops/providers/ollama_provider.py +48 -0
- radhiops/providers/openai_provider.py +42 -0
- radhiops/providers/registry.py +85 -0
- radhiops/repo/__init__.py +8 -0
- radhiops/repo/git.py +149 -0
- radhiops/repo/intents.py +100 -0
- radhiops/security/__init__.py +19 -0
- radhiops/security/secrets.py +199 -0
- radhiops/userconfig.py +58 -0
- radhiops-0.0.1.dist-info/METADATA +220 -0
- radhiops-0.0.1.dist-info/RECORD +57 -0
- radhiops-0.0.1.dist-info/WHEEL +4 -0
- radhiops-0.0.1.dist-info/entry_points.txt +2 -0
radhiops/__init__.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""RadhiOps — BYOE AI Engineering Platform SDK."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__version__ = "0.0.1"
|
|
6
|
+
|
|
7
|
+
from .client import RadhiOps
|
|
8
|
+
from .exceptions import (
|
|
9
|
+
AgentError,
|
|
10
|
+
AuthError,
|
|
11
|
+
InsufficientCreditsError,
|
|
12
|
+
ProviderError,
|
|
13
|
+
RadhiOpsError,
|
|
14
|
+
SubscriptionRequiredError,
|
|
15
|
+
)
|
|
16
|
+
from .plans import PLANS, Plan, upgrade_hint
|
|
17
|
+
from .orchestrator import AutonomyMode, Orchestrator
|
|
18
|
+
from .providers import (
|
|
19
|
+
ChatResult,
|
|
20
|
+
Message,
|
|
21
|
+
ModelProvider,
|
|
22
|
+
available_providers,
|
|
23
|
+
get_provider,
|
|
24
|
+
register_provider,
|
|
25
|
+
)
|
|
26
|
+
from .deploy import (
|
|
27
|
+
Deployment,
|
|
28
|
+
DeployStatus,
|
|
29
|
+
available_deploy_providers,
|
|
30
|
+
get_deploy_provider,
|
|
31
|
+
register_deploy_provider,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"__version__",
|
|
36
|
+
"RadhiOps",
|
|
37
|
+
"RadhiOpsError",
|
|
38
|
+
"AuthError",
|
|
39
|
+
"AgentError",
|
|
40
|
+
"ProviderError",
|
|
41
|
+
"InsufficientCreditsError",
|
|
42
|
+
"SubscriptionRequiredError",
|
|
43
|
+
"PLANS",
|
|
44
|
+
"Plan",
|
|
45
|
+
"upgrade_hint",
|
|
46
|
+
"AutonomyMode",
|
|
47
|
+
"Orchestrator",
|
|
48
|
+
"ChatResult",
|
|
49
|
+
"Message",
|
|
50
|
+
"ModelProvider",
|
|
51
|
+
"available_providers",
|
|
52
|
+
"get_provider",
|
|
53
|
+
"register_provider",
|
|
54
|
+
"Deployment",
|
|
55
|
+
"DeployStatus",
|
|
56
|
+
"available_deploy_providers",
|
|
57
|
+
"get_deploy_provider",
|
|
58
|
+
"register_deploy_provider",
|
|
59
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""RadhiOps agents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import Agent
|
|
6
|
+
from .cyberdefense import CyberDefenseAgent
|
|
7
|
+
from .deployment import DeploymentAgent
|
|
8
|
+
from .monitor import RuntimeMonitor
|
|
9
|
+
from .repo import RepoAgent
|
|
10
|
+
from .soc import RadhiSOC
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"Agent",
|
|
14
|
+
"RadhiSOC",
|
|
15
|
+
"RepoAgent",
|
|
16
|
+
"DeploymentAgent",
|
|
17
|
+
"RuntimeMonitor",
|
|
18
|
+
"CyberDefenseAgent",
|
|
19
|
+
]
|
radhiops/agents/base.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Base class shared by every RadhiOps agent.
|
|
2
|
+
|
|
3
|
+
An agent is given:
|
|
4
|
+
- a reference to the :class:`RadhiOps` client (for credits/auth/telemetry)
|
|
5
|
+
- an optional BYOE model provider (the user's own model)
|
|
6
|
+
|
|
7
|
+
Agents expose high-level capabilities (audit, deploy, monitor, ...). Each
|
|
8
|
+
capability that consumes platform resources reports its credit cost through
|
|
9
|
+
``self._charge(...)`` so usage stays transparent.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import TYPE_CHECKING, Any
|
|
15
|
+
|
|
16
|
+
from ..providers.base import Message, ModelProvider
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from ..client import RadhiOps
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Agent:
|
|
23
|
+
#: Human-facing name, e.g. "RadhiSOC".
|
|
24
|
+
name: str = "Agent"
|
|
25
|
+
|
|
26
|
+
def __init__(self, client: "RadhiOps", model: ModelProvider | None = None) -> None:
|
|
27
|
+
self.client = client
|
|
28
|
+
self._model = model
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def model(self) -> ModelProvider:
|
|
32
|
+
"""The BYOE model bound to this agent (falls back to the client default)."""
|
|
33
|
+
m = self._model or self.client.default_model
|
|
34
|
+
if m is None:
|
|
35
|
+
from ..exceptions import ProviderError
|
|
36
|
+
|
|
37
|
+
raise ProviderError(
|
|
38
|
+
f"{self.name} needs a model. Pass model=... or set a default "
|
|
39
|
+
"provider/model on the RadhiOps client."
|
|
40
|
+
)
|
|
41
|
+
return m
|
|
42
|
+
|
|
43
|
+
def _ask(self, system: str, user: str, **opts: Any) -> str:
|
|
44
|
+
"""Convenience: single-turn prompt to the bound model."""
|
|
45
|
+
result = self.model.chat(
|
|
46
|
+
[Message("system", system), Message("user", user)], **opts
|
|
47
|
+
)
|
|
48
|
+
return result.text
|
|
49
|
+
|
|
50
|
+
def _charge(self, credits: int, action: str) -> None:
|
|
51
|
+
"""Record credit usage for an action via the client."""
|
|
52
|
+
self.client._consume_credits(credits, f"{self.name}:{action}")
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Cyber Defense Agent — runtime attack detection & response.
|
|
2
|
+
|
|
3
|
+
Feed inbound requests through ``inspect`` (or wire ``middleware`` into your web
|
|
4
|
+
framework). The engine combines payload signatures (SQLi, XSS, SSRF, traversal,
|
|
5
|
+
command injection, header injection) with behavioral detection (brute force,
|
|
6
|
+
credential stuffing, rate abuse, DDoS) into an allow/challenge/block verdict and
|
|
7
|
+
maintains an auto-blocklist.
|
|
8
|
+
|
|
9
|
+
Escalation routing:
|
|
10
|
+
* Code-level vulns (injection/XSS/...) -> RadhiSOC, to fix the underlying code.
|
|
11
|
+
* Volumetric/behavioral attacks (DDoS/brute force) -> handled here (block).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import time
|
|
17
|
+
from typing import Any, Callable
|
|
18
|
+
|
|
19
|
+
from ..defense.engine import DefenseConfig, DefenseEngine, Verdict
|
|
20
|
+
from ..defense.request import Request
|
|
21
|
+
from ..monitor.incident import Incident, security_incident
|
|
22
|
+
from .base import Agent
|
|
23
|
+
|
|
24
|
+
# Detection/behavior types that indicate a code weakness RadhiSOC should fix.
|
|
25
|
+
_CODE_LEVEL = {
|
|
26
|
+
"sql_injection", "xss", "ssrf", "command_injection",
|
|
27
|
+
"path_traversal", "header_injection",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
_EXPLAIN_SYSTEM = (
|
|
31
|
+
"You are RadhiOps' security analyst. Given a blocked HTTP request and the "
|
|
32
|
+
"detections that triggered it, explain the attack in two sentences and state "
|
|
33
|
+
"what code-level fix prevents it. Be concrete and avoid scaremongering."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class CyberDefenseAgent(Agent):
|
|
38
|
+
name = "CyberDefenseAgent"
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
client,
|
|
43
|
+
model=None,
|
|
44
|
+
*,
|
|
45
|
+
config: DefenseConfig | None = None,
|
|
46
|
+
now: Callable[[], float] = time.time,
|
|
47
|
+
) -> None:
|
|
48
|
+
super().__init__(client, model)
|
|
49
|
+
self.engine = DefenseEngine(config, now=now)
|
|
50
|
+
self._seen: set[tuple[str, str]] = set() # (ip, attack_type) de-dupe per session
|
|
51
|
+
self._stats = {"analyzed": 0, "challenged": 0, "blocked": 0, "incidents": 0}
|
|
52
|
+
|
|
53
|
+
# ----- core ----------------------------------------------------------
|
|
54
|
+
def analyze(self, request: Request | dict[str, Any]) -> Verdict:
|
|
55
|
+
"""Return a verdict for one request. Cheap — no credits (high volume)."""
|
|
56
|
+
req = request if isinstance(request, Request) else Request.from_dict(request)
|
|
57
|
+
verdict = self.engine.analyze(req)
|
|
58
|
+
self._stats["analyzed"] += 1
|
|
59
|
+
if verdict.action == "challenge":
|
|
60
|
+
self._stats["challenged"] += 1
|
|
61
|
+
elif verdict.action == "block":
|
|
62
|
+
self._stats["blocked"] += 1
|
|
63
|
+
return verdict
|
|
64
|
+
|
|
65
|
+
def inspect(self, request: Request | dict[str, Any]) -> tuple[Verdict, Incident | None]:
|
|
66
|
+
"""Analyze and, for serious activity, raise a (deduped) incident."""
|
|
67
|
+
verdict = self.analyze(request)
|
|
68
|
+
incident = self._maybe_incident(verdict)
|
|
69
|
+
return verdict, incident
|
|
70
|
+
|
|
71
|
+
def _maybe_incident(self, verdict: Verdict) -> Incident | None:
|
|
72
|
+
# An already-blocklisted IP is handled — don't keep raising incidents.
|
|
73
|
+
if verdict.reason == "ip_blocklisted":
|
|
74
|
+
return None
|
|
75
|
+
serious = verdict.blocked or any(
|
|
76
|
+
d.severity in ("high", "critical") for d in verdict.detections
|
|
77
|
+
) or any(b.severity in ("high", "critical") for b in verdict.behaviors)
|
|
78
|
+
if not serious:
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
types = {d.type for d in verdict.detections} | {b.type for b in verdict.behaviors}
|
|
82
|
+
primary = next(iter(sorted(types)), "security_event")
|
|
83
|
+
key = (verdict.ip, primary)
|
|
84
|
+
if key in self._seen:
|
|
85
|
+
return None
|
|
86
|
+
self._seen.add(key)
|
|
87
|
+
|
|
88
|
+
self._charge(1, "incident")
|
|
89
|
+
self._stats["incidents"] += 1
|
|
90
|
+
code_level = bool(types & _CODE_LEVEL)
|
|
91
|
+
return security_incident(
|
|
92
|
+
target=verdict.ip,
|
|
93
|
+
summary=f"{verdict.action.upper()} {verdict.ip}: {', '.join(sorted(types))}.",
|
|
94
|
+
severity="critical" if verdict.blocked else "high",
|
|
95
|
+
escalate_to="RadhiSOC" if code_level else "CyberDefenseAgent",
|
|
96
|
+
detail=verdict.to_dict(),
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# ----- blocklist controls -------------------------------------------
|
|
100
|
+
def block(self, ip: str) -> None:
|
|
101
|
+
self.engine.block_ip(ip)
|
|
102
|
+
|
|
103
|
+
def unblock(self, ip: str) -> None:
|
|
104
|
+
self.engine.unblock_ip(ip)
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def blocklist(self) -> set[str]:
|
|
108
|
+
return self.engine.blocklist
|
|
109
|
+
|
|
110
|
+
def stats(self) -> dict:
|
|
111
|
+
return {**self._stats, "blocklist_size": len(self.engine.blocklist)}
|
|
112
|
+
|
|
113
|
+
# ----- AI explanation -----------------------------------------------
|
|
114
|
+
def explain(self, verdict: Verdict) -> str:
|
|
115
|
+
if not verdict.detections and not verdict.behaviors:
|
|
116
|
+
return "No attack detected; nothing to explain."
|
|
117
|
+
self._charge(2, "explain")
|
|
118
|
+
payload = verdict.to_dict()
|
|
119
|
+
return self._ask(_EXPLAIN_SYSTEM, str(payload))
|
|
120
|
+
|
|
121
|
+
# ----- framework integration ----------------------------------------
|
|
122
|
+
def middleware(self, on_block: Callable[[Verdict], Any] | None = None):
|
|
123
|
+
"""Build a WSGI-ish guard: returns a callable(request_dict) -> Verdict.
|
|
124
|
+
|
|
125
|
+
Wire this into your framework's request hook; if the verdict is a block,
|
|
126
|
+
return your 403 response (optionally via ``on_block``).
|
|
127
|
+
"""
|
|
128
|
+
def guard(request: dict[str, Any]) -> Verdict:
|
|
129
|
+
verdict, _ = self.inspect(request)
|
|
130
|
+
if verdict.blocked and on_block:
|
|
131
|
+
on_block(verdict)
|
|
132
|
+
return verdict
|
|
133
|
+
|
|
134
|
+
return guard
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Deployment Agent — deploy orchestration + log monitoring + failure diagnosis.
|
|
2
|
+
|
|
3
|
+
dep = ops.deploy("vercel", token="...", team_id="...")
|
|
4
|
+
d = dep.trigger() # kick a deployment
|
|
5
|
+
final = dep.watch(d.id) # poll until ready/failed
|
|
6
|
+
if final.status.failed:
|
|
7
|
+
print(dep.diagnose(d.id)) # rule-based + AI explanation
|
|
8
|
+
|
|
9
|
+
BYOE: the developer brings the platform token; RadhiOps never stores it.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import time
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import Any, Callable
|
|
17
|
+
|
|
18
|
+
from ..deploy.base import Deployment, DeployProvider, LogLine
|
|
19
|
+
from ..deploy.diagnose import diagnose_logs
|
|
20
|
+
from ..deploy.registry import get_deploy_provider
|
|
21
|
+
from .base import Agent
|
|
22
|
+
|
|
23
|
+
_DIAGNOSE_SYSTEM = (
|
|
24
|
+
"You are RadhiOps' deployment SRE. Given build/deploy logs that the "
|
|
25
|
+
"rule-based checker could not classify, identify the most likely root cause "
|
|
26
|
+
"in one or two sentences and give a concrete fix. Be specific; don't guess "
|
|
27
|
+
"wildly. If the logs are inconclusive, say so."
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class DeployDiagnosis:
|
|
33
|
+
deployment_id: str
|
|
34
|
+
status: str
|
|
35
|
+
rule_based: dict
|
|
36
|
+
ai_explanation: str | None = None
|
|
37
|
+
log_tail: list[str] = field(default_factory=list)
|
|
38
|
+
|
|
39
|
+
def to_dict(self) -> dict:
|
|
40
|
+
return {
|
|
41
|
+
"deployment_id": self.deployment_id,
|
|
42
|
+
"status": self.status,
|
|
43
|
+
"rule_based": self.rule_based,
|
|
44
|
+
"ai_explanation": self.ai_explanation,
|
|
45
|
+
"log_tail": self.log_tail,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class DeploymentAgent(Agent):
|
|
50
|
+
name = "DeploymentAgent"
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
client,
|
|
55
|
+
provider: str | DeployProvider,
|
|
56
|
+
model=None,
|
|
57
|
+
*,
|
|
58
|
+
token: str | None = None,
|
|
59
|
+
sleep: Callable[[float], None] = time.sleep,
|
|
60
|
+
**provider_options: Any,
|
|
61
|
+
) -> None:
|
|
62
|
+
super().__init__(client, model)
|
|
63
|
+
if isinstance(provider, DeployProvider):
|
|
64
|
+
self.provider = provider
|
|
65
|
+
else:
|
|
66
|
+
self.provider = get_deploy_provider(provider, token=token, **provider_options)
|
|
67
|
+
self._sleep = sleep
|
|
68
|
+
|
|
69
|
+
# ----- operations ----------------------------------------------------
|
|
70
|
+
def trigger(self, **opts: Any) -> Deployment:
|
|
71
|
+
self._charge(5, "deploy")
|
|
72
|
+
return self.provider.trigger_deploy(**opts)
|
|
73
|
+
|
|
74
|
+
def status(self, deployment_id: str) -> Deployment:
|
|
75
|
+
return self.provider.get_deployment(deployment_id) # read: no charge
|
|
76
|
+
|
|
77
|
+
def logs(self, deployment_id: str, **opts: Any) -> list[LogLine]:
|
|
78
|
+
return self.provider.get_logs(deployment_id, **opts) # read: no charge
|
|
79
|
+
|
|
80
|
+
def list(self, *, limit: int = 10, **opts: Any) -> list[Deployment]:
|
|
81
|
+
return self.provider.list_deployments(limit=limit, **opts)
|
|
82
|
+
|
|
83
|
+
def watch(
|
|
84
|
+
self,
|
|
85
|
+
deployment_id: str,
|
|
86
|
+
*,
|
|
87
|
+
interval: float = 5.0,
|
|
88
|
+
timeout: float = 600.0,
|
|
89
|
+
on_update: Callable[[Deployment], None] | None = None,
|
|
90
|
+
) -> Deployment:
|
|
91
|
+
"""Poll a deployment until it reaches a terminal state or times out."""
|
|
92
|
+
self._charge(2, "watch")
|
|
93
|
+
waited = 0.0
|
|
94
|
+
last: Deployment | None = None
|
|
95
|
+
while waited <= timeout:
|
|
96
|
+
last = self.provider.get_deployment(deployment_id)
|
|
97
|
+
if on_update:
|
|
98
|
+
on_update(last)
|
|
99
|
+
if last.status.is_terminal:
|
|
100
|
+
return last
|
|
101
|
+
self._sleep(interval)
|
|
102
|
+
waited += interval
|
|
103
|
+
return last or self.provider.get_deployment(deployment_id)
|
|
104
|
+
|
|
105
|
+
def diagnose(self, deployment_id: str, *, use_ai: bool = True) -> DeployDiagnosis:
|
|
106
|
+
"""Diagnose a (usually failed) deployment from its logs."""
|
|
107
|
+
self._charge(3, "diagnose")
|
|
108
|
+
dep = self.provider.get_deployment(deployment_id)
|
|
109
|
+
logs = self.provider.get_logs(deployment_id)
|
|
110
|
+
tail = [str(line) for line in logs[-50:]]
|
|
111
|
+
|
|
112
|
+
rule = diagnose_logs(logs)
|
|
113
|
+
ai_text: str | None = None
|
|
114
|
+
if use_ai and not rule.matched and self._model is not None and tail:
|
|
115
|
+
self._charge(2, "diagnose_ai")
|
|
116
|
+
ai_text = self._ask(_DIAGNOSE_SYSTEM, "\n".join(tail))
|
|
117
|
+
|
|
118
|
+
return DeployDiagnosis(
|
|
119
|
+
deployment_id=deployment_id,
|
|
120
|
+
status=dep.status.value,
|
|
121
|
+
rule_based=rule.to_dict(),
|
|
122
|
+
ai_explanation=ai_text,
|
|
123
|
+
log_tail=tail,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def deploy_and_watch(self, *, on_update=None, interval: float = 5.0, **opts: Any):
|
|
127
|
+
"""Convenience: trigger a deploy and watch it to completion, returning
|
|
128
|
+
the final deployment and (if it failed) a diagnosis."""
|
|
129
|
+
dep = self.trigger(**opts)
|
|
130
|
+
final = self.watch(dep.id, interval=interval, on_update=on_update)
|
|
131
|
+
diagnosis = self.diagnose(final.id) if final.status.failed else None
|
|
132
|
+
return final, diagnosis
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Runtime Monitor — production health, anomalies, crashes, and escalation.
|
|
2
|
+
|
|
3
|
+
mon = ops.monitor()
|
|
4
|
+
mon.add_target("api", "https://myapp.com/health")
|
|
5
|
+
|
|
6
|
+
# one cycle: probe every target, fold into metric windows
|
|
7
|
+
results = mon.poll()
|
|
8
|
+
|
|
9
|
+
# evaluate the rolling windows for anomalies -> incidents
|
|
10
|
+
incidents = mon.evaluate()
|
|
11
|
+
for inc in incidents:
|
|
12
|
+
print(inc.summary, "->", inc.escalate_to)
|
|
13
|
+
|
|
14
|
+
# crash detection from a log stream
|
|
15
|
+
mon.ingest_logs("api", ["Traceback (most recent call last):", "MemoryError"])
|
|
16
|
+
|
|
17
|
+
A monitoring session escalates issues by handing structured Incidents to an
|
|
18
|
+
``on_incident`` callback (and, in the upcoming autonomous loop, straight to the
|
|
19
|
+
Deployment or Cyber-Defense agents).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import time
|
|
25
|
+
from dataclasses import dataclass, field
|
|
26
|
+
from typing import Any, Callable
|
|
27
|
+
|
|
28
|
+
from ..monitor.anomaly import Thresholds, detect
|
|
29
|
+
from ..monitor.crash import detect_crash
|
|
30
|
+
from ..monitor.health import HealthProbe, HealthResult
|
|
31
|
+
from ..monitor.incident import Incident, crash_incident, from_anomaly
|
|
32
|
+
from ..monitor.metrics import MetricWindow, Sample
|
|
33
|
+
from .base import Agent
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class Target:
|
|
38
|
+
name: str
|
|
39
|
+
url: str
|
|
40
|
+
expect_status: int | tuple[int, ...] = 200
|
|
41
|
+
contains: str | None = None
|
|
42
|
+
window: MetricWindow = field(default_factory=lambda: MetricWindow("unset"))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class RuntimeMonitor(Agent):
|
|
46
|
+
name = "RuntimeMonitor"
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
client,
|
|
51
|
+
model=None,
|
|
52
|
+
*,
|
|
53
|
+
thresholds: Thresholds | None = None,
|
|
54
|
+
window_size: int = 100,
|
|
55
|
+
sleep: Callable[[float], None] = time.sleep,
|
|
56
|
+
transport: Any = None,
|
|
57
|
+
timeout: float = 10.0,
|
|
58
|
+
) -> None:
|
|
59
|
+
super().__init__(client, model)
|
|
60
|
+
self.thresholds = thresholds or Thresholds()
|
|
61
|
+
self._window_size = window_size
|
|
62
|
+
self._sleep = sleep
|
|
63
|
+
self._probe = HealthProbe(timeout=timeout, transport=transport)
|
|
64
|
+
self.targets: dict[str, Target] = {}
|
|
65
|
+
# De-dupe repeat incidents within a session by (type, target).
|
|
66
|
+
self._open_incidents: dict[tuple[str, str], Incident] = {}
|
|
67
|
+
|
|
68
|
+
# ----- target management --------------------------------------------
|
|
69
|
+
def add_target(
|
|
70
|
+
self,
|
|
71
|
+
name: str,
|
|
72
|
+
url: str,
|
|
73
|
+
*,
|
|
74
|
+
expect_status: int | tuple[int, ...] = 200,
|
|
75
|
+
contains: str | None = None,
|
|
76
|
+
) -> None:
|
|
77
|
+
self.targets[name] = Target(
|
|
78
|
+
name=name, url=url, expect_status=expect_status, contains=contains,
|
|
79
|
+
window=MetricWindow(name, maxlen=self._window_size),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# ----- probing -------------------------------------------------------
|
|
83
|
+
def poll(self) -> list[HealthResult]:
|
|
84
|
+
"""Probe every target once and fold the result into its window."""
|
|
85
|
+
results: list[HealthResult] = []
|
|
86
|
+
for tgt in self.targets.values():
|
|
87
|
+
res = self._probe.check(
|
|
88
|
+
tgt.name, tgt.url, expect_status=tgt.expect_status, contains=tgt.contains
|
|
89
|
+
)
|
|
90
|
+
tgt.window.add(Sample(ok=res.ok, latency_ms=res.latency_ms, timestamp=res.timestamp))
|
|
91
|
+
results.append(res)
|
|
92
|
+
return results
|
|
93
|
+
|
|
94
|
+
# ----- analysis ------------------------------------------------------
|
|
95
|
+
def evaluate(self) -> list[Incident]:
|
|
96
|
+
"""Run anomaly detection across all windows and return new incidents."""
|
|
97
|
+
incidents: list[Incident] = []
|
|
98
|
+
for tgt in self.targets.values():
|
|
99
|
+
for anomaly in detect(tgt.window, self.thresholds):
|
|
100
|
+
key = (anomaly.type, anomaly.target)
|
|
101
|
+
if key in self._open_incidents:
|
|
102
|
+
continue # already escalated this session
|
|
103
|
+
inc = from_anomaly(anomaly)
|
|
104
|
+
self._open_incidents[key] = inc
|
|
105
|
+
incidents.append(inc)
|
|
106
|
+
if incidents:
|
|
107
|
+
self._charge(1, "incident")
|
|
108
|
+
return incidents
|
|
109
|
+
|
|
110
|
+
def ingest_logs(self, target: str, logs: list[str], **_: Any) -> Incident | None:
|
|
111
|
+
"""Scan a log stream for crash signatures; return an incident if found."""
|
|
112
|
+
signal = detect_crash(logs)
|
|
113
|
+
if not signal.detected:
|
|
114
|
+
return None
|
|
115
|
+
key = ("crash", target)
|
|
116
|
+
if key in self._open_incidents:
|
|
117
|
+
return self._open_incidents[key]
|
|
118
|
+
self._charge(1, "crash_detected")
|
|
119
|
+
inc = crash_incident(
|
|
120
|
+
target,
|
|
121
|
+
f"Crash detected on {target}: {signal.category}.",
|
|
122
|
+
detail={"category": signal.category, "evidence": signal.evidence},
|
|
123
|
+
)
|
|
124
|
+
self._open_incidents[key] = inc
|
|
125
|
+
return inc
|
|
126
|
+
|
|
127
|
+
def summary(self) -> dict:
|
|
128
|
+
return {name: t.window.summary() for name, t in self.targets.items()}
|
|
129
|
+
|
|
130
|
+
# ----- session loop --------------------------------------------------
|
|
131
|
+
def watch(
|
|
132
|
+
self,
|
|
133
|
+
*,
|
|
134
|
+
rounds: int = 0,
|
|
135
|
+
interval: float = 30.0,
|
|
136
|
+
timeout: float | None = None,
|
|
137
|
+
on_incident: Callable[[Incident], None] | None = None,
|
|
138
|
+
on_poll: Callable[[list[HealthResult]], None] | None = None,
|
|
139
|
+
) -> list[Incident]:
|
|
140
|
+
"""Run a monitoring loop.
|
|
141
|
+
|
|
142
|
+
Stops after ``rounds`` polls (if > 0) or after ``timeout`` seconds.
|
|
143
|
+
Calls ``on_incident`` for every new incident as it's raised.
|
|
144
|
+
"""
|
|
145
|
+
if not self.targets:
|
|
146
|
+
from ..exceptions import AgentError
|
|
147
|
+
|
|
148
|
+
raise AgentError("No targets to monitor. Call add_target() first.")
|
|
149
|
+
self._charge(2, "watch")
|
|
150
|
+
all_incidents: list[Incident] = []
|
|
151
|
+
waited = 0.0
|
|
152
|
+
count = 0
|
|
153
|
+
while True:
|
|
154
|
+
results = self.poll()
|
|
155
|
+
if on_poll:
|
|
156
|
+
on_poll(results)
|
|
157
|
+
new = self.evaluate()
|
|
158
|
+
for inc in new:
|
|
159
|
+
all_incidents.append(inc)
|
|
160
|
+
if on_incident:
|
|
161
|
+
on_incident(inc)
|
|
162
|
+
count += 1
|
|
163
|
+
if rounds and count >= rounds:
|
|
164
|
+
break
|
|
165
|
+
if timeout is not None and waited >= timeout:
|
|
166
|
+
break
|
|
167
|
+
self._sleep(interval)
|
|
168
|
+
waited += interval
|
|
169
|
+
return all_incidents
|