techrevati-runtime 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- techrevati/__init__.py +0 -0
- techrevati/runtime/__init__.py +177 -0
- techrevati/runtime/agent_events.py +270 -0
- techrevati/runtime/agent_lifecycle.py +224 -0
- techrevati/runtime/circuit_breaker.py +243 -0
- techrevati/runtime/data/pricing.json +10 -0
- techrevati/runtime/guardrails.py +138 -0
- techrevati/runtime/handoffs.py +57 -0
- techrevati/runtime/orchestrator.py +828 -0
- techrevati/runtime/otel.py +190 -0
- techrevati/runtime/permissions.py +139 -0
- techrevati/runtime/policy_engine.py +243 -0
- techrevati/runtime/py.typed +0 -0
- techrevati/runtime/quality_gate.py +65 -0
- techrevati/runtime/retry_policy.py +424 -0
- techrevati/runtime/sinks.py +115 -0
- techrevati/runtime/usage_tracking.py +232 -0
- techrevati_runtime-0.1.0.dist-info/METADATA +212 -0
- techrevati_runtime-0.1.0.dist-info/RECORD +21 -0
- techrevati_runtime-0.1.0.dist-info/WHEEL +4 -0
- techrevati_runtime-0.1.0.dist-info/licenses/LICENSE +21 -0
techrevati/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
techrevati.runtime — Runtime primitives for multi-step LLM agent loops.
|
|
3
|
+
|
|
4
|
+
Reliability, cost tracking, and lifecycle for multi-step agent execution.
|
|
5
|
+
Zero runtime dependencies.
|
|
6
|
+
|
|
7
|
+
>>> from techrevati.runtime import Orchestrator, UsageSnapshot
|
|
8
|
+
>>> from techrevati.runtime import classify_exception, attempt_recovery, RecoveryContext
|
|
9
|
+
>>> from techrevati.runtime import CircuitBreaker, PolicyEngine
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
__version__ = "0.1.0"
|
|
13
|
+
|
|
14
|
+
from techrevati.runtime.agent_events import (
|
|
15
|
+
AgentEvent,
|
|
16
|
+
AgentEventName,
|
|
17
|
+
AgentEventStatus,
|
|
18
|
+
AgentFailureClass,
|
|
19
|
+
)
|
|
20
|
+
from techrevati.runtime.agent_lifecycle import (
|
|
21
|
+
AgentRegistry,
|
|
22
|
+
AgentStatus,
|
|
23
|
+
AgentWorker,
|
|
24
|
+
AgentWorkerEvent,
|
|
25
|
+
InvalidTransitionError,
|
|
26
|
+
)
|
|
27
|
+
from techrevati.runtime.circuit_breaker import (
|
|
28
|
+
AsyncCircuitBreaker,
|
|
29
|
+
CircuitBreaker,
|
|
30
|
+
CircuitOpenError,
|
|
31
|
+
CircuitState,
|
|
32
|
+
)
|
|
33
|
+
from techrevati.runtime.guardrails import (
|
|
34
|
+
AllowAllGuardrail,
|
|
35
|
+
Guardrail,
|
|
36
|
+
GuardrailOutcome,
|
|
37
|
+
GuardrailStage,
|
|
38
|
+
GuardrailViolatedError,
|
|
39
|
+
)
|
|
40
|
+
from techrevati.runtime.handoffs import Handoff
|
|
41
|
+
from techrevati.runtime.orchestrator import (
|
|
42
|
+
AgentSession,
|
|
43
|
+
AsyncOrchestrationSession,
|
|
44
|
+
MaxIterationsExceededError,
|
|
45
|
+
OrchestrationSession,
|
|
46
|
+
Orchestrator,
|
|
47
|
+
PermissionDeniedError,
|
|
48
|
+
TurnTimeoutError,
|
|
49
|
+
)
|
|
50
|
+
from techrevati.runtime.permissions import (
|
|
51
|
+
PermissionEnforcer,
|
|
52
|
+
PermissionMode,
|
|
53
|
+
PermissionOutcome,
|
|
54
|
+
PermissionPolicy,
|
|
55
|
+
RolePermissionConfig,
|
|
56
|
+
)
|
|
57
|
+
from techrevati.runtime.policy_engine import (
|
|
58
|
+
PhaseContext,
|
|
59
|
+
PolicyAction,
|
|
60
|
+
PolicyActionData,
|
|
61
|
+
PolicyCondition,
|
|
62
|
+
PolicyEngine,
|
|
63
|
+
PolicyRule,
|
|
64
|
+
)
|
|
65
|
+
from techrevati.runtime.quality_gate import (
|
|
66
|
+
QualityGate,
|
|
67
|
+
QualityGateOutcome,
|
|
68
|
+
QualityLevel,
|
|
69
|
+
)
|
|
70
|
+
from techrevati.runtime.retry_policy import (
|
|
71
|
+
EscalationPolicy,
|
|
72
|
+
FailureScenario,
|
|
73
|
+
RecoveryContext,
|
|
74
|
+
RecoveryEvent,
|
|
75
|
+
RecoveryRecipe,
|
|
76
|
+
RecoveryResult,
|
|
77
|
+
RecoveryStep,
|
|
78
|
+
aattempt_recovery,
|
|
79
|
+
attempt_recovery,
|
|
80
|
+
backoff_delay,
|
|
81
|
+
classify_exception,
|
|
82
|
+
next_provider,
|
|
83
|
+
recipe_for,
|
|
84
|
+
smaller_context_budget,
|
|
85
|
+
)
|
|
86
|
+
from techrevati.runtime.sinks import (
|
|
87
|
+
DEFAULT_RING_CAPACITY,
|
|
88
|
+
EventSink,
|
|
89
|
+
NoopEventSink,
|
|
90
|
+
NoopUsageSink,
|
|
91
|
+
RingBufferEventSink,
|
|
92
|
+
RingBufferUsageSink,
|
|
93
|
+
UsageSink,
|
|
94
|
+
)
|
|
95
|
+
from techrevati.runtime.usage_tracking import (
|
|
96
|
+
PRICING_TABLE,
|
|
97
|
+
BudgetExceededError,
|
|
98
|
+
ModelPricing,
|
|
99
|
+
UsageSnapshot,
|
|
100
|
+
UsageTracker,
|
|
101
|
+
has_pricing,
|
|
102
|
+
load_pricing_from_file,
|
|
103
|
+
register_pricing,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
__all__ = [
|
|
107
|
+
"AgentEvent",
|
|
108
|
+
"AgentEventName",
|
|
109
|
+
"AgentEventStatus",
|
|
110
|
+
"AgentFailureClass",
|
|
111
|
+
"AgentRegistry",
|
|
112
|
+
"AgentSession",
|
|
113
|
+
"AgentStatus",
|
|
114
|
+
"AgentWorker",
|
|
115
|
+
"AgentWorkerEvent",
|
|
116
|
+
"AllowAllGuardrail",
|
|
117
|
+
"AsyncCircuitBreaker",
|
|
118
|
+
"AsyncOrchestrationSession",
|
|
119
|
+
"BudgetExceededError",
|
|
120
|
+
"CircuitBreaker",
|
|
121
|
+
"CircuitOpenError",
|
|
122
|
+
"CircuitState",
|
|
123
|
+
"DEFAULT_RING_CAPACITY",
|
|
124
|
+
"EscalationPolicy",
|
|
125
|
+
"EventSink",
|
|
126
|
+
"FailureScenario",
|
|
127
|
+
"Guardrail",
|
|
128
|
+
"GuardrailOutcome",
|
|
129
|
+
"GuardrailStage",
|
|
130
|
+
"GuardrailViolatedError",
|
|
131
|
+
"Handoff",
|
|
132
|
+
"InvalidTransitionError",
|
|
133
|
+
"MaxIterationsExceededError",
|
|
134
|
+
"ModelPricing",
|
|
135
|
+
"NoopEventSink",
|
|
136
|
+
"NoopUsageSink",
|
|
137
|
+
"OrchestrationSession",
|
|
138
|
+
"Orchestrator",
|
|
139
|
+
"PermissionDeniedError",
|
|
140
|
+
"PermissionEnforcer",
|
|
141
|
+
"PermissionMode",
|
|
142
|
+
"PermissionOutcome",
|
|
143
|
+
"PermissionPolicy",
|
|
144
|
+
"PhaseContext",
|
|
145
|
+
"PolicyAction",
|
|
146
|
+
"PolicyActionData",
|
|
147
|
+
"PolicyCondition",
|
|
148
|
+
"PolicyEngine",
|
|
149
|
+
"PolicyRule",
|
|
150
|
+
"PRICING_TABLE",
|
|
151
|
+
"QualityGate",
|
|
152
|
+
"QualityGateOutcome",
|
|
153
|
+
"QualityLevel",
|
|
154
|
+
"RecoveryContext",
|
|
155
|
+
"RecoveryEvent",
|
|
156
|
+
"RecoveryRecipe",
|
|
157
|
+
"RecoveryResult",
|
|
158
|
+
"RecoveryStep",
|
|
159
|
+
"RingBufferEventSink",
|
|
160
|
+
"RingBufferUsageSink",
|
|
161
|
+
"RolePermissionConfig",
|
|
162
|
+
"TurnTimeoutError",
|
|
163
|
+
"UsageSink",
|
|
164
|
+
"UsageSnapshot",
|
|
165
|
+
"UsageTracker",
|
|
166
|
+
"__version__",
|
|
167
|
+
"aattempt_recovery",
|
|
168
|
+
"attempt_recovery",
|
|
169
|
+
"backoff_delay",
|
|
170
|
+
"classify_exception",
|
|
171
|
+
"has_pricing",
|
|
172
|
+
"load_pricing_from_file",
|
|
173
|
+
"next_provider",
|
|
174
|
+
"recipe_for",
|
|
175
|
+
"register_pricing",
|
|
176
|
+
"smaller_context_budget",
|
|
177
|
+
]
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent Events — Typed lifecycle events with a failure taxonomy.
|
|
3
|
+
|
|
4
|
+
Provides a structured event schema for agent execution. Events are
|
|
5
|
+
JSON-serializable and include both an 'event' (full path) and 'type'
|
|
6
|
+
(short tail) field so they can be routed by either consumers.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from dataclasses import dataclass, field, replace
|
|
13
|
+
from datetime import UTC, datetime
|
|
14
|
+
from enum import Enum
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AgentEventName(str, Enum):
|
|
19
|
+
"""Typed event names for agent and phase lifecycle."""
|
|
20
|
+
|
|
21
|
+
# Agent lifecycle
|
|
22
|
+
AGENT_STARTED = "agent.started"
|
|
23
|
+
AGENT_READY = "agent.ready"
|
|
24
|
+
AGENT_BLOCKED = "agent.blocked"
|
|
25
|
+
AGENT_TOOL_CALLED = "agent.tool_called"
|
|
26
|
+
AGENT_TOOL_COMPLETED = "agent.tool_completed"
|
|
27
|
+
AGENT_COMPLETED = "agent.completed"
|
|
28
|
+
AGENT_FAILED = "agent.failed"
|
|
29
|
+
# Phase lifecycle
|
|
30
|
+
PHASE_STARTED = "phase.started"
|
|
31
|
+
PHASE_COMPLETED = "phase.completed"
|
|
32
|
+
PHASE_GATE_EVALUATED = "phase.gate_evaluated"
|
|
33
|
+
PHASE_GATE_PASSED = "phase.gate_passed"
|
|
34
|
+
PHASE_GATE_FAILED = "phase.gate_failed"
|
|
35
|
+
# Recovery
|
|
36
|
+
RECOVERY_ATTEMPTED = "agent.recovery.attempted"
|
|
37
|
+
RECOVERY_SUCCEEDED = "agent.recovery.succeeded"
|
|
38
|
+
RECOVERY_FAILED = "agent.recovery.failed"
|
|
39
|
+
RECOVERY_ESCALATED = "agent.recovery.escalated"
|
|
40
|
+
RECOVERY_PROVIDER_SWITCHED = "agent.recovery.provider_switched"
|
|
41
|
+
# Hooks
|
|
42
|
+
HOOK_PRE_TOOL = "hook.pre_tool"
|
|
43
|
+
HOOK_POST_TOOL = "hook.post_tool"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class AgentEventStatus(str, Enum):
|
|
47
|
+
"""Current status at time of event emission."""
|
|
48
|
+
|
|
49
|
+
RUNNING = "running"
|
|
50
|
+
READY = "ready"
|
|
51
|
+
BLOCKED = "blocked"
|
|
52
|
+
GREEN = "green"
|
|
53
|
+
RED = "red"
|
|
54
|
+
COMPLETED = "completed"
|
|
55
|
+
FAILED = "failed"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class AgentFailureClass(str, Enum):
|
|
59
|
+
"""Failure taxonomy for structured error classification."""
|
|
60
|
+
|
|
61
|
+
LLM_TIMEOUT = "llm_timeout"
|
|
62
|
+
LLM_ERROR = "llm_error"
|
|
63
|
+
TOOL_ERROR = "tool_error"
|
|
64
|
+
CONTEXT_OVERFLOW = "context_overflow"
|
|
65
|
+
RATE_LIMIT = "rate_limit"
|
|
66
|
+
DEPENDENCY_FAILED = "dependency_failed"
|
|
67
|
+
MEMORY_CORRUPTION = "memory_corruption"
|
|
68
|
+
VALIDATION_ERROR = "validation_error"
|
|
69
|
+
PROMPT_REJECTION = "prompt_rejection"
|
|
70
|
+
UNKNOWN = "unknown"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _now_iso() -> str:
|
|
74
|
+
return datetime.now(UTC).isoformat()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass(frozen=True)
|
|
78
|
+
class AgentEvent:
|
|
79
|
+
"""A typed lifecycle event for agent orchestration."""
|
|
80
|
+
|
|
81
|
+
event: AgentEventName
|
|
82
|
+
status: AgentEventStatus
|
|
83
|
+
emitted_at: str = field(default_factory=_now_iso)
|
|
84
|
+
role: str | None = None
|
|
85
|
+
phase: str | None = None
|
|
86
|
+
project_id: int | None = None
|
|
87
|
+
failure_class: AgentFailureClass | None = None
|
|
88
|
+
detail: str | None = None
|
|
89
|
+
data: dict[str, Any] | None = None
|
|
90
|
+
|
|
91
|
+
# -- Builder methods (return new instances) --
|
|
92
|
+
|
|
93
|
+
def with_failure_class(self, fc: AgentFailureClass) -> AgentEvent:
|
|
94
|
+
return replace(self, failure_class=fc)
|
|
95
|
+
|
|
96
|
+
def with_detail(self, detail: str) -> AgentEvent:
|
|
97
|
+
return replace(self, detail=detail)
|
|
98
|
+
|
|
99
|
+
def with_data(self, data: dict[str, Any]) -> AgentEvent:
|
|
100
|
+
return replace(self, data=data)
|
|
101
|
+
|
|
102
|
+
def with_project(self, project_id: int) -> AgentEvent:
|
|
103
|
+
return replace(self, project_id=project_id)
|
|
104
|
+
|
|
105
|
+
# -- Convenience constructors --
|
|
106
|
+
|
|
107
|
+
@classmethod
|
|
108
|
+
def started(cls, role: str, phase: str) -> AgentEvent:
|
|
109
|
+
return cls(
|
|
110
|
+
event=AgentEventName.AGENT_STARTED,
|
|
111
|
+
status=AgentEventStatus.RUNNING,
|
|
112
|
+
role=role,
|
|
113
|
+
phase=phase,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
@classmethod
|
|
117
|
+
def completed(cls, role: str, phase: str, detail: str | None = None) -> AgentEvent:
|
|
118
|
+
return cls(
|
|
119
|
+
event=AgentEventName.AGENT_COMPLETED,
|
|
120
|
+
status=AgentEventStatus.COMPLETED,
|
|
121
|
+
role=role,
|
|
122
|
+
phase=phase,
|
|
123
|
+
detail=detail,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def failed(
|
|
128
|
+
cls,
|
|
129
|
+
role: str,
|
|
130
|
+
phase: str,
|
|
131
|
+
failure_class: AgentFailureClass,
|
|
132
|
+
detail: str | None = None,
|
|
133
|
+
) -> AgentEvent:
|
|
134
|
+
return cls(
|
|
135
|
+
event=AgentEventName.AGENT_FAILED,
|
|
136
|
+
status=AgentEventStatus.FAILED,
|
|
137
|
+
role=role,
|
|
138
|
+
phase=phase,
|
|
139
|
+
failure_class=failure_class,
|
|
140
|
+
detail=detail,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
@classmethod
|
|
144
|
+
def phase_started(cls, phase: str) -> AgentEvent:
|
|
145
|
+
return cls(
|
|
146
|
+
event=AgentEventName.PHASE_STARTED,
|
|
147
|
+
status=AgentEventStatus.RUNNING,
|
|
148
|
+
phase=phase,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
@classmethod
|
|
152
|
+
def gate_passed(cls, phase: str, detail: str | None = None) -> AgentEvent:
|
|
153
|
+
return cls(
|
|
154
|
+
event=AgentEventName.PHASE_GATE_PASSED,
|
|
155
|
+
status=AgentEventStatus.GREEN,
|
|
156
|
+
phase=phase,
|
|
157
|
+
detail=detail,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
@classmethod
|
|
161
|
+
def gate_failed(cls, phase: str, detail: str | None = None) -> AgentEvent:
|
|
162
|
+
return cls(
|
|
163
|
+
event=AgentEventName.PHASE_GATE_FAILED,
|
|
164
|
+
status=AgentEventStatus.RED,
|
|
165
|
+
phase=phase,
|
|
166
|
+
detail=detail,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
@classmethod
|
|
170
|
+
def recovery_attempted(
|
|
171
|
+
cls, role: str, phase: str, detail: str | None = None
|
|
172
|
+
) -> AgentEvent:
|
|
173
|
+
return cls(
|
|
174
|
+
event=AgentEventName.RECOVERY_ATTEMPTED,
|
|
175
|
+
status=AgentEventStatus.RUNNING,
|
|
176
|
+
role=role,
|
|
177
|
+
phase=phase,
|
|
178
|
+
detail=detail,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# -- Serialization --
|
|
182
|
+
|
|
183
|
+
def to_dict(self) -> dict[str, Any]:
|
|
184
|
+
"""JSON-serializable dict. Includes 'type' for backward compat."""
|
|
185
|
+
d: dict[str, Any] = {
|
|
186
|
+
"event": self.event.value,
|
|
187
|
+
"type": self.event.value.split(".")[-1], # backward compat
|
|
188
|
+
"status": self.status.value,
|
|
189
|
+
"emitted_at": self.emitted_at,
|
|
190
|
+
}
|
|
191
|
+
if self.role is not None:
|
|
192
|
+
d["role"] = self.role
|
|
193
|
+
if self.phase is not None:
|
|
194
|
+
d["phase"] = self.phase
|
|
195
|
+
if self.project_id is not None:
|
|
196
|
+
d["project_id"] = self.project_id
|
|
197
|
+
if self.failure_class is not None:
|
|
198
|
+
d["failure_class"] = self.failure_class.value
|
|
199
|
+
if self.detail is not None:
|
|
200
|
+
d["detail"] = self.detail
|
|
201
|
+
if self.data is not None:
|
|
202
|
+
d["data"] = self.data
|
|
203
|
+
return d
|
|
204
|
+
|
|
205
|
+
def to_json(self) -> str:
|
|
206
|
+
"""JSON string representation."""
|
|
207
|
+
return json.dumps(self.to_dict(), ensure_ascii=False)
|
|
208
|
+
|
|
209
|
+
@classmethod
|
|
210
|
+
def from_dict(cls, data: dict[str, Any]) -> AgentEvent:
|
|
211
|
+
"""Reconstruct AgentEvent from dict. Handles enum conversions."""
|
|
212
|
+
event_raw = data.get("event")
|
|
213
|
+
if isinstance(event_raw, AgentEventName):
|
|
214
|
+
event = event_raw
|
|
215
|
+
elif isinstance(event_raw, str):
|
|
216
|
+
event = AgentEventName(event_raw)
|
|
217
|
+
else:
|
|
218
|
+
raise ValueError(f"event field missing or invalid: {event_raw!r}")
|
|
219
|
+
|
|
220
|
+
status_raw = data.get("status")
|
|
221
|
+
if isinstance(status_raw, AgentEventStatus):
|
|
222
|
+
status = status_raw
|
|
223
|
+
elif isinstance(status_raw, str):
|
|
224
|
+
status = AgentEventStatus(status_raw)
|
|
225
|
+
else:
|
|
226
|
+
raise ValueError(f"status field missing or invalid: {status_raw!r}")
|
|
227
|
+
|
|
228
|
+
failure_class: AgentFailureClass | None = None
|
|
229
|
+
fc_raw = data.get("failure_class")
|
|
230
|
+
if isinstance(fc_raw, AgentFailureClass):
|
|
231
|
+
failure_class = fc_raw
|
|
232
|
+
elif isinstance(fc_raw, str):
|
|
233
|
+
failure_class = AgentFailureClass(fc_raw)
|
|
234
|
+
|
|
235
|
+
return cls(
|
|
236
|
+
event=event,
|
|
237
|
+
status=status,
|
|
238
|
+
emitted_at=data.get("emitted_at", _now_iso()),
|
|
239
|
+
role=data.get("role"),
|
|
240
|
+
phase=data.get("phase"),
|
|
241
|
+
project_id=data.get("project_id"),
|
|
242
|
+
failure_class=failure_class,
|
|
243
|
+
detail=data.get("detail"),
|
|
244
|
+
data=data.get("data"),
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
@classmethod
|
|
248
|
+
def from_json(cls, s: str) -> AgentEvent:
|
|
249
|
+
"""Reconstruct AgentEvent from JSON string."""
|
|
250
|
+
data = json.loads(s)
|
|
251
|
+
return cls.from_dict(data)
|
|
252
|
+
|
|
253
|
+
def to_otel_attributes(self) -> dict[str, str | int | float]:
|
|
254
|
+
"""Convert to OpenTelemetry semantic convention attributes."""
|
|
255
|
+
attrs: dict[str, str | int | float] = {
|
|
256
|
+
"agent.event": self.event.value,
|
|
257
|
+
"agent.event.status": self.status.value,
|
|
258
|
+
"agent.event.timestamp": self.emitted_at,
|
|
259
|
+
}
|
|
260
|
+
if self.role is not None:
|
|
261
|
+
attrs["agent.role"] = self.role
|
|
262
|
+
if self.phase is not None:
|
|
263
|
+
attrs["agent.phase"] = self.phase
|
|
264
|
+
if self.project_id is not None:
|
|
265
|
+
attrs["agent.project_id"] = self.project_id
|
|
266
|
+
if self.failure_class is not None:
|
|
267
|
+
attrs["agent.failure_class"] = self.failure_class.value
|
|
268
|
+
if self.detail is not None:
|
|
269
|
+
attrs["agent.detail"] = self.detail
|
|
270
|
+
return attrs
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent Lifecycle — Agent state machine with event log.
|
|
3
|
+
|
|
4
|
+
Explicit agent lifecycle states with validated transitions. Every
|
|
5
|
+
state change is recorded as an event with timestamp and detail.
|
|
6
|
+
AgentRegistry provides thread-safe concurrent access.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import threading
|
|
13
|
+
import uuid
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from datetime import UTC, datetime
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("techrevati.runtime.lifecycle")
|
|
20
|
+
logger.addHandler(logging.NullHandler())
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AgentStatus(str, Enum):
|
|
24
|
+
"""Agent lifecycle states with valid transition paths."""
|
|
25
|
+
|
|
26
|
+
IDLE = "idle"
|
|
27
|
+
INITIALIZING = "initializing"
|
|
28
|
+
WAITING_FOR_INPUT = "waiting_for_input"
|
|
29
|
+
RUNNING = "running"
|
|
30
|
+
COMPLETED = "completed"
|
|
31
|
+
FAILED = "failed"
|
|
32
|
+
CANCELLED = "cancelled"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Valid state transitions. CANCELLED is reachable from any non-terminal state
|
|
36
|
+
# (caller-driven cancellation, asyncio.CancelledError, timeout, etc.).
|
|
37
|
+
_VALID_TRANSITIONS: dict[AgentStatus, set[AgentStatus]] = {
|
|
38
|
+
AgentStatus.IDLE: {
|
|
39
|
+
AgentStatus.INITIALIZING,
|
|
40
|
+
AgentStatus.FAILED,
|
|
41
|
+
AgentStatus.CANCELLED,
|
|
42
|
+
},
|
|
43
|
+
AgentStatus.INITIALIZING: {
|
|
44
|
+
AgentStatus.WAITING_FOR_INPUT,
|
|
45
|
+
AgentStatus.RUNNING,
|
|
46
|
+
AgentStatus.FAILED,
|
|
47
|
+
AgentStatus.CANCELLED,
|
|
48
|
+
},
|
|
49
|
+
AgentStatus.WAITING_FOR_INPUT: {
|
|
50
|
+
AgentStatus.RUNNING,
|
|
51
|
+
AgentStatus.FAILED,
|
|
52
|
+
AgentStatus.CANCELLED,
|
|
53
|
+
},
|
|
54
|
+
AgentStatus.RUNNING: {
|
|
55
|
+
AgentStatus.WAITING_FOR_INPUT,
|
|
56
|
+
AgentStatus.COMPLETED,
|
|
57
|
+
AgentStatus.FAILED,
|
|
58
|
+
AgentStatus.CANCELLED,
|
|
59
|
+
},
|
|
60
|
+
AgentStatus.COMPLETED: set(), # terminal
|
|
61
|
+
AgentStatus.FAILED: set(), # terminal
|
|
62
|
+
AgentStatus.CANCELLED: set(), # terminal
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class InvalidTransitionError(Exception):
|
|
67
|
+
"""Raised when an invalid state transition is attempted."""
|
|
68
|
+
|
|
69
|
+
def __init__(self, current: AgentStatus, target: AgentStatus) -> None:
|
|
70
|
+
self.current = current
|
|
71
|
+
self.target = target
|
|
72
|
+
super().__init__(f"Invalid transition: {current.value} → {target.value}")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _now_iso() -> str:
|
|
76
|
+
return datetime.now(UTC).isoformat()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass(frozen=True)
|
|
80
|
+
class AgentWorkerEvent:
|
|
81
|
+
"""Immutable record of a state transition."""
|
|
82
|
+
|
|
83
|
+
seq: int
|
|
84
|
+
kind: str
|
|
85
|
+
status: str
|
|
86
|
+
detail: str | None
|
|
87
|
+
timestamp: str
|
|
88
|
+
|
|
89
|
+
def to_dict(self) -> dict[str, Any]:
|
|
90
|
+
d: dict[str, Any] = {
|
|
91
|
+
"seq": self.seq,
|
|
92
|
+
"kind": self.kind,
|
|
93
|
+
"status": self.status,
|
|
94
|
+
"timestamp": self.timestamp,
|
|
95
|
+
}
|
|
96
|
+
if self.detail:
|
|
97
|
+
d["detail"] = self.detail
|
|
98
|
+
return d
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class AgentWorker:
|
|
103
|
+
"""Tracks an agent's lifecycle through execution."""
|
|
104
|
+
|
|
105
|
+
worker_id: str
|
|
106
|
+
role: str
|
|
107
|
+
phase: str
|
|
108
|
+
project_id: int | None = None
|
|
109
|
+
status: AgentStatus = AgentStatus.IDLE
|
|
110
|
+
events: list[AgentWorkerEvent] = field(default_factory=list)
|
|
111
|
+
retry_count: int = 0
|
|
112
|
+
last_error: dict[str, Any] | None = None
|
|
113
|
+
provider_used: str | None = None
|
|
114
|
+
created_at: str = field(default_factory=_now_iso)
|
|
115
|
+
updated_at: str = field(default_factory=_now_iso)
|
|
116
|
+
|
|
117
|
+
def transition(
|
|
118
|
+
self, new_status: AgentStatus, detail: str | None = None
|
|
119
|
+
) -> AgentWorkerEvent:
|
|
120
|
+
"""Validate and execute a state transition. Returns the new event."""
|
|
121
|
+
valid = _VALID_TRANSITIONS.get(self.status, set())
|
|
122
|
+
if new_status not in valid:
|
|
123
|
+
raise InvalidTransitionError(self.status, new_status)
|
|
124
|
+
|
|
125
|
+
event = AgentWorkerEvent(
|
|
126
|
+
seq=len(self.events) + 1,
|
|
127
|
+
kind=new_status.value,
|
|
128
|
+
status=new_status.value,
|
|
129
|
+
detail=detail,
|
|
130
|
+
timestamp=_now_iso(),
|
|
131
|
+
)
|
|
132
|
+
self.events.append(event)
|
|
133
|
+
self.status = new_status
|
|
134
|
+
self.updated_at = event.timestamp
|
|
135
|
+
|
|
136
|
+
if new_status == AgentStatus.FAILED and detail:
|
|
137
|
+
self.last_error = {"message": detail, "timestamp": event.timestamp}
|
|
138
|
+
|
|
139
|
+
return event
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def is_terminal(self) -> bool:
|
|
143
|
+
return self.status in (
|
|
144
|
+
AgentStatus.COMPLETED,
|
|
145
|
+
AgentStatus.FAILED,
|
|
146
|
+
AgentStatus.CANCELLED,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def to_dict(self) -> dict[str, Any]:
|
|
150
|
+
return {
|
|
151
|
+
"worker_id": self.worker_id,
|
|
152
|
+
"role": self.role,
|
|
153
|
+
"phase": self.phase,
|
|
154
|
+
"project_id": self.project_id,
|
|
155
|
+
"status": self.status.value,
|
|
156
|
+
"retry_count": self.retry_count,
|
|
157
|
+
"last_error": self.last_error,
|
|
158
|
+
"provider_used": self.provider_used,
|
|
159
|
+
"created_at": self.created_at,
|
|
160
|
+
"updated_at": self.updated_at,
|
|
161
|
+
"events": [e.to_dict() for e in self.events],
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class AgentRegistry:
|
|
166
|
+
"""Thread-safe registry of active AgentWorker instances."""
|
|
167
|
+
|
|
168
|
+
def __init__(self) -> None:
|
|
169
|
+
self._workers: dict[str, AgentWorker] = {}
|
|
170
|
+
self._lock = threading.Lock()
|
|
171
|
+
|
|
172
|
+
def create(
|
|
173
|
+
self,
|
|
174
|
+
role: str,
|
|
175
|
+
phase: str,
|
|
176
|
+
project_id: int | None = None,
|
|
177
|
+
) -> AgentWorker:
|
|
178
|
+
worker_id = f"{role}-{phase}-{uuid.uuid4().hex[:8]}"
|
|
179
|
+
worker = AgentWorker(
|
|
180
|
+
worker_id=worker_id,
|
|
181
|
+
role=role,
|
|
182
|
+
phase=phase,
|
|
183
|
+
project_id=project_id,
|
|
184
|
+
)
|
|
185
|
+
with self._lock:
|
|
186
|
+
self._workers[worker_id] = worker
|
|
187
|
+
return worker
|
|
188
|
+
|
|
189
|
+
def get(self, worker_id: str) -> AgentWorker | None:
|
|
190
|
+
with self._lock:
|
|
191
|
+
return self._workers.get(worker_id)
|
|
192
|
+
|
|
193
|
+
def transition(
|
|
194
|
+
self,
|
|
195
|
+
worker_id: str,
|
|
196
|
+
status: AgentStatus,
|
|
197
|
+
detail: str | None = None,
|
|
198
|
+
) -> AgentWorker:
|
|
199
|
+
with self._lock:
|
|
200
|
+
worker = self._workers.get(worker_id)
|
|
201
|
+
if worker is None:
|
|
202
|
+
raise KeyError(f"Worker not found: {worker_id}")
|
|
203
|
+
worker.transition(status, detail)
|
|
204
|
+
return worker
|
|
205
|
+
|
|
206
|
+
def list_active(self) -> list[AgentWorker]:
|
|
207
|
+
with self._lock:
|
|
208
|
+
return [w for w in self._workers.values() if not w.is_terminal]
|
|
209
|
+
|
|
210
|
+
def get_by_role_phase(self, role: str, phase: str) -> AgentWorker | None:
|
|
211
|
+
with self._lock:
|
|
212
|
+
for w in self._workers.values():
|
|
213
|
+
if w.role == role and w.phase == phase and not w.is_terminal:
|
|
214
|
+
return w
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
def get_by_project(self, project_id: int) -> list[AgentWorker]:
|
|
218
|
+
with self._lock:
|
|
219
|
+
return [w for w in self._workers.values() if w.project_id == project_id]
|
|
220
|
+
|
|
221
|
+
def clear(self) -> None:
|
|
222
|
+
"""Clear all workers (for testing)."""
|
|
223
|
+
with self._lock:
|
|
224
|
+
self._workers.clear()
|