empathy-framework 4.9.1__py3-none-any.whl → 5.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {empathy_framework-4.9.1.dist-info → empathy_framework-5.0.1.dist-info}/METADATA +1 -1
- {empathy_framework-4.9.1.dist-info → empathy_framework-5.0.1.dist-info}/RECORD +47 -26
- empathy_os/__init__.py +1 -1
- empathy_os/cache/hash_only.py +6 -3
- empathy_os/cache/hybrid.py +6 -3
- empathy_os/cli_legacy.py +27 -1
- empathy_os/cli_minimal.py +512 -15
- empathy_os/cli_router.py +145 -113
- empathy_os/cli_unified.py +25 -0
- empathy_os/dashboard/__init__.py +42 -0
- empathy_os/dashboard/app.py +512 -0
- empathy_os/dashboard/simple_server.py +403 -0
- empathy_os/dashboard/standalone_server.py +536 -0
- empathy_os/memory/__init__.py +19 -5
- empathy_os/memory/short_term.py +4 -70
- empathy_os/memory/types.py +2 -2
- empathy_os/models/__init__.py +3 -0
- empathy_os/models/adaptive_routing.py +437 -0
- empathy_os/models/registry.py +4 -4
- empathy_os/socratic/ab_testing.py +1 -1
- empathy_os/telemetry/__init__.py +29 -1
- empathy_os/telemetry/agent_coordination.py +478 -0
- empathy_os/telemetry/agent_tracking.py +350 -0
- empathy_os/telemetry/approval_gates.py +563 -0
- empathy_os/telemetry/event_streaming.py +405 -0
- empathy_os/telemetry/feedback_loop.py +557 -0
- empathy_os/vscode_bridge 2.py +173 -0
- empathy_os/workflows/__init__.py +4 -4
- empathy_os/workflows/base.py +495 -43
- empathy_os/workflows/history.py +3 -5
- empathy_os/workflows/output.py +410 -0
- empathy_os/workflows/progress.py +324 -22
- empathy_os/workflows/progressive/README 2.md +454 -0
- empathy_os/workflows/progressive/__init__ 2.py +92 -0
- empathy_os/workflows/progressive/cli 2.py +242 -0
- empathy_os/workflows/progressive/core 2.py +488 -0
- empathy_os/workflows/progressive/orchestrator 2.py +701 -0
- empathy_os/workflows/progressive/reports 2.py +528 -0
- empathy_os/workflows/progressive/telemetry 2.py +280 -0
- empathy_os/workflows/progressive/test_gen 2.py +514 -0
- empathy_os/workflows/progressive/workflow 2.py +628 -0
- empathy_os/workflows/routing.py +5 -0
- empathy_os/workflows/security_audit.py +189 -0
- {empathy_framework-4.9.1.dist-info → empathy_framework-5.0.1.dist-info}/WHEEL +0 -0
- {empathy_framework-4.9.1.dist-info → empathy_framework-5.0.1.dist-info}/entry_points.txt +0 -0
- {empathy_framework-4.9.1.dist-info → empathy_framework-5.0.1.dist-info}/licenses/LICENSE +0 -0
- {empathy_framework-4.9.1.dist-info → empathy_framework-5.0.1.dist-info}/top_level.txt +0 -0
empathy_os/memory/types.py
CHANGED
|
@@ -42,13 +42,13 @@ class TTLStrategy(Enum):
|
|
|
42
42
|
Per EMPATHY_PHILOSOPHY.md Section 9.3:
|
|
43
43
|
- Working results: 1 hour
|
|
44
44
|
- Staged patterns: 24 hours
|
|
45
|
-
- Coordination signals: 5 minutes
|
|
45
|
+
- Coordination signals: 5 minutes (REMOVED in v5.0 - see CoordinationSignals)
|
|
46
46
|
- Conflict context: Until resolution
|
|
47
47
|
"""
|
|
48
48
|
|
|
49
49
|
WORKING_RESULTS = 3600 # 1 hour
|
|
50
50
|
STAGED_PATTERNS = 86400 # 24 hours
|
|
51
|
-
COORDINATION
|
|
51
|
+
# COORDINATION removed in v5.0 - use CoordinationSignals with custom TTLs
|
|
52
52
|
CONFLICT_CONTEXT = 604800 # 7 days (fallback for unresolved)
|
|
53
53
|
SESSION = 1800 # 30 minutes
|
|
54
54
|
STREAM_ENTRY = 86400 * 7 # 7 days for audit stream entries
|
empathy_os/models/__init__.py
CHANGED
|
@@ -9,6 +9,7 @@ Copyright 2025 Smart-AI-Memory
|
|
|
9
9
|
Licensed under Fair Source License 0.9
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
+
from .adaptive_routing import AdaptiveModelRouter, ModelPerformance
|
|
12
13
|
from .empathy_executor import EmpathyLLMExecutor
|
|
13
14
|
from .executor import ExecutionContext, LLMExecutor, LLMResponse, MockLLMExecutor
|
|
14
15
|
from .fallback import (
|
|
@@ -87,6 +88,7 @@ __all__ = [
|
|
|
87
88
|
"MODEL_REGISTRY",
|
|
88
89
|
"PREMIUM_TASKS",
|
|
89
90
|
"TASK_TIER_MAP",
|
|
91
|
+
"AdaptiveModelRouter",
|
|
90
92
|
"AgentAssignmentRecord",
|
|
91
93
|
"CircuitBreaker",
|
|
92
94
|
"CircuitBreakerState",
|
|
@@ -106,6 +108,7 @@ __all__ = [
|
|
|
106
108
|
"LLMResponse",
|
|
107
109
|
"MockLLMExecutor",
|
|
108
110
|
"ModelInfo",
|
|
111
|
+
"ModelPerformance",
|
|
109
112
|
"ModelProvider",
|
|
110
113
|
# Registry exports
|
|
111
114
|
"ModelRegistry",
|
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
"""Adaptive Model Routing based on historical telemetry.
|
|
2
|
+
|
|
3
|
+
This module implements Pattern 3 from AGENT_COORDINATION_ARCHITECTURE.md:
|
|
4
|
+
Using telemetry history to learn which models work best for each workflow/stage.
|
|
5
|
+
|
|
6
|
+
Key Features:
|
|
7
|
+
- Analyzes historical performance per model/workflow/stage
|
|
8
|
+
- Recommends best model based on success rate and cost
|
|
9
|
+
- Auto-detects when tier upgrades are needed (>20% failure rate)
|
|
10
|
+
- Respects cost and latency constraints
|
|
11
|
+
|
|
12
|
+
Copyright 2025 Smart-AI-Memory
|
|
13
|
+
Licensed under Fair Source License 0.9
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# Lazy import to avoid circular dependency
|
|
25
|
+
_model_registry = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_registry():
|
|
29
|
+
"""Get ModelRegistry instance (lazy load to avoid circular import)."""
|
|
30
|
+
global _model_registry
|
|
31
|
+
if _model_registry is None:
|
|
32
|
+
from .registry import MODEL_REGISTRY
|
|
33
|
+
_model_registry = MODEL_REGISTRY
|
|
34
|
+
return _model_registry
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class ModelPerformance:
|
|
39
|
+
"""Performance metrics for a model on a specific task.
|
|
40
|
+
|
|
41
|
+
Attributes:
|
|
42
|
+
model_id: Model identifier (e.g., "claude-sonnet-4.5")
|
|
43
|
+
tier: Model tier (CHEAP, CAPABLE, PREMIUM)
|
|
44
|
+
success_rate: Percentage of successful calls (0.0 - 1.0)
|
|
45
|
+
avg_latency_ms: Average response time in milliseconds
|
|
46
|
+
avg_cost: Average cost per call in USD
|
|
47
|
+
sample_size: Number of calls analyzed
|
|
48
|
+
recent_failures: Number of failures in last 20 calls
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
model_id: str
|
|
52
|
+
tier: str
|
|
53
|
+
success_rate: float
|
|
54
|
+
avg_latency_ms: float
|
|
55
|
+
avg_cost: float
|
|
56
|
+
sample_size: int
|
|
57
|
+
recent_failures: int = 0
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def quality_score(self) -> float:
|
|
61
|
+
"""Calculate quality score for ranking models.
|
|
62
|
+
|
|
63
|
+
Score prioritizes:
|
|
64
|
+
1. Success rate (most important)
|
|
65
|
+
2. Cost (secondary)
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Quality score (higher is better)
|
|
69
|
+
"""
|
|
70
|
+
# Success rate contributes 100 points max
|
|
71
|
+
# Lower cost adds bonus points
|
|
72
|
+
return (self.success_rate * 100) - (self.avg_cost * 10)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class AdaptiveModelRouter:
|
|
76
|
+
"""Route tasks to models based on historical telemetry performance.
|
|
77
|
+
|
|
78
|
+
Uses telemetry data to learn which models work best for each workflow/stage
|
|
79
|
+
combination. Automatically recommends tier upgrades when failure rates are high.
|
|
80
|
+
|
|
81
|
+
Example:
|
|
82
|
+
>>> from empathy_os.telemetry import UsageTracker
|
|
83
|
+
>>> router = AdaptiveModelRouter(UsageTracker.get_instance())
|
|
84
|
+
>>>
|
|
85
|
+
>>> # Get best model for this workflow stage
|
|
86
|
+
>>> model = router.get_best_model(
|
|
87
|
+
... workflow="code-review",
|
|
88
|
+
... stage="analysis",
|
|
89
|
+
... max_cost=0.01
|
|
90
|
+
... )
|
|
91
|
+
>>> print(f"Using {model}")
|
|
92
|
+
Using claude-3-5-haiku-20241022
|
|
93
|
+
>>>
|
|
94
|
+
>>> # Check if we should upgrade tier
|
|
95
|
+
>>> should_upgrade, reason = router.recommend_tier_upgrade(
|
|
96
|
+
... workflow="code-review",
|
|
97
|
+
... stage="analysis"
|
|
98
|
+
... )
|
|
99
|
+
>>> if should_upgrade:
|
|
100
|
+
... print(f"⚠️ {reason}")
|
|
101
|
+
⚠️ High failure rate: 25.0% in last 20 calls
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
# Minimum sample size for making routing decisions
|
|
105
|
+
MIN_SAMPLE_SIZE = 10
|
|
106
|
+
|
|
107
|
+
# Failure rate threshold for tier upgrade recommendation
|
|
108
|
+
FAILURE_RATE_THRESHOLD = 0.2 # 20%
|
|
109
|
+
|
|
110
|
+
# Recent window size for failure detection
|
|
111
|
+
RECENT_WINDOW_SIZE = 20
|
|
112
|
+
|
|
113
|
+
def __init__(self, telemetry: Any):
|
|
114
|
+
"""Initialize adaptive router.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
telemetry: UsageTracker instance for telemetry data access
|
|
118
|
+
"""
|
|
119
|
+
self.telemetry = telemetry
|
|
120
|
+
|
|
121
|
+
def _get_default_model(self, tier: str = "CHEAP") -> str:
|
|
122
|
+
"""Get default Anthropic model for a tier from registry.
|
|
123
|
+
|
|
124
|
+
This dynamically fetches the current Anthropic model for each tier,
|
|
125
|
+
so when new models are released (e.g., Claude 5), they're automatically used.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
tier: Tier name (CHEAP, CAPABLE, or PREMIUM)
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Model ID from registry (e.g., "claude-3-5-haiku-20241022")
|
|
132
|
+
"""
|
|
133
|
+
registry = _get_registry()
|
|
134
|
+
|
|
135
|
+
# Get Anthropic model for this tier
|
|
136
|
+
tier_lower = tier.lower()
|
|
137
|
+
if tier_lower in registry.get("anthropic", {}):
|
|
138
|
+
return registry["anthropic"][tier_lower].id
|
|
139
|
+
|
|
140
|
+
# Fallback to known models if registry lookup fails
|
|
141
|
+
fallbacks = {
|
|
142
|
+
"cheap": "claude-3-5-haiku-20241022",
|
|
143
|
+
"capable": "claude-sonnet-4-5",
|
|
144
|
+
"premium": "claude-opus-4-5-20251101",
|
|
145
|
+
}
|
|
146
|
+
return fallbacks.get(tier_lower, "claude-3-5-haiku-20241022")
|
|
147
|
+
|
|
148
|
+
def get_best_model(
|
|
149
|
+
self,
|
|
150
|
+
workflow: str,
|
|
151
|
+
stage: str,
|
|
152
|
+
max_cost: float | None = None,
|
|
153
|
+
max_latency_ms: int | None = None,
|
|
154
|
+
min_success_rate: float = 0.8,
|
|
155
|
+
) -> str:
|
|
156
|
+
"""Get best model for workflow/stage based on historical performance.
|
|
157
|
+
|
|
158
|
+
Analyzes recent telemetry to find the model with the best quality score
|
|
159
|
+
(success rate + cost efficiency) that meets the specified constraints.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
workflow: Workflow name (e.g., "code-review", "bug-predict")
|
|
163
|
+
stage: Stage name (e.g., "analysis", "synthesis")
|
|
164
|
+
max_cost: Maximum acceptable cost per call (USD)
|
|
165
|
+
max_latency_ms: Maximum acceptable latency (milliseconds)
|
|
166
|
+
min_success_rate: Minimum acceptable success rate (0.0 - 1.0)
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Model ID to use (e.g., "claude-3-5-haiku-20241022")
|
|
170
|
+
|
|
171
|
+
Example:
|
|
172
|
+
>>> model = router.get_best_model(
|
|
173
|
+
... workflow="code-review",
|
|
174
|
+
... stage="analysis",
|
|
175
|
+
... max_cost=0.01,
|
|
176
|
+
... min_success_rate=0.9
|
|
177
|
+
... )
|
|
178
|
+
>>> print(model)
|
|
179
|
+
claude-3-5-haiku-20241022
|
|
180
|
+
"""
|
|
181
|
+
# Get performance data for all models on this workflow/stage
|
|
182
|
+
performances = self._analyze_model_performance(workflow, stage)
|
|
183
|
+
|
|
184
|
+
if not performances:
|
|
185
|
+
# No historical data, use default Anthropic cheap model from registry
|
|
186
|
+
default_model = self._get_default_model("CHEAP")
|
|
187
|
+
logger.info(
|
|
188
|
+
"adaptive_routing_no_history",
|
|
189
|
+
workflow=workflow,
|
|
190
|
+
stage=stage,
|
|
191
|
+
fallback=default_model,
|
|
192
|
+
)
|
|
193
|
+
return default_model
|
|
194
|
+
|
|
195
|
+
# Filter by constraints
|
|
196
|
+
candidates = []
|
|
197
|
+
for perf in performances:
|
|
198
|
+
# Skip if insufficient data
|
|
199
|
+
if perf.sample_size < self.MIN_SAMPLE_SIZE:
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
# Skip if doesn't meet minimum success rate
|
|
203
|
+
if perf.success_rate < min_success_rate:
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
# Skip if exceeds cost constraint
|
|
207
|
+
if max_cost is not None and perf.avg_cost > max_cost:
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
# Skip if exceeds latency constraint
|
|
211
|
+
if max_latency_ms is not None and perf.avg_latency_ms > max_latency_ms:
|
|
212
|
+
continue
|
|
213
|
+
|
|
214
|
+
candidates.append(perf)
|
|
215
|
+
|
|
216
|
+
if not candidates:
|
|
217
|
+
# All models filtered out, fall back to default Anthropic model
|
|
218
|
+
default_model = self._get_default_model("CHEAP")
|
|
219
|
+
logger.warning(
|
|
220
|
+
"adaptive_routing_no_candidates",
|
|
221
|
+
workflow=workflow,
|
|
222
|
+
stage=stage,
|
|
223
|
+
constraints={"max_cost": max_cost, "max_latency_ms": max_latency_ms},
|
|
224
|
+
fallback=default_model,
|
|
225
|
+
)
|
|
226
|
+
return default_model
|
|
227
|
+
|
|
228
|
+
# Sort by quality score (success rate + cost efficiency)
|
|
229
|
+
candidates.sort(key=lambda p: p.quality_score, reverse=True)
|
|
230
|
+
best = candidates[0]
|
|
231
|
+
|
|
232
|
+
logger.info(
|
|
233
|
+
"adaptive_routing_selected",
|
|
234
|
+
workflow=workflow,
|
|
235
|
+
stage=stage,
|
|
236
|
+
model=best.model_id,
|
|
237
|
+
success_rate=f"{best.success_rate:.1%}",
|
|
238
|
+
avg_cost=f"${best.avg_cost:.4f}",
|
|
239
|
+
sample_size=best.sample_size,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
return best.model_id
|
|
243
|
+
|
|
244
|
+
def recommend_tier_upgrade(
|
|
245
|
+
self, workflow: str, stage: str
|
|
246
|
+
) -> tuple[bool, str]:
|
|
247
|
+
"""Check if tier should be upgraded based on failure rate.
|
|
248
|
+
|
|
249
|
+
Analyzes recent telemetry (last 20 calls) for this workflow/stage.
|
|
250
|
+
If failure rate exceeds threshold (20%), recommends tier upgrade.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
workflow: Workflow name
|
|
254
|
+
stage: Stage name
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Tuple of (should_upgrade: bool, reason: str)
|
|
258
|
+
|
|
259
|
+
Example:
|
|
260
|
+
>>> should_upgrade, reason = router.recommend_tier_upgrade(
|
|
261
|
+
... workflow="code-review",
|
|
262
|
+
... stage="analysis"
|
|
263
|
+
... )
|
|
264
|
+
>>> if should_upgrade:
|
|
265
|
+
... print(f"⚠️ Upgrading tier: {reason}")
|
|
266
|
+
⚠️ Upgrading tier: High failure rate: 25.0% in last 20 calls
|
|
267
|
+
"""
|
|
268
|
+
# Get recent entries for this workflow/stage
|
|
269
|
+
entries = self._get_workflow_stage_entries(workflow, stage, days=7)
|
|
270
|
+
|
|
271
|
+
if len(entries) < self.MIN_SAMPLE_SIZE:
|
|
272
|
+
return False, f"Insufficient data ({len(entries)} calls, need {self.MIN_SAMPLE_SIZE})"
|
|
273
|
+
|
|
274
|
+
# Analyze recent window (last 20 calls)
|
|
275
|
+
recent = entries[-self.RECENT_WINDOW_SIZE :]
|
|
276
|
+
failures = sum(1 for e in recent if not e.get("success", True))
|
|
277
|
+
failure_rate = failures / len(recent)
|
|
278
|
+
|
|
279
|
+
if failure_rate > self.FAILURE_RATE_THRESHOLD:
|
|
280
|
+
return (
|
|
281
|
+
True,
|
|
282
|
+
f"High failure rate: {failure_rate:.1%} ({failures}/{len(recent)} failed in recent calls)",
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
return False, f"Performance acceptable: {failure_rate:.1%} failure rate"
|
|
286
|
+
|
|
287
|
+
def get_routing_stats(
|
|
288
|
+
self, workflow: str, stage: str | None = None, days: int = 7
|
|
289
|
+
) -> dict[str, Any]:
|
|
290
|
+
"""Get routing statistics for a workflow (or specific stage).
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
workflow: Workflow name
|
|
294
|
+
stage: Optional stage name (None for all stages)
|
|
295
|
+
days: Number of days to analyze
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
Dictionary with routing statistics:
|
|
299
|
+
- models_used: List of models used
|
|
300
|
+
- performance_by_model: Performance metrics per model
|
|
301
|
+
- total_calls: Total number of calls
|
|
302
|
+
- avg_cost: Average cost per call
|
|
303
|
+
- avg_success_rate: Average success rate
|
|
304
|
+
|
|
305
|
+
Example:
|
|
306
|
+
>>> stats = router.get_routing_stats("code-review", days=7)
|
|
307
|
+
>>> print(f"Models used: {stats['models_used']}")
|
|
308
|
+
Models used: ['claude-haiku-3.5', 'claude-sonnet-4.5']
|
|
309
|
+
>>> print(f"Average cost: ${stats['avg_cost']:.4f}")
|
|
310
|
+
Average cost: $0.0023
|
|
311
|
+
"""
|
|
312
|
+
entries = self._get_workflow_stage_entries(workflow, stage, days=days)
|
|
313
|
+
|
|
314
|
+
if not entries:
|
|
315
|
+
return {
|
|
316
|
+
"models_used": [],
|
|
317
|
+
"performance_by_model": {},
|
|
318
|
+
"total_calls": 0,
|
|
319
|
+
"avg_cost": 0.0,
|
|
320
|
+
"avg_success_rate": 0.0,
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
# Calculate stats
|
|
324
|
+
models_used = list({e["model"] for e in entries})
|
|
325
|
+
total_calls = len(entries)
|
|
326
|
+
total_cost = sum(e.get("cost", 0.0) for e in entries)
|
|
327
|
+
successes = sum(1 for e in entries if e.get("success", True))
|
|
328
|
+
|
|
329
|
+
# Per-model performance
|
|
330
|
+
performance_by_model = {}
|
|
331
|
+
for model in models_used:
|
|
332
|
+
model_entries = [e for e in entries if e["model"] == model]
|
|
333
|
+
model_successes = sum(1 for e in model_entries if e.get("success", True))
|
|
334
|
+
|
|
335
|
+
performance_by_model[model] = {
|
|
336
|
+
"calls": len(model_entries),
|
|
337
|
+
"success_rate": model_successes / len(model_entries),
|
|
338
|
+
"avg_cost": sum(e.get("cost", 0.0) for e in model_entries)
|
|
339
|
+
/ len(model_entries),
|
|
340
|
+
"avg_latency_ms": sum(e.get("duration_ms", 0) for e in model_entries)
|
|
341
|
+
/ len(model_entries),
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
return {
|
|
345
|
+
"workflow": workflow,
|
|
346
|
+
"stage": stage or "all",
|
|
347
|
+
"days_analyzed": days,
|
|
348
|
+
"models_used": models_used,
|
|
349
|
+
"performance_by_model": performance_by_model,
|
|
350
|
+
"total_calls": total_calls,
|
|
351
|
+
"avg_cost": total_cost / total_calls if total_calls > 0 else 0.0,
|
|
352
|
+
"avg_success_rate": successes / total_calls if total_calls > 0 else 0.0,
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
def _analyze_model_performance(
|
|
356
|
+
self, workflow: str, stage: str, days: int = 7
|
|
357
|
+
) -> list[ModelPerformance]:
|
|
358
|
+
"""Analyze performance of all models for this workflow/stage.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
workflow: Workflow name
|
|
362
|
+
stage: Stage name
|
|
363
|
+
days: Number of days to analyze
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
List of ModelPerformance objects, one per model
|
|
367
|
+
"""
|
|
368
|
+
entries = self._get_workflow_stage_entries(workflow, stage, days=days)
|
|
369
|
+
|
|
370
|
+
if not entries:
|
|
371
|
+
return []
|
|
372
|
+
|
|
373
|
+
# Group by model
|
|
374
|
+
by_model: dict[str, list[dict]] = {}
|
|
375
|
+
for entry in entries:
|
|
376
|
+
model = entry["model"]
|
|
377
|
+
if model not in by_model:
|
|
378
|
+
by_model[model] = []
|
|
379
|
+
by_model[model].append(entry)
|
|
380
|
+
|
|
381
|
+
# Calculate performance metrics per model
|
|
382
|
+
performances = []
|
|
383
|
+
for model, model_entries in by_model.items():
|
|
384
|
+
total = len(model_entries)
|
|
385
|
+
successes = sum(1 for e in model_entries if e.get("success", True))
|
|
386
|
+
success_rate = successes / total
|
|
387
|
+
|
|
388
|
+
avg_latency = (
|
|
389
|
+
sum(e.get("duration_ms", 0) for e in model_entries) / total
|
|
390
|
+
)
|
|
391
|
+
avg_cost = sum(e.get("cost", 0.0) for e in model_entries) / total
|
|
392
|
+
|
|
393
|
+
# Analyze recent failures (last 20 calls)
|
|
394
|
+
recent = model_entries[-self.RECENT_WINDOW_SIZE :]
|
|
395
|
+
recent_failures = sum(1 for e in recent if not e.get("success", True))
|
|
396
|
+
|
|
397
|
+
performances.append(
|
|
398
|
+
ModelPerformance(
|
|
399
|
+
model_id=model,
|
|
400
|
+
tier=model_entries[0].get("tier", "unknown"),
|
|
401
|
+
success_rate=success_rate,
|
|
402
|
+
avg_latency_ms=avg_latency,
|
|
403
|
+
avg_cost=avg_cost,
|
|
404
|
+
sample_size=total,
|
|
405
|
+
recent_failures=recent_failures,
|
|
406
|
+
)
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
return performances
|
|
410
|
+
|
|
411
|
+
def _get_workflow_stage_entries(
|
|
412
|
+
self, workflow: str, stage: str | None, days: int
|
|
413
|
+
) -> list[dict[str, Any]]:
|
|
414
|
+
"""Get telemetry entries for a workflow/stage.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
workflow: Workflow name
|
|
418
|
+
stage: Stage name (None for all stages)
|
|
419
|
+
days: Number of days to retrieve
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
List of telemetry entries
|
|
423
|
+
"""
|
|
424
|
+
# Get recent entries from telemetry tracker
|
|
425
|
+
all_entries = self.telemetry.get_recent_entries(limit=10000, days=days)
|
|
426
|
+
|
|
427
|
+
# Filter to this workflow
|
|
428
|
+
workflow_entries = [e for e in all_entries if e.get("workflow") == workflow]
|
|
429
|
+
|
|
430
|
+
# Filter to this stage if specified
|
|
431
|
+
if stage is not None:
|
|
432
|
+
workflow_entries = [e for e in workflow_entries if e.get("stage") == stage]
|
|
433
|
+
|
|
434
|
+
return workflow_entries
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
__all__ = ["AdaptiveModelRouter", "ModelPerformance"]
|
empathy_os/models/registry.py
CHANGED
|
@@ -209,11 +209,11 @@ class ModelRegistry:
|
|
|
209
209
|
"""Build tier and model ID caches for O(1) lookups."""
|
|
210
210
|
# Cache for get_models_by_tier (tier -> list[ModelInfo])
|
|
211
211
|
self._tier_cache: dict[str, list[ModelInfo]] = {}
|
|
212
|
-
for
|
|
213
|
-
self._tier_cache[
|
|
214
|
-
provider_models[
|
|
212
|
+
for tier in ModelTier:
|
|
213
|
+
self._tier_cache[tier.value] = [
|
|
214
|
+
provider_models[tier.value]
|
|
215
215
|
for provider_models in self._registry.values()
|
|
216
|
-
if
|
|
216
|
+
if tier.value in provider_models
|
|
217
217
|
]
|
|
218
218
|
|
|
219
219
|
# Cache for get_model_by_id (model_id -> ModelInfo)
|
empathy_os/telemetry/__init__.py
CHANGED
|
@@ -2,10 +2,38 @@
|
|
|
2
2
|
|
|
3
3
|
Privacy-first, local-only usage tracking to measure actual cost savings.
|
|
4
4
|
|
|
5
|
+
Includes:
|
|
6
|
+
- UsageTracker: Track LLM usage and costs
|
|
7
|
+
- HeartbeatCoordinator: Monitor agent liveness via TTL heartbeats
|
|
8
|
+
- CoordinationSignals: Inter-agent communication via TTL signals
|
|
9
|
+
- EventStreamer: Real-time event streaming via Redis Streams
|
|
10
|
+
- ApprovalGate: Human approval gates for workflow control
|
|
11
|
+
- FeedbackLoop: Agent-to-LLM quality feedback for adaptive routing
|
|
12
|
+
|
|
5
13
|
Copyright 2025 Smart-AI-Memory
|
|
6
14
|
Licensed under Fair Source License 0.9
|
|
7
15
|
"""
|
|
8
16
|
|
|
17
|
+
from .agent_coordination import CoordinationSignal, CoordinationSignals
|
|
18
|
+
from .agent_tracking import AgentHeartbeat, HeartbeatCoordinator
|
|
19
|
+
from .approval_gates import ApprovalGate, ApprovalRequest, ApprovalResponse
|
|
20
|
+
from .event_streaming import EventStreamer, StreamEvent
|
|
21
|
+
from .feedback_loop import FeedbackEntry, FeedbackLoop, QualityStats, TierRecommendation
|
|
9
22
|
from .usage_tracker import UsageTracker
|
|
10
23
|
|
|
11
|
-
__all__ = [
|
|
24
|
+
__all__ = [
|
|
25
|
+
"UsageTracker",
|
|
26
|
+
"HeartbeatCoordinator",
|
|
27
|
+
"AgentHeartbeat",
|
|
28
|
+
"CoordinationSignals",
|
|
29
|
+
"CoordinationSignal",
|
|
30
|
+
"EventStreamer",
|
|
31
|
+
"StreamEvent",
|
|
32
|
+
"ApprovalGate",
|
|
33
|
+
"ApprovalRequest",
|
|
34
|
+
"ApprovalResponse",
|
|
35
|
+
"FeedbackLoop",
|
|
36
|
+
"FeedbackEntry",
|
|
37
|
+
"QualityStats",
|
|
38
|
+
"TierRecommendation",
|
|
39
|
+
]
|