empathy-framework 5.0.1__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {empathy_framework-5.0.1.dist-info → empathy_framework-5.1.0.dist-info}/METADATA +311 -150
- {empathy_framework-5.0.1.dist-info → empathy_framework-5.1.0.dist-info}/RECORD +60 -33
- empathy_framework-5.1.0.dist-info/licenses/LICENSE +201 -0
- empathy_framework-5.1.0.dist-info/licenses/LICENSE_CHANGE_ANNOUNCEMENT.md +101 -0
- empathy_llm_toolkit/providers.py +175 -35
- empathy_llm_toolkit/utils/tokens.py +150 -30
- empathy_os/__init__.py +1 -1
- empathy_os/cli/commands/batch.py +256 -0
- empathy_os/cli/commands/cache.py +248 -0
- empathy_os/cli/commands/inspect.py +1 -2
- empathy_os/cli/commands/metrics.py +1 -1
- empathy_os/cli/commands/routing.py +285 -0
- empathy_os/cli/commands/workflow.py +2 -1
- empathy_os/cli/parsers/__init__.py +6 -0
- empathy_os/cli/parsers/batch.py +118 -0
- empathy_os/cli/parsers/cache 2.py +65 -0
- empathy_os/cli/parsers/cache.py +65 -0
- empathy_os/cli/parsers/routing.py +110 -0
- empathy_os/cli_minimal.py +3 -3
- empathy_os/cli_router 2.py +416 -0
- empathy_os/dashboard/__init__.py +1 -2
- empathy_os/dashboard/app 2.py +512 -0
- empathy_os/dashboard/app.py +1 -1
- empathy_os/dashboard/simple_server 2.py +403 -0
- empathy_os/dashboard/standalone_server 2.py +536 -0
- empathy_os/dashboard/standalone_server.py +22 -11
- empathy_os/memory/types 2.py +441 -0
- empathy_os/metrics/collector.py +31 -0
- empathy_os/models/__init__.py +19 -0
- empathy_os/models/adaptive_routing 2.py +437 -0
- empathy_os/models/auth_cli.py +444 -0
- empathy_os/models/auth_strategy.py +450 -0
- empathy_os/models/token_estimator.py +21 -13
- empathy_os/project_index/scanner_parallel 2.py +291 -0
- empathy_os/telemetry/agent_coordination 2.py +478 -0
- empathy_os/telemetry/agent_coordination.py +14 -16
- empathy_os/telemetry/agent_tracking 2.py +350 -0
- empathy_os/telemetry/agent_tracking.py +18 -20
- empathy_os/telemetry/approval_gates 2.py +563 -0
- empathy_os/telemetry/approval_gates.py +27 -39
- empathy_os/telemetry/event_streaming 2.py +405 -0
- empathy_os/telemetry/event_streaming.py +22 -22
- empathy_os/telemetry/feedback_loop 2.py +557 -0
- empathy_os/telemetry/feedback_loop.py +14 -17
- empathy_os/workflows/__init__.py +8 -0
- empathy_os/workflows/autonomous_test_gen.py +569 -0
- empathy_os/workflows/batch_processing.py +56 -10
- empathy_os/workflows/bug_predict.py +45 -0
- empathy_os/workflows/code_review.py +92 -22
- empathy_os/workflows/document_gen.py +594 -62
- empathy_os/workflows/llm_base.py +363 -0
- empathy_os/workflows/perf_audit.py +69 -0
- empathy_os/workflows/release_prep.py +54 -0
- empathy_os/workflows/security_audit.py +154 -79
- empathy_os/workflows/test_gen.py +60 -0
- empathy_os/workflows/test_gen_behavioral.py +477 -0
- empathy_os/workflows/test_gen_parallel.py +341 -0
- empathy_framework-5.0.1.dist-info/licenses/LICENSE +0 -139
- {empathy_framework-5.0.1.dist-info → empathy_framework-5.1.0.dist-info}/WHEEL +0 -0
- {empathy_framework-5.0.1.dist-info → empathy_framework-5.1.0.dist-info}/entry_points.txt +0 -0
- {empathy_framework-5.0.1.dist-info → empathy_framework-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,557 @@
|
|
|
1
|
+
"""Agent-to-LLM Feedback Loop for Quality-Based Learning.
|
|
2
|
+
|
|
3
|
+
Pattern 6 from Agent Coordination Architecture - Collect quality ratings
|
|
4
|
+
on LLM responses and use feedback to inform routing decisions.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
# Record feedback after LLM response
|
|
8
|
+
feedback = FeedbackLoop()
|
|
9
|
+
feedback.record_feedback(
|
|
10
|
+
workflow_name="code-review",
|
|
11
|
+
stage_name="analysis",
|
|
12
|
+
tier=ModelTier.CHEAP,
|
|
13
|
+
quality_score=0.8,
|
|
14
|
+
metadata={
|
|
15
|
+
"response_length": 500,
|
|
16
|
+
"tokens": 150,
|
|
17
|
+
"latency_ms": 1200
|
|
18
|
+
}
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Get tier recommendation based on historical performance
|
|
22
|
+
recommendation = feedback.recommend_tier(
|
|
23
|
+
workflow_name="code-review",
|
|
24
|
+
stage_name="analysis"
|
|
25
|
+
)
|
|
26
|
+
if recommendation.recommended_tier == ModelTier.CAPABLE:
|
|
27
|
+
print(f"Upgrade to CAPABLE tier (confidence: {recommendation.confidence})")
|
|
28
|
+
|
|
29
|
+
# Get quality stats for analysis
|
|
30
|
+
stats = feedback.get_quality_stats(
|
|
31
|
+
workflow_name="code-review",
|
|
32
|
+
stage_name="analysis"
|
|
33
|
+
)
|
|
34
|
+
print(f"Average quality: {stats.avg_quality}")
|
|
35
|
+
|
|
36
|
+
Copyright 2025 Smart-AI-Memory
|
|
37
|
+
Licensed under Fair Source License 0.9
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
import logging
|
|
43
|
+
from dataclasses import dataclass, field
|
|
44
|
+
from datetime import datetime, timedelta
|
|
45
|
+
from enum import Enum
|
|
46
|
+
from typing import Any
|
|
47
|
+
from uuid import uuid4
|
|
48
|
+
|
|
49
|
+
logger = logging.getLogger(__name__)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ModelTier(str, Enum):
|
|
53
|
+
"""Model tier enum matching workflows.base.ModelTier."""
|
|
54
|
+
|
|
55
|
+
CHEAP = "cheap"
|
|
56
|
+
CAPABLE = "capable"
|
|
57
|
+
PREMIUM = "premium"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class FeedbackEntry:
|
|
62
|
+
"""Quality feedback for an LLM response.
|
|
63
|
+
|
|
64
|
+
Represents a single quality rating for a workflow stage execution.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
feedback_id: str
|
|
68
|
+
workflow_name: str
|
|
69
|
+
stage_name: str
|
|
70
|
+
tier: str # ModelTier value
|
|
71
|
+
quality_score: float # 0.0 (bad) to 1.0 (excellent)
|
|
72
|
+
timestamp: datetime
|
|
73
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
74
|
+
|
|
75
|
+
def to_dict(self) -> dict[str, Any]:
|
|
76
|
+
"""Convert to dictionary for serialization."""
|
|
77
|
+
return {
|
|
78
|
+
"feedback_id": self.feedback_id,
|
|
79
|
+
"workflow_name": self.workflow_name,
|
|
80
|
+
"stage_name": self.stage_name,
|
|
81
|
+
"tier": self.tier,
|
|
82
|
+
"quality_score": self.quality_score,
|
|
83
|
+
"timestamp": self.timestamp.isoformat() if isinstance(self.timestamp, datetime) else self.timestamp,
|
|
84
|
+
"metadata": self.metadata,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
@classmethod
|
|
88
|
+
def from_dict(cls, data: dict[str, Any]) -> FeedbackEntry:
|
|
89
|
+
"""Create from dictionary."""
|
|
90
|
+
timestamp = data.get("timestamp")
|
|
91
|
+
if isinstance(timestamp, str):
|
|
92
|
+
timestamp = datetime.fromisoformat(timestamp)
|
|
93
|
+
elif not isinstance(timestamp, datetime):
|
|
94
|
+
timestamp = datetime.utcnow()
|
|
95
|
+
|
|
96
|
+
return cls(
|
|
97
|
+
feedback_id=data["feedback_id"],
|
|
98
|
+
workflow_name=data["workflow_name"],
|
|
99
|
+
stage_name=data["stage_name"],
|
|
100
|
+
tier=data["tier"],
|
|
101
|
+
quality_score=data["quality_score"],
|
|
102
|
+
timestamp=timestamp,
|
|
103
|
+
metadata=data.get("metadata", {}),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class QualityStats:
|
|
109
|
+
"""Quality statistics for a workflow stage."""
|
|
110
|
+
|
|
111
|
+
workflow_name: str
|
|
112
|
+
stage_name: str
|
|
113
|
+
tier: str
|
|
114
|
+
avg_quality: float
|
|
115
|
+
min_quality: float
|
|
116
|
+
max_quality: float
|
|
117
|
+
sample_count: int
|
|
118
|
+
recent_trend: float # -1.0 (declining) to 1.0 (improving)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class TierRecommendation:
|
|
123
|
+
"""Tier recommendation based on quality feedback."""
|
|
124
|
+
|
|
125
|
+
current_tier: str
|
|
126
|
+
recommended_tier: str
|
|
127
|
+
confidence: float # 0.0 (low) to 1.0 (high)
|
|
128
|
+
reason: str
|
|
129
|
+
stats: dict[str, QualityStats] # Stats by tier
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class FeedbackLoop:
|
|
133
|
+
"""Agent-to-LLM feedback loop for quality-based learning.
|
|
134
|
+
|
|
135
|
+
Collects quality ratings on LLM responses and uses feedback to:
|
|
136
|
+
- Recommend tier upgrades/downgrades
|
|
137
|
+
- Track quality trends over time
|
|
138
|
+
- Identify underperforming stages
|
|
139
|
+
- Optimize routing based on historical performance
|
|
140
|
+
|
|
141
|
+
Attributes:
|
|
142
|
+
FEEDBACK_TTL: Feedback entry TTL (7 days)
|
|
143
|
+
MIN_SAMPLES: Minimum samples for recommendation (10)
|
|
144
|
+
QUALITY_THRESHOLD: Quality threshold for tier upgrade (0.7)
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
FEEDBACK_TTL = 604800 # 7 days (60*60*24*7)
|
|
148
|
+
MIN_SAMPLES = 10 # Minimum samples for recommendation
|
|
149
|
+
QUALITY_THRESHOLD = 0.7 # Quality below this triggers upgrade recommendation
|
|
150
|
+
|
|
151
|
+
def __init__(self, memory=None):
|
|
152
|
+
"""Initialize feedback loop.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
memory: Memory instance for storing feedback
|
|
156
|
+
"""
|
|
157
|
+
self.memory = memory
|
|
158
|
+
|
|
159
|
+
if self.memory is None:
|
|
160
|
+
try:
|
|
161
|
+
from empathy_os.telemetry import UsageTracker
|
|
162
|
+
|
|
163
|
+
tracker = UsageTracker.get_instance()
|
|
164
|
+
if hasattr(tracker, "_memory"):
|
|
165
|
+
self.memory = tracker._memory
|
|
166
|
+
except (ImportError, AttributeError):
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
if self.memory is None:
|
|
170
|
+
logger.warning("No memory backend available for feedback loop")
|
|
171
|
+
|
|
172
|
+
def record_feedback(
|
|
173
|
+
self,
|
|
174
|
+
workflow_name: str,
|
|
175
|
+
stage_name: str,
|
|
176
|
+
tier: str | ModelTier,
|
|
177
|
+
quality_score: float,
|
|
178
|
+
metadata: dict[str, Any] | None = None,
|
|
179
|
+
) -> str:
|
|
180
|
+
"""Record quality feedback for a workflow stage execution.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
workflow_name: Name of workflow
|
|
184
|
+
stage_name: Name of stage within workflow
|
|
185
|
+
tier: Model tier used (CHEAP, CAPABLE, PREMIUM)
|
|
186
|
+
quality_score: Quality rating 0.0-1.0 (0=bad, 1=excellent)
|
|
187
|
+
metadata: Optional metadata (tokens, latency, etc.)
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Feedback ID if stored, empty string otherwise
|
|
191
|
+
|
|
192
|
+
Example:
|
|
193
|
+
>>> feedback = FeedbackLoop()
|
|
194
|
+
>>> feedback.record_feedback(
|
|
195
|
+
... workflow_name="code-review",
|
|
196
|
+
... stage_name="analysis",
|
|
197
|
+
... tier=ModelTier.CHEAP,
|
|
198
|
+
... quality_score=0.85,
|
|
199
|
+
... metadata={"tokens": 150, "latency_ms": 1200}
|
|
200
|
+
... )
|
|
201
|
+
"""
|
|
202
|
+
if not self.memory:
|
|
203
|
+
logger.debug("Cannot record feedback: no memory backend")
|
|
204
|
+
return ""
|
|
205
|
+
|
|
206
|
+
# Validate quality score
|
|
207
|
+
if not 0.0 <= quality_score <= 1.0:
|
|
208
|
+
logger.warning(f"Invalid quality score: {quality_score} (must be 0.0-1.0)")
|
|
209
|
+
return ""
|
|
210
|
+
|
|
211
|
+
# Convert tier to string if ModelTier enum
|
|
212
|
+
if isinstance(tier, ModelTier):
|
|
213
|
+
tier = tier.value
|
|
214
|
+
|
|
215
|
+
feedback_id = f"feedback_{uuid4().hex[:8]}"
|
|
216
|
+
|
|
217
|
+
entry = FeedbackEntry(
|
|
218
|
+
feedback_id=feedback_id,
|
|
219
|
+
workflow_name=workflow_name,
|
|
220
|
+
stage_name=stage_name,
|
|
221
|
+
tier=tier,
|
|
222
|
+
quality_score=quality_score,
|
|
223
|
+
timestamp=datetime.utcnow(),
|
|
224
|
+
metadata=metadata or {},
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Store feedback
|
|
228
|
+
# Key format: feedback:{workflow}:{stage}:{tier}:{id}
|
|
229
|
+
key = f"feedback:{workflow_name}:{stage_name}:{tier}:{feedback_id}"
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
if hasattr(self.memory, "stash"):
|
|
233
|
+
self.memory.stash(
|
|
234
|
+
key=key, data=entry.to_dict(), credentials=None, ttl_seconds=self.FEEDBACK_TTL
|
|
235
|
+
)
|
|
236
|
+
elif hasattr(self.memory, "_redis"):
|
|
237
|
+
import json
|
|
238
|
+
|
|
239
|
+
self.memory._redis.setex(key, self.FEEDBACK_TTL, json.dumps(entry.to_dict()))
|
|
240
|
+
else:
|
|
241
|
+
logger.warning("Cannot store feedback: unsupported memory type")
|
|
242
|
+
return ""
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.error(f"Failed to store feedback: {e}")
|
|
245
|
+
return ""
|
|
246
|
+
|
|
247
|
+
logger.debug(
|
|
248
|
+
f"Recorded feedback: {workflow_name}/{stage_name} tier={tier} quality={quality_score:.2f}"
|
|
249
|
+
)
|
|
250
|
+
return feedback_id
|
|
251
|
+
|
|
252
|
+
def get_feedback_history(
|
|
253
|
+
self, workflow_name: str, stage_name: str, tier: str | ModelTier | None = None, limit: int = 100
|
|
254
|
+
) -> list[FeedbackEntry]:
|
|
255
|
+
"""Get feedback history for a workflow stage.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
workflow_name: Name of workflow
|
|
259
|
+
stage_name: Name of stage
|
|
260
|
+
tier: Optional filter by tier
|
|
261
|
+
limit: Maximum number of entries to return
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
List of feedback entries (newest first)
|
|
265
|
+
"""
|
|
266
|
+
if not self.memory or not hasattr(self.memory, "_redis"):
|
|
267
|
+
return []
|
|
268
|
+
|
|
269
|
+
# Convert tier to string if ModelTier enum
|
|
270
|
+
if isinstance(tier, ModelTier):
|
|
271
|
+
tier = tier.value
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
# Build search pattern
|
|
275
|
+
if tier:
|
|
276
|
+
pattern = f"feedback:{workflow_name}:{stage_name}:{tier}:*"
|
|
277
|
+
else:
|
|
278
|
+
pattern = f"feedback:{workflow_name}:{stage_name}:*"
|
|
279
|
+
|
|
280
|
+
keys = self.memory._redis.keys(pattern)
|
|
281
|
+
|
|
282
|
+
entries = []
|
|
283
|
+
for key in keys:
|
|
284
|
+
if isinstance(key, bytes):
|
|
285
|
+
key = key.decode("utf-8")
|
|
286
|
+
|
|
287
|
+
# Retrieve entry
|
|
288
|
+
data = self._retrieve_feedback(key)
|
|
289
|
+
if data:
|
|
290
|
+
entries.append(FeedbackEntry.from_dict(data))
|
|
291
|
+
|
|
292
|
+
if len(entries) >= limit:
|
|
293
|
+
break
|
|
294
|
+
|
|
295
|
+
# Sort by timestamp (newest first)
|
|
296
|
+
entries.sort(key=lambda e: e.timestamp, reverse=True)
|
|
297
|
+
|
|
298
|
+
return entries[:limit]
|
|
299
|
+
except Exception as e:
|
|
300
|
+
logger.error(f"Failed to get feedback history: {e}")
|
|
301
|
+
return []
|
|
302
|
+
|
|
303
|
+
def _retrieve_feedback(self, key: str) -> dict[str, Any] | None:
|
|
304
|
+
"""Retrieve feedback entry from memory."""
|
|
305
|
+
if not self.memory:
|
|
306
|
+
return None
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
if hasattr(self.memory, "retrieve"):
|
|
310
|
+
return self.memory.retrieve(key, credentials=None)
|
|
311
|
+
elif hasattr(self.memory, "_redis"):
|
|
312
|
+
import json
|
|
313
|
+
|
|
314
|
+
data = self.memory._redis.get(key)
|
|
315
|
+
if data:
|
|
316
|
+
if isinstance(data, bytes):
|
|
317
|
+
data = data.decode("utf-8")
|
|
318
|
+
return json.loads(data)
|
|
319
|
+
return None
|
|
320
|
+
except Exception as e:
|
|
321
|
+
logger.debug(f"Failed to retrieve feedback: {e}")
|
|
322
|
+
return None
|
|
323
|
+
|
|
324
|
+
def get_quality_stats(
|
|
325
|
+
self, workflow_name: str, stage_name: str, tier: str | ModelTier | None = None
|
|
326
|
+
) -> QualityStats | None:
|
|
327
|
+
"""Get quality statistics for a workflow stage.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
workflow_name: Name of workflow
|
|
331
|
+
stage_name: Name of stage
|
|
332
|
+
tier: Optional filter by tier
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
Quality statistics or None if insufficient data
|
|
336
|
+
"""
|
|
337
|
+
history = self.get_feedback_history(workflow_name, stage_name, tier=tier)
|
|
338
|
+
|
|
339
|
+
if not history:
|
|
340
|
+
return None
|
|
341
|
+
|
|
342
|
+
# Calculate statistics
|
|
343
|
+
quality_scores = [entry.quality_score for entry in history]
|
|
344
|
+
|
|
345
|
+
avg_quality = sum(quality_scores) / len(quality_scores)
|
|
346
|
+
min_quality = min(quality_scores)
|
|
347
|
+
max_quality = max(quality_scores)
|
|
348
|
+
|
|
349
|
+
# Calculate trend (recent vs older feedback)
|
|
350
|
+
if len(history) >= 4:
|
|
351
|
+
recent = quality_scores[: len(quality_scores) // 2]
|
|
352
|
+
older = quality_scores[len(quality_scores) // 2 :]
|
|
353
|
+
recent_avg = sum(recent) / len(recent)
|
|
354
|
+
older_avg = sum(older) / len(older)
|
|
355
|
+
recent_trend = (recent_avg - older_avg) / max(older_avg, 0.1) # Normalized difference
|
|
356
|
+
else:
|
|
357
|
+
recent_trend = 0.0
|
|
358
|
+
|
|
359
|
+
tier_str = tier.value if isinstance(tier, ModelTier) else (tier or "all")
|
|
360
|
+
|
|
361
|
+
return QualityStats(
|
|
362
|
+
workflow_name=workflow_name,
|
|
363
|
+
stage_name=stage_name,
|
|
364
|
+
tier=tier_str,
|
|
365
|
+
avg_quality=avg_quality,
|
|
366
|
+
min_quality=min_quality,
|
|
367
|
+
max_quality=max_quality,
|
|
368
|
+
sample_count=len(history),
|
|
369
|
+
recent_trend=recent_trend,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
def recommend_tier(
|
|
373
|
+
self, workflow_name: str, stage_name: str, current_tier: str | ModelTier | None = None
|
|
374
|
+
) -> TierRecommendation:
|
|
375
|
+
"""Recommend optimal tier based on quality feedback.
|
|
376
|
+
|
|
377
|
+
Analyzes historical quality data and recommends:
|
|
378
|
+
- Downgrade if current tier consistently delivers high quality (cost optimization)
|
|
379
|
+
- Upgrade if current tier delivers poor quality (quality optimization)
|
|
380
|
+
- Keep current if quality is acceptable
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
workflow_name: Name of workflow
|
|
384
|
+
stage_name: Name of stage
|
|
385
|
+
current_tier: Current tier in use (if known)
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
Tier recommendation with confidence and reasoning
|
|
389
|
+
"""
|
|
390
|
+
# Convert tier to string if ModelTier enum
|
|
391
|
+
if isinstance(current_tier, ModelTier):
|
|
392
|
+
current_tier = current_tier.value
|
|
393
|
+
|
|
394
|
+
# Get stats for all tiers
|
|
395
|
+
stats_by_tier = {}
|
|
396
|
+
for tier in ["cheap", "capable", "premium"]:
|
|
397
|
+
stats = self.get_quality_stats(workflow_name, stage_name, tier=tier)
|
|
398
|
+
if stats:
|
|
399
|
+
stats_by_tier[tier] = stats
|
|
400
|
+
|
|
401
|
+
# No data - default recommendation
|
|
402
|
+
if not stats_by_tier:
|
|
403
|
+
return TierRecommendation(
|
|
404
|
+
current_tier=current_tier or "unknown",
|
|
405
|
+
recommended_tier=current_tier or "cheap",
|
|
406
|
+
confidence=0.0,
|
|
407
|
+
reason="No feedback data available",
|
|
408
|
+
stats={},
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
# Determine current tier if not provided
|
|
412
|
+
if not current_tier:
|
|
413
|
+
# Use tier with most recent feedback
|
|
414
|
+
all_history = self.get_feedback_history(workflow_name, stage_name, tier=None, limit=1)
|
|
415
|
+
if all_history:
|
|
416
|
+
current_tier = all_history[0].tier
|
|
417
|
+
else:
|
|
418
|
+
current_tier = "cheap"
|
|
419
|
+
|
|
420
|
+
current_stats = stats_by_tier.get(current_tier)
|
|
421
|
+
|
|
422
|
+
# Insufficient data for current tier
|
|
423
|
+
if not current_stats or current_stats.sample_count < self.MIN_SAMPLES:
|
|
424
|
+
return TierRecommendation(
|
|
425
|
+
current_tier=current_tier,
|
|
426
|
+
recommended_tier=current_tier,
|
|
427
|
+
confidence=0.0,
|
|
428
|
+
reason=f"Insufficient data (need {self.MIN_SAMPLES} samples, have {current_stats.sample_count if current_stats else 0})",
|
|
429
|
+
stats=stats_by_tier,
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Analyze quality
|
|
433
|
+
avg_quality = current_stats.avg_quality
|
|
434
|
+
confidence = min(current_stats.sample_count / (self.MIN_SAMPLES * 2), 1.0)
|
|
435
|
+
|
|
436
|
+
# Decision logic
|
|
437
|
+
if avg_quality < self.QUALITY_THRESHOLD:
|
|
438
|
+
# Poor quality - recommend upgrade
|
|
439
|
+
if current_tier == "cheap":
|
|
440
|
+
recommended = "capable"
|
|
441
|
+
reason = f"Low quality ({avg_quality:.2f}) - upgrade for better results"
|
|
442
|
+
elif current_tier == "capable":
|
|
443
|
+
recommended = "premium"
|
|
444
|
+
reason = f"Low quality ({avg_quality:.2f}) - upgrade to premium tier"
|
|
445
|
+
else: # premium
|
|
446
|
+
recommended = "premium"
|
|
447
|
+
reason = f"Already using premium tier (quality: {avg_quality:.2f})"
|
|
448
|
+
confidence = 1.0
|
|
449
|
+
elif avg_quality > 0.9 and current_tier != "cheap":
|
|
450
|
+
# Excellent quality - consider downgrade for cost optimization
|
|
451
|
+
if current_tier == "premium":
|
|
452
|
+
# Check if capable tier also has good quality
|
|
453
|
+
capable_stats = stats_by_tier.get("capable")
|
|
454
|
+
if capable_stats and capable_stats.avg_quality > 0.85:
|
|
455
|
+
recommended = "capable"
|
|
456
|
+
reason = f"Excellent quality ({avg_quality:.2f}) - downgrade to save cost"
|
|
457
|
+
else:
|
|
458
|
+
recommended = "premium"
|
|
459
|
+
reason = f"Excellent quality ({avg_quality:.2f}) - keep premium for consistency"
|
|
460
|
+
elif current_tier == "capable":
|
|
461
|
+
# Check if cheap tier also has good quality
|
|
462
|
+
cheap_stats = stats_by_tier.get("cheap")
|
|
463
|
+
if cheap_stats and cheap_stats.avg_quality > 0.85:
|
|
464
|
+
recommended = "cheap"
|
|
465
|
+
reason = f"Excellent quality ({avg_quality:.2f}) - downgrade to save cost"
|
|
466
|
+
else:
|
|
467
|
+
recommended = "capable"
|
|
468
|
+
reason = f"Excellent quality ({avg_quality:.2f}) - keep capable tier"
|
|
469
|
+
else:
|
|
470
|
+
recommended = current_tier
|
|
471
|
+
reason = f"Excellent quality ({avg_quality:.2f}) - maintain current tier"
|
|
472
|
+
else:
|
|
473
|
+
# Acceptable quality - keep current tier
|
|
474
|
+
recommended = current_tier
|
|
475
|
+
reason = f"Acceptable quality ({avg_quality:.2f}) - maintain current tier"
|
|
476
|
+
|
|
477
|
+
return TierRecommendation(
|
|
478
|
+
current_tier=current_tier,
|
|
479
|
+
recommended_tier=recommended,
|
|
480
|
+
confidence=confidence,
|
|
481
|
+
reason=reason,
|
|
482
|
+
stats=stats_by_tier,
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
def get_underperforming_stages(
|
|
486
|
+
self, workflow_name: str, quality_threshold: float = 0.7
|
|
487
|
+
) -> list[tuple[str, QualityStats]]:
|
|
488
|
+
"""Get workflow stages with poor quality scores.
|
|
489
|
+
|
|
490
|
+
Args:
|
|
491
|
+
workflow_name: Name of workflow
|
|
492
|
+
quality_threshold: Threshold below which stage is considered underperforming
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
List of (stage_name, stats) tuples for underperforming stages
|
|
496
|
+
"""
|
|
497
|
+
if not self.memory or not hasattr(self.memory, "_redis"):
|
|
498
|
+
return []
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
# Find all feedback keys for this workflow
|
|
502
|
+
pattern = f"feedback:{workflow_name}:*"
|
|
503
|
+
keys = self.memory._redis.keys(pattern)
|
|
504
|
+
|
|
505
|
+
# Extract unique stages
|
|
506
|
+
stages = set()
|
|
507
|
+
for key in keys:
|
|
508
|
+
if isinstance(key, bytes):
|
|
509
|
+
key = key.decode("utf-8")
|
|
510
|
+
# Parse key: feedback:{workflow}:{stage}:{tier}:{id}
|
|
511
|
+
parts = key.split(":")
|
|
512
|
+
if len(parts) >= 4:
|
|
513
|
+
stages.add(parts[2])
|
|
514
|
+
|
|
515
|
+
# Get stats for each stage
|
|
516
|
+
underperforming = []
|
|
517
|
+
for stage_name in stages:
|
|
518
|
+
stats = self.get_quality_stats(workflow_name, stage_name)
|
|
519
|
+
if stats and stats.avg_quality < quality_threshold:
|
|
520
|
+
underperforming.append((stage_name, stats))
|
|
521
|
+
|
|
522
|
+
# Sort by quality (worst first)
|
|
523
|
+
underperforming.sort(key=lambda x: x[1].avg_quality)
|
|
524
|
+
|
|
525
|
+
return underperforming
|
|
526
|
+
except Exception as e:
|
|
527
|
+
logger.error(f"Failed to get underperforming stages: {e}")
|
|
528
|
+
return []
|
|
529
|
+
|
|
530
|
+
def clear_feedback(self, workflow_name: str, stage_name: str | None = None) -> int:
|
|
531
|
+
"""Clear feedback history for a workflow or stage.
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
workflow_name: Name of workflow
|
|
535
|
+
stage_name: Optional stage name (clears all stages if None)
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
Number of feedback entries cleared
|
|
539
|
+
"""
|
|
540
|
+
if not self.memory or not hasattr(self.memory, "_redis"):
|
|
541
|
+
return 0
|
|
542
|
+
|
|
543
|
+
try:
|
|
544
|
+
if stage_name:
|
|
545
|
+
pattern = f"feedback:{workflow_name}:{stage_name}:*"
|
|
546
|
+
else:
|
|
547
|
+
pattern = f"feedback:{workflow_name}:*"
|
|
548
|
+
|
|
549
|
+
keys = self.memory._redis.keys(pattern)
|
|
550
|
+
if not keys:
|
|
551
|
+
return 0
|
|
552
|
+
|
|
553
|
+
deleted = self.memory._redis.delete(*keys)
|
|
554
|
+
return deleted
|
|
555
|
+
except Exception as e:
|
|
556
|
+
logger.error(f"Failed to clear feedback: {e}")
|
|
557
|
+
return 0
|
|
@@ -41,7 +41,7 @@ from __future__ import annotations
|
|
|
41
41
|
|
|
42
42
|
import logging
|
|
43
43
|
from dataclasses import dataclass, field
|
|
44
|
-
from datetime import datetime
|
|
44
|
+
from datetime import datetime
|
|
45
45
|
from enum import Enum
|
|
46
46
|
from typing import Any
|
|
47
47
|
from uuid import uuid4
|
|
@@ -229,16 +229,13 @@ class FeedbackLoop:
|
|
|
229
229
|
key = f"feedback:{workflow_name}:{stage_name}:{tier}:{feedback_id}"
|
|
230
230
|
|
|
231
231
|
try:
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
key=key, data=entry.to_dict(), credentials=None, ttl_seconds=self.FEEDBACK_TTL
|
|
235
|
-
)
|
|
236
|
-
elif hasattr(self.memory, "_redis"):
|
|
232
|
+
# Use direct Redis access for custom TTL
|
|
233
|
+
if hasattr(self.memory, "_client") and self.memory._client:
|
|
237
234
|
import json
|
|
238
235
|
|
|
239
|
-
self.memory.
|
|
236
|
+
self.memory._client.setex(key, self.FEEDBACK_TTL, json.dumps(entry.to_dict()))
|
|
240
237
|
else:
|
|
241
|
-
logger.warning("Cannot store feedback:
|
|
238
|
+
logger.warning("Cannot store feedback: no Redis backend available")
|
|
242
239
|
return ""
|
|
243
240
|
except Exception as e:
|
|
244
241
|
logger.error(f"Failed to store feedback: {e}")
|
|
@@ -263,7 +260,7 @@ class FeedbackLoop:
|
|
|
263
260
|
Returns:
|
|
264
261
|
List of feedback entries (newest first)
|
|
265
262
|
"""
|
|
266
|
-
if not self.memory or not hasattr(self.memory, "
|
|
263
|
+
if not self.memory or not hasattr(self.memory, "_client"):
|
|
267
264
|
return []
|
|
268
265
|
|
|
269
266
|
# Convert tier to string if ModelTier enum
|
|
@@ -277,7 +274,7 @@ class FeedbackLoop:
|
|
|
277
274
|
else:
|
|
278
275
|
pattern = f"feedback:{workflow_name}:{stage_name}:*"
|
|
279
276
|
|
|
280
|
-
keys = self.memory.
|
|
277
|
+
keys = self.memory._client.keys(pattern)
|
|
281
278
|
|
|
282
279
|
entries = []
|
|
283
280
|
for key in keys:
|
|
@@ -308,10 +305,10 @@ class FeedbackLoop:
|
|
|
308
305
|
try:
|
|
309
306
|
if hasattr(self.memory, "retrieve"):
|
|
310
307
|
return self.memory.retrieve(key, credentials=None)
|
|
311
|
-
elif hasattr(self.memory, "
|
|
308
|
+
elif hasattr(self.memory, "_client"):
|
|
312
309
|
import json
|
|
313
310
|
|
|
314
|
-
data = self.memory.
|
|
311
|
+
data = self.memory._client.get(key)
|
|
315
312
|
if data:
|
|
316
313
|
if isinstance(data, bytes):
|
|
317
314
|
data = data.decode("utf-8")
|
|
@@ -494,13 +491,13 @@ class FeedbackLoop:
|
|
|
494
491
|
Returns:
|
|
495
492
|
List of (stage_name, stats) tuples for underperforming stages
|
|
496
493
|
"""
|
|
497
|
-
if not self.memory or not hasattr(self.memory, "
|
|
494
|
+
if not self.memory or not hasattr(self.memory, "_client"):
|
|
498
495
|
return []
|
|
499
496
|
|
|
500
497
|
try:
|
|
501
498
|
# Find all feedback keys for this workflow
|
|
502
499
|
pattern = f"feedback:{workflow_name}:*"
|
|
503
|
-
keys = self.memory.
|
|
500
|
+
keys = self.memory._client.keys(pattern)
|
|
504
501
|
|
|
505
502
|
# Extract unique stages
|
|
506
503
|
stages = set()
|
|
@@ -537,7 +534,7 @@ class FeedbackLoop:
|
|
|
537
534
|
Returns:
|
|
538
535
|
Number of feedback entries cleared
|
|
539
536
|
"""
|
|
540
|
-
if not self.memory or not hasattr(self.memory, "
|
|
537
|
+
if not self.memory or not hasattr(self.memory, "_client"):
|
|
541
538
|
return 0
|
|
542
539
|
|
|
543
540
|
try:
|
|
@@ -546,11 +543,11 @@ class FeedbackLoop:
|
|
|
546
543
|
else:
|
|
547
544
|
pattern = f"feedback:{workflow_name}:*"
|
|
548
545
|
|
|
549
|
-
keys = self.memory.
|
|
546
|
+
keys = self.memory._client.keys(pattern)
|
|
550
547
|
if not keys:
|
|
551
548
|
return 0
|
|
552
549
|
|
|
553
|
-
deleted = self.memory.
|
|
550
|
+
deleted = self.memory._client.delete(*keys)
|
|
554
551
|
return deleted
|
|
555
552
|
except Exception as e:
|
|
556
553
|
logger.error(f"Failed to clear feedback: {e}")
|
empathy_os/workflows/__init__.py
CHANGED
|
@@ -66,6 +66,8 @@ if TYPE_CHECKING:
|
|
|
66
66
|
from .test5 import Test5Workflow
|
|
67
67
|
from .test_coverage_boost_crew import TestCoverageBoostCrew, TestCoverageBoostCrewResult
|
|
68
68
|
from .test_gen import TestGenerationWorkflow
|
|
69
|
+
from .test_gen_behavioral import BehavioralTestGenerationWorkflow
|
|
70
|
+
from .test_gen_parallel import ParallelTestGenerationWorkflow
|
|
69
71
|
from .xml_enhanced_crew import XMLAgent, XMLTask
|
|
70
72
|
|
|
71
73
|
# Only import base module eagerly (small, needed for type checks)
|
|
@@ -136,6 +138,8 @@ _LAZY_WORKFLOW_IMPORTS: dict[str, tuple[str, str]] = {
|
|
|
136
138
|
"TestCoverageBoostCrew": (".test_coverage_boost_crew", "TestCoverageBoostCrew"),
|
|
137
139
|
"TestCoverageBoostCrewResult": (".test_coverage_boost_crew", "TestCoverageBoostCrewResult"),
|
|
138
140
|
"TestGenerationWorkflow": (".test_gen", "TestGenerationWorkflow"),
|
|
141
|
+
"BehavioralTestGenerationWorkflow": (".test_gen_behavioral", "BehavioralTestGenerationWorkflow"),
|
|
142
|
+
"ParallelTestGenerationWorkflow": (".test_gen_parallel", "ParallelTestGenerationWorkflow"),
|
|
139
143
|
"XMLAgent": (".xml_enhanced_crew", "XMLAgent"),
|
|
140
144
|
"XMLTask": (".xml_enhanced_crew", "XMLTask"),
|
|
141
145
|
"parse_xml_response": (".xml_enhanced_crew", "parse_xml_response"),
|
|
@@ -213,6 +217,8 @@ _DEFAULT_WORKFLOW_NAMES: dict[str, str] = {
|
|
|
213
217
|
"perf-audit": "PerformanceAuditWorkflow",
|
|
214
218
|
# Generation workflows
|
|
215
219
|
"test-gen": "TestGenerationWorkflow",
|
|
220
|
+
"test-gen-behavioral": "BehavioralTestGenerationWorkflow",
|
|
221
|
+
"test-gen-parallel": "ParallelTestGenerationWorkflow",
|
|
216
222
|
"refactor-plan": "RefactorPlanWorkflow",
|
|
217
223
|
# Operational workflows
|
|
218
224
|
"dependency-check": "DependencyCheckWorkflow",
|
|
@@ -484,6 +490,8 @@ __all__ = [
|
|
|
484
490
|
"SecureReleaseResult",
|
|
485
491
|
"SecurityAuditWorkflow",
|
|
486
492
|
"TestGenerationWorkflow",
|
|
493
|
+
"BehavioralTestGenerationWorkflow",
|
|
494
|
+
"ParallelTestGenerationWorkflow",
|
|
487
495
|
# Configuration
|
|
488
496
|
"WorkflowConfig",
|
|
489
497
|
"WorkflowResult",
|