empathy-framework 5.2.1__py3-none-any.whl → 5.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {empathy_framework-5.2.1.dist-info → empathy_framework-5.3.0.dist-info}/METADATA +28 -4
  2. {empathy_framework-5.2.1.dist-info → empathy_framework-5.3.0.dist-info}/RECORD +27 -49
  3. empathy_os/__init__.py +1 -1
  4. empathy_os/cache/hybrid.py +5 -1
  5. empathy_os/cli/commands/batch.py +8 -0
  6. empathy_os/cli/commands/profiling.py +4 -0
  7. empathy_os/cli/commands/workflow.py +8 -4
  8. empathy_os/config.py +15 -2
  9. empathy_os/dashboard/simple_server.py +62 -30
  10. empathy_os/memory/long_term.py +5 -5
  11. empathy_os/memory/mixins/backend_init_mixin.py +6 -1
  12. empathy_os/memory/mixins/capabilities_mixin.py +12 -3
  13. empathy_os/memory/short_term.py +54 -12
  14. empathy_os/memory/simple_storage.py +3 -3
  15. empathy_os/memory/types.py +8 -3
  16. empathy_os/telemetry/agent_coordination.py +2 -3
  17. empathy_os/telemetry/agent_tracking.py +26 -7
  18. empathy_os/telemetry/approval_gates.py +18 -24
  19. empathy_os/telemetry/event_streaming.py +7 -3
  20. empathy_os/telemetry/feedback_loop.py +28 -15
  21. empathy_os/workflows/output.py +4 -1
  22. empathy_os/workflows/progress.py +8 -2
  23. empathy_os/cli/parsers/cache 2.py +0 -65
  24. empathy_os/cli_router 2.py +0 -416
  25. empathy_os/dashboard/app 2.py +0 -512
  26. empathy_os/dashboard/simple_server 2.py +0 -403
  27. empathy_os/dashboard/standalone_server 2.py +0 -536
  28. empathy_os/models/adaptive_routing 2.py +0 -437
  29. empathy_os/project_index/scanner_parallel 2.py +0 -291
  30. empathy_os/telemetry/agent_coordination 2.py +0 -478
  31. empathy_os/telemetry/agent_tracking 2.py +0 -350
  32. empathy_os/telemetry/approval_gates 2.py +0 -563
  33. empathy_os/telemetry/event_streaming 2.py +0 -405
  34. empathy_os/telemetry/feedback_loop 2.py +0 -557
  35. empathy_os/vscode_bridge 2.py +0 -173
  36. empathy_os/workflows/document_gen.py +0 -29
  37. empathy_os/workflows/progressive/__init__ 2.py +0 -92
  38. empathy_os/workflows/progressive/cli 2.py +0 -242
  39. empathy_os/workflows/progressive/core 2.py +0 -488
  40. empathy_os/workflows/progressive/orchestrator 2.py +0 -701
  41. empathy_os/workflows/progressive/reports 2.py +0 -528
  42. empathy_os/workflows/progressive/telemetry 2.py +0 -280
  43. empathy_os/workflows/progressive/test_gen 2.py +0 -514
  44. empathy_os/workflows/progressive/workflow 2.py +0 -628
  45. {empathy_framework-5.2.1.dist-info → empathy_framework-5.3.0.dist-info}/WHEEL +0 -0
  46. {empathy_framework-5.2.1.dist-info → empathy_framework-5.3.0.dist-info}/entry_points.txt +0 -0
  47. {empathy_framework-5.2.1.dist-info → empathy_framework-5.3.0.dist-info}/licenses/LICENSE +0 -0
  48. {empathy_framework-5.2.1.dist-info → empathy_framework-5.3.0.dist-info}/licenses/LICENSE_CHANGE_ANNOUNCEMENT.md +0 -0
  49. {empathy_framework-5.2.1.dist-info → empathy_framework-5.3.0.dist-info}/top_level.txt +0 -0
@@ -1,557 +0,0 @@
1
- """Agent-to-LLM Feedback Loop for Quality-Based Learning.
2
-
3
- Pattern 6 from Agent Coordination Architecture - Collect quality ratings
4
- on LLM responses and use feedback to inform routing decisions.
5
-
6
- Usage:
7
- # Record feedback after LLM response
8
- feedback = FeedbackLoop()
9
- feedback.record_feedback(
10
- workflow_name="code-review",
11
- stage_name="analysis",
12
- tier=ModelTier.CHEAP,
13
- quality_score=0.8,
14
- metadata={
15
- "response_length": 500,
16
- "tokens": 150,
17
- "latency_ms": 1200
18
- }
19
- )
20
-
21
- # Get tier recommendation based on historical performance
22
- recommendation = feedback.recommend_tier(
23
- workflow_name="code-review",
24
- stage_name="analysis"
25
- )
26
- if recommendation.recommended_tier == ModelTier.CAPABLE:
27
- print(f"Upgrade to CAPABLE tier (confidence: {recommendation.confidence})")
28
-
29
- # Get quality stats for analysis
30
- stats = feedback.get_quality_stats(
31
- workflow_name="code-review",
32
- stage_name="analysis"
33
- )
34
- print(f"Average quality: {stats.avg_quality}")
35
-
36
- Copyright 2025 Smart-AI-Memory
37
- Licensed under Fair Source License 0.9
38
- """
39
-
40
- from __future__ import annotations
41
-
42
- import logging
43
- from dataclasses import dataclass, field
44
- from datetime import datetime, timedelta
45
- from enum import Enum
46
- from typing import Any
47
- from uuid import uuid4
48
-
49
- logger = logging.getLogger(__name__)
50
-
51
-
52
- class ModelTier(str, Enum):
53
- """Model tier enum matching workflows.base.ModelTier."""
54
-
55
- CHEAP = "cheap"
56
- CAPABLE = "capable"
57
- PREMIUM = "premium"
58
-
59
-
60
- @dataclass
61
- class FeedbackEntry:
62
- """Quality feedback for an LLM response.
63
-
64
- Represents a single quality rating for a workflow stage execution.
65
- """
66
-
67
- feedback_id: str
68
- workflow_name: str
69
- stage_name: str
70
- tier: str # ModelTier value
71
- quality_score: float # 0.0 (bad) to 1.0 (excellent)
72
- timestamp: datetime
73
- metadata: dict[str, Any] = field(default_factory=dict)
74
-
75
- def to_dict(self) -> dict[str, Any]:
76
- """Convert to dictionary for serialization."""
77
- return {
78
- "feedback_id": self.feedback_id,
79
- "workflow_name": self.workflow_name,
80
- "stage_name": self.stage_name,
81
- "tier": self.tier,
82
- "quality_score": self.quality_score,
83
- "timestamp": self.timestamp.isoformat() if isinstance(self.timestamp, datetime) else self.timestamp,
84
- "metadata": self.metadata,
85
- }
86
-
87
- @classmethod
88
- def from_dict(cls, data: dict[str, Any]) -> FeedbackEntry:
89
- """Create from dictionary."""
90
- timestamp = data.get("timestamp")
91
- if isinstance(timestamp, str):
92
- timestamp = datetime.fromisoformat(timestamp)
93
- elif not isinstance(timestamp, datetime):
94
- timestamp = datetime.utcnow()
95
-
96
- return cls(
97
- feedback_id=data["feedback_id"],
98
- workflow_name=data["workflow_name"],
99
- stage_name=data["stage_name"],
100
- tier=data["tier"],
101
- quality_score=data["quality_score"],
102
- timestamp=timestamp,
103
- metadata=data.get("metadata", {}),
104
- )
105
-
106
-
107
- @dataclass
108
- class QualityStats:
109
- """Quality statistics for a workflow stage."""
110
-
111
- workflow_name: str
112
- stage_name: str
113
- tier: str
114
- avg_quality: float
115
- min_quality: float
116
- max_quality: float
117
- sample_count: int
118
- recent_trend: float # -1.0 (declining) to 1.0 (improving)
119
-
120
-
121
- @dataclass
122
- class TierRecommendation:
123
- """Tier recommendation based on quality feedback."""
124
-
125
- current_tier: str
126
- recommended_tier: str
127
- confidence: float # 0.0 (low) to 1.0 (high)
128
- reason: str
129
- stats: dict[str, QualityStats] # Stats by tier
130
-
131
-
132
- class FeedbackLoop:
133
- """Agent-to-LLM feedback loop for quality-based learning.
134
-
135
- Collects quality ratings on LLM responses and uses feedback to:
136
- - Recommend tier upgrades/downgrades
137
- - Track quality trends over time
138
- - Identify underperforming stages
139
- - Optimize routing based on historical performance
140
-
141
- Attributes:
142
- FEEDBACK_TTL: Feedback entry TTL (7 days)
143
- MIN_SAMPLES: Minimum samples for recommendation (10)
144
- QUALITY_THRESHOLD: Quality threshold for tier upgrade (0.7)
145
- """
146
-
147
- FEEDBACK_TTL = 604800 # 7 days (60*60*24*7)
148
- MIN_SAMPLES = 10 # Minimum samples for recommendation
149
- QUALITY_THRESHOLD = 0.7 # Quality below this triggers upgrade recommendation
150
-
151
- def __init__(self, memory=None):
152
- """Initialize feedback loop.
153
-
154
- Args:
155
- memory: Memory instance for storing feedback
156
- """
157
- self.memory = memory
158
-
159
- if self.memory is None:
160
- try:
161
- from empathy_os.telemetry import UsageTracker
162
-
163
- tracker = UsageTracker.get_instance()
164
- if hasattr(tracker, "_memory"):
165
- self.memory = tracker._memory
166
- except (ImportError, AttributeError):
167
- pass
168
-
169
- if self.memory is None:
170
- logger.warning("No memory backend available for feedback loop")
171
-
172
- def record_feedback(
173
- self,
174
- workflow_name: str,
175
- stage_name: str,
176
- tier: str | ModelTier,
177
- quality_score: float,
178
- metadata: dict[str, Any] | None = None,
179
- ) -> str:
180
- """Record quality feedback for a workflow stage execution.
181
-
182
- Args:
183
- workflow_name: Name of workflow
184
- stage_name: Name of stage within workflow
185
- tier: Model tier used (CHEAP, CAPABLE, PREMIUM)
186
- quality_score: Quality rating 0.0-1.0 (0=bad, 1=excellent)
187
- metadata: Optional metadata (tokens, latency, etc.)
188
-
189
- Returns:
190
- Feedback ID if stored, empty string otherwise
191
-
192
- Example:
193
- >>> feedback = FeedbackLoop()
194
- >>> feedback.record_feedback(
195
- ... workflow_name="code-review",
196
- ... stage_name="analysis",
197
- ... tier=ModelTier.CHEAP,
198
- ... quality_score=0.85,
199
- ... metadata={"tokens": 150, "latency_ms": 1200}
200
- ... )
201
- """
202
- if not self.memory:
203
- logger.debug("Cannot record feedback: no memory backend")
204
- return ""
205
-
206
- # Validate quality score
207
- if not 0.0 <= quality_score <= 1.0:
208
- logger.warning(f"Invalid quality score: {quality_score} (must be 0.0-1.0)")
209
- return ""
210
-
211
- # Convert tier to string if ModelTier enum
212
- if isinstance(tier, ModelTier):
213
- tier = tier.value
214
-
215
- feedback_id = f"feedback_{uuid4().hex[:8]}"
216
-
217
- entry = FeedbackEntry(
218
- feedback_id=feedback_id,
219
- workflow_name=workflow_name,
220
- stage_name=stage_name,
221
- tier=tier,
222
- quality_score=quality_score,
223
- timestamp=datetime.utcnow(),
224
- metadata=metadata or {},
225
- )
226
-
227
- # Store feedback
228
- # Key format: feedback:{workflow}:{stage}:{tier}:{id}
229
- key = f"feedback:{workflow_name}:{stage_name}:{tier}:{feedback_id}"
230
-
231
- try:
232
- if hasattr(self.memory, "stash"):
233
- self.memory.stash(
234
- key=key, data=entry.to_dict(), credentials=None, ttl_seconds=self.FEEDBACK_TTL
235
- )
236
- elif hasattr(self.memory, "_redis"):
237
- import json
238
-
239
- self.memory._redis.setex(key, self.FEEDBACK_TTL, json.dumps(entry.to_dict()))
240
- else:
241
- logger.warning("Cannot store feedback: unsupported memory type")
242
- return ""
243
- except Exception as e:
244
- logger.error(f"Failed to store feedback: {e}")
245
- return ""
246
-
247
- logger.debug(
248
- f"Recorded feedback: {workflow_name}/{stage_name} tier={tier} quality={quality_score:.2f}"
249
- )
250
- return feedback_id
251
-
252
- def get_feedback_history(
253
- self, workflow_name: str, stage_name: str, tier: str | ModelTier | None = None, limit: int = 100
254
- ) -> list[FeedbackEntry]:
255
- """Get feedback history for a workflow stage.
256
-
257
- Args:
258
- workflow_name: Name of workflow
259
- stage_name: Name of stage
260
- tier: Optional filter by tier
261
- limit: Maximum number of entries to return
262
-
263
- Returns:
264
- List of feedback entries (newest first)
265
- """
266
- if not self.memory or not hasattr(self.memory, "_redis"):
267
- return []
268
-
269
- # Convert tier to string if ModelTier enum
270
- if isinstance(tier, ModelTier):
271
- tier = tier.value
272
-
273
- try:
274
- # Build search pattern
275
- if tier:
276
- pattern = f"feedback:{workflow_name}:{stage_name}:{tier}:*"
277
- else:
278
- pattern = f"feedback:{workflow_name}:{stage_name}:*"
279
-
280
- keys = self.memory._redis.keys(pattern)
281
-
282
- entries = []
283
- for key in keys:
284
- if isinstance(key, bytes):
285
- key = key.decode("utf-8")
286
-
287
- # Retrieve entry
288
- data = self._retrieve_feedback(key)
289
- if data:
290
- entries.append(FeedbackEntry.from_dict(data))
291
-
292
- if len(entries) >= limit:
293
- break
294
-
295
- # Sort by timestamp (newest first)
296
- entries.sort(key=lambda e: e.timestamp, reverse=True)
297
-
298
- return entries[:limit]
299
- except Exception as e:
300
- logger.error(f"Failed to get feedback history: {e}")
301
- return []
302
-
303
- def _retrieve_feedback(self, key: str) -> dict[str, Any] | None:
304
- """Retrieve feedback entry from memory."""
305
- if not self.memory:
306
- return None
307
-
308
- try:
309
- if hasattr(self.memory, "retrieve"):
310
- return self.memory.retrieve(key, credentials=None)
311
- elif hasattr(self.memory, "_redis"):
312
- import json
313
-
314
- data = self.memory._redis.get(key)
315
- if data:
316
- if isinstance(data, bytes):
317
- data = data.decode("utf-8")
318
- return json.loads(data)
319
- return None
320
- except Exception as e:
321
- logger.debug(f"Failed to retrieve feedback: {e}")
322
- return None
323
-
324
- def get_quality_stats(
325
- self, workflow_name: str, stage_name: str, tier: str | ModelTier | None = None
326
- ) -> QualityStats | None:
327
- """Get quality statistics for a workflow stage.
328
-
329
- Args:
330
- workflow_name: Name of workflow
331
- stage_name: Name of stage
332
- tier: Optional filter by tier
333
-
334
- Returns:
335
- Quality statistics or None if insufficient data
336
- """
337
- history = self.get_feedback_history(workflow_name, stage_name, tier=tier)
338
-
339
- if not history:
340
- return None
341
-
342
- # Calculate statistics
343
- quality_scores = [entry.quality_score for entry in history]
344
-
345
- avg_quality = sum(quality_scores) / len(quality_scores)
346
- min_quality = min(quality_scores)
347
- max_quality = max(quality_scores)
348
-
349
- # Calculate trend (recent vs older feedback)
350
- if len(history) >= 4:
351
- recent = quality_scores[: len(quality_scores) // 2]
352
- older = quality_scores[len(quality_scores) // 2 :]
353
- recent_avg = sum(recent) / len(recent)
354
- older_avg = sum(older) / len(older)
355
- recent_trend = (recent_avg - older_avg) / max(older_avg, 0.1) # Normalized difference
356
- else:
357
- recent_trend = 0.0
358
-
359
- tier_str = tier.value if isinstance(tier, ModelTier) else (tier or "all")
360
-
361
- return QualityStats(
362
- workflow_name=workflow_name,
363
- stage_name=stage_name,
364
- tier=tier_str,
365
- avg_quality=avg_quality,
366
- min_quality=min_quality,
367
- max_quality=max_quality,
368
- sample_count=len(history),
369
- recent_trend=recent_trend,
370
- )
371
-
372
- def recommend_tier(
373
- self, workflow_name: str, stage_name: str, current_tier: str | ModelTier | None = None
374
- ) -> TierRecommendation:
375
- """Recommend optimal tier based on quality feedback.
376
-
377
- Analyzes historical quality data and recommends:
378
- - Downgrade if current tier consistently delivers high quality (cost optimization)
379
- - Upgrade if current tier delivers poor quality (quality optimization)
380
- - Keep current if quality is acceptable
381
-
382
- Args:
383
- workflow_name: Name of workflow
384
- stage_name: Name of stage
385
- current_tier: Current tier in use (if known)
386
-
387
- Returns:
388
- Tier recommendation with confidence and reasoning
389
- """
390
- # Convert tier to string if ModelTier enum
391
- if isinstance(current_tier, ModelTier):
392
- current_tier = current_tier.value
393
-
394
- # Get stats for all tiers
395
- stats_by_tier = {}
396
- for tier in ["cheap", "capable", "premium"]:
397
- stats = self.get_quality_stats(workflow_name, stage_name, tier=tier)
398
- if stats:
399
- stats_by_tier[tier] = stats
400
-
401
- # No data - default recommendation
402
- if not stats_by_tier:
403
- return TierRecommendation(
404
- current_tier=current_tier or "unknown",
405
- recommended_tier=current_tier or "cheap",
406
- confidence=0.0,
407
- reason="No feedback data available",
408
- stats={},
409
- )
410
-
411
- # Determine current tier if not provided
412
- if not current_tier:
413
- # Use tier with most recent feedback
414
- all_history = self.get_feedback_history(workflow_name, stage_name, tier=None, limit=1)
415
- if all_history:
416
- current_tier = all_history[0].tier
417
- else:
418
- current_tier = "cheap"
419
-
420
- current_stats = stats_by_tier.get(current_tier)
421
-
422
- # Insufficient data for current tier
423
- if not current_stats or current_stats.sample_count < self.MIN_SAMPLES:
424
- return TierRecommendation(
425
- current_tier=current_tier,
426
- recommended_tier=current_tier,
427
- confidence=0.0,
428
- reason=f"Insufficient data (need {self.MIN_SAMPLES} samples, have {current_stats.sample_count if current_stats else 0})",
429
- stats=stats_by_tier,
430
- )
431
-
432
- # Analyze quality
433
- avg_quality = current_stats.avg_quality
434
- confidence = min(current_stats.sample_count / (self.MIN_SAMPLES * 2), 1.0)
435
-
436
- # Decision logic
437
- if avg_quality < self.QUALITY_THRESHOLD:
438
- # Poor quality - recommend upgrade
439
- if current_tier == "cheap":
440
- recommended = "capable"
441
- reason = f"Low quality ({avg_quality:.2f}) - upgrade for better results"
442
- elif current_tier == "capable":
443
- recommended = "premium"
444
- reason = f"Low quality ({avg_quality:.2f}) - upgrade to premium tier"
445
- else: # premium
446
- recommended = "premium"
447
- reason = f"Already using premium tier (quality: {avg_quality:.2f})"
448
- confidence = 1.0
449
- elif avg_quality > 0.9 and current_tier != "cheap":
450
- # Excellent quality - consider downgrade for cost optimization
451
- if current_tier == "premium":
452
- # Check if capable tier also has good quality
453
- capable_stats = stats_by_tier.get("capable")
454
- if capable_stats and capable_stats.avg_quality > 0.85:
455
- recommended = "capable"
456
- reason = f"Excellent quality ({avg_quality:.2f}) - downgrade to save cost"
457
- else:
458
- recommended = "premium"
459
- reason = f"Excellent quality ({avg_quality:.2f}) - keep premium for consistency"
460
- elif current_tier == "capable":
461
- # Check if cheap tier also has good quality
462
- cheap_stats = stats_by_tier.get("cheap")
463
- if cheap_stats and cheap_stats.avg_quality > 0.85:
464
- recommended = "cheap"
465
- reason = f"Excellent quality ({avg_quality:.2f}) - downgrade to save cost"
466
- else:
467
- recommended = "capable"
468
- reason = f"Excellent quality ({avg_quality:.2f}) - keep capable tier"
469
- else:
470
- recommended = current_tier
471
- reason = f"Excellent quality ({avg_quality:.2f}) - maintain current tier"
472
- else:
473
- # Acceptable quality - keep current tier
474
- recommended = current_tier
475
- reason = f"Acceptable quality ({avg_quality:.2f}) - maintain current tier"
476
-
477
- return TierRecommendation(
478
- current_tier=current_tier,
479
- recommended_tier=recommended,
480
- confidence=confidence,
481
- reason=reason,
482
- stats=stats_by_tier,
483
- )
484
-
485
- def get_underperforming_stages(
486
- self, workflow_name: str, quality_threshold: float = 0.7
487
- ) -> list[tuple[str, QualityStats]]:
488
- """Get workflow stages with poor quality scores.
489
-
490
- Args:
491
- workflow_name: Name of workflow
492
- quality_threshold: Threshold below which stage is considered underperforming
493
-
494
- Returns:
495
- List of (stage_name, stats) tuples for underperforming stages
496
- """
497
- if not self.memory or not hasattr(self.memory, "_redis"):
498
- return []
499
-
500
- try:
501
- # Find all feedback keys for this workflow
502
- pattern = f"feedback:{workflow_name}:*"
503
- keys = self.memory._redis.keys(pattern)
504
-
505
- # Extract unique stages
506
- stages = set()
507
- for key in keys:
508
- if isinstance(key, bytes):
509
- key = key.decode("utf-8")
510
- # Parse key: feedback:{workflow}:{stage}:{tier}:{id}
511
- parts = key.split(":")
512
- if len(parts) >= 4:
513
- stages.add(parts[2])
514
-
515
- # Get stats for each stage
516
- underperforming = []
517
- for stage_name in stages:
518
- stats = self.get_quality_stats(workflow_name, stage_name)
519
- if stats and stats.avg_quality < quality_threshold:
520
- underperforming.append((stage_name, stats))
521
-
522
- # Sort by quality (worst first)
523
- underperforming.sort(key=lambda x: x[1].avg_quality)
524
-
525
- return underperforming
526
- except Exception as e:
527
- logger.error(f"Failed to get underperforming stages: {e}")
528
- return []
529
-
530
- def clear_feedback(self, workflow_name: str, stage_name: str | None = None) -> int:
531
- """Clear feedback history for a workflow or stage.
532
-
533
- Args:
534
- workflow_name: Name of workflow
535
- stage_name: Optional stage name (clears all stages if None)
536
-
537
- Returns:
538
- Number of feedback entries cleared
539
- """
540
- if not self.memory or not hasattr(self.memory, "_redis"):
541
- return 0
542
-
543
- try:
544
- if stage_name:
545
- pattern = f"feedback:{workflow_name}:{stage_name}:*"
546
- else:
547
- pattern = f"feedback:{workflow_name}:*"
548
-
549
- keys = self.memory._redis.keys(pattern)
550
- if not keys:
551
- return 0
552
-
553
- deleted = self.memory._redis.delete(*keys)
554
- return deleted
555
- except Exception as e:
556
- logger.error(f"Failed to clear feedback: {e}")
557
- return 0