empathy-framework 5.0.3__py3-none-any.whl → 5.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {empathy_framework-5.0.3.dist-info → empathy_framework-5.1.1.dist-info}/METADATA +259 -142
  2. {empathy_framework-5.0.3.dist-info → empathy_framework-5.1.1.dist-info}/RECORD +58 -28
  3. empathy_framework-5.1.1.dist-info/licenses/LICENSE +201 -0
  4. empathy_framework-5.1.1.dist-info/licenses/LICENSE_CHANGE_ANNOUNCEMENT.md +101 -0
  5. empathy_os/__init__.py +1 -1
  6. empathy_os/cli/commands/batch.py +5 -5
  7. empathy_os/cli/commands/routing.py +1 -1
  8. empathy_os/cli/commands/workflow.py +2 -1
  9. empathy_os/cli/parsers/cache 2.py +65 -0
  10. empathy_os/cli_minimal.py +3 -3
  11. empathy_os/cli_router 2.py +416 -0
  12. empathy_os/cli_router.py +12 -0
  13. empathy_os/dashboard/__init__.py +1 -2
  14. empathy_os/dashboard/app 2.py +512 -0
  15. empathy_os/dashboard/app.py +1 -1
  16. empathy_os/dashboard/simple_server 2.py +403 -0
  17. empathy_os/dashboard/standalone_server 2.py +536 -0
  18. empathy_os/memory/types 2.py +441 -0
  19. empathy_os/meta_workflows/intent_detector.py +71 -0
  20. empathy_os/models/__init__.py +19 -0
  21. empathy_os/models/adaptive_routing 2.py +437 -0
  22. empathy_os/models/auth_cli.py +444 -0
  23. empathy_os/models/auth_strategy.py +450 -0
  24. empathy_os/project_index/scanner_parallel 2.py +291 -0
  25. empathy_os/telemetry/agent_coordination 2.py +478 -0
  26. empathy_os/telemetry/agent_coordination.py +3 -3
  27. empathy_os/telemetry/agent_tracking 2.py +350 -0
  28. empathy_os/telemetry/agent_tracking.py +1 -2
  29. empathy_os/telemetry/approval_gates 2.py +563 -0
  30. empathy_os/telemetry/event_streaming 2.py +405 -0
  31. empathy_os/telemetry/event_streaming.py +3 -3
  32. empathy_os/telemetry/feedback_loop 2.py +557 -0
  33. empathy_os/telemetry/feedback_loop.py +1 -1
  34. empathy_os/vscode_bridge 2.py +173 -0
  35. empathy_os/workflows/__init__.py +8 -0
  36. empathy_os/workflows/autonomous_test_gen.py +569 -0
  37. empathy_os/workflows/bug_predict.py +45 -0
  38. empathy_os/workflows/code_review.py +92 -22
  39. empathy_os/workflows/document_gen.py +594 -62
  40. empathy_os/workflows/llm_base.py +363 -0
  41. empathy_os/workflows/perf_audit.py +69 -0
  42. empathy_os/workflows/progressive/README 2.md +454 -0
  43. empathy_os/workflows/progressive/__init__ 2.py +92 -0
  44. empathy_os/workflows/progressive/cli 2.py +242 -0
  45. empathy_os/workflows/progressive/core 2.py +488 -0
  46. empathy_os/workflows/progressive/orchestrator 2.py +701 -0
  47. empathy_os/workflows/progressive/reports 2.py +528 -0
  48. empathy_os/workflows/progressive/telemetry 2.py +280 -0
  49. empathy_os/workflows/progressive/test_gen 2.py +514 -0
  50. empathy_os/workflows/progressive/workflow 2.py +628 -0
  51. empathy_os/workflows/release_prep.py +54 -0
  52. empathy_os/workflows/security_audit.py +154 -79
  53. empathy_os/workflows/test_gen.py +60 -0
  54. empathy_os/workflows/test_gen_behavioral.py +477 -0
  55. empathy_os/workflows/test_gen_parallel.py +341 -0
  56. empathy_framework-5.0.3.dist-info/licenses/LICENSE +0 -139
  57. {empathy_framework-5.0.3.dist-info → empathy_framework-5.1.1.dist-info}/WHEEL +0 -0
  58. {empathy_framework-5.0.3.dist-info → empathy_framework-5.1.1.dist-info}/entry_points.txt +0 -0
  59. {empathy_framework-5.0.3.dist-info → empathy_framework-5.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,488 @@
1
+ """Core data structures for progressive tier escalation.
2
+
3
+ This module defines the fundamental data structures used throughout the
4
+ progressive escalation system, including failure analysis, quality metrics,
5
+ tier results, and configuration.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime
10
+ from enum import Enum
11
+ from typing import Any
12
+
13
+
14
+ class Tier(Enum):
15
+ """Model tier levels for progressive escalation.
16
+
17
+ Attributes:
18
+ CHEAP: Low-cost models (e.g., gpt-4o-mini, claude-3-haiku)
19
+ CAPABLE: Mid-tier models (e.g., claude-3-5-sonnet, gpt-4o)
20
+ PREMIUM: High-end models (e.g., claude-opus-4, o1)
21
+ """
22
+ CHEAP = "cheap"
23
+ CAPABLE = "capable"
24
+ PREMIUM = "premium"
25
+
26
+ def __lt__(self, other: "Tier") -> bool:
27
+ """Compare tiers for ordering (CHEAP < CAPABLE < PREMIUM)."""
28
+ order = {Tier.CHEAP: 0, Tier.CAPABLE: 1, Tier.PREMIUM: 2}
29
+ return order[self] < order[other]
30
+
31
+
32
+ @dataclass
33
+ class FailureAnalysis:
34
+ """Multi-signal failure detection and quality analysis.
35
+
36
+ Combines multiple signals to provide robust failure detection:
37
+ 1. Syntax errors in generated code
38
+ 2. Execution failures (test pass rate)
39
+ 3. Quality metrics (coverage, assertion depth)
40
+ 4. LLM confidence signals
41
+
42
+ The composite quality score (CQS) provides an objective measure
43
+ that combines all signals with appropriate weighting.
44
+
45
+ Attributes:
46
+ syntax_errors: List of syntax errors found in generated code
47
+ test_failures: List of test execution failures
48
+ test_pass_rate: Percentage of tests that passed (0.0-1.0)
49
+ coverage_percent: Code coverage percentage (0.0-100.0)
50
+ assertion_depth: Average number of assertions per test
51
+ confidence_score: LLM confidence level (0.0-1.0)
52
+ llm_uncertainty_signals: Uncertainty phrases detected in LLM response
53
+
54
+ Example:
55
+ >>> analysis = FailureAnalysis(
56
+ ... test_pass_rate=0.85,
57
+ ... coverage_percent=78.0,
58
+ ... assertion_depth=5.2,
59
+ ... confidence_score=0.92
60
+ ... )
61
+ >>> analysis.calculate_quality_score()
62
+ 87.7
63
+ >>> analysis.should_escalate
64
+ False
65
+ """
66
+
67
+ syntax_errors: list[SyntaxError] = field(default_factory=list)
68
+ test_failures: list[dict[str, Any]] = field(default_factory=list)
69
+ test_pass_rate: float = 0.0
70
+ coverage_percent: float = 0.0
71
+ assertion_depth: float = 0.0
72
+ confidence_score: float = 0.0
73
+ llm_uncertainty_signals: list[str] = field(default_factory=list)
74
+
75
+ def calculate_quality_score(self) -> float:
76
+ """Calculate composite quality score (CQS) from 0-100.
77
+
78
+ Formula:
79
+ CQS = (
80
+ 0.40 × test_pass_rate +
81
+ 0.25 × code_coverage +
82
+ 0.20 × assertion_quality +
83
+ 0.15 × llm_confidence
84
+ ) × syntax_error_penalty
85
+
86
+ Weights:
87
+ - Test pass rate: 40% (most important - functionality must work)
88
+ - Code coverage: 25% (thoroughness matters)
89
+ - Assertion quality: 20% (test depth is important)
90
+ - LLM confidence: 15% (signals potential brittleness)
91
+
92
+ Penalties:
93
+ - Syntax errors: 50% penalty (halves the score)
94
+
95
+ Returns:
96
+ Quality score from 0.0 (worst) to 100.0 (perfect)
97
+
98
+ Example:
99
+ >>> analysis = FailureAnalysis(
100
+ ... test_pass_rate=0.90,
101
+ ... coverage_percent=85.0,
102
+ ... assertion_depth=6.0,
103
+ ... confidence_score=0.95
104
+ ... )
105
+ >>> analysis.calculate_quality_score()
106
+ 91.25
107
+ """
108
+ # Component scores (convert to 0-100 scale)
109
+ pass_rate_score = self.test_pass_rate * 100
110
+ coverage_score = self.coverage_percent
111
+
112
+ # Assertion quality: cap at 100% (10 assertions = 100%)
113
+ assertion_quality_score = min(self.assertion_depth * 10, 100)
114
+
115
+ confidence_score_scaled = self.confidence_score * 100
116
+
117
+ # Weighted composite
118
+ cqs = (
119
+ 0.40 * pass_rate_score +
120
+ 0.25 * coverage_score +
121
+ 0.20 * assertion_quality_score +
122
+ 0.15 * confidence_score_scaled
123
+ )
124
+
125
+ # Apply syntax error penalty
126
+ if len(self.syntax_errors) > 0:
127
+ cqs *= 0.5 # Halve score for any syntax errors
128
+
129
+ return min(cqs, 100.0)
130
+
131
+ @property
132
+ def should_escalate(self) -> bool:
133
+ """Determine if this result should trigger escalation.
134
+
135
+ Multi-criteria decision based on:
136
+ - Low CQS (<70)
137
+ - Multiple syntax errors (>3)
138
+ - Low test pass rate (<70%)
139
+ - Low coverage (<60%)
140
+
141
+ Returns:
142
+ True if escalation is recommended, False otherwise
143
+
144
+ Example:
145
+ >>> analysis = FailureAnalysis(test_pass_rate=0.50)
146
+ >>> analysis.should_escalate
147
+ True
148
+ """
149
+ cqs = self.calculate_quality_score()
150
+ return (
151
+ cqs < 70 or
152
+ len(self.syntax_errors) > 3 or
153
+ self.test_pass_rate < 0.7 or
154
+ self.coverage_percent < 60
155
+ )
156
+
157
+ @property
158
+ def failure_severity(self) -> str:
159
+ """Determine severity level of failures.
160
+
161
+ Returns:
162
+ "CRITICAL": Severe failures, consider skipping to Premium
163
+ "HIGH": Significant failures, escalate to next tier
164
+ "MODERATE": Minor failures, retry at current tier
165
+ "LOW": Acceptable quality, no escalation needed
166
+
167
+ Example:
168
+ >>> analysis = FailureAnalysis(test_pass_rate=0.25)
169
+ >>> analysis.failure_severity
170
+ 'CRITICAL'
171
+ """
172
+ cqs = self.calculate_quality_score()
173
+
174
+ if len(self.syntax_errors) > 5 or self.test_pass_rate < 0.3:
175
+ return "CRITICAL"
176
+ elif cqs < 70 or self.test_pass_rate < 0.5:
177
+ return "HIGH"
178
+ elif cqs < 80 or self.test_pass_rate < 0.7:
179
+ return "MODERATE"
180
+ else:
181
+ return "LOW"
182
+
183
+
184
+ @dataclass
185
+ class TierResult:
186
+ """Results from a single tier execution attempt.
187
+
188
+ Captures all information about a tier's execution including
189
+ generated artifacts, quality analysis, cost, and escalation decision.
190
+
191
+ Attributes:
192
+ tier: Which tier executed (CHEAP, CAPABLE, or PREMIUM)
193
+ model: Specific model used (e.g., "gpt-4o-mini")
194
+ attempt: Attempt number at this tier (1-based)
195
+ timestamp: When this execution occurred
196
+ generated_items: Generated artifacts (tests, code, etc.)
197
+ failure_analysis: Quality and failure analysis
198
+ cost: Cost in USD for this execution
199
+ duration: Execution time in seconds
200
+ escalated: Whether this result triggered escalation
201
+ escalation_reason: Human-readable reason for escalation
202
+
203
+ Example:
204
+ >>> result = TierResult(
205
+ ... tier=Tier.CHEAP,
206
+ ... model="gpt-4o-mini",
207
+ ... attempt=1,
208
+ ... timestamp=datetime.now(),
209
+ ... generated_items=[{"code": "test_foo()"}],
210
+ ... failure_analysis=FailureAnalysis(test_pass_rate=0.65),
211
+ ... cost=0.15,
212
+ ... duration=12.5
213
+ ... )
214
+ >>> result.quality_score
215
+ 65.0
216
+ """
217
+
218
+ tier: Tier
219
+ model: str
220
+ attempt: int
221
+ timestamp: datetime
222
+
223
+ # Generated artifacts
224
+ generated_items: list[dict[str, Any]] = field(default_factory=list)
225
+
226
+ # Analysis
227
+ failure_analysis: FailureAnalysis = field(default_factory=FailureAnalysis)
228
+ cost: float = 0.0
229
+ duration: float = 0.0
230
+ tokens_used: dict[str, int] = field(default_factory=dict)
231
+
232
+ # Decision
233
+ escalated: bool = False
234
+ escalation_reason: str = ""
235
+
236
+ @property
237
+ def quality_score(self) -> float:
238
+ """Get composite quality score for this tier result.
239
+
240
+ Returns:
241
+ CQS from 0.0 to 100.0
242
+ """
243
+ return self.failure_analysis.calculate_quality_score()
244
+
245
+ @property
246
+ def success_count(self) -> int:
247
+ """Count of successfully generated items (CQS >= 80).
248
+
249
+ Returns:
250
+ Number of items meeting quality threshold
251
+ """
252
+ return sum(
253
+ 1 for item in self.generated_items
254
+ if item.get("quality_score", 0) >= 80
255
+ )
256
+
257
+ @property
258
+ def success_rate(self) -> float:
259
+ """Percentage of items successfully generated.
260
+
261
+ Returns:
262
+ Success rate from 0.0 to 1.0
263
+ """
264
+ if not self.generated_items:
265
+ return 0.0
266
+ return self.success_count / len(self.generated_items)
267
+
268
+
269
+ @dataclass
270
+ class ProgressiveWorkflowResult:
271
+ """Complete results from a progressive workflow execution.
272
+
273
+ Captures the full progression history across all tiers, including
274
+ costs, quality metrics, and escalation decisions.
275
+
276
+ Attributes:
277
+ workflow_name: Name of the workflow (e.g., "test-gen")
278
+ task_id: Unique identifier for this execution
279
+ tier_results: Chronological list of tier execution results
280
+ final_result: The last tier result (may be successful or failed)
281
+ total_cost: Total cost in USD across all tiers
282
+ total_duration: Total execution time in seconds
283
+ success: Whether the workflow completed successfully
284
+
285
+ Example:
286
+ >>> result = ProgressiveWorkflowResult(
287
+ ... workflow_name="test-gen",
288
+ ... task_id="test-gen-20260117-143022",
289
+ ... tier_results=[cheap_result, capable_result],
290
+ ... final_result=capable_result,
291
+ ... total_cost=0.75,
292
+ ... total_duration=45.2,
293
+ ... success=True
294
+ ... )
295
+ >>> print(result.generate_report())
296
+ 🎯 PROGRESSIVE ESCALATION REPORT
297
+ ...
298
+ """
299
+
300
+ workflow_name: str
301
+ task_id: str
302
+ tier_results: list[TierResult]
303
+
304
+ final_result: TierResult
305
+ total_cost: float
306
+ total_duration: float
307
+ success: bool
308
+
309
+ def generate_report(self) -> str:
310
+ """Generate human-readable progression report.
311
+
312
+ Creates a detailed report showing:
313
+ - Tier-by-tier breakdown
314
+ - Quality scores and success rates
315
+ - Cost analysis and savings
316
+ - Escalation decisions
317
+
318
+ Returns:
319
+ Formatted report string
320
+ """
321
+ # Implementation will be in reports.py module
322
+ from empathy_os.workflows.progressive.reports import generate_progression_report
323
+ return generate_progression_report(self)
324
+
325
+ def save_to_disk(self, storage_path: str) -> None:
326
+ """Save detailed results to disk.
327
+
328
+ Creates a directory with:
329
+ - summary.json: High-level metrics
330
+ - tier_N_<tier_name>.json: Detailed tier results
331
+ - report.txt: Human-readable report
332
+
333
+ Args:
334
+ storage_path: Base path for saving results
335
+ """
336
+ from empathy_os.workflows.progressive.reports import save_results_to_disk
337
+ save_results_to_disk(self, storage_path)
338
+
339
+ @property
340
+ def cost_savings(self) -> float:
341
+ """Calculate cost savings vs running all items at Premium tier.
342
+
343
+ Returns:
344
+ Dollar amount saved by using progressive escalation
345
+ """
346
+ # Estimate what it would cost if all items were Premium
347
+ total_items = sum(len(r.generated_items) for r in self.tier_results)
348
+
349
+ # Assume Premium costs ~$0.05 per item (conservative estimate)
350
+ all_premium_cost = total_items * 0.05
351
+
352
+ savings = all_premium_cost - self.total_cost
353
+ return max(savings, 0.0)
354
+
355
+ @property
356
+ def cost_savings_percent(self) -> float:
357
+ """Calculate percentage of cost saved.
358
+
359
+ Returns:
360
+ Savings percentage (0-100)
361
+ """
362
+ total_items = sum(len(r.generated_items) for r in self.tier_results)
363
+ all_premium_cost = total_items * 0.05
364
+
365
+ if all_premium_cost == 0:
366
+ return 0.0
367
+
368
+ return (self.cost_savings / all_premium_cost) * 100
369
+
370
+
371
+ @dataclass
372
+ class EscalationConfig:
373
+ """Configuration for progressive tier escalation.
374
+
375
+ Controls all aspects of the escalation system including retry logic,
376
+ thresholds, cost management, and storage.
377
+
378
+ Attributes:
379
+ enabled: Whether progressive escalation is active
380
+ tiers: Ordered list of tiers to use (default: all three)
381
+
382
+ Retry configuration:
383
+ cheap_min_attempts: Minimum attempts at cheap tier
384
+ cheap_max_attempts: Maximum attempts at cheap tier
385
+ capable_min_attempts: Minimum attempts at capable tier
386
+ capable_max_attempts: Maximum attempts at capable tier
387
+ premium_max_attempts: Maximum attempts at premium tier
388
+
389
+ Thresholds (Cheap → Capable):
390
+ cheap_to_capable_failure_rate: Max failure rate before escalation
391
+ cheap_to_capable_min_cqs: Min quality score to avoid escalation
392
+ cheap_to_capable_max_syntax_errors: Max syntax errors allowed
393
+
394
+ Thresholds (Capable → Premium):
395
+ capable_to_premium_failure_rate: Max failure rate before escalation
396
+ capable_to_premium_min_cqs: Min quality score to avoid escalation
397
+ capable_to_premium_max_syntax_errors: Max syntax errors allowed
398
+
399
+ Stagnation detection:
400
+ improvement_threshold: Min CQS improvement to avoid stagnation (%)
401
+ consecutive_stagnation_limit: Consecutive stagnations before escalation
402
+
403
+ Cost management:
404
+ max_cost: Maximum total cost in USD
405
+ auto_approve_under: Auto-approve escalations under this cost
406
+ warn_on_budget_exceeded: Print warning if budget exceeded
407
+ abort_on_budget_exceeded: Abort execution if budget exceeded
408
+
409
+ Storage:
410
+ save_tier_results: Whether to save tier results to disk
411
+ storage_path: Directory for saving results
412
+
413
+ Example:
414
+ >>> config = EscalationConfig(
415
+ ... enabled=True,
416
+ ... max_cost=10.00,
417
+ ... auto_approve_under=5.00,
418
+ ... cheap_min_attempts=2,
419
+ ... capable_max_attempts=6
420
+ ... )
421
+ """
422
+
423
+ # Global settings
424
+ enabled: bool = False
425
+ tiers: list[Tier] = field(default_factory=lambda: [Tier.CHEAP, Tier.CAPABLE, Tier.PREMIUM])
426
+
427
+ # Retry configuration
428
+ cheap_min_attempts: int = 2
429
+ cheap_max_attempts: int = 3
430
+ capable_min_attempts: int = 2
431
+ capable_max_attempts: int = 6
432
+ premium_max_attempts: int = 1
433
+
434
+ # Thresholds: Cheap → Capable
435
+ cheap_to_capable_failure_rate: float = 0.30
436
+ cheap_to_capable_min_cqs: float = 70.0
437
+ cheap_to_capable_max_syntax_errors: int = 3
438
+
439
+ # Thresholds: Capable → Premium
440
+ capable_to_premium_failure_rate: float = 0.20
441
+ capable_to_premium_min_cqs: float = 80.0
442
+ capable_to_premium_max_syntax_errors: int = 1
443
+
444
+ # Stagnation detection
445
+ improvement_threshold: float = 5.0 # 5% CQS improvement required
446
+ consecutive_stagnation_limit: int = 2
447
+
448
+ # Cost management
449
+ max_cost: float = 5.00
450
+ auto_approve_under: float | None = None
451
+ warn_on_budget_exceeded: bool = True
452
+ abort_on_budget_exceeded: bool = False
453
+
454
+ # Storage
455
+ save_tier_results: bool = True
456
+ storage_path: str = ".empathy/progressive_runs"
457
+
458
+ def get_max_attempts(self, tier: Tier) -> int:
459
+ """Get maximum attempts for a specific tier.
460
+
461
+ Args:
462
+ tier: The tier to query
463
+
464
+ Returns:
465
+ Maximum number of attempts allowed
466
+ """
467
+ if tier == Tier.CHEAP:
468
+ return self.cheap_max_attempts
469
+ elif tier == Tier.CAPABLE:
470
+ return self.capable_max_attempts
471
+ else: # PREMIUM
472
+ return self.premium_max_attempts
473
+
474
+ def get_min_attempts(self, tier: Tier) -> int:
475
+ """Get minimum attempts for a specific tier.
476
+
477
+ Args:
478
+ tier: The tier to query
479
+
480
+ Returns:
481
+ Minimum number of attempts required
482
+ """
483
+ if tier == Tier.CHEAP:
484
+ return self.cheap_min_attempts
485
+ elif tier == Tier.CAPABLE:
486
+ return self.capable_min_attempts
487
+ else: # PREMIUM
488
+ return 1 # Premium always gets exactly 1 attempt