PyPI - empathy-framework - Versions diffs - 5.0.3__py3-none-any.whl → 5.1.1__py3-none-any.whl - Mend

empathy-framework 5.0.3py3-none-any.whl → 5.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{empathy_framework-5.0.3.dist-info → empathy_framework-5.1.1.dist-info}/METADATA +259 -142
{empathy_framework-5.0.3.dist-info → empathy_framework-5.1.1.dist-info}/RECORD +58 -28
empathy_framework-5.1.1.dist-info/licenses/LICENSE +201 -0
empathy_framework-5.1.1.dist-info/licenses/LICENSE_CHANGE_ANNOUNCEMENT.md +101 -0
empathy_os/__init__.py +1 -1
empathy_os/cli/commands/batch.py +5 -5
empathy_os/cli/commands/routing.py +1 -1
empathy_os/cli/commands/workflow.py +2 -1
empathy_os/cli/parsers/cache 2.py +65 -0
empathy_os/cli_minimal.py +3 -3
empathy_os/cli_router 2.py +416 -0
empathy_os/cli_router.py +12 -0
empathy_os/dashboard/__init__.py +1 -2
empathy_os/dashboard/app 2.py +512 -0
empathy_os/dashboard/app.py +1 -1
empathy_os/dashboard/simple_server 2.py +403 -0
empathy_os/dashboard/standalone_server 2.py +536 -0
empathy_os/memory/types 2.py +441 -0
empathy_os/meta_workflows/intent_detector.py +71 -0
empathy_os/models/__init__.py +19 -0
empathy_os/models/adaptive_routing 2.py +437 -0
empathy_os/models/auth_cli.py +444 -0
empathy_os/models/auth_strategy.py +450 -0
empathy_os/project_index/scanner_parallel 2.py +291 -0
empathy_os/telemetry/agent_coordination 2.py +478 -0
empathy_os/telemetry/agent_coordination.py +3 -3
empathy_os/telemetry/agent_tracking 2.py +350 -0
empathy_os/telemetry/agent_tracking.py +1 -2
empathy_os/telemetry/approval_gates 2.py +563 -0
empathy_os/telemetry/event_streaming 2.py +405 -0
empathy_os/telemetry/event_streaming.py +3 -3
empathy_os/telemetry/feedback_loop 2.py +557 -0
empathy_os/telemetry/feedback_loop.py +1 -1
empathy_os/vscode_bridge 2.py +173 -0
empathy_os/workflows/__init__.py +8 -0
empathy_os/workflows/autonomous_test_gen.py +569 -0
empathy_os/workflows/bug_predict.py +45 -0
empathy_os/workflows/code_review.py +92 -22
empathy_os/workflows/document_gen.py +594 -62
empathy_os/workflows/llm_base.py +363 -0
empathy_os/workflows/perf_audit.py +69 -0
empathy_os/workflows/progressive/README 2.md +454 -0
empathy_os/workflows/progressive/__init__ 2.py +92 -0
empathy_os/workflows/progressive/cli 2.py +242 -0
empathy_os/workflows/progressive/core 2.py +488 -0
empathy_os/workflows/progressive/orchestrator 2.py +701 -0
empathy_os/workflows/progressive/reports 2.py +528 -0
empathy_os/workflows/progressive/telemetry 2.py +280 -0
empathy_os/workflows/progressive/test_gen 2.py +514 -0
empathy_os/workflows/progressive/workflow 2.py +628 -0
empathy_os/workflows/release_prep.py +54 -0
empathy_os/workflows/security_audit.py +154 -79
empathy_os/workflows/test_gen.py +60 -0
empathy_os/workflows/test_gen_behavioral.py +477 -0
empathy_os/workflows/test_gen_parallel.py +341 -0
empathy_framework-5.0.3.dist-info/licenses/LICENSE +0 -139
{empathy_framework-5.0.3.dist-info → empathy_framework-5.1.1.dist-info}/WHEEL +0 -0
{empathy_framework-5.0.3.dist-info → empathy_framework-5.1.1.dist-info}/entry_points.txt +0 -0
{empathy_framework-5.0.3.dist-info → empathy_framework-5.1.1.dist-info}/top_level.txt +0 -0

empathy_os/workflows/progressive/core 2.py ADDED Viewed

@@ -0,0 +1,488 @@
+"""Core data structures for progressive tier escalation.
+This module defines the fundamental data structures used throughout the
+progressive escalation system, including failure analysis, quality metrics,
+tier results, and configuration.
+"""
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Any
+class Tier(Enum):
+    """Model tier levels for progressive escalation.
+    Attributes:
+        CHEAP: Low-cost models (e.g., gpt-4o-mini, claude-3-haiku)
+        CAPABLE: Mid-tier models (e.g., claude-3-5-sonnet, gpt-4o)
+        PREMIUM: High-end models (e.g., claude-opus-4, o1)
+    """
+    CHEAP = "cheap"
+    CAPABLE = "capable"
+    PREMIUM = "premium"
+    def __lt__(self, other: "Tier") -> bool:
+        """Compare tiers for ordering (CHEAP < CAPABLE < PREMIUM)."""
+        order = {Tier.CHEAP: 0, Tier.CAPABLE: 1, Tier.PREMIUM: 2}
+        return order[self] < order[other]
+@dataclass
+class FailureAnalysis:
+    """Multi-signal failure detection and quality analysis.
+    Combines multiple signals to provide robust failure detection:
+    1. Syntax errors in generated code
+    2. Execution failures (test pass rate)
+    3. Quality metrics (coverage, assertion depth)
+    4. LLM confidence signals
+    The composite quality score (CQS) provides an objective measure
+    that combines all signals with appropriate weighting.
+    Attributes:
+        syntax_errors: List of syntax errors found in generated code
+        test_failures: List of test execution failures
+        test_pass_rate: Percentage of tests that passed (0.0-1.0)
+        coverage_percent: Code coverage percentage (0.0-100.0)
+        assertion_depth: Average number of assertions per test
+        confidence_score: LLM confidence level (0.0-1.0)
+        llm_uncertainty_signals: Uncertainty phrases detected in LLM response
+    Example:
+        >>> analysis = FailureAnalysis(
+        ...     test_pass_rate=0.85,
+        ...     coverage_percent=78.0,
+        ...     assertion_depth=5.2,
+        ...     confidence_score=0.92
+        ... )
+        >>> analysis.calculate_quality_score()
+        87.7
+        >>> analysis.should_escalate
+        False
+    """
+    syntax_errors: list[SyntaxError] = field(default_factory=list)
+    test_failures: list[dict[str, Any]] = field(default_factory=list)
+    test_pass_rate: float = 0.0
+    coverage_percent: float = 0.0
+    assertion_depth: float = 0.0
+    confidence_score: float = 0.0
+    llm_uncertainty_signals: list[str] = field(default_factory=list)
+    def calculate_quality_score(self) -> float:
+        """Calculate composite quality score (CQS) from 0-100.
+        Formula:
+            CQS = (
+                0.40 × test_pass_rate +
+                0.25 × code_coverage +
+                0.20 × assertion_quality +
+                0.15 × llm_confidence
+            ) × syntax_error_penalty
+        Weights:
+            - Test pass rate: 40% (most important - functionality must work)
+            - Code coverage: 25% (thoroughness matters)
+            - Assertion quality: 20% (test depth is important)
+            - LLM confidence: 15% (signals potential brittleness)
+        Penalties:
+            - Syntax errors: 50% penalty (halves the score)
+        Returns:
+            Quality score from 0.0 (worst) to 100.0 (perfect)
+        Example:
+            >>> analysis = FailureAnalysis(
+            ...     test_pass_rate=0.90,
+            ...     coverage_percent=85.0,
+            ...     assertion_depth=6.0,
+            ...     confidence_score=0.95
+            ... )
+            >>> analysis.calculate_quality_score()
+            91.25
+        """
+        # Component scores (convert to 0-100 scale)
+        pass_rate_score = self.test_pass_rate * 100
+        coverage_score = self.coverage_percent
+        # Assertion quality: cap at 100% (10 assertions = 100%)
+        assertion_quality_score = min(self.assertion_depth * 10, 100)
+        confidence_score_scaled = self.confidence_score * 100
+        # Weighted composite
+        cqs = (
+            0.40 * pass_rate_score +
+            0.25 * coverage_score +
+            0.20 * assertion_quality_score +
+            0.15 * confidence_score_scaled
+        )
+        # Apply syntax error penalty
+        if len(self.syntax_errors) > 0:
+            cqs *= 0.5  # Halve score for any syntax errors
+        return min(cqs, 100.0)
+    @property
+    def should_escalate(self) -> bool:
+        """Determine if this result should trigger escalation.
+        Multi-criteria decision based on:
+        - Low CQS (<70)
+        - Multiple syntax errors (>3)
+        - Low test pass rate (<70%)
+        - Low coverage (<60%)
+        Returns:
+            True if escalation is recommended, False otherwise
+        Example:
+            >>> analysis = FailureAnalysis(test_pass_rate=0.50)
+            >>> analysis.should_escalate
+            True
+        """
+        cqs = self.calculate_quality_score()
+        return (
+            cqs < 70 or
+            len(self.syntax_errors) > 3 or
+            self.test_pass_rate < 0.7 or
+            self.coverage_percent < 60
+        )
+    @property
+    def failure_severity(self) -> str:
+        """Determine severity level of failures.
+        Returns:
+            "CRITICAL": Severe failures, consider skipping to Premium
+            "HIGH": Significant failures, escalate to next tier
+            "MODERATE": Minor failures, retry at current tier
+            "LOW": Acceptable quality, no escalation needed
+        Example:
+            >>> analysis = FailureAnalysis(test_pass_rate=0.25)
+            >>> analysis.failure_severity
+            'CRITICAL'
+        """
+        cqs = self.calculate_quality_score()
+        if len(self.syntax_errors) > 5 or self.test_pass_rate < 0.3:
+            return "CRITICAL"
+        elif cqs < 70 or self.test_pass_rate < 0.5:
+            return "HIGH"
+        elif cqs < 80 or self.test_pass_rate < 0.7:
+            return "MODERATE"
+        else:
+            return "LOW"
+@dataclass
+class TierResult:
+    """Results from a single tier execution attempt.
+    Captures all information about a tier's execution including
+    generated artifacts, quality analysis, cost, and escalation decision.
+    Attributes:
+        tier: Which tier executed (CHEAP, CAPABLE, or PREMIUM)
+        model: Specific model used (e.g., "gpt-4o-mini")
+        attempt: Attempt number at this tier (1-based)
+        timestamp: When this execution occurred
+        generated_items: Generated artifacts (tests, code, etc.)
+        failure_analysis: Quality and failure analysis
+        cost: Cost in USD for this execution
+        duration: Execution time in seconds
+        escalated: Whether this result triggered escalation
+        escalation_reason: Human-readable reason for escalation
+    Example:
+        >>> result = TierResult(
+        ...     tier=Tier.CHEAP,
+        ...     model="gpt-4o-mini",
+        ...     attempt=1,
+        ...     timestamp=datetime.now(),
+        ...     generated_items=[{"code": "test_foo()"}],
+        ...     failure_analysis=FailureAnalysis(test_pass_rate=0.65),
+        ...     cost=0.15,
+        ...     duration=12.5
+        ... )
+        >>> result.quality_score
+        65.0
+    """
+    tier: Tier
+    model: str
+    attempt: int
+    timestamp: datetime
+    # Generated artifacts
+    generated_items: list[dict[str, Any]] = field(default_factory=list)
+    # Analysis
+    failure_analysis: FailureAnalysis = field(default_factory=FailureAnalysis)
+    cost: float = 0.0
+    duration: float = 0.0
+    tokens_used: dict[str, int] = field(default_factory=dict)
+    # Decision
+    escalated: bool = False
+    escalation_reason: str = ""
+    @property
+    def quality_score(self) -> float:
+        """Get composite quality score for this tier result.
+        Returns:
+            CQS from 0.0 to 100.0
+        """
+        return self.failure_analysis.calculate_quality_score()
+    @property
+    def success_count(self) -> int:
+        """Count of successfully generated items (CQS >= 80).
+        Returns:
+            Number of items meeting quality threshold
+        """
+        return sum(
+            1 for item in self.generated_items
+            if item.get("quality_score", 0) >= 80
+        )
+    @property
+    def success_rate(self) -> float:
+        """Percentage of items successfully generated.
+        Returns:
+            Success rate from 0.0 to 1.0
+        """
+        if not self.generated_items:
+            return 0.0
+        return self.success_count / len(self.generated_items)
+@dataclass
+class ProgressiveWorkflowResult:
+    """Complete results from a progressive workflow execution.
+    Captures the full progression history across all tiers, including
+    costs, quality metrics, and escalation decisions.
+    Attributes:
+        workflow_name: Name of the workflow (e.g., "test-gen")
+        task_id: Unique identifier for this execution
+        tier_results: Chronological list of tier execution results
+        final_result: The last tier result (may be successful or failed)
+        total_cost: Total cost in USD across all tiers
+        total_duration: Total execution time in seconds
+        success: Whether the workflow completed successfully
+    Example:
+        >>> result = ProgressiveWorkflowResult(
+        ...     workflow_name="test-gen",
+        ...     task_id="test-gen-20260117-143022",
+        ...     tier_results=[cheap_result, capable_result],
+        ...     final_result=capable_result,
+        ...     total_cost=0.75,
+        ...     total_duration=45.2,
+        ...     success=True
+        ... )
+        >>> print(result.generate_report())
+        🎯 PROGRESSIVE ESCALATION REPORT
+        ...
+    """
+    workflow_name: str
+    task_id: str
+    tier_results: list[TierResult]
+    final_result: TierResult
+    total_cost: float
+    total_duration: float
+    success: bool
+    def generate_report(self) -> str:
+        """Generate human-readable progression report.
+        Creates a detailed report showing:
+        - Tier-by-tier breakdown
+        - Quality scores and success rates
+        - Cost analysis and savings
+        - Escalation decisions
+        Returns:
+            Formatted report string
+        """
+        # Implementation will be in reports.py module
+        from empathy_os.workflows.progressive.reports import generate_progression_report
+        return generate_progression_report(self)
+    def save_to_disk(self, storage_path: str) -> None:
+        """Save detailed results to disk.
+        Creates a directory with:
+        - summary.json: High-level metrics
+        - tier_N_<tier_name>.json: Detailed tier results
+        - report.txt: Human-readable report
+        Args:
+            storage_path: Base path for saving results
+        """
+        from empathy_os.workflows.progressive.reports import save_results_to_disk
+        save_results_to_disk(self, storage_path)
+    @property
+    def cost_savings(self) -> float:
+        """Calculate cost savings vs running all items at Premium tier.
+        Returns:
+            Dollar amount saved by using progressive escalation
+        """
+        # Estimate what it would cost if all items were Premium
+        total_items = sum(len(r.generated_items) for r in self.tier_results)
+        # Assume Premium costs ~$0.05 per item (conservative estimate)
+        all_premium_cost = total_items * 0.05
+        savings = all_premium_cost - self.total_cost
+        return max(savings, 0.0)
+    @property
+    def cost_savings_percent(self) -> float:
+        """Calculate percentage of cost saved.
+        Returns:
+            Savings percentage (0-100)
+        """
+        total_items = sum(len(r.generated_items) for r in self.tier_results)
+        all_premium_cost = total_items * 0.05
+        if all_premium_cost == 0:
+            return 0.0
+        return (self.cost_savings / all_premium_cost) * 100
+@dataclass
+class EscalationConfig:
+    """Configuration for progressive tier escalation.
+    Controls all aspects of the escalation system including retry logic,
+    thresholds, cost management, and storage.
+    Attributes:
+        enabled: Whether progressive escalation is active
+        tiers: Ordered list of tiers to use (default: all three)
+        Retry configuration:
+            cheap_min_attempts: Minimum attempts at cheap tier
+            cheap_max_attempts: Maximum attempts at cheap tier
+            capable_min_attempts: Minimum attempts at capable tier
+            capable_max_attempts: Maximum attempts at capable tier
+            premium_max_attempts: Maximum attempts at premium tier
+        Thresholds (Cheap → Capable):
+            cheap_to_capable_failure_rate: Max failure rate before escalation
+            cheap_to_capable_min_cqs: Min quality score to avoid escalation
+            cheap_to_capable_max_syntax_errors: Max syntax errors allowed
+        Thresholds (Capable → Premium):
+            capable_to_premium_failure_rate: Max failure rate before escalation
+            capable_to_premium_min_cqs: Min quality score to avoid escalation
+            capable_to_premium_max_syntax_errors: Max syntax errors allowed
+        Stagnation detection:
+            improvement_threshold: Min CQS improvement to avoid stagnation (%)
+            consecutive_stagnation_limit: Consecutive stagnations before escalation
+        Cost management:
+            max_cost: Maximum total cost in USD
+            auto_approve_under: Auto-approve escalations under this cost
+            warn_on_budget_exceeded: Print warning if budget exceeded
+            abort_on_budget_exceeded: Abort execution if budget exceeded
+        Storage:
+            save_tier_results: Whether to save tier results to disk
+            storage_path: Directory for saving results
+    Example:
+        >>> config = EscalationConfig(
+        ...     enabled=True,
+        ...     max_cost=10.00,
+        ...     auto_approve_under=5.00,
+        ...     cheap_min_attempts=2,
+        ...     capable_max_attempts=6
+        ... )
+    """
+    # Global settings
+    enabled: bool = False
+    tiers: list[Tier] = field(default_factory=lambda: [Tier.CHEAP, Tier.CAPABLE, Tier.PREMIUM])
+    # Retry configuration
+    cheap_min_attempts: int = 2
+    cheap_max_attempts: int = 3
+    capable_min_attempts: int = 2
+    capable_max_attempts: int = 6
+    premium_max_attempts: int = 1
+    # Thresholds: Cheap → Capable
+    cheap_to_capable_failure_rate: float = 0.30
+    cheap_to_capable_min_cqs: float = 70.0
+    cheap_to_capable_max_syntax_errors: int = 3
+    # Thresholds: Capable → Premium
+    capable_to_premium_failure_rate: float = 0.20
+    capable_to_premium_min_cqs: float = 80.0
+    capable_to_premium_max_syntax_errors: int = 1
+    # Stagnation detection
+    improvement_threshold: float = 5.0  # 5% CQS improvement required
+    consecutive_stagnation_limit: int = 2
+    # Cost management
+    max_cost: float = 5.00
+    auto_approve_under: float | None = None
+    warn_on_budget_exceeded: bool = True
+    abort_on_budget_exceeded: bool = False
+    # Storage
+    save_tier_results: bool = True
+    storage_path: str = ".empathy/progressive_runs"
+    def get_max_attempts(self, tier: Tier) -> int:
+        """Get maximum attempts for a specific tier.
+        Args:
+            tier: The tier to query
+        Returns:
+            Maximum number of attempts allowed
+        """
+        if tier == Tier.CHEAP:
+            return self.cheap_max_attempts
+        elif tier == Tier.CAPABLE:
+            return self.capable_max_attempts
+        else:  # PREMIUM
+            return self.premium_max_attempts
+    def get_min_attempts(self, tier: Tier) -> int:
+        """Get minimum attempts for a specific tier.
+        Args:
+            tier: The tier to query
+        Returns:
+            Minimum number of attempts required
+        """
+        if tier == Tier.CHEAP:
+            return self.cheap_min_attempts
+        elif tier == Tier.CAPABLE:
+            return self.capable_min_attempts
+        else:  # PREMIUM
+            return 1  # Premium always gets exactly 1 attempt

empathy-framework 5.0.3__py3-none-any.whl → 5.1.1__py3-none-any.whl

empathy-framework 5.0.3py3-none-any.whl → 5.1.1py3-none-any.whl