npm - genoma-evolution - Versions diffs - 1.0.0 - Mend

genoma-evolution 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (445) hide show

package/tests/collectors/test_hermes_collector.py ADDED Viewed

@@ -0,0 +1,167 @@
+"""Tests for HermesCollector — Phase 2 verification."""
+import pytest
+from backend.promethean.models import TraceRecord, CanonicalRun
+from backend.collectors.hermes_collector import HermesCollector
+@pytest.fixture
+def collector():
+    """Fixture for HermesCollector."""
+    return HermesCollector()
+@pytest.fixture
+def sample_trace():
+    """Sample TraceRecord for testing."""
+    return TraceRecord(
+        agent="hermes",
+        agent_version="2.1.143",
+        timestamp="2026-05-19T14:32:00Z",
+        task="Implement user authentication",
+        outcome="success",
+        error_signature=None,
+        context={"skill_name": "auth-middleware"},
+        trace_id="hermes-001"
+    )
+@pytest.fixture
+def failed_trace():
+    """TraceRecord with error."""
+    return TraceRecord(
+        agent="hermes",
+        agent_version="2.1.143",
+        timestamp="2026-05-19T14:32:00Z",
+        task="Test task",
+        outcome="failure",
+        error_signature="TypeError: Cannot read property 'foo' of undefined",
+        resolution="Rolled back changes",
+        trace_id="hermes-002"
+    )
+class TestHermesCollectorBasic:
+    """Basic HermesCollector functionality tests."""
+    def test_collector_instantiation(self, collector):
+        """Collector should instantiate without error."""
+        assert collector is not None
+        assert collector.VERSION == "0.1.0"
+        assert collector.AGENT_NAME == "hermes"
+    def test_collect_from_trace_success(self, collector, sample_trace):
+        """Convert successful TraceRecord to CanonicalRun."""
+        canonical = collector.collect_from_trace(sample_trace)
+        assert isinstance(canonical, CanonicalRun)
+        assert canonical.run_id == "hermes-001"
+        assert canonical.agent_name == "hermes"
+        assert canonical.collector == "hermes-trace-ingestor"
+        assert canonical.outcome == "success"
+        assert canonical.task_name == "Implement user authentication"
+        assert canonical.started_at == "2026-05-19T14:32:00Z"
+        assert canonical.provider == "hermes"
+    def test_collect_from_trace_with_error(self, collector, failed_trace):
+        """Convert failed TraceRecord with error_signature."""
+        canonical = collector.collect_from_trace(failed_trace)
+        assert canonical.outcome == "failure"
+        assert len(canonical.errors) == 1
+        assert canonical.errors[0]["signature"] == "TypeError: Cannot read property 'foo' of undefined"
+        assert canonical.resolution == "Rolled back changes"
+    def test_collect_from_trace_no_error(self, collector, sample_trace):
+        """TraceRecord without error_signature should have empty errors list."""
+        canonical = collector.collect_from_trace(sample_trace)
+        assert canonical.errors == []
+    def test_collect_batch(self, collector, sample_trace, failed_trace):
+        """Batch conversion should work."""
+        traces = [sample_trace, failed_trace]
+        canonicals = collector.collect_batch(traces)
+        assert len(canonicals) == 2
+        assert canonicals[0].run_id == "hermes-001"
+        assert canonicals[1].run_id == "hermes-002"
+        assert canonicals[0].outcome == "success"
+        assert canonicals[1].outcome == "failure"
+class TestCanonicalRunSerialization:
+    """CanonicalRun serialization tests."""
+    def test_to_dict(self, collector, sample_trace):
+        """CanonicalRun.to_dict() should produce valid dict."""
+        canonical = collector.collect_from_trace(sample_trace)
+        data = canonical.to_dict()
+        assert isinstance(data, dict)
+        assert data["run_id"] == "hermes-001"
+        assert data["agent_name"] == "hermes"
+        assert "collector" in data
+        assert "started_at" in data
+    def test_to_json(self, collector, sample_trace):
+        """CanonicalRun.to_json() should produce valid JSON."""
+        import json
+        canonical = collector.collect_from_trace(sample_trace)
+        json_str = canonical.to_json()
+        # Should be valid JSON
+        parsed = json.loads(json_str)
+        assert parsed["run_id"] == "hermes-001"
+        assert parsed["agent_name"] == "hermes"
+    def test_from_dict_roundtrip(self, collector, sample_trace):
+        """CanonicalRun should round-trip through dict."""
+        original = collector.collect_from_trace(sample_trace)
+        data = original.to_dict()
+        restored = CanonicalRun.from_dict(data)
+        assert restored.run_id == original.run_id
+        assert restored.agent_name == original.agent_name
+        assert restored.outcome == original.outcome
+        assert restored.task_name == original.task_name
+    def test_context_preservation(self, collector, sample_trace):
+        """TraceRecord context should be preserved in CanonicalRun."""
+        canonical = collector.collect_from_trace(sample_trace)
+        assert canonical.context == {"skill_name": "auth-middleware"}
+    def test_minimal_trace_conversion(self, collector):
+        """Minimal TraceRecord (no optional fields) should convert."""
+        minimal = TraceRecord(
+            agent="hermes",
+            agent_version="2.0.0",
+            timestamp="2026-05-19T10:00:00Z",
+            task="Minimal task",
+            outcome="unknown",
+            trace_id="minimal-001"
+        )
+        canonical = collector.collect_from_trace(minimal)
+        assert canonical.run_id == "minimal-001"
+        assert canonical.outcome == "unknown"
+        assert canonical.errors == []
+        assert canonical.context == {}
+        assert canonical.resolution is None
+class TestCanonicalRunRequiredFields:
+    """Test that required fields are always present."""
+    def test_required_fields_present(self, collector, sample_trace):
+        """All required fields must be present and non-None."""
+        canonical = collector.collect_from_trace(sample_trace)
+        required = {"run_id", "agent_name", "collector", "started_at", "task_name", "outcome"}
+        for field in required:
+            value = getattr(canonical, field)
+            assert value is not None, f"Required field '{field}' is None"
+            assert len(str(value)) > 0, f"Required field '{field}' is empty string"

package/tests/eval/test_engine.py ADDED Viewed

@@ -0,0 +1,234 @@
+"""Tests for evaluation engine — Phase 5 verification."""
+import tempfile
+from pathlib import Path
+import pytest
+from backend.eval.engine import EvaluationEngine
+from backend.promethean.models import CanonicalRun, RunMetrics
+from backend.storage import RunStore
+@pytest.fixture
+def temp_db():
+    """Temporary database for testing."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        yield Path(tmpdir) / "test.db"
+@pytest.fixture
+def store(temp_db):
+    """RunStore instance with temp database."""
+    return RunStore(db_path=temp_db)
+@pytest.fixture
+def engine(store):
+    """EvaluationEngine instance."""
+    return EvaluationEngine(store=store)
+@pytest.fixture
+def sample_run(store):
+    """Sample run stored in database."""
+    run = CanonicalRun(
+        run_id="run-001",
+        agent_name="hermes",
+        collector="hermes-trace-ingestor",
+        started_at="2026-05-19T10:00:00Z",
+        task_name="Task",
+        outcome="success",
+        metrics=RunMetrics(input_tokens=1000, output_tokens=500, tool_call_count=2),
+        errors=[],
+    )
+    store.upsert_run(run)
+    return run
+@pytest.fixture
+def failed_run(store):
+    """Failed run stored in database."""
+    run = CanonicalRun(
+        run_id="run-002",
+        agent_name="hermes",
+        collector="hermes-trace-ingestor",
+        started_at="2026-05-19T11:00:00Z",
+        task_name="Task",
+        outcome="failure",
+        metrics=RunMetrics(input_tokens=2000, output_tokens=800, tool_call_count=0),
+        errors=[{"signature": "Error", "message": "Failed"}],
+    )
+    store.upsert_run(run)
+    return run
+class TestEvaluationEngine:
+    """EvaluationEngine tests."""
+    def test_engine_instantiation(self, engine):
+        """Engine should instantiate with default scorers."""
+        assert engine is not None
+        assert len(engine.scorers) == 5  # 5 default scorers
+    def test_evaluate_single_run(self, engine, sample_run):
+        """evaluate() should return scores from applicable scorers."""
+        scores = engine.evaluate(sample_run)
+        assert len(scores) > 0  # At least some scorers apply
+        assert all(hasattr(s, "score") for s in scores)
+        assert all(hasattr(s, "passed") for s in scores)
+        assert all(0.0 <= s.score <= 1.0 for s in scores)
+    def test_evaluate_batch(self, engine, sample_run, failed_run):
+        """evaluate_batch() should process multiple runs."""
+        runs = [sample_run, failed_run]
+        result = engine.evaluate_batch(runs)
+        assert result["total"] == 2
+        assert result["evaluated"] == 2
+        assert result["errors"] == 0
+    def test_get_aggregate_score(self, engine, sample_run):
+        """get_aggregate_score() should return weighted average."""
+        agg = engine.get_aggregate_score(sample_run)
+        assert 0.0 <= agg <= 1.0
+        # Success run should score > 0.5
+        assert agg > 0.5
+    def test_aggregate_score_consistency(self, engine, sample_run):
+        """Aggregate score should be consistent."""
+        agg1 = engine.get_aggregate_score(sample_run)
+        agg2 = engine.get_aggregate_score(sample_run)
+        assert agg1 == agg2
+    def test_detect_regression_improvement(self, engine, store):
+        """detect_regression() should detect improvement."""
+        baseline = CanonicalRun(
+            run_id="baseline",
+            agent_name="hermes",
+            collector="hermes-trace-ingestor",
+            started_at="2026-05-19T10:00:00Z",
+            task_name="Task",
+            outcome="partial",
+        )
+        evolved = CanonicalRun(
+            run_id="evolved",
+            agent_name="hermes",
+            collector="hermes-trace-ingestor",
+            started_at="2026-05-19T11:00:00Z",
+            task_name="Task",
+            outcome="success",
+        )
+        store.upsert_run(baseline)
+        store.upsert_run(evolved)
+        result = engine.detect_regression("baseline", "evolved", threshold=0.05)
+        assert result["improvement"] is True
+        assert result["regression"] is False
+        assert result["delta"] > 0
+    def test_detect_regression_failure(self, engine, store):
+        """detect_regression() should detect regression."""
+        baseline = CanonicalRun(
+            run_id="baseline",
+            agent_name="hermes",
+            collector="hermes-trace-ingestor",
+            started_at="2026-05-19T10:00:00Z",
+            task_name="Task",
+            outcome="success",
+        )
+        evolved = CanonicalRun(
+            run_id="evolved",
+            agent_name="hermes",
+            collector="hermes-trace-ingestor",
+            started_at="2026-05-19T11:00:00Z",
+            task_name="Task",
+            outcome="failure",
+        )
+        store.upsert_run(baseline)
+        store.upsert_run(evolved)
+        result = engine.detect_regression("baseline", "evolved", threshold=0.05)
+        assert result["regression"] is True
+        assert result["improvement"] is False
+        assert result["delta"] < 0
+    def test_detect_regression_neutral(self, engine, store):
+        """detect_regression() should detect neutral change."""
+        baseline = CanonicalRun(
+            run_id="baseline",
+            agent_name="hermes",
+            collector="hermes-trace-ingestor",
+            started_at="2026-05-19T10:00:00Z",
+            task_name="Task",
+            outcome="success",
+        )
+        evolved = CanonicalRun(
+            run_id="evolved",
+            agent_name="hermes",
+            collector="hermes-trace-ingestor",
+            started_at="2026-05-19T11:00:00Z",
+            task_name="Task",
+            outcome="success",
+        )
+        store.upsert_run(baseline)
+        store.upsert_run(evolved)
+        result = engine.detect_regression("baseline", "evolved", threshold=0.05)
+        assert result["neutral"] is True
+        assert result["improvement"] is False
+        assert result["regression"] is False
+    def test_detect_regression_missing_run(self, engine):
+        """detect_regression() should handle missing runs."""
+        result = engine.detect_regression("nonexistent-1", "nonexistent-2")
+        assert "error" in result
+        assert result["baseline_found"] is False
+        assert result["evolved_found"] is False
+class TestEngineWithCustomScorers:
+    """Test engine with custom scorer configurations."""
+    def test_custom_scorers(self, store):
+        """Engine should work with custom scorer list."""
+        from backend.eval.scorers import OutcomeScorer
+        custom_scorers = [OutcomeScorer()]
+        engine = EvaluationEngine(store=store, scorers=custom_scorers)
+        run = CanonicalRun(
+            run_id="test",
+            agent_name="hermes",
+            collector="hermes-trace-ingestor",
+            started_at="2026-05-19T10:00:00Z",
+            task_name="Task",
+            outcome="success",
+        )
+        scores = engine.evaluate(run)
+        assert len(scores) == 1
+        assert scores[0].scorer == "outcome"
+    def test_empty_scorers(self, store):
+        """Engine with empty scorers should return empty list."""
+        engine = EvaluationEngine(store=store, scorers=[])
+        run = CanonicalRun(
+            run_id="test",
+            agent_name="hermes",
+            collector="hermes-trace-ingestor",
+            started_at="2026-05-19T10:00:00Z",
+            task_name="Task",
+            outcome="success",
+        )
+        scores = engine.evaluate(run)
+        assert len(scores) == 0

package/tests/eval/test_scorers.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""Tests for evaluation scorers — Phase 5 verification."""
+import pytest
+from backend.eval.scorers import (
+    OutcomeScorer,
+    ToolEfficiencyScorer,
+    TokenCostScorer,
+    ErrorRecoveryScorer,
+)
+from backend.promethean.models import CanonicalRun, ToolCallRecord, RunMetrics
+@pytest.fixture
+def success_run():
+    """Successful run with no errors."""
+    return CanonicalRun(
+        run_id="run-success",
+        agent_name="hermes",
+        collector="hermes-trace-ingestor",
+        started_at="2026-05-19T10:00:00Z",
+        task_name="Task",
+        outcome="success",
+        errors=[],
+    )
+@pytest.fixture
+def failure_run():
+    """Failed run with error."""
+    return CanonicalRun(
+        run_id="run-failure",
+        agent_name="hermes",
+        collector="hermes-trace-ingestor",
+        started_at="2026-05-19T10:00:00Z",
+        task_name="Task",
+        outcome="failure",
+        errors=[{"signature": "TypeError", "message": "Type error"}],
+    )
+@pytest.fixture
+def run_with_tools():
+    """Run with tool calls."""
+    return CanonicalRun(
+        run_id="run-tools",
+        agent_name="claude-code",
+        collector="claude-code-session-collector",
+        started_at="2026-05-19T10:00:00Z",
+        task_name="Task",
+        outcome="success",
+        tool_calls=[
+            ToolCallRecord(id="1", name="Read"),
+            ToolCallRecord(id="2", name="Read"),
+            ToolCallRecord(id="3", name="Edit"),
+        ],
+    )
+@pytest.fixture
+def run_with_metrics():
+    """Run with token metrics."""
+    return CanonicalRun(
+        run_id="run-metrics",
+        agent_name="hermes",
+        collector="hermes-trace-ingestor",
+        started_at="2026-05-19T10:00:00Z",
+        task_name="Task",
+        outcome="success",
+        metrics=RunMetrics(
+            input_tokens=5000,
+            output_tokens=2000,
+            cache_tokens=500,
+            tool_call_count=0,
+        ),
+    )
+class TestOutcomeScorer:
+    """OutcomeScorer tests."""
+    def test_success_outcome(self, success_run):
+        """Success outcome should score 1.0."""
+        scorer = OutcomeScorer()
+        score = scorer.score(success_run)
+        assert score.score == 1.0
+        assert score.passed is True
+        assert score.scorer == "outcome"
+    def test_failure_outcome(self, failure_run):
+        """Failure outcome should score 0.0."""
+        scorer = OutcomeScorer()
+        score = scorer.score(failure_run)
+        assert score.score == 0.0
+        assert score.passed is False
+    def test_partial_outcome(self, success_run):
+        """Partial outcome should score 0.5."""
+        success_run.outcome = "partial"
+        scorer = OutcomeScorer()
+        score = scorer.score(success_run)
+        assert score.score == 0.5
+        assert score.passed is False
+    def test_unknown_outcome(self, success_run):
+        """Unknown outcome should score 0.3."""
+        success_run.outcome = "unknown"
+        scorer = OutcomeScorer()
+        score = scorer.score(success_run)
+        assert score.score == 0.3
+        assert score.passed is False
+    def test_applies_to_all(self, success_run):
+        """OutcomeScorer applies to all runs."""
+        scorer = OutcomeScorer()
+        assert scorer.applies_to(success_run) is True
+class TestToolEfficiencyScorer:
+    """ToolEfficiencyScorer tests."""
+    def test_no_tools(self, success_run):
+        """No tools = 1.0 score (efficient)."""
+        scorer = ToolEfficiencyScorer()
+        score = scorer.score(success_run)
+        assert score.score == 1.0
+        assert score.passed is True
+    def test_efficient_tools(self, run_with_tools):
+        """2 unique / 3 total = 0.67 ratio (pass)."""
+        scorer = ToolEfficiencyScorer()
+        score = scorer.score(run_with_tools)
+        assert score.score > 0.3  # Passes threshold
+        assert score.passed is True
+        assert score.details["unique_tools"] == 2
+        assert score.details["total_calls"] == 3
+    def test_applies_only_with_tools(self, success_run, run_with_tools):
+        """ToolEfficiencyScorer applies only with tool calls."""
+        scorer = ToolEfficiencyScorer()
+        assert scorer.applies_to(success_run) is False
+        assert scorer.applies_to(run_with_tools) is True
+class TestTokenCostScorer:
+    """TokenCostScorer tests."""
+    def test_low_tokens(self, run_with_metrics):
+        """Low token count scores high."""
+        scorer = TokenCostScorer()
+        score = scorer.score(run_with_metrics)
+        assert score.score > 0.5
+        assert score.passed is True
+    def test_high_tokens(self, run_with_metrics):
+        """High token count scores low."""
+        run_with_metrics.metrics.input_tokens = 40000
+        run_with_metrics.metrics.output_tokens = 15000
+        scorer = TokenCostScorer()
+        score = scorer.score(run_with_metrics)
+        assert score.score < 0.3
+        assert score.passed is False
+    def test_no_metrics(self, success_run):
+        """No metrics = 1.0 score."""
+        scorer = TokenCostScorer()
+        score = scorer.score(success_run)
+        assert score.score == 1.0
+        assert score.passed is True
+    def test_applies_only_with_metrics(self, success_run, run_with_metrics):
+        """TokenCostScorer applies only with metrics."""
+        scorer = TokenCostScorer()
+        assert scorer.applies_to(success_run) is False
+        assert scorer.applies_to(run_with_metrics) is True
+class TestErrorRecoveryScorer:
+    """ErrorRecoveryScorer tests."""
+    def test_success_no_errors(self, success_run):
+        """Success with no errors = 1.0."""
+        scorer = ErrorRecoveryScorer()
+        score = scorer.score(success_run)
+        assert score.score == 1.0
+        assert score.passed is True
+    def test_success_with_errors(self, success_run):
+        """Success with errors = 0.8."""
+        success_run.errors = [{"signature": "Warning"}]
+        scorer = ErrorRecoveryScorer()
+        score = scorer.score(success_run)
+        assert score.score == 0.8
+        assert score.passed is True
+    def test_failure(self, failure_run):
+        """Failure = 0.0."""
+        scorer = ErrorRecoveryScorer()
+        score = scorer.score(failure_run)
+        assert score.score == 0.0
+        assert score.passed is False
+    def test_applies_to_all(self, success_run):
+        """ErrorRecoveryScorer applies to all runs."""
+        scorer = ErrorRecoveryScorer()
+        assert scorer.applies_to(success_run) is True
+class TestScorerDeterminism:
+    """Verify scorers are deterministic."""
+    def test_outcome_scorer_determinism(self, success_run):
+        """Same run should score identically."""
+        scorer = OutcomeScorer()
+        score1 = scorer.score(success_run)
+        score2 = scorer.score(success_run)
+        assert score1.score == score2.score
+        assert score1.passed == score2.passed
+    def test_tool_scorer_determinism(self, run_with_tools):
+        """Same run should score identically."""
+        scorer = ToolEfficiencyScorer()
+        score1 = scorer.score(run_with_tools)
+        score2 = scorer.score(run_with_tools)
+        assert score1.score == score2.score
+        assert score1.passed == score2.passed
+    def test_token_scorer_determinism(self, run_with_metrics):
+        """Same run should score identically."""
+        scorer = TokenCostScorer()
+        score1 = scorer.score(run_with_metrics)
+        score2 = scorer.score(run_with_metrics)
+        assert score1.score == score2.score
+        assert score1.passed == score2.passed

package/tests/storage/__init__.py ADDED Viewed

File without changes