npm - genoma-evolution - Versions diffs - 1.0.0 - Mend

genoma-evolution 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (445) hide show

package/backend/eval/engine.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""Evaluation engine orchestrating multiple scorers."""
+from typing import Optional
+from backend.eval.scorers import (
+    EvalScore,
+    OutcomeScorer,
+    ToolEfficiencyScorer,
+    TokenCostScorer,
+    ErrorRecoveryScorer,
+    DeltaScorer,
+)
+from backend.promethean.models import CanonicalRun
+from backend.storage import RunStore
+class EvaluationEngine:
+    """Evaluate canonical runs using composable scorers."""
+    DEFAULT_SCORERS = [
+        OutcomeScorer(),
+        ToolEfficiencyScorer(),
+        TokenCostScorer(),
+        ErrorRecoveryScorer(),
+        DeltaScorer(),
+    ]
+    def __init__(self, store: Optional[RunStore] = None, scorers: Optional[list] = None):
+        self.store = store or RunStore()
+        self.scorers = scorers if scorers is not None else self.DEFAULT_SCORERS
+    def evaluate(self, run: CanonicalRun) -> list[EvalScore]:
+        """Run applicable scorers on a run. Return list of scores."""
+        scores = []
+        for scorer in self.scorers:
+            if scorer.applies_to(run):
+                score = scorer.score(run)
+                if score:  # DeltaScorer may return None
+                    scores.append(score)
+                    # Optionally save to storage
+                    if self.store and hasattr(run, "run_id"):
+                        try:
+                            self._save_score(run.run_id, score)
+                        except Exception:
+                            pass
+        return scores
+    def evaluate_batch(self, runs: list[CanonicalRun]) -> dict:
+        """Evaluate batch of runs. Return {total, evaluated, errors}."""
+        result = {"total": len(runs), "evaluated": 0, "errors": 0}
+        for run in runs:
+            try:
+                self.evaluate(run)
+                result["evaluated"] += 1
+            except Exception:
+                result["errors"] += 1
+        return result
+    def get_aggregate_score(self, run: CanonicalRun) -> float:
+        """Weighted average of all applicable scores."""
+        scores = self.evaluate(run)
+        if not scores:
+            return 0.5  # Default if no scorers apply
+        # Equal weighting for now; can be customized per scorer
+        total = sum(s.score for s in scores)
+        return total / len(scores)
+    def detect_regression(
+        self, baseline_run_id: str, evolved_run_id: str, threshold: float = 0.05
+    ) -> dict:
+        """Compare baseline and evolved runs. Return {delta, regression, improvement, neutral}."""
+        baseline = self.store.get_run(baseline_run_id)
+        evolved = self.store.get_run(evolved_run_id)
+        if not baseline or not evolved:
+            return {
+                "error": "One or both runs not found",
+                "baseline_found": baseline is not None,
+                "evolved_found": evolved is not None,
+            }
+        baseline_score = self.get_aggregate_score(baseline)
+        evolved_score = self.get_aggregate_score(evolved)
+        delta = evolved_score - baseline_score
+        return {
+            "baseline_run_id": baseline_run_id,
+            "evolved_run_id": evolved_run_id,
+            "baseline_score": round(baseline_score, 3),
+            "evolved_score": round(evolved_score, 3),
+            "delta": round(delta, 3),
+            "threshold": threshold,
+            "regression": delta < -threshold,
+            "improvement": delta > threshold,
+            "neutral": abs(delta) <= threshold,
+        }
+    def _save_score(self, run_id: str, score: EvalScore):
+        """Save evaluation score to storage (optional)."""
+        conn = self.store.connect()
+        try:
+            conn.execute(
+                """INSERT INTO eval_scores (run_id, scorer, score, passed, details)
+                   VALUES (?, ?, ?, ?, ?)""",
+                (
+                    run_id,
+                    score.scorer,
+                    score.score,
+                    1 if score.passed else 0,
+                    str(score.details),
+                ),
+            )
+            conn.commit()
+        finally:
+            self.store.close(conn)

package/backend/eval/scorers.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""Evaluation scorers for CanonicalRun instances."""
+from dataclasses import dataclass, field
+from typing import Optional
+from backend.promethean.models import CanonicalRun
+@dataclass
+class EvalScore:
+    """Result of applying a scorer to a run."""
+    scorer: str
+    score: float  # 0.0 to 1.0
+    passed: bool
+    details: dict = field(default_factory=dict)
+class OutcomeScorer:
+    """Score based on run outcome."""
+    name = "outcome"
+    def score(self, run: CanonicalRun) -> EvalScore:
+        """Map outcome to score: success=1.0, partial=0.5, failure=0.0, unknown=0.3."""
+        outcome_map = {
+            "success": 1.0,
+            "partial": 0.5,
+            "failure": 0.0,
+            "unknown": 0.3,
+        }
+        score = outcome_map.get(run.outcome, 0.3)
+        return EvalScore(
+            scorer=self.name,
+            score=score,
+            passed=score > 0.5,
+            details={"outcome": run.outcome},
+        )
+    def applies_to(self, run: CanonicalRun) -> bool:
+        """Applies to all runs."""
+        return True
+class ToolEfficiencyScorer:
+    """Score based on tool call efficiency (unique_tools / total_calls)."""
+    name = "tool_efficiency"
+    def score(self, run: CanonicalRun) -> EvalScore:
+        """Calculate tool efficiency ratio. Pass if > 0.3."""
+        if not run.tool_calls:
+            return EvalScore(
+                scorer=self.name,
+                score=1.0,
+                passed=True,
+                details={"reason": "no_tools_used"},
+            )
+        total = len(run.tool_calls)
+        unique = len(set(tc.name for tc in run.tool_calls))
+        ratio = unique / total if total > 0 else 0.0
+        return EvalScore(
+            scorer=self.name,
+            score=ratio,
+            passed=ratio > 0.3,
+            details={
+                "unique_tools": unique,
+                "total_calls": total,
+                "efficiency_ratio": round(ratio, 2),
+            },
+        )
+    def applies_to(self, run: CanonicalRun) -> bool:
+        """Applies to runs with tool calls."""
+        return len(run.tool_calls) > 0
+class TokenCostScorer:
+    """Score based on token usage: lower is better (up to 50k tokens)."""
+    name = "token_cost"
+    def score(self, run: CanonicalRun) -> EvalScore:
+        """Score = max(0, 1 - tokens/50000). Pass if < 50k."""
+        if not run.metrics or not run.metrics.input_tokens:
+            return EvalScore(
+                scorer=self.name,
+                score=1.0,
+                passed=True,
+                details={"reason": "no_metrics"},
+            )
+        total_tokens = (run.metrics.input_tokens or 0) + (run.metrics.output_tokens or 0)
+        threshold = 50000
+        score = max(0.0, 1.0 - (total_tokens / threshold))
+        return EvalScore(
+            scorer=self.name,
+            score=score,
+            passed=total_tokens < threshold,
+            details={
+                "input_tokens": run.metrics.input_tokens,
+                "output_tokens": run.metrics.output_tokens,
+                "total_tokens": total_tokens,
+                "threshold": threshold,
+            },
+        )
+    def applies_to(self, run: CanonicalRun) -> bool:
+        """Applies to runs with metrics."""
+        return run.metrics is not None
+class ErrorRecoveryScorer:
+    """Score based on error handling: success with no errors = 1.0, success with errors = 0.8, failure = 0.0."""
+    name = "error_recovery"
+    def score(self, run: CanonicalRun) -> EvalScore:
+        """Score based on outcome and error presence."""
+        if run.outcome == "success":
+            if not run.errors:
+                score = 1.0
+            else:
+                score = 0.8
+            passed = True
+        elif run.outcome == "partial":
+            score = 0.5
+            passed = False
+        else:  # failure or unknown
+            score = 0.0
+            passed = False
+        return EvalScore(
+            scorer=self.name,
+            score=score,
+            passed=passed,
+            details={
+                "outcome": run.outcome,
+                "error_count": len(run.errors),
+                "has_errors": len(run.errors) > 0,
+            },
+        )
+    def applies_to(self, run: CanonicalRun) -> bool:
+        """Applies to all runs."""
+        return True
+class DeltaScorer:
+    """Score based on delta validation (Hermes-specific).
+    Only applies to Hermes runs with context.skill_name.
+    Integrates with existing DeltaValidator from promethean module.
+    """
+    name = "delta"
+    def score(self, run: CanonicalRun) -> Optional[EvalScore]:
+        """Run delta validation if applicable. Return None if not applicable."""
+        if not self.applies_to(run):
+            return None
+        # Try to import DeltaValidator
+        try:
+            from backend.promethean.delta_validator import get_validator
+            validator = get_validator()
+            skill_name = run.context.get("skill_name")
+            # Get baseline from context or infer
+            baseline = run.context.get("baseline_version")
+            if not baseline:
+                baseline = "unknown"
+            # Run validation
+            result = validator.validate(skill_name, baseline=baseline)
+            # Map validation result to score
+            passed = result.get("passed", False)
+            score = 1.0 if passed else 0.0
+            return EvalScore(
+                scorer=self.name,
+                score=score,
+                passed=passed,
+                details=result,
+            )
+        except Exception:
+            # DeltaValidator not available or error occurred
+            return None
+    def applies_to(self, run: CanonicalRun) -> bool:
+        """Applies to Hermes runs with skill_name in context."""
+        return (
+            run.agent_name == "hermes"
+            and run.context
+            and "skill_name" in run.context
+        )

package/backend/generate_dataset.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""
+Dataset Generator — Genera ejemplos de validación sintéticos para skills.
+Usage:
+    python generate_dataset.py --skill code-review --count 10 --output ~/.hermes/datasets/code-review/train.jsonl
+"""
+import argparse
+import json
+import sys
+from pathlib import Path
+from openai import OpenAI
+OLLAMA_BASE = "http://localhost:11434/v1"
+OLLAMA_MODEL = "gemma4:31b-cloud"
+client = OpenAI(base_url=OLLAMA_BASE, api_key="ollama")
+SKILL_PROMPTS = {
+    "code-review": """Para la skill 'code-review', generá exactamente {count} ejemplos de revisión de código.
+Cada ejemplo debe ser un objeto JSON con:
+- "input": código con bugs/problemas (Python, TypeScript, o Go)
+- "expected": la revisión esperada (bugs específicos que deberían detectarse)
+Cubrí estos casos:
+1. Python: SQL injection, type hints faltantes, async mal usado, raw dicts en vez de Pydantic
+2. TypeScript/React: useEffect sin deps, componentes no memoizados, keys faltantes en maps
+3. Go: error handling ignorado, goroutines sin sync, nil pointer dereference
+4. General: dead code, secretos hardcodeados, logs con PII
+IMPORTANTE: El expected NO debe ser el código corregido, sino una LISTA de bugs que deben detectarse.
+OUTPUT: Array JSON de objetos {{"input": "...", "expected": "..."}}. Sin markdown fences.
+Generá EXACTAMENTE {count} ejemplos. Comenzá YA:""",
+}
+def generate_dataset(skill: str, count: int, output_path: str):
+    prompt = SKILL_PROMPTS.get(skill, SKILL_PROMPTS["code-review"]).format(count=count)
+    print(f"🎯 Generating {count} examples for '{skill}'...", flush=True)
+    resp = client.chat.completions.create(
+        model=OLLAMA_MODEL,
+        messages=[
+            {"role": "system", "content": "Sos un experto en code review. Generás datasets de validación en JSON. Output: solo JSON válido, sin explicaciones."},
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0.7,
+        max_tokens=4000,
+    )
+    raw = resp.choices[0].message.content or ""
+    raw = raw.strip()
+    if raw.startswith("```"):
+        raw = raw.split("```")[1]
+        if raw.startswith("json"):
+            raw = raw[4:]
+    raw = raw.strip()
+    try:
+        examples = json.loads(raw)
+    except json.JSONDecodeError:
+        print(f"❌ Failed to parse JSON. Raw output:", file=sys.stderr)
+        print(raw[:500], file=sys.stderr)
+        sys.exit(1)
+    if not isinstance(examples, list):
+        examples = [examples]
+    # Write to file
+    out = Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with open(out, "w") as f:
+        for ex in examples:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+    print(f"✅ Saved {len(examples)} examples to {out}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate synthetic validation dataset")
+    parser.add_argument("--skill", required=True, help="Skill name")
+    parser.add_argument("--count", type=int, default=10, help="Number of examples")
+    parser.add_argument("--output", required=True, help="Output .jsonl file")
+    args = parser.parse_args()
+    generate_dataset(args.skill, args.count, args.output)

package/backend/job_tracker.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""Evolution job tracking — persistent state for active and completed jobs."""
+import json
+import re
+import uuid
+import asyncio
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from dataclasses import dataclass, field, asdict
+from typing import Optional
+LOG_DIR = Path.home() / ".hermes" / "evolution-logs"
+class JobStatus(str, Enum):
+    QUEUED = "queued"
+    LOADING_SKILL = "loading_skill"
+    BUILDING_DATASET = "building_dataset"
+    VALIDATING = "validating"
+    CONFIGURING = "configuring"
+    OPTIMIZING = "optimizing"
+    EVALUATING = "evaluating"
+    SAVING = "saving"
+    COMPLETED = "completed"
+    FAILED = "failed"
+# Phases in order (for progress calculation)
+PHASE_ORDER = [
+    JobStatus.LOADING_SKILL,
+    JobStatus.BUILDING_DATASET,
+    JobStatus.VALIDATING,
+    JobStatus.CONFIGURING,
+    JobStatus.OPTIMIZING,
+    JobStatus.EVALUATING,
+    JobStatus.SAVING,
+]
+@dataclass
+class EvolutionJob:
+    id: str
+    skill_name: str
+    status: JobStatus = JobStatus.QUEUED
+    iterations: int = 3
+    current_iteration: int = 0
+    pid: Optional[int] = None
+    started_at: str = ""
+    completed_at: str = ""
+    baseline_score: Optional[float] = None
+    current_best_score: Optional[float] = None
+    evolved_score: Optional[float] = None
+    improvement: Optional[float] = None
+    error: str = ""
+    logs: list[str] = field(default_factory=list)
+    def to_dict(self) -> dict:
+        d = asdict(self)
+        d["status"] = self.status.value
+        return d
+    @property
+    def progress(self) -> float:
+        """0.0 to 1.0 progress estimate."""
+        if self.status == JobStatus.COMPLETED:
+            return 1.0
+        if self.status == JobStatus.FAILED:
+            return 0.0
+        if self.status == JobStatus.QUEUED:
+            return 0.0
+        # Phase-based progress
+        phase_idx = 0
+        try:
+            phase_idx = PHASE_ORDER.index(self.status)
+        except ValueError:
+            pass
+        phase_weight = 1.0 / len(PHASE_ORDER)
+        # Within OPTIMIZING phase, use iteration progress
+        if self.status == JobStatus.OPTIMIZING and self.iterations > 0:
+            iter_progress = self.current_iteration / self.iterations
+            base = phase_idx * phase_weight
+            return base + (iter_progress * phase_weight)
+        return phase_idx * phase_weight
+    def add_log(self, message: str):
+        timestamp = datetime.now().isoformat()[:19]
+        self.logs.append(f"[{timestamp}] {message}")
+        # Keep last 500 lines
+        if len(self.logs) > 500:
+            self.logs = self.logs[-500:]
+    def save_log(self):
+        LOG_DIR.mkdir(parents=True, exist_ok=True)
+        log_file = LOG_DIR / f"{self.id}.json"
+        log_file.write_text(json.dumps(self.to_dict(), indent=2))
+class JobTracker:
+    def __init__(self):
+        self._jobs: dict[str, EvolutionJob] = {}
+        self._processes: dict[str, asyncio.subprocess.Process] = {}
+        LOG_DIR.mkdir(parents=True, exist_ok=True)
+    def create_job(self, skill_name: str, iterations: int) -> EvolutionJob:
+        job_id = f"{skill_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:6]}"
+        job = EvolutionJob(
+            id=job_id,
+            skill_name=skill_name,
+            iterations=iterations,
+            started_at=datetime.now().isoformat(),
+        )
+        self._jobs[job_id] = job
+        return job
+    def get_job(self, job_id: str) -> Optional[EvolutionJob]:
+        return self._jobs.get(job_id)
+    def get_active_jobs(self) -> list[EvolutionJob]:
+        return [
+            j for j in self._jobs.values()
+            if j.status not in (JobStatus.COMPLETED, JobStatus.FAILED)
+        ]
+    def get_all_jobs(self, limit: int = 50) -> list[EvolutionJob]:
+        sorted_jobs = sorted(
+            self._jobs.values(),
+            key=lambda j: j.started_at,
+            reverse=True,
+        )
+        return sorted_jobs[:limit]
+    def set_process(self, job_id: str, process: asyncio.subprocess.Process):
+        self._processes[job_id] = process
+    def get_process(self, job_id: str) -> Optional[asyncio.subprocess.Process]:
+        return self._processes.get(job_id)
+    def cleanup_process(self, job_id: str):
+        self._processes.pop(job_id, None)
+    # ── Log parsing ─────────────────────────────────────────────────
+    # Patterns to detect phase changes from Rich output
+    PHASE_PATTERNS = [
+        (r"Loaded:.*\.md", JobStatus.LOADING_SKILL),
+        (r"Building evaluation dataset", JobStatus.BUILDING_DATASET),
+        (r"Generated.*synthetic examples", JobStatus.BUILDING_DATASET),
+        (r"Mined.*examples from session", JobStatus.BUILDING_DATASET),
+        (r"Loaded golden dataset", JobStatus.BUILDING_DATASET),
+        (r"Validating baseline constraints", JobStatus.VALIDATING),
+        (r"Configuring optimizer", JobStatus.CONFIGURING),
+        (r"Running GEPA optimization", JobStatus.OPTIMIZING),
+        (r"Running MIPROv2", JobStatus.OPTIMIZING),
+        (r"Optimization completed", JobStatus.OPTIMIZING),
+        (r"Validating evolved skill", JobStatus.EVALUATING),
+        (r"Evaluating on holdout", JobStatus.EVALUATING),
+        (r"Evolution Results", JobStatus.SAVING),
+        (r"Saved.*to output", JobStatus.SAVING),
+        (r"✓ Skill evolved", JobStatus.COMPLETED),
+    ]
+    # Pattern to extract scores
+    SCORE_PATTERN = re.compile(r"baseline[=_:\s]+(\d+\.?\d*)|evolved[=_:\s]+(\d+\.?\d*)|score[=_:\s]+(\d+\.?\d*)|improvement[=_:\s]+([+-]?\d+\.?\d*)", re.IGNORECASE)
+    # Pattern to extract iteration number
+    ITER_PATTERN = re.compile(r"iteration[=:\s]+(\d+)|eval[=:\s#]+(\d+)/(\d+)", re.IGNORECASE)
+    def parse_line(self, job: EvolutionJob, line: str):
+        """Parse a log line and update job state."""
+        # Strip ANSI codes
+        clean = re.sub(r"\x1b\[[0-9;]*m", "", line).strip()
+        if not clean:
+            return
+        job.add_log(clean)
+        # Check phase transitions
+        for pattern, phase in self.PHASE_PATTERNS:
+            if re.search(pattern, clean, re.IGNORECASE):
+                if phase != job.status:
+                    job.status = phase
+        # Check for optimization iteration progress
+        iter_match = self.ITER_PATTERN.search(clean)
+        if iter_match:
+            num = iter_match.group(1) or iter_match.group(2)
+            if num:
+                job.current_iteration = int(num)
+            total = iter_match.group(3)
+            if total:
+                job.iterations = int(total)
+        # Check for scores
+        score_match = self.SCORE_PATTERN.search(clean)
+        if score_match:
+            val = score_match.group(1) or score_match.group(2) or score_match.group(3) or score_match.group(4)
+            if val:
+                try:
+                    score = float(val)
+                    if "baseline" in clean.lower():
+                        job.baseline_score = score
+                    elif "evolved" in clean.lower() or "best" in clean.lower():
+                        job.current_best_score = score
+                    elif "improvement" in clean.lower():
+                        job.improvement = score
+                except ValueError:
+                    pass
+        # Check for completion
+        if "evolved_skill.md" in clean and "Saved" in clean:
+            job.status = JobStatus.COMPLETED
+            job.completed_at = datetime.now().isoformat()
+        # Check for failure
+        if "FAILED" in clean or "not found" in clean.lower():
+            if "✗" in clean or "Error" in clean or "error" in clean:
+                job.error = clean
+                job.status = JobStatus.FAILED
+                job.completed_at = datetime.now().isoformat()
+        # Persist periodically
+        if len(job.logs) % 10 == 0:
+            job.save_log()
+# Global tracker instance
+tracker = JobTracker()