PyPI - contextforge-eval - Versions diffs - 0.1.0__py3-none-any.whl - Mend

contextforge-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

context_forge/__init__.py +95 -0
context_forge/core/__init__.py +55 -0
context_forge/core/trace.py +369 -0
context_forge/core/types.py +121 -0
context_forge/evaluation.py +267 -0
context_forge/exceptions.py +56 -0
context_forge/graders/__init__.py +44 -0
context_forge/graders/base.py +264 -0
context_forge/graders/deterministic/__init__.py +11 -0
context_forge/graders/deterministic/memory_corruption.py +130 -0
context_forge/graders/hybrid.py +190 -0
context_forge/graders/judges/__init__.py +11 -0
context_forge/graders/judges/backends/__init__.py +9 -0
context_forge/graders/judges/backends/ollama.py +173 -0
context_forge/graders/judges/base.py +158 -0
context_forge/graders/judges/memory_hygiene_judge.py +332 -0
context_forge/graders/judges/models.py +113 -0
context_forge/harness/__init__.py +43 -0
context_forge/harness/user_simulator/__init__.py +70 -0
context_forge/harness/user_simulator/adapters/__init__.py +13 -0
context_forge/harness/user_simulator/adapters/base.py +67 -0
context_forge/harness/user_simulator/adapters/crewai.py +100 -0
context_forge/harness/user_simulator/adapters/langgraph.py +157 -0
context_forge/harness/user_simulator/adapters/pydanticai.py +105 -0
context_forge/harness/user_simulator/llm/__init__.py +5 -0
context_forge/harness/user_simulator/llm/ollama.py +119 -0
context_forge/harness/user_simulator/models.py +103 -0
context_forge/harness/user_simulator/persona.py +154 -0
context_forge/harness/user_simulator/runner.py +342 -0
context_forge/harness/user_simulator/scenario.py +95 -0
context_forge/harness/user_simulator/simulator.py +307 -0
context_forge/instrumentation/__init__.py +23 -0
context_forge/instrumentation/base.py +307 -0
context_forge/instrumentation/instrumentors/__init__.py +17 -0
context_forge/instrumentation/instrumentors/langchain.py +671 -0
context_forge/instrumentation/instrumentors/langgraph.py +534 -0
context_forge/instrumentation/tracer.py +588 -0
context_forge/py.typed +0 -0
contextforge_eval-0.1.0.dist-info/METADATA +420 -0
contextforge_eval-0.1.0.dist-info/RECORD +43 -0
contextforge_eval-0.1.0.dist-info/WHEEL +5 -0
contextforge_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
contextforge_eval-0.1.0.dist-info/top_level.txt +1 -0

context_forge/graders/judges/memory_hygiene_judge.py ADDED Viewed

@@ -0,0 +1,332 @@
+"""Memory Hygiene Judge - LLM-based semantic evaluation.
+Layer 2 of the hybrid Memory Hygiene Grader. Uses an LLM to evaluate:
+- Did the user provide new facts about themselves?
+- Do those facts contradict stored memory?
+- Was memory appropriately updated?
+These semantic checks require natural language understanding.
+Uses Ollama's structured output feature for reliable JSON parsing.
+"""
+import json
+import logging
+from typing import Any
+from pydantic import ValidationError
+from context_forge.core.trace import (
+    MemoryReadStep,
+    MemoryWriteStep,
+    TraceRun,
+    UserInputStep,
+)
+from context_forge.graders.base import Evidence, GraderResult, Severity
+from context_forge.graders.judges.base import LLMBackend, LLMJudge
+from context_forge.graders.judges.models import MemoryHygieneEvaluation
+logger = logging.getLogger(__name__)
+MEMORY_HYGIENE_PROMPT_TEMPLATE = '''You are evaluating an AI agent's memory management.
+## Task
+Analyze whether the agent correctly identified and saved ONLY facts the user explicitly stated.
+## Current Memory (what the agent knew at session start)
+{memory_state}
+## User Messages (what the user said during the session)
+{user_messages}
+## Memory Changes (field-level differences, old_value -> new_value)
+{memory_writes}
+## Step-by-Step Evaluation
+### Step 1: List user-stated facts
+Read the user messages carefully. What concrete facts did the user explicitly state about themselves?
+- Only include facts the user ACTUALLY said
+- Example: If user says "I have a Tesla Model 3", that's a user fact
+- Example: If user says "When should I charge?", that contains NO facts about themselves
+### Step 2: Check each memory change
+For EACH field in "Memory Changes", ask:
+- Did the user explicitly state this information? If yes → correct save
+- Did the user NOT mention this at all? If the agent invented it → HALLUCINATION
+- Was existing correct data deleted or overwritten incorrectly? → data loss
+### Step 3: Check for missed facts
+For each user-stated fact from Step 1, was it saved to memory? If not → missed fact
+## What IS a hallucination (flag these!)
+- Agent saves "user plans to buy solar" but user never mentioned solar → HALLUCINATION
+- Agent saves "user prefers morning charging" but user never stated a preference → HALLUCINATION
+- Agent saves ANY new semantic content that the user did not explicitly state → HALLUCINATION
+## What is NOT a hallucination (ignore these)
+- Timestamp/metadata changes (updated_at, created_at, IDs)
+- Preserving existing data that was already in memory
+- Reformatting user's words (e.g., "12kW" saved as "12000W")
+## Critical Rule
+If the agent writes NEW information to memory that the user did NOT say, that is a hallucination.
+The agent should ONLY save facts the user explicitly stated.
+Evaluate the memory management and provide your assessment.'''
+class MemoryHygieneJudge(LLMJudge):
+    """LLM-based judge for memory hygiene semantic evaluation.
+    Evaluates whether the agent correctly identified user facts and
+    updated memory appropriately. Catches issues that rule-based
+    checks cannot detect:
+    - User stated new fact but it wasn't saved
+    - Agent saved something user didn't say (hallucination)
+    - Contradictions between user statements and memory updates
+    Uses Pydantic models for structured output validation.
+    Usage:
+        from context_forge.graders.judges.backends import OllamaBackend
+        judge = MemoryHygieneJudge(backend=OllamaBackend(model="llama3.2"))
+        result = judge.grade(trace)
+        if not result.passed:
+            print("Issues found:")
+            for evidence in result.errors:
+                print(f"  - {evidence.description}")
+    """
+    name = "memory_hygiene_judge"
+    required_step_types = ["user_input"]
+    def _build_prompt(self, trace: TraceRun) -> str:
+        """Build the evaluation prompt from trace data.
+        Extracts user inputs, memory reads, and memory writes from
+        the trace and formats them for LLM evaluation.
+        """
+        # Extract relevant steps
+        user_inputs = [s for s in trace.steps if isinstance(s, UserInputStep)]
+        memory_reads = [s for s in trace.steps if isinstance(s, MemoryReadStep)]
+        memory_writes = [s for s in trace.steps if isinstance(s, MemoryWriteStep)]
+        # Format memory state (from reads)
+        if memory_reads:
+            memory_state = self._format_memory_state(memory_reads)
+        else:
+            memory_state = "No memory was read at session start."
+        # Format user messages
+        if user_inputs:
+            user_messages = self._format_user_messages(user_inputs)
+        else:
+            user_messages = "No user messages in this session."
+        # Format memory writes
+        if memory_writes:
+            memory_writes_text = self._format_memory_writes(memory_writes)
+        else:
+            memory_writes_text = "No memory updates were made."
+        # Build prompt
+        prompt = MEMORY_HYGIENE_PROMPT_TEMPLATE.format(
+            memory_state=memory_state,
+            user_messages=user_messages,
+            memory_writes=memory_writes_text,
+        )
+        return prompt
+    def _format_memory_state(self, memory_reads: list[MemoryReadStep]) -> str:
+        """Format memory read results for the prompt."""
+        parts = []
+        for i, read in enumerate(memory_reads, 1):
+            if read.results:
+                # Pretty print the results
+                results_str = json.dumps(read.results, indent=2, default=str)
+                parts.append(f"Read {i}:\n{results_str}")
+            else:
+                parts.append(f"Read {i}: (empty)")
+        return "\n\n".join(parts)
+    def _format_user_messages(self, user_inputs: list[UserInputStep]) -> str:
+        """Format user input messages for the prompt."""
+        parts = []
+        for i, inp in enumerate(user_inputs, 1):
+            parts.append(f"Message {i}: {inp.content}")
+        return "\n".join(parts)
+    def _format_memory_writes(self, memory_writes: list[MemoryWriteStep]) -> str:
+        """Format memory writes for the prompt."""
+        parts = []
+        for i, write in enumerate(memory_writes, 1):
+            if write.changes:
+                changes_str = "\n".join(
+                    f"  - {c.path}: {c.old_value} -> {c.new_value}"
+                    for c in write.changes
+                )
+                parts.append(f"Write {i} (to {write.namespace}):\n{changes_str}")
+            else:
+                parts.append(f"Write {i}: {write.data}")
+        return "\n\n".join(parts)
+    def grade(self, trace: TraceRun) -> GraderResult:
+        """Evaluate a trace using structured LLM output.
+        Overrides base class to use complete_structured for reliable parsing.
+        Args:
+            trace: The trace to evaluate
+        Returns:
+            GraderResult with LLM evaluation
+        """
+        prompt = self._build_prompt(trace)
+        try:
+            # Use structured output - Ollama enforces the schema
+            evaluation = self.backend.complete_structured(
+                prompt=prompt,
+                response_model=MemoryHygieneEvaluation,
+                temperature=self.temperature,
+            )
+            # Convert to GraderResult
+            evidence = self._evaluation_to_evidence(evaluation)
+            result = GraderResult(
+                grader_name=self.name,
+                passed=evaluation.passed,
+                score=evaluation.score,
+                evidence=evidence,
+            )
+            # Add reproducibility metadata
+            result.metadata = {
+                "llm": {
+                    "model_id": self.backend.model_id,
+                    "temperature": self.temperature,
+                    "prompt": prompt,
+                }
+            }
+            return result
+        except (ValidationError, ValueError) as e:
+            logger.warning(f"Structured output failed: {e}")
+            # Fallback: return a warning result
+            return GraderResult(
+                grader_name=self.name,
+                passed=True,  # Don't fail just because of LLM error
+                score=0.5,
+                evidence=[
+                    Evidence(
+                        check_name="llm_error",
+                        description=f"LLM evaluation failed: {e}",
+                        severity=Severity.WARN,
+                    )
+                ],
+                metadata={
+                    "llm": {
+                        "model_id": self.backend.model_id,
+                        "temperature": self.temperature,
+                        "prompt": prompt,
+                        "error": str(e),
+                    }
+                },
+            )
+    def _parse_response(self, response: str, trace: TraceRun) -> GraderResult:
+        """Parse LLM response (not used with structured output).
+        This method is kept for compatibility but the grade() method
+        uses complete_structured() instead.
+        """
+        # This shouldn't be called when using structured output
+        raise NotImplementedError(
+            "MemoryHygieneJudge uses structured output via grade() method"
+        )
+    def _evaluation_to_evidence(
+        self, evaluation: MemoryHygieneEvaluation
+    ) -> list[Evidence]:
+        """Convert a validated evaluation to evidence items."""
+        evidence: list[Evidence] = []
+        # Missed facts (ERROR)
+        for item in evaluation.facts_missed:
+            evidence.append(
+                Evidence(
+                    check_name="missed_fact",
+                    description=f"User stated '{item.fact}' but it was not saved",
+                    severity=Severity.ERROR,
+                    details={
+                        "fact": item.fact,
+                        "should_have_updated": item.should_have_updated,
+                    },
+                )
+            )
+        # Hallucinations (ERROR)
+        for item in evaluation.hallucinations:
+            evidence.append(
+                Evidence(
+                    check_name="hallucination",
+                    description=f"Agent saved '{item.saved}' which user did not state",
+                    severity=Severity.ERROR,
+                    details={
+                        "saved": item.saved,
+                        "reason": item.reason,
+                    },
+                )
+            )
+        # Data loss (ERROR)
+        for item in evaluation.data_incorrectly_lost:
+            evidence.append(
+                Evidence(
+                    check_name="incorrect_data_loss",
+                    description=f"Field '{item.field}' was incorrectly overwritten",
+                    severity=Severity.ERROR,
+                    details={
+                        "field": item.field,
+                        "old_value": item.old_value,
+                        "reason": item.reason,
+                    },
+                )
+            )
+        # Correctly saved facts (INFO - positive feedback)
+        for item in evaluation.facts_correctly_saved:
+            evidence.append(
+                Evidence(
+                    check_name="correct_save",
+                    description=f"Correctly saved: '{item.fact}'",
+                    severity=Severity.INFO,
+                    details={
+                        "fact": item.fact,
+                        "saved_as": item.saved_as,
+                    },
+                )
+            )
+        # Summary (INFO)
+        evidence.append(
+            Evidence(
+                check_name="llm_summary",
+                description=evaluation.summary,
+                severity=Severity.INFO,
+                details={
+                    "user_facts_count": len(evaluation.user_facts_stated),
+                    "correctly_saved_count": len(evaluation.facts_correctly_saved),
+                    "missed_count": len(evaluation.facts_missed),
+                    "hallucinations_count": len(evaluation.hallucinations),
+                },
+            )
+        )
+        return evidence

context_forge/graders/judges/models.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""Pydantic models for LLM judge responses.
+Using Pydantic models for LLM output provides:
+- Structured validation of responses
+- Clear schema documentation
+- Better error messages when parsing fails
+- Type safety throughout the codebase
+"""
+from typing import Optional
+from pydantic import BaseModel, Field
+class UserFact(BaseModel):
+    """A fact the user stated about themselves."""
+    fact: str = Field(description="Description of what the user stated")
+    topic: str = Field(description="Category: equipment, schedule, preference, household, location")
+class CorrectSave(BaseModel):
+    """A fact that was correctly saved to memory."""
+    fact: str = Field(description="What the user stated")
+    saved_as: str = Field(description="How it was saved to memory")
+class MissedFact(BaseModel):
+    """A fact the user stated but was not saved."""
+    fact: str = Field(description="What the user stated")
+    should_have_updated: str = Field(description="Which memory field should have been updated")
+class Hallucination(BaseModel):
+    """Something saved to memory that the user did not state."""
+    saved: str = Field(description="What was incorrectly saved")
+    reason: str = Field(description="Why this is considered a hallucination")
+class DataLoss(BaseModel):
+    """Correct data that was incorrectly lost or overwritten."""
+    field: str = Field(description="Which field was affected")
+    old_value: str = Field(description="The value that was lost")
+    reason: str = Field(description="Why this loss was incorrect")
+class MemoryHygieneEvaluation(BaseModel):
+    """Complete evaluation result from the Memory Hygiene Judge.
+    This model defines the expected structure of the LLM's response.
+    The LLM is prompted to return JSON matching this schema.
+    """
+    user_facts_stated: list[UserFact] = Field(
+        default_factory=list,
+        description="Facts the user stated about themselves during the session",
+    )
+    facts_correctly_saved: list[CorrectSave] = Field(
+        default_factory=list,
+        description="Facts that were correctly identified and saved",
+    )
+    facts_missed: list[MissedFact] = Field(
+        default_factory=list,
+        description="Facts the user stated but were not saved to memory",
+    )
+    hallucinations: list[Hallucination] = Field(
+        default_factory=list,
+        description="Things saved to memory that the user did not actually state",
+    )
+    data_incorrectly_lost: list[DataLoss] = Field(
+        default_factory=list,
+        description="Correct data that was incorrectly overwritten or deleted",
+    )
+    summary: str = Field(
+        description="One sentence summary of memory management quality"
+    )
+    score: float = Field(
+        ge=0.0,
+        le=1.0,
+        description="Quality score from 0.0 (worst) to 1.0 (best)",
+    )
+    passed: bool = Field(
+        description="Whether the memory management passed evaluation",
+    )
+    @classmethod
+    def get_json_schema_prompt(cls) -> str:
+        """Get a prompt-friendly description of the expected JSON schema."""
+        # Note: Using single braces - this string is NOT passed through .format()
+        return """{
+  "user_facts_stated": [
+    {"fact": "description of fact", "topic": "equipment|schedule|preference|household|location"}
+  ],
+  "facts_correctly_saved": [
+    {"fact": "what user stated", "saved_as": "how it was saved"}
+  ],
+  "facts_missed": [
+    {"fact": "what user stated", "should_have_updated": "which memory field"}
+  ],
+  "hallucinations": [
+    {"saved": "what was incorrectly saved", "reason": "why this is wrong"}
+  ],
+  "data_incorrectly_lost": [
+    {"field": "which field", "old_value": "what was lost", "reason": "why this was wrong"}
+  ],
+  "summary": "One sentence summary",
+  "score": 0.0 to 1.0,
+  "passed": true or false
+}"""

context_forge/harness/__init__.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""ContextForge harness module for evaluation and simulation."""
+from context_forge.harness.user_simulator import (
+    AgentAdapter,
+    BatchSimulationRunner,
+    CrewAIAdapter,
+    GenerativeScenario,
+    Goal,
+    LangGraphAdapter,
+    LLMUserSimulator,
+    Persona,
+    PydanticAIAdapter,
+    ScriptedScenario,
+    ScriptedUserSimulator,
+    SimulationResult,
+    SimulationRunner,
+    SimulationState,
+    UserSimulator,
+)
+__all__ = [
+    # Runner
+    "SimulationRunner",
+    "BatchSimulationRunner",
+    # State
+    "SimulationState",
+    "SimulationResult",
+    # Personas & Scenarios
+    "Persona",
+    "Goal",
+    "ScriptedScenario",
+    "GenerativeScenario",
+    # Protocols
+    "UserSimulator",
+    "AgentAdapter",
+    # Simulators
+    "LLMUserSimulator",
+    "ScriptedUserSimulator",
+    # Adapters
+    "LangGraphAdapter",
+    "CrewAIAdapter",
+    "PydanticAIAdapter",
+]

context_forge/harness/user_simulator/__init__.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""User simulator module for generating multi-turn conversations with agents."""
+from .adapters import (
+    AgentAdapter,
+    CrewAIAdapter,
+    LangGraphAdapter,
+    PydanticAIAdapter,
+)
+from .llm import OllamaClient, OllamaConfig
+from .models import (
+    ConversationRole,
+    SimulationResult,
+    SimulationState,
+    SimulationTurn,
+)
+from .persona import (
+    Behavior,
+    CommunicationStyle,
+    Goal,
+    Persona,
+    TechnicalLevel,
+)
+from .runner import BatchSimulationRunner, SimulationRunner
+from .scenario import (
+    GenerativeScenario,
+    Scenario,
+    ScriptedScenario,
+    ScriptedTurn,
+    TerminationCondition,
+)
+from .simulator import (
+    LLMUserSimulator,
+    ScriptedUserSimulator,
+    UserSimulator,
+)
+__all__ = [
+    # Models
+    "SimulationState",
+    "SimulationResult",
+    "SimulationTurn",
+    "ConversationRole",
+    # Personas
+    "Persona",
+    "Behavior",
+    "Goal",
+    "CommunicationStyle",
+    "TechnicalLevel",
+    # Scenarios
+    "Scenario",
+    "ScriptedScenario",
+    "GenerativeScenario",
+    "ScriptedTurn",
+    "TerminationCondition",
+    # Simulators
+    "UserSimulator",
+    "LLMUserSimulator",
+    "ScriptedUserSimulator",
+    # Adapters
+    "AgentAdapter",
+    "LangGraphAdapter",
+    "CrewAIAdapter",
+    "PydanticAIAdapter",
+    # Runner
+    "SimulationRunner",
+    "BatchSimulationRunner",
+    # LLM
+    "OllamaClient",
+    "OllamaConfig",
+]

context_forge/harness/user_simulator/adapters/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Agent adapters for different frameworks."""
+from .base import AgentAdapter
+from .crewai import CrewAIAdapter
+from .langgraph import LangGraphAdapter
+from .pydanticai import PydanticAIAdapter
+__all__ = [
+    "AgentAdapter",
+    "LangGraphAdapter",
+    "CrewAIAdapter",
+    "PydanticAIAdapter",
+]

context_forge/harness/user_simulator/adapters/base.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""Base protocol for agent adapters."""
+from typing import Any, Protocol, runtime_checkable
+from langchain_core.messages import BaseMessage
+from ..models import SimulationState
+@runtime_checkable
+class AgentAdapter(Protocol):
+    """Protocol for adapting different agent frameworks to the simulation harness.
+    Each adapter wraps a framework-specific agent and provides a uniform
+    interface for:
+    - Invoking the agent with user messages
+    - Extracting responses in BaseMessage format
+    - Managing agent state between turns
+    """
+    @property
+    def framework(self) -> str:
+        """Return the framework name (e.g., 'langgraph', 'crewai', 'pydanticai')."""
+        ...
+    @property
+    def agent_name(self) -> str:
+        """Return the agent's name/identifier."""
+        ...
+    async def invoke(
+        self,
+        message: BaseMessage,
+        state: SimulationState,
+    ) -> BaseMessage:
+        """Invoke the agent with a user message and return the response.
+        Args:
+            message: User's input message (HumanMessage)
+            state: Current simulation state for context
+        Returns:
+            Agent's response as AIMessage
+        """
+        ...
+    async def initialize(
+        self,
+        config: dict[str, Any] | None = None,
+    ) -> None:
+        """Initialize the agent before simulation starts.
+        Called once per simulation run. Use for setup that should
+        happen before the first turn.
+        """
+        ...
+    async def cleanup(self) -> None:
+        """Clean up agent resources after simulation ends."""
+        ...
+    def get_state(self) -> dict[str, Any]:
+        """Get the current internal state of the agent.
+        Used for trace capture and debugging.
+        """
+        ...