PyPI - contextforge-eval - Versions diffs - 0.1.0__py3-none-any.whl - Mend

contextforge-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

context_forge/__init__.py +95 -0
context_forge/core/__init__.py +55 -0
context_forge/core/trace.py +369 -0
context_forge/core/types.py +121 -0
context_forge/evaluation.py +267 -0
context_forge/exceptions.py +56 -0
context_forge/graders/__init__.py +44 -0
context_forge/graders/base.py +264 -0
context_forge/graders/deterministic/__init__.py +11 -0
context_forge/graders/deterministic/memory_corruption.py +130 -0
context_forge/graders/hybrid.py +190 -0
context_forge/graders/judges/__init__.py +11 -0
context_forge/graders/judges/backends/__init__.py +9 -0
context_forge/graders/judges/backends/ollama.py +173 -0
context_forge/graders/judges/base.py +158 -0
context_forge/graders/judges/memory_hygiene_judge.py +332 -0
context_forge/graders/judges/models.py +113 -0
context_forge/harness/__init__.py +43 -0
context_forge/harness/user_simulator/__init__.py +70 -0
context_forge/harness/user_simulator/adapters/__init__.py +13 -0
context_forge/harness/user_simulator/adapters/base.py +67 -0
context_forge/harness/user_simulator/adapters/crewai.py +100 -0
context_forge/harness/user_simulator/adapters/langgraph.py +157 -0
context_forge/harness/user_simulator/adapters/pydanticai.py +105 -0
context_forge/harness/user_simulator/llm/__init__.py +5 -0
context_forge/harness/user_simulator/llm/ollama.py +119 -0
context_forge/harness/user_simulator/models.py +103 -0
context_forge/harness/user_simulator/persona.py +154 -0
context_forge/harness/user_simulator/runner.py +342 -0
context_forge/harness/user_simulator/scenario.py +95 -0
context_forge/harness/user_simulator/simulator.py +307 -0
context_forge/instrumentation/__init__.py +23 -0
context_forge/instrumentation/base.py +307 -0
context_forge/instrumentation/instrumentors/__init__.py +17 -0
context_forge/instrumentation/instrumentors/langchain.py +671 -0
context_forge/instrumentation/instrumentors/langgraph.py +534 -0
context_forge/instrumentation/tracer.py +588 -0
context_forge/py.typed +0 -0
contextforge_eval-0.1.0.dist-info/METADATA +420 -0
contextforge_eval-0.1.0.dist-info/RECORD +43 -0
contextforge_eval-0.1.0.dist-info/WHEEL +5 -0
contextforge_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
contextforge_eval-0.1.0.dist-info/top_level.txt +1 -0

context_forge/harness/user_simulator/persona.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""Persona and behavior definitions for user simulation."""
+from enum import Enum
+from typing import Any
+from pydantic import BaseModel, Field
+class CommunicationStyle(str, Enum):
+    """How the persona communicates."""
+    CONCISE = "concise"
+    VERBOSE = "verbose"
+    CASUAL = "casual"
+    FORMAL = "formal"
+    CONFUSED = "confused"
+    IMPATIENT = "impatient"
+class TechnicalLevel(str, Enum):
+    """Technical sophistication of the persona."""
+    NOVICE = "novice"
+    INTERMEDIATE = "intermediate"
+    EXPERT = "expert"
+class Behavior(BaseModel):
+    """Behavioral traits that influence response generation."""
+    communication_style: CommunicationStyle = CommunicationStyle.CASUAL
+    technical_level: TechnicalLevel = TechnicalLevel.INTERMEDIATE
+    patience_level: int = Field(default=5, ge=1, le=10)
+    # Response patterns
+    asks_followup_questions: bool = True
+    provides_context_upfront: bool = True
+    corrects_misunderstandings: bool = True
+    # Conversation dynamics
+    topic_drift_probability: float = Field(default=0.1, ge=0, le=1)
+    clarification_threshold: int = Field(default=2)
+class Goal(BaseModel):
+    """A specific goal the persona wants to achieve."""
+    description: str
+    success_criteria: str
+    priority: int = Field(default=1, ge=1, le=5)
+    is_achieved: bool = False
+    metadata: dict[str, Any] = Field(default_factory=dict)
+class Persona(BaseModel):
+    """Complete persona definition for user simulation.
+    A persona represents a simulated user with specific characteristics,
+    goals, and behavioral traits. The LLM uses this to generate contextually
+    appropriate responses.
+    """
+    persona_id: str
+    name: str
+    description: str = ""
+    # Context that shapes responses
+    background: str = ""
+    situation: str = ""
+    # Behavioral configuration
+    behavior: Behavior = Field(default_factory=Behavior)
+    # Goals for this conversation
+    goals: list[Goal] = Field(default_factory=list)
+    # Domain-specific context
+    context: dict[str, Any] = Field(default_factory=dict)
+    # Example phrases this persona might use
+    example_phrases: list[str] = Field(default_factory=list)
+    def to_system_prompt(self) -> str:
+        """Generate system prompt for LLM-based response generation."""
+        style_desc = {
+            CommunicationStyle.CONCISE: "Keep responses brief and to the point.",
+            CommunicationStyle.VERBOSE: "Provide detailed responses with context.",
+            CommunicationStyle.CASUAL: "Use informal, conversational language.",
+            CommunicationStyle.FORMAL: "Use professional, polished language.",
+            CommunicationStyle.CONFUSED: "Often ask for clarification or express uncertainty.",
+            CommunicationStyle.IMPATIENT: "Express urgency, want quick answers.",
+        }
+        tech_desc = {
+            TechnicalLevel.NOVICE: "Avoid technical jargon. Ask for simpler explanations.",
+            TechnicalLevel.INTERMEDIATE: "Comfortable with basic domain terminology.",
+            TechnicalLevel.EXPERT: "Use technical terms confidently. Challenge vague answers.",
+        }
+        goals_str = "\n".join(
+            f"- {g.description}" for g in self.goals if not g.is_achieved
+        )
+        prompt_parts = [
+            f"You are simulating a user named {self.name}.",
+        ]
+        if self.background:
+            prompt_parts.append(f"\nBackground: {self.background}")
+        if self.situation:
+            prompt_parts.append(f"Current Situation: {self.situation}")
+        prompt_parts.extend([
+            f"\nCommunication Style: {style_desc[self.behavior.communication_style]}",
+            f"Technical Level: {tech_desc[self.behavior.technical_level]}",
+        ])
+        if goals_str:
+            prompt_parts.append(f"\nYour goals for this conversation:\n{goals_str}")
+        else:
+            prompt_parts.append("\nYour goal: Have a productive conversation")
+        if self.context:
+            context_str = ", ".join(f"{k}: {v}" for k, v in self.context.items())
+            prompt_parts.append(f"\nAdditional context: {context_str}")
+        if self.example_phrases:
+            phrases_str = ", ".join(f'"{p}"' for p in self.example_phrases[:3])
+            prompt_parts.append(f"\nExample phrases you might use: {phrases_str}")
+        prompt_parts.append(
+            "\n\nRespond as this user would, staying in character. "
+            "Generate only the user's message, not the agent's response."
+        )
+        return "\n".join(prompt_parts)
+    def mark_goal_achieved(self, goal_description: str) -> bool:
+        """Mark a goal as achieved by its description."""
+        for goal in self.goals:
+            if goal.description == goal_description:
+                goal.is_achieved = True
+                return True
+        return False
+    def get_pending_goals(self) -> list[Goal]:
+        """Get list of goals not yet achieved."""
+        return [g for g in self.goals if not g.is_achieved]
+    def reset_goals(self) -> None:
+        """Reset all goals to not achieved."""
+        for goal in self.goals:
+            goal.is_achieved = False

context_forge/harness/user_simulator/runner.py ADDED Viewed

@@ -0,0 +1,342 @@
+"""Simulation runner for orchestrating user-agent conversations."""
+import asyncio
+import json
+import uuid
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
+from langchain_core.messages import HumanMessage
+from .adapters.base import AgentAdapter
+from .models import (
+    ConversationRole,
+    SimulationResult,
+    SimulationState,
+    SimulationTurn,
+)
+from .persona import Persona
+from .scenario import GenerativeScenario, Scenario, ScriptedScenario
+from .simulator import LLMUserSimulator, ScriptedUserSimulator, UserSimulator
+class SimulationRunner:
+    """Orchestrates simulation runs between user simulator and agent adapter.
+    Handles the conversation loop, trace capture integration, and
+    termination conditions.
+    Example usage:
+        from context_forge.harness import SimulationRunner, LangGraphAdapter
+        adapter = LangGraphAdapter(graph=my_graph, ...)
+        scenario = GenerativeScenario(...)
+        runner = SimulationRunner(
+            adapter=adapter,
+            trace_output_dir="./traces",
+        )
+        result = await runner.run(scenario)
+    """
+    def __init__(
+        self,
+        adapter: AgentAdapter,
+        trace_output_dir: Optional[Union[str, Path]] = None,
+        default_max_turns: int = 20,
+    ):
+        """Initialize the simulation runner.
+        Args:
+            adapter: Framework adapter for agent invocation
+            trace_output_dir: Directory for trace files
+            default_max_turns: Default maximum turns if not specified in scenario
+        """
+        self._adapter = adapter
+        self._trace_output_dir = Path(trace_output_dir) if trace_output_dir else None
+        self._default_max_turns = default_max_turns
+    async def run(
+        self,
+        scenario: Scenario,
+        config: Optional[dict[str, Any]] = None,
+    ) -> SimulationResult:
+        """Run a complete simulation.
+        Args:
+            scenario: Scenario definition (scripted or generative)
+            config: Additional configuration for adapter/simulator
+        Returns:
+            SimulationResult with conversation history and metrics
+        """
+        simulation_id = str(uuid.uuid4())
+        # Create user simulator based on scenario type
+        simulator = self._create_simulator(scenario)
+        # Initialize state
+        state = SimulationState(
+            simulation_id=simulation_id,
+            scenario_id=scenario.scenario_id,
+            persona_id=scenario.persona.persona_id,
+            max_turns=scenario.max_turns,
+        )
+        # Initialize adapter and simulator
+        await self._adapter.initialize(config)
+        if hasattr(simulator, "initialize"):
+            await simulator.initialize()
+        try:
+            # Run conversation loop
+            await self._run_conversation_loop(state, simulator, scenario)
+            # Mark success
+            state.status = "completed"
+            state.ended_at = datetime.now()
+            # Calculate metrics
+            metrics = self._calculate_metrics(state)
+            # Save trace if configured
+            trace_path = None
+            if self._trace_output_dir:
+                trace_path = await self._save_trace(state)
+            return SimulationResult(
+                simulation_id=simulation_id,
+                state=state,
+                trace_path=str(trace_path) if trace_path else None,
+                metrics=metrics,
+                success=True,
+            )
+        except Exception as e:
+            state.status = "failed"
+            state.ended_at = datetime.now()
+            state.termination_reason = str(e)
+            return SimulationResult(
+                simulation_id=simulation_id,
+                state=state,
+                success=False,
+                error=str(e),
+            )
+        finally:
+            await self._adapter.cleanup()
+            if hasattr(simulator, "cleanup"):
+                await simulator.cleanup()
+    async def _run_conversation_loop(
+        self,
+        state: SimulationState,
+        simulator: UserSimulator,
+        scenario: Scenario,
+    ) -> None:
+        """Execute the main conversation loop."""
+        # Get initial message
+        initial_message_text = scenario.get_initial_message()
+        initial_message = HumanMessage(content=initial_message_text)
+        # Add initial user turn
+        state.turns.append(SimulationTurn(
+            turn_number=0,
+            role=ConversationRole.USER,
+            message=initial_message,
+        ))
+        # Invoke agent with initial message
+        agent_response = await self._adapter.invoke(initial_message, state)
+        state.turns.append(SimulationTurn(
+            turn_number=0,
+            role=ConversationRole.AGENT,
+            message=agent_response,
+        ))
+        state.current_turn = 1
+        # Main loop
+        while state.current_turn < state.max_turns:
+            # Check termination
+            should_stop, reason = await simulator.should_terminate(state)
+            if should_stop:
+                state.termination_reason = reason
+                break
+            # Generate user response
+            try:
+                user_message = await simulator.generate_response(agent_response, state)
+            except StopIteration as e:
+                state.termination_reason = str(e)
+                break
+            state.turns.append(SimulationTurn(
+                turn_number=state.current_turn,
+                role=ConversationRole.USER,
+                message=user_message,
+            ))
+            # Invoke agent
+            agent_response = await self._adapter.invoke(user_message, state)
+            state.turns.append(SimulationTurn(
+                turn_number=state.current_turn,
+                role=ConversationRole.AGENT,
+                message=agent_response,
+            ))
+            # Update agent state snapshot
+            state.agent_state = self._adapter.get_state()
+            state.current_turn += 1
+    def _create_simulator(self, scenario: Scenario) -> UserSimulator:
+        """Create appropriate simulator for scenario type."""
+        if isinstance(scenario, ScriptedScenario):
+            llm_fallback = None
+            if scenario.fallback == "generative":
+                llm_fallback = LLMUserSimulator(scenario.persona)
+            return ScriptedUserSimulator(scenario, llm_fallback)
+        else:
+            return LLMUserSimulator(scenario.persona)
+    def _calculate_metrics(self, state: SimulationState) -> dict[str, Any]:
+        """Calculate simulation metrics."""
+        user_turns = [t for t in state.turns if t.role == ConversationRole.USER]
+        agent_turns = [t for t in state.turns if t.role == ConversationRole.AGENT]
+        duration = 0.0
+        if state.ended_at and state.started_at:
+            duration = (state.ended_at - state.started_at).total_seconds()
+        return {
+            "total_turns": len(state.turns),
+            "user_turns": len(user_turns),
+            "agent_turns": len(agent_turns),
+            "avg_user_message_length": (
+                sum(len(t.message.content) for t in user_turns) / max(len(user_turns), 1)
+            ),
+            "avg_agent_message_length": (
+                sum(len(t.message.content) for t in agent_turns) / max(len(agent_turns), 1)
+            ),
+            "duration_seconds": duration,
+            "termination_reason": state.termination_reason,
+        }
+    async def _save_trace(self, state: SimulationState) -> Path:
+        """Save simulation state as a trace file."""
+        if not self._trace_output_dir:
+            raise ValueError("No trace output directory configured")
+        self._trace_output_dir.mkdir(parents=True, exist_ok=True)
+        trace_file = self._trace_output_dir / f"simulation_{state.simulation_id}.json"
+        # Convert to JSON-serializable format
+        result = SimulationResult(
+            simulation_id=state.simulation_id,
+            state=state,
+            success=True,
+        )
+        trace_data = result.to_dict()
+        with open(trace_file, "w") as f:
+            json.dump(trace_data, f, indent=2, default=str)
+        return trace_file
+class BatchSimulationRunner:
+    """Run multiple simulations with different scenarios/configurations.
+    Useful for evaluation runs across multiple test cases.
+    Example usage:
+        def adapter_factory():
+            return LangGraphAdapter(graph=build_graph(), ...)
+        runner = BatchSimulationRunner(
+            adapter_factory=adapter_factory,
+            trace_output_dir="./traces",
+            parallel=True,
+        )
+        results = await runner.run_all(scenarios)
+    """
+    def __init__(
+        self,
+        adapter_factory: Callable[[], AgentAdapter],
+        trace_output_dir: Optional[Union[str, Path]] = None,
+        parallel: bool = False,
+        max_parallel: int = 4,
+    ):
+        """Initialize batch simulation runner.
+        Args:
+            adapter_factory: Factory function to create adapters
+            trace_output_dir: Directory for trace files
+            parallel: Whether to run simulations in parallel
+            max_parallel: Maximum concurrent simulations
+        """
+        self._adapter_factory = adapter_factory
+        self._trace_output_dir = Path(trace_output_dir) if trace_output_dir else None
+        self._parallel = parallel
+        self._max_parallel = max_parallel
+    async def run_all(
+        self,
+        scenarios: list[Scenario],
+    ) -> list[SimulationResult]:
+        """Run all scenarios and collect results.
+        Args:
+            scenarios: List of scenarios to run
+        Returns:
+            List of simulation results
+        """
+        if self._parallel:
+            return await self._run_parallel(scenarios)
+        else:
+            return await self._run_sequential(scenarios)
+    async def _run_sequential(
+        self,
+        scenarios: list[Scenario],
+    ) -> list[SimulationResult]:
+        """Run scenarios one at a time."""
+        results = []
+        for scenario in scenarios:
+            adapter = self._adapter_factory()
+            runner = SimulationRunner(
+                adapter=adapter,
+                trace_output_dir=self._trace_output_dir,
+            )
+            result = await runner.run(scenario)
+            results.append(result)
+        return results
+    async def _run_parallel(
+        self,
+        scenarios: list[Scenario],
+    ) -> list[SimulationResult]:
+        """Run scenarios in parallel with concurrency limit."""
+        semaphore = asyncio.Semaphore(self._max_parallel)
+        async def run_with_semaphore(scenario: Scenario) -> SimulationResult:
+            async with semaphore:
+                adapter = self._adapter_factory()
+                runner = SimulationRunner(
+                    adapter=adapter,
+                    trace_output_dir=self._trace_output_dir,
+                )
+                return await runner.run(scenario)
+        tasks = [run_with_semaphore(s) for s in scenarios]
+        return await asyncio.gather(*tasks)

context_forge/harness/user_simulator/scenario.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Scenario definitions for user simulation."""
+from typing import Any, Literal, Optional, Union
+from pydantic import BaseModel, Field
+from .persona import Persona
+class TerminationCondition(BaseModel):
+    """Condition that can end a simulation."""
+    condition_type: Literal["max_turns", "goal_achieved", "keyword", "custom"]
+    value: Any
+    description: str = ""
+class ScriptedTurn(BaseModel):
+    """A pre-defined turn in a scripted scenario."""
+    turn_number: int
+    user_message: str
+    expected_keywords: list[str] = Field(default_factory=list)
+    allow_variation: bool = False
+class ScriptedScenario(BaseModel):
+    """A scenario with pre-defined user messages.
+    Useful for regression testing and specific edge case validation.
+    """
+    scenario_id: str
+    name: str
+    description: str = ""
+    persona: Persona
+    # Pre-defined conversation script
+    turns: list[ScriptedTurn]
+    # What to do after script exhausted
+    fallback: Literal["loop", "generative", "terminate"] = "terminate"
+    # Termination conditions
+    max_turns: int = Field(default=50)
+    termination_conditions: list[TerminationCondition] = Field(default_factory=list)
+    def get_turn_message(self, turn_number: int) -> Optional[str]:
+        """Get the scripted message for a turn, if available."""
+        for turn in self.turns:
+            if turn.turn_number == turn_number:
+                return turn.user_message
+        return None
+    def get_initial_message(self) -> str:
+        """Get the first user message."""
+        if self.turns:
+            return self.turns[0].user_message
+        raise ValueError("Scripted scenario has no turns defined")
+class GenerativeScenario(BaseModel):
+    """A scenario where user responses are LLM-generated.
+    The persona and goals guide response generation. More flexible
+    than scripted scenarios for exploratory testing.
+    """
+    scenario_id: str
+    name: str
+    description: str = ""
+    persona: Persona
+    # Initial user message to start conversation
+    initial_message: str
+    # Constraints on response generation
+    max_turns: int = Field(default=20)
+    termination_conditions: list[TerminationCondition] = Field(default_factory=list)
+    # Response generation parameters
+    temperature: float = Field(default=0.7, ge=0, le=2)
+    max_response_tokens: int = Field(default=500)
+    # Topic boundaries
+    allowed_topics: list[str] = Field(default_factory=list)
+    forbidden_topics: list[str] = Field(default_factory=list)
+    def get_initial_message(self) -> str:
+        """Get the initial user message."""
+        return self.initial_message
+# Union type for all scenarios
+Scenario = Union[ScriptedScenario, GenerativeScenario]