PyPI - contextforge-eval - Versions diffs - 0.1.0__py3-none-any.whl - Mend

contextforge-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

context_forge/__init__.py +95 -0
context_forge/core/__init__.py +55 -0
context_forge/core/trace.py +369 -0
context_forge/core/types.py +121 -0
context_forge/evaluation.py +267 -0
context_forge/exceptions.py +56 -0
context_forge/graders/__init__.py +44 -0
context_forge/graders/base.py +264 -0
context_forge/graders/deterministic/__init__.py +11 -0
context_forge/graders/deterministic/memory_corruption.py +130 -0
context_forge/graders/hybrid.py +190 -0
context_forge/graders/judges/__init__.py +11 -0
context_forge/graders/judges/backends/__init__.py +9 -0
context_forge/graders/judges/backends/ollama.py +173 -0
context_forge/graders/judges/base.py +158 -0
context_forge/graders/judges/memory_hygiene_judge.py +332 -0
context_forge/graders/judges/models.py +113 -0
context_forge/harness/__init__.py +43 -0
context_forge/harness/user_simulator/__init__.py +70 -0
context_forge/harness/user_simulator/adapters/__init__.py +13 -0
context_forge/harness/user_simulator/adapters/base.py +67 -0
context_forge/harness/user_simulator/adapters/crewai.py +100 -0
context_forge/harness/user_simulator/adapters/langgraph.py +157 -0
context_forge/harness/user_simulator/adapters/pydanticai.py +105 -0
context_forge/harness/user_simulator/llm/__init__.py +5 -0
context_forge/harness/user_simulator/llm/ollama.py +119 -0
context_forge/harness/user_simulator/models.py +103 -0
context_forge/harness/user_simulator/persona.py +154 -0
context_forge/harness/user_simulator/runner.py +342 -0
context_forge/harness/user_simulator/scenario.py +95 -0
context_forge/harness/user_simulator/simulator.py +307 -0
context_forge/instrumentation/__init__.py +23 -0
context_forge/instrumentation/base.py +307 -0
context_forge/instrumentation/instrumentors/__init__.py +17 -0
context_forge/instrumentation/instrumentors/langchain.py +671 -0
context_forge/instrumentation/instrumentors/langgraph.py +534 -0
context_forge/instrumentation/tracer.py +588 -0
context_forge/py.typed +0 -0
contextforge_eval-0.1.0.dist-info/METADATA +420 -0
contextforge_eval-0.1.0.dist-info/RECORD +43 -0
contextforge_eval-0.1.0.dist-info/WHEEL +5 -0
contextforge_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
contextforge_eval-0.1.0.dist-info/top_level.txt +1 -0

context_forge/harness/user_simulator/simulator.py ADDED Viewed

@@ -0,0 +1,307 @@
+"""User simulator implementations."""
+from typing import Optional, Protocol, runtime_checkable
+from langchain_core.messages import BaseMessage, HumanMessage
+from .llm.ollama import OllamaClient, OllamaConfig
+from .models import ConversationRole, SimulationState
+from .persona import Persona
+from .scenario import ScriptedScenario
+@runtime_checkable
+class UserSimulator(Protocol):
+    """Protocol for simulating user behavior in agent conversations.
+    Implementations can be:
+    - LLM-based (using Ollama to generate contextual responses)
+    - Scripted (returning pre-defined responses)
+    - Hybrid (following a script with LLM fallback)
+    """
+    @property
+    def persona(self) -> Persona:
+        """Get the persona driving this simulator."""
+        ...
+    async def generate_response(
+        self,
+        agent_message: BaseMessage,
+        state: SimulationState,
+    ) -> BaseMessage:
+        """Generate the next user message in response to agent output.
+        Args:
+            agent_message: The agent's most recent message
+            state: Current simulation state including conversation history
+        Returns:
+            A HumanMessage representing the simulated user's response
+        """
+        ...
+    async def should_terminate(
+        self,
+        state: SimulationState,
+    ) -> tuple[bool, Optional[str]]:
+        """Determine if the conversation should end.
+        Args:
+            state: Current simulation state
+        Returns:
+            Tuple of (should_terminate, reason)
+        """
+        ...
+    def reset(self) -> None:
+        """Reset simulator state for a new conversation."""
+        ...
+class LLMUserSimulator:
+    """User simulator powered by Ollama LLM.
+    Generates contextually appropriate user responses based on
+    persona, goals, and conversation history.
+    Example usage:
+        persona = Persona(
+            persona_id="test_user",
+            name="Sarah",
+            background="Homeowner with solar panels",
+            goals=[Goal(description="Get EV charging advice", ...)],
+        )
+        simulator = LLMUserSimulator(persona)
+        await simulator.initialize()
+        response = await simulator.generate_response(agent_message, state)
+    """
+    def __init__(
+        self,
+        persona: Persona,
+        ollama_config: Optional[OllamaConfig] = None,
+        check_goals: bool = True,
+    ):
+        """Initialize the LLM user simulator.
+        Args:
+            persona: Persona to simulate
+            ollama_config: Configuration for Ollama
+            check_goals: Whether to check goal achievement for termination
+        """
+        self._persona = persona
+        self._ollama_config = ollama_config or OllamaConfig()
+        self._check_goals = check_goals
+        self._client: Optional[OllamaClient] = None
+        self._initialized = False
+    @property
+    def persona(self) -> Persona:
+        return self._persona
+    async def initialize(self) -> None:
+        """Initialize the Ollama client."""
+        if self._initialized:
+            return
+        self._client = OllamaClient(self._ollama_config)
+        await self._client.__aenter__()
+        self._initialized = True
+    async def cleanup(self) -> None:
+        """Clean up resources."""
+        if self._client:
+            await self._client.__aexit__(None, None, None)
+            self._client = None
+            self._initialized = False
+    async def generate_response(
+        self,
+        agent_message: BaseMessage,
+        state: SimulationState,
+    ) -> BaseMessage:
+        """Generate a user response using the LLM."""
+        if not self._client or not self._initialized:
+            await self.initialize()
+        # Build conversation context
+        history = self._format_history(state)
+        prompt = f"""Based on the conversation history below, generate the next message from the user's perspective.
+Conversation History:
+{history}
+Agent's last message: {agent_message.content}
+Generate only the user's response (no labels or prefixes). Stay in character.
+Keep your response focused and concise (1-3 sentences typically)."""
+        system_prompt = self._persona.to_system_prompt()
+        response = await self._client.generate(prompt, system=system_prompt)
+        # Clean up response
+        cleaned = response.strip()
+        # Remove any accidental role prefixes
+        for prefix in ["User:", "user:", "Human:", "human:", "Me:", "me:"]:
+            if cleaned.startswith(prefix):
+                cleaned = cleaned[len(prefix):].strip()
+        return HumanMessage(content=cleaned)
+    def _format_history(self, state: SimulationState) -> str:
+        """Format conversation history for the prompt."""
+        lines = []
+        # Include last 10 turns for context
+        for turn in state.turns[-10:]:
+            role = "User" if turn.role == ConversationRole.USER else "Agent"
+            lines.append(f"{role}: {turn.message.content}")
+        return "\n".join(lines) or "(No history yet)"
+    async def should_terminate(
+        self,
+        state: SimulationState,
+    ) -> tuple[bool, Optional[str]]:
+        """Determine if conversation should end."""
+        # Check max turns
+        if state.current_turn >= state.max_turns:
+            return True, "max_turns_reached"
+        # Check goal achievement (use LLM to evaluate)
+        if self._check_goals and self._persona.goals:
+            achieved = await self._check_goals_achieved(state)
+            if achieved:
+                return True, "goals_achieved"
+        return False, None
+    async def _check_goals_achieved(self, state: SimulationState) -> bool:
+        """Use LLM to check if goals have been achieved."""
+        if not self._client or not self._initialized:
+            return False
+        pending_goals = self._persona.get_pending_goals()
+        if not pending_goals:
+            return True
+        goals_str = "\n".join(
+            f"- {g.description}: {g.success_criteria}"
+            for g in pending_goals
+        )
+        history = self._format_history(state)
+        prompt = f"""Based on this conversation, have the user's goals been achieved?
+Goals:
+{goals_str}
+Conversation:
+{history}
+Answer with ONLY 'yes' or 'no'."""
+        response = await self._client.generate(prompt)
+        return response.strip().lower() == "yes"
+    def reset(self) -> None:
+        """Reset persona goal states."""
+        self._persona.reset_goals()
+class ScriptedUserSimulator:
+    """User simulator that follows a pre-defined script.
+    Falls back to LLM generation if script is exhausted
+    and fallback mode is 'generative'.
+    Example usage:
+        scenario = ScriptedScenario(
+            scenario_id="test",
+            name="Test scenario",
+            persona=persona,
+            turns=[
+                ScriptedTurn(turn_number=0, user_message="Hello"),
+                ScriptedTurn(turn_number=1, user_message="What time should I charge?"),
+            ],
+        )
+        simulator = ScriptedUserSimulator(scenario)
+    """
+    def __init__(
+        self,
+        scenario: ScriptedScenario,
+        llm_fallback: Optional[LLMUserSimulator] = None,
+    ):
+        """Initialize scripted user simulator.
+        Args:
+            scenario: Scripted scenario with predefined turns
+            llm_fallback: Optional LLM simulator for fallback generation
+        """
+        self._scenario = scenario
+        self._llm_fallback = llm_fallback
+    @property
+    def persona(self) -> Persona:
+        return self._scenario.persona
+    async def initialize(self) -> None:
+        """Initialize fallback simulator if present."""
+        if self._llm_fallback:
+            await self._llm_fallback.initialize()
+    async def cleanup(self) -> None:
+        """Clean up fallback simulator if present."""
+        if self._llm_fallback:
+            await self._llm_fallback.cleanup()
+    async def generate_response(
+        self,
+        agent_message: BaseMessage,
+        state: SimulationState,
+    ) -> BaseMessage:
+        """Return scripted response or fall back to LLM."""
+        scripted = self._scenario.get_turn_message(state.current_turn)
+        if scripted:
+            return HumanMessage(content=scripted)
+        # Script exhausted
+        if self._scenario.fallback == "terminate":
+            raise StopIteration("Script exhausted")
+        elif self._scenario.fallback == "loop":
+            # Restart from beginning
+            if self._scenario.turns:
+                turn_in_script = state.current_turn % len(self._scenario.turns)
+                scripted = self._scenario.turns[turn_in_script].user_message
+                return HumanMessage(content=scripted)
+            raise StopIteration("No turns in script")
+        elif self._scenario.fallback == "generative" and self._llm_fallback:
+            return await self._llm_fallback.generate_response(agent_message, state)
+        raise ValueError(f"Invalid fallback mode: {self._scenario.fallback}")
+    async def should_terminate(
+        self,
+        state: SimulationState,
+    ) -> tuple[bool, Optional[str]]:
+        """Check termination conditions."""
+        if state.current_turn >= self._scenario.max_turns:
+            return True, "max_turns_reached"
+        # Check if script is exhausted in terminate mode
+        if self._scenario.fallback == "terminate":
+            scripted = self._scenario.get_turn_message(state.current_turn)
+            if scripted is None:
+                return True, "script_exhausted"
+        return False, None
+    def reset(self) -> None:
+        """Reset state."""
+        self._scenario.persona.reset_goals()

context_forge/instrumentation/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Instrumentation module for ContextForge.
+This module provides multiple levels of trace capture:
+- Level 2: Auto-instrumentation via Instrumentor().instrument()
+- Level 3: Callback handlers for per-call control
+- Level 4: Explicit Tracer API for custom agents
+"""
+from context_forge.instrumentation.base import (
+    BaseInstrumentor,
+    RedactionConfig,
+)
+from context_forge.instrumentation.instrumentors.langchain import LangChainInstrumentor
+from context_forge.instrumentation.instrumentors.langgraph import LangGraphInstrumentor
+__all__ = [
+    # Base classes
+    "BaseInstrumentor",
+    "RedactionConfig",
+    # Framework instrumentors
+    "LangChainInstrumentor",
+    "LangGraphInstrumentor",
+]

context_forge/instrumentation/base.py ADDED Viewed

@@ -0,0 +1,307 @@
+"""Base instrumentation classes for ContextForge.
+This module implements:
+- T033: RedactionConfig model
+- T034: BaseInstrumentor abstract class
+- T035: instrument() and uninstrument() methods
+- T036: get_traces() method
+- T037: Context manager protocol
+"""
+import re
+import uuid
+from abc import ABC, abstractmethod
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Optional, Pattern
+from pydantic import BaseModel, ConfigDict, Field
+from context_forge.core.trace import TraceRun
+from context_forge.core.types import AgentInfo
+from context_forge.exceptions import (
+    InstrumentorAlreadyActiveError,
+    InstrumentorNotActiveError,
+)
+class RedactionConfig(BaseModel):
+    """Configuration for PII/secret redaction in traces.
+    Allows users to specify patterns and field names that should
+    be redacted from trace output.
+    Attributes:
+        patterns: Regex patterns to match and redact
+        field_names: Field names to always redact
+        replacement: String to replace redacted content
+        enabled: Whether redaction is active
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    patterns: list[Pattern[str]] = Field(default_factory=list)
+    field_names: list[str] = Field(
+        default_factory=lambda: ["password", "api_key", "secret", "token", "authorization"]
+    )
+    replacement: str = "[REDACTED]"
+    enabled: bool = True
+    @classmethod
+    def default(cls) -> "RedactionConfig":
+        """Create default redaction config with common patterns."""
+        return cls(
+            patterns=[
+                re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"),  # Email
+                re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),  # SSN
+                re.compile(r"\b\d{16}\b"),  # Credit card (simple)
+            ],
+            field_names=["password", "api_key", "secret", "token", "authorization", "bearer"],
+        )
+    def redact(self, value: str) -> str:
+        """Apply redaction to a string value.
+        Args:
+            value: String to potentially redact
+        Returns:
+            Redacted string if patterns match, original otherwise
+        """
+        if not self.enabled or not value:
+            return value
+        result = value
+        for pattern in self.patterns:
+            result = pattern.sub(self.replacement, result)
+        return result
+    def should_redact_field(self, field_name: str) -> bool:
+        """Check if a field name should be redacted.
+        Args:
+            field_name: Name of the field to check
+        Returns:
+            True if the field should be redacted
+        """
+        if not self.enabled:
+            return False
+        field_lower = field_name.lower()
+        return any(name.lower() in field_lower for name in self.field_names)
+class BaseInstrumentor(ABC):
+    """Abstract base class for framework instrumentors.
+    Provides the common interface for auto-instrumentation of
+    agent frameworks. Subclasses implement framework-specific
+    hooks.
+    Usage:
+        instrumentor = LangChainInstrumentor()
+        instrumentor.instrument()
+        # ... run agent code ...
+        traces = instrumentor.get_traces()
+        instrumentor.uninstrument()
+    Or with context manager:
+        with LangChainInstrumentor() as instrumentor:
+            # ... run agent code ...
+            traces = instrumentor.get_traces()
+    """
+    def __init__(
+        self,
+        agent_name: str = "default",
+        agent_version: Optional[str] = None,
+        output_path: Optional[str | Path] = None,
+        redaction_config: Optional[RedactionConfig] = None,
+    ):
+        """Initialize the instrumentor.
+        Args:
+            agent_name: Name to assign to traced agent
+            agent_version: Version string for the agent
+            output_path: Directory to save traces (optional)
+            redaction_config: PII redaction configuration
+        """
+        self._agent_name = agent_name
+        self._agent_version = agent_version
+        self._output_path = Path(output_path) if output_path else None
+        self._redaction_config = redaction_config or RedactionConfig()
+        self._is_active = False
+        self._traces: list[TraceRun] = []
+        self._current_trace: Optional[TraceRun] = None
+    @property
+    def is_active(self) -> bool:
+        """Whether instrumentation is currently active."""
+        return self._is_active
+    @property
+    @abstractmethod
+    def framework(self) -> str:
+        """Return the framework name (e.g., 'langchain', 'crewai')."""
+        pass
+    @property
+    @abstractmethod
+    def framework_version(self) -> Optional[str]:
+        """Return the framework version if available."""
+        pass
+    def instrument(self) -> "BaseInstrumentor":
+        """Activate instrumentation.
+        Installs hooks into the framework to capture trace events.
+        Returns:
+            Self for method chaining
+        Raises:
+            InstrumentorAlreadyActiveError: If already instrumented
+        """
+        if self._is_active:
+            raise InstrumentorAlreadyActiveError(
+                f"{self.__class__.__name__} is already active"
+            )
+        self._install_hooks()
+        self._is_active = True
+        return self
+    def uninstrument(self) -> None:
+        """Deactivate instrumentation.
+        Removes hooks and finalizes any active traces.
+        Raises:
+            InstrumentorNotActiveError: If not currently instrumented
+        """
+        if not self._is_active:
+            raise InstrumentorNotActiveError(
+                f"{self.__class__.__name__} is not active"
+            )
+        self._finalize_current_trace()
+        self._remove_hooks()
+        self._is_active = False
+    @abstractmethod
+    def _install_hooks(self) -> None:
+        """Install framework-specific hooks.
+        Subclasses must implement this to add callbacks/patches
+        to the target framework.
+        """
+        pass
+    @abstractmethod
+    def _remove_hooks(self) -> None:
+        """Remove framework-specific hooks.
+        Subclasses must implement this to clean up any installed
+        callbacks/patches.
+        """
+        pass
+    def get_traces(self) -> list[TraceRun]:
+        """Get all captured traces.
+        Returns:
+            List of TraceRun objects captured during instrumentation
+        """
+        # Include current trace if active
+        traces = list(self._traces)
+        if self._current_trace is not None:
+            traces.append(self._current_trace)
+        return traces
+    def clear_traces(self) -> None:
+        """Clear all captured traces."""
+        self._traces.clear()
+        self._current_trace = None
+    def _start_trace(self, task_description: Optional[str] = None) -> TraceRun:
+        """Start a new trace.
+        Args:
+            task_description: Optional description of the task
+        Returns:
+            The new TraceRun object
+        """
+        self._finalize_current_trace()
+        from context_forge.core.types import TaskInfo
+        agent_info = AgentInfo(
+            name=self._agent_name,
+            version=self._agent_version,
+            framework=self.framework,
+            framework_version=self.framework_version,
+        )
+        task_info = None
+        if task_description:
+            task_info = TaskInfo(description=task_description)
+        self._current_trace = TraceRun(
+            run_id=str(uuid.uuid4()),
+            started_at=datetime.now(timezone.utc),
+            agent_info=agent_info,
+            task_info=task_info,
+        )
+        return self._current_trace
+    def _finalize_current_trace(self) -> None:
+        """Finalize the current trace and add to completed traces."""
+        if self._current_trace is not None:
+            self._current_trace.ended_at = datetime.now(timezone.utc)
+            self._traces.append(self._current_trace)
+            # Save to file if output path configured
+            if self._output_path:
+                self._save_trace(self._current_trace)
+            self._current_trace = None
+    def _save_trace(self, trace: TraceRun) -> Path:
+        """Save a trace to the output directory.
+        Args:
+            trace: The trace to save
+        Returns:
+            Path to the saved file
+        """
+        if self._output_path is None:
+            raise ValueError("No output path configured")
+        self._output_path.mkdir(parents=True, exist_ok=True)
+        filename = f"trace-{trace.run_id}.json"
+        filepath = self._output_path / filename
+        with open(filepath, "w") as f:
+            f.write(trace.to_json(indent=2))
+        return filepath
+    def _get_current_trace(self) -> TraceRun:
+        """Get or create current trace.
+        Returns:
+            The current active TraceRun
+        """
+        if self._current_trace is None:
+            self._start_trace()
+        return self._current_trace
+    def __enter__(self) -> "BaseInstrumentor":
+        """Enter context manager, activating instrumentation."""
+        return self.instrument()
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        """Exit context manager, deactivating instrumentation."""
+        self.uninstrument()

context_forge/instrumentation/instrumentors/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""Framework-specific instrumentors for ContextForge.
+Each instrumentor provides one-line auto-instrumentation for
+a specific agent framework.
+"""
+from context_forge.instrumentation.instrumentors.langchain import (
+    ContextForgeCallbackHandler,
+    LangChainInstrumentor,
+)
+from context_forge.instrumentation.instrumentors.langgraph import LangGraphInstrumentor
+__all__ = [
+    "LangChainInstrumentor",
+    "LangGraphInstrumentor",
+    "ContextForgeCallbackHandler",
+]