PyPI - multi-model-debate - Versions diffs - 1.0.1__py3-none-any.whl - Mend

multi-model-debate 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

multi_model_debate/__init__.py +4 -0
multi_model_debate/__main__.py +6 -0
multi_model_debate/cli.py +290 -0
multi_model_debate/config.py +271 -0
multi_model_debate/exceptions.py +83 -0
multi_model_debate/models/__init__.py +71 -0
multi_model_debate/models/claude.py +168 -0
multi_model_debate/models/cli_wrapper.py +233 -0
multi_model_debate/models/gemini.py +66 -0
multi_model_debate/models/openai.py +66 -0
multi_model_debate/models/protocols.py +35 -0
multi_model_debate/orchestrator.py +465 -0
multi_model_debate/phases/__init__.py +22 -0
multi_model_debate/phases/base.py +236 -0
multi_model_debate/phases/baseline.py +117 -0
multi_model_debate/phases/debate.py +154 -0
multi_model_debate/phases/defense.py +186 -0
multi_model_debate/phases/final_position.py +307 -0
multi_model_debate/phases/judge.py +177 -0
multi_model_debate/phases/synthesis.py +162 -0
multi_model_debate/pre_debate.py +83 -0
multi_model_debate/prompts/arbiter_prompt.md.j2 +24 -0
multi_model_debate/prompts/arbiter_summary.md.j2 +102 -0
multi_model_debate/prompts/baseline_critique.md.j2 +5 -0
multi_model_debate/prompts/critic_1_lens.md.j2 +52 -0
multi_model_debate/prompts/critic_2_lens.md.j2 +52 -0
multi_model_debate/prompts/debate_round.md.j2 +14 -0
multi_model_debate/prompts/defense_initial.md.j2 +9 -0
multi_model_debate/prompts/defense_round.md.j2 +8 -0
multi_model_debate/prompts/judge.md.j2 +34 -0
multi_model_debate/prompts/judge_prompt.md.j2 +13 -0
multi_model_debate/prompts/strategist_proxy_lens.md.j2 +33 -0
multi_model_debate/prompts/synthesis_prompt.md.j2 +16 -0
multi_model_debate/prompts/synthesis_template.md.j2 +44 -0
multi_model_debate/prompts/winner_response.md.j2 +17 -0
multi_model_debate/response_parser.py +268 -0
multi_model_debate/roles.py +163 -0
multi_model_debate/storage/__init__.py +17 -0
multi_model_debate/storage/run.py +509 -0
multi_model_debate-1.0.1.dist-info/METADATA +572 -0
multi_model_debate-1.0.1.dist-info/RECORD +44 -0
multi_model_debate-1.0.1.dist-info/WHEEL +4 -0
multi_model_debate-1.0.1.dist-info/entry_points.txt +2 -0
multi_model_debate-1.0.1.dist-info/licenses/LICENSE +21 -0

multi_model_debate/response_parser.py ADDED Viewed

@@ -0,0 +1,268 @@
+"""Structured response parser for model outputs.
+This module provides JSON parsing for model responses, replacing the
+legacy magic string detection ("NO NEW ISSUES") with structured output.
+See REQUIREMENTS_V2.md Section 6 for rationale.
+Schema Versioning:
+- Version 1.0: Current format with schema_version field
+- Version 0.9: Legacy format without schema_version (backwards compat)
+"""
+from __future__ import annotations
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Any
+# Current expected schema version
+CURRENT_SCHEMA_VERSION = "1.0"
+# Default version for responses without schema_version (backwards compat)
+LEGACY_SCHEMA_VERSION = "0.9"
+logger = logging.getLogger(__name__)
+class ResponseParseError(Exception):
+    """Error parsing model response."""
+    pass
+@dataclass
+class Issue:
+    """A structured issue from a model response."""
+    id: str
+    severity: str
+    title: str
+    claim: str = ""
+    evidence: str = ""
+    recommendation: str = ""
+    failure_mode: str = ""  # GPT lens uses this
+    assumption_at_risk: str = ""  # Gemini lens uses this
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> Issue:
+        """Create an Issue from a dictionary."""
+        return cls(
+            id=data.get("id", "UNKNOWN"),
+            severity=data.get("severity", "MEDIUM"),
+            title=data.get("title", "Untitled Issue"),
+            claim=data.get("claim", ""),
+            evidence=data.get("evidence", ""),
+            recommendation=data.get("recommendation", ""),
+            failure_mode=data.get("failure_mode", ""),
+            assumption_at_risk=data.get("assumption_at_risk", ""),
+        )
+@dataclass
+class ParsedResponse:
+    """A parsed model response with structured data."""
+    has_new_issues: bool
+    issues: list[Issue] = field(default_factory=list)
+    summary: str = ""
+    raw_response: str = ""
+    schema_version: str = LEGACY_SCHEMA_VERSION
+    def issue_count(self) -> int:
+        """Return the number of issues."""
+        return len(self.issues)
+def extract_json_block(response: str) -> str | None:
+    """Extract JSON from a ```json code block.
+    Args:
+        response: The raw response text.
+    Returns:
+        The JSON string if found, None otherwise.
+    """
+    # Try to find ```json block
+    pattern = r"```json\s*(.*?)\s*```"
+    match = re.search(pattern, response, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    return None
+def parse_json_response(response: str) -> dict[str, Any]:
+    """Parse JSON from a model response.
+    Tries multiple strategies:
+    1. Extract from ```json code block
+    2. Parse entire response as JSON
+    3. Find JSON object anywhere in response
+    Args:
+        response: The raw response text.
+    Returns:
+        Parsed JSON as a dictionary.
+    Raises:
+        ResponseParseError: If JSON cannot be parsed.
+    """
+    # Strategy 1: Extract from ```json block
+    json_block = extract_json_block(response)
+    if json_block:
+        try:
+            result: dict[str, Any] = json.loads(json_block)
+            return result
+        except json.JSONDecodeError:
+            # Continue to other strategies
+            pass
+    # Strategy 2: Try parsing entire response as JSON
+    try:
+        result = json.loads(response.strip())
+        if isinstance(result, dict):
+            return result
+    except json.JSONDecodeError:
+        pass
+    # Strategy 3: Find JSON object anywhere in response
+    # Look for {...} pattern
+    json_pattern = r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}"
+    matches = re.findall(json_pattern, response, re.DOTALL)
+    for match in matches:
+        try:
+            data = json.loads(match)
+            # Verify it has expected structure
+            if isinstance(data, dict) and "has_new_issues" in data:
+                return data
+        except json.JSONDecodeError:
+            continue
+    raise ResponseParseError(
+        f"Could not parse JSON from response. Response starts with: {response[:200]}..."
+    )
+def parse_response(response: str) -> ParsedResponse:
+    """Parse a model response into structured data.
+    Handles both new JSON format and legacy "NO NEW ISSUES" format
+    for backwards compatibility.
+    Args:
+        response: The raw response text.
+    Returns:
+        ParsedResponse with structured data.
+    """
+    # Check for legacy "NO NEW ISSUES" format (backwards compatibility)
+    if is_legacy_no_issues(response):
+        return ParsedResponse(
+            has_new_issues=False,
+            issues=[],
+            summary="No new issues identified.",
+            raw_response=response,
+            schema_version=LEGACY_SCHEMA_VERSION,
+        )
+    # Try to parse as JSON
+    try:
+        data = parse_json_response(response)
+        # Extract schema version with backwards compatibility
+        schema_version = data.get("schema_version", LEGACY_SCHEMA_VERSION)
+        if schema_version == LEGACY_SCHEMA_VERSION:
+            logger.warning(
+                "Response missing schema_version field; assuming version %s. "
+                "Update prompts to include schema_version for better compatibility.",
+                LEGACY_SCHEMA_VERSION,
+            )
+        elif schema_version != CURRENT_SCHEMA_VERSION:
+            logger.warning(
+                "Response has unexpected schema_version '%s' (expected '%s'). "
+                "Parsing may produce unexpected results.",
+                schema_version,
+                CURRENT_SCHEMA_VERSION,
+            )
+        return ParsedResponse(
+            has_new_issues=data.get("has_new_issues", True),
+            issues=[Issue.from_dict(i) for i in data.get("issues", [])],
+            summary=data.get("summary", ""),
+            raw_response=response,
+            schema_version=schema_version,
+        )
+    except ResponseParseError:
+        # Fallback: assume there are issues if we can't parse
+        # This maintains the prior behavior where any substantial response
+        # was treated as containing issues
+        return ParsedResponse(
+            has_new_issues=True,
+            issues=[],
+            summary="",
+            raw_response=response,
+            schema_version=LEGACY_SCHEMA_VERSION,
+        )
+def is_legacy_no_issues(response: str) -> bool:
+    """Check if response uses legacy "NO NEW ISSUES" format.
+    This provides backwards compatibility with older prompts and
+    responses that haven't been updated to JSON format.
+    Args:
+        response: The raw response text.
+    Returns:
+        True if this is a legacy no-issues response.
+    """
+    return "NO NEW ISSUES" in response.upper()
+def has_new_issues(response: str) -> bool:
+    """Check if a response indicates new issues were found.
+    This is the main entry point for checking if debate should continue.
+    Works with both JSON format and legacy "NO NEW ISSUES" format.
+    Args:
+        response: The raw response text (or parsed JSON string).
+    Returns:
+        True if the response contains new issues.
+    """
+    parsed = parse_response(response)
+    return parsed.has_new_issues
+def is_valid_response(response: str, min_length: int = 100) -> bool:
+    """Check if a response is valid for processing.
+    A response is valid if:
+    - It's a proper JSON response (regardless of length)
+    - It uses legacy "NO NEW ISSUES" format
+    - It meets minimum length requirements
+    Args:
+        response: The raw response text.
+        min_length: Minimum length for non-JSON responses.
+    Returns:
+        True if the response is valid.
+    """
+    if not response or not response.strip():
+        return False
+    # Check for JSON format
+    if extract_json_block(response) is not None:
+        return True
+    # Check for legacy format
+    if is_legacy_no_issues(response):
+        return True
+    # Fall back to length check
+    return len(response) >= min_length

multi_model_debate/roles.py ADDED Viewed

@@ -0,0 +1,163 @@
+"""Dynamic role assignment for adversarial debates.
+Assigns Strategist, Critics, and Judge based on who initiated the debate.
+DESIGN DECISION: Judge = Strategist's model family (isolated instance)
+The Judge evaluates CRITICS, not the Strategist's plan.
+Judge reads Critic A vs Critic B arguments and picks winner.
+Since Judge is different family from both Critics, no bias.
+See REQUIREMENTS_V2.md for full rationale and evidence:
+- "Prefer a judge from different provider to reduce shared biases" (Evidently AI, 2026)
+- GPT-4 achieves 80% human agreement as judge (LabelYourData, 2026)
+- Bias is toward "own writing style" - Judge isn't reading own family's writing
+"""
+from __future__ import annotations
+import os
+import warnings
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+from multi_model_debate.exceptions import InsufficientCriticsError
+if TYPE_CHECKING:
+    from multi_model_debate.config import Config
+# Environment variable for explicit strategist override
+ENV_STRATEGIST = "ADVERSARIAL_CRITIQUE_STRATEGIST"
+@dataclass
+class RoleAssignment:
+    """Assignment of roles for a debate.
+    Attributes:
+        strategist: The model family running this session (defends the plan).
+        critics: Model families that critique the plan (all except strategist).
+        judge: Model family that picks the winner (same as strategist, isolated instance).
+    """
+    strategist: str
+    critics: list[str]
+    judge: str
+def detect_strategist_family(config: Config) -> str:
+    """Detect which model family is running this session.
+    Detection priority:
+    1. Config override (roles.strategist)
+    2. Environment variable (ADVERSARIAL_CRITIQUE_STRATEGIST)
+    3. Default to "claude" (most common use case with Claude Code)
+    Args:
+        config: Configuration with optional strategist override.
+    Returns:
+        Model family name (e.g., "claude", "gemini", "codex").
+    """
+    # 1. Check config override
+    if config.roles.strategist:
+        return config.roles.strategist
+    # 2. Check environment variable
+    env_strategist = os.environ.get(ENV_STRATEGIST)
+    if env_strategist:
+        return env_strategist.lower()
+    # 3. Default to claude (most common: running from Claude Code)
+    return "claude"
+def assign_roles(config: Config) -> RoleAssignment:
+    """Assign roles for a debate based on config.
+    Supports two modes:
+    - Explicit: `config.roles.critics` is set - use explicit critic list
+    - Legacy: `config.roles.critics` is None - derive from models.available
+    Args:
+        config: Configuration with available models and role settings.
+    Returns:
+        RoleAssignment with strategist, critics, and judge.
+    Raises:
+        ValueError: If strategist is not in available models (legacy mode).
+        InsufficientCriticsError: If fewer than 2 critics available.
+    """
+    strategist = detect_strategist_family(config)
+    # Check if explicit mode (critics list provided)
+    if config.roles.critics is not None:
+        # Explicit mode: use provided critics list
+        critics = list(config.roles.critics)  # Copy to avoid mutation
+        # Remove strategist from critics if accidentally included
+        if strategist in critics:
+            warnings.warn(
+                f"Strategist '{strategist}' found in critics list, removing automatically",
+                UserWarning,
+                stacklevel=2,
+            )
+            critics = [c for c in critics if c != strategist]
+        # Validate we still have enough critics
+        if len(critics) < 2:
+            raise InsufficientCriticsError(strategist=strategist, available=critics)
+        # Judge: explicit or default to strategist
+        judge = config.roles.judge if config.roles.judge else strategist
+    else:
+        # Legacy mode: derive critics from available models
+        available = config.models.available
+        # Validate strategist is available
+        if strategist not in available:
+            raise ValueError(
+                f"Strategist model '{strategist}' not in available models: {available}. "
+                f"Add it to [models].available or change the strategist."
+            )
+        # Critics = all models except strategist's family
+        critics = [m for m in available if m != strategist]
+        if len(critics) < 1:
+            raise InsufficientCriticsError(strategist=strategist, available=available)
+        # Judge defaults to strategist (same family, isolated instance)
+        judge = strategist
+    return RoleAssignment(
+        strategist=strategist,
+        critics=critics,
+        judge=judge,
+    )
+def get_critic_pair(roles: RoleAssignment) -> tuple[str, str]:
+    """Get the first two critics for debate.
+    In a 3-model setup (default), this returns the two non-strategist models.
+    For example, if strategist is "claude", returns ("codex", "gemini").
+    Args:
+        roles: The role assignment.
+    Returns:
+        Tuple of (critic_a, critic_b) model names.
+    Raises:
+        ValueError: If fewer than 2 critics available.
+    """
+    if len(roles.critics) < 2:
+        raise ValueError(
+            f"Need at least 2 critics for debate, got {len(roles.critics)}: {roles.critics}"
+        )
+    return (roles.critics[0], roles.critics[1])

multi_model_debate/storage/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""Storage utilities for run management and artifacts."""
+from multi_model_debate.storage.run import (
+    RunContext,
+    create_run,
+    find_latest_incomplete_run,
+    load_run,
+    verify_game_plan_integrity,
+)
+__all__ = [
+    "RunContext",
+    "create_run",
+    "find_latest_incomplete_run",
+    "load_run",
+    "verify_game_plan_integrity",
+]