multi-model-debate 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. multi_model_debate/__init__.py +4 -0
  2. multi_model_debate/__main__.py +6 -0
  3. multi_model_debate/cli.py +290 -0
  4. multi_model_debate/config.py +271 -0
  5. multi_model_debate/exceptions.py +83 -0
  6. multi_model_debate/models/__init__.py +71 -0
  7. multi_model_debate/models/claude.py +168 -0
  8. multi_model_debate/models/cli_wrapper.py +233 -0
  9. multi_model_debate/models/gemini.py +66 -0
  10. multi_model_debate/models/openai.py +66 -0
  11. multi_model_debate/models/protocols.py +35 -0
  12. multi_model_debate/orchestrator.py +465 -0
  13. multi_model_debate/phases/__init__.py +22 -0
  14. multi_model_debate/phases/base.py +236 -0
  15. multi_model_debate/phases/baseline.py +117 -0
  16. multi_model_debate/phases/debate.py +154 -0
  17. multi_model_debate/phases/defense.py +186 -0
  18. multi_model_debate/phases/final_position.py +307 -0
  19. multi_model_debate/phases/judge.py +177 -0
  20. multi_model_debate/phases/synthesis.py +162 -0
  21. multi_model_debate/pre_debate.py +83 -0
  22. multi_model_debate/prompts/arbiter_prompt.md.j2 +24 -0
  23. multi_model_debate/prompts/arbiter_summary.md.j2 +102 -0
  24. multi_model_debate/prompts/baseline_critique.md.j2 +5 -0
  25. multi_model_debate/prompts/critic_1_lens.md.j2 +52 -0
  26. multi_model_debate/prompts/critic_2_lens.md.j2 +52 -0
  27. multi_model_debate/prompts/debate_round.md.j2 +14 -0
  28. multi_model_debate/prompts/defense_initial.md.j2 +9 -0
  29. multi_model_debate/prompts/defense_round.md.j2 +8 -0
  30. multi_model_debate/prompts/judge.md.j2 +34 -0
  31. multi_model_debate/prompts/judge_prompt.md.j2 +13 -0
  32. multi_model_debate/prompts/strategist_proxy_lens.md.j2 +33 -0
  33. multi_model_debate/prompts/synthesis_prompt.md.j2 +16 -0
  34. multi_model_debate/prompts/synthesis_template.md.j2 +44 -0
  35. multi_model_debate/prompts/winner_response.md.j2 +17 -0
  36. multi_model_debate/response_parser.py +268 -0
  37. multi_model_debate/roles.py +163 -0
  38. multi_model_debate/storage/__init__.py +17 -0
  39. multi_model_debate/storage/run.py +509 -0
  40. multi_model_debate-1.0.1.dist-info/METADATA +572 -0
  41. multi_model_debate-1.0.1.dist-info/RECORD +44 -0
  42. multi_model_debate-1.0.1.dist-info/WHEEL +4 -0
  43. multi_model_debate-1.0.1.dist-info/entry_points.txt +2 -0
  44. multi_model_debate-1.0.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,268 @@
1
+ """Structured response parser for model outputs.
2
+
3
+ This module provides JSON parsing for model responses, replacing the
4
+ legacy magic string detection ("NO NEW ISSUES") with structured output.
5
+
6
+ See REQUIREMENTS_V2.md Section 6 for rationale.
7
+
8
+ Schema Versioning:
9
+ - Version 1.0: Current format with schema_version field
10
+ - Version 0.9: Legacy format without schema_version (backwards compat)
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import logging
17
+ import re
18
+ from dataclasses import dataclass, field
19
+ from typing import Any
20
+
21
+ # Current expected schema version
22
+ CURRENT_SCHEMA_VERSION = "1.0"
23
+ # Default version for responses without schema_version (backwards compat)
24
+ LEGACY_SCHEMA_VERSION = "0.9"
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class ResponseParseError(Exception):
30
+ """Error parsing model response."""
31
+
32
+ pass
33
+
34
+
35
+ @dataclass
36
+ class Issue:
37
+ """A structured issue from a model response."""
38
+
39
+ id: str
40
+ severity: str
41
+ title: str
42
+ claim: str = ""
43
+ evidence: str = ""
44
+ recommendation: str = ""
45
+ failure_mode: str = "" # GPT lens uses this
46
+ assumption_at_risk: str = "" # Gemini lens uses this
47
+
48
+ @classmethod
49
+ def from_dict(cls, data: dict[str, Any]) -> Issue:
50
+ """Create an Issue from a dictionary."""
51
+ return cls(
52
+ id=data.get("id", "UNKNOWN"),
53
+ severity=data.get("severity", "MEDIUM"),
54
+ title=data.get("title", "Untitled Issue"),
55
+ claim=data.get("claim", ""),
56
+ evidence=data.get("evidence", ""),
57
+ recommendation=data.get("recommendation", ""),
58
+ failure_mode=data.get("failure_mode", ""),
59
+ assumption_at_risk=data.get("assumption_at_risk", ""),
60
+ )
61
+
62
+
63
+ @dataclass
64
+ class ParsedResponse:
65
+ """A parsed model response with structured data."""
66
+
67
+ has_new_issues: bool
68
+ issues: list[Issue] = field(default_factory=list)
69
+ summary: str = ""
70
+ raw_response: str = ""
71
+ schema_version: str = LEGACY_SCHEMA_VERSION
72
+
73
+ def issue_count(self) -> int:
74
+ """Return the number of issues."""
75
+ return len(self.issues)
76
+
77
+
78
+ def extract_json_block(response: str) -> str | None:
79
+ """Extract JSON from a ```json code block.
80
+
81
+ Args:
82
+ response: The raw response text.
83
+
84
+ Returns:
85
+ The JSON string if found, None otherwise.
86
+ """
87
+ # Try to find ```json block
88
+ pattern = r"```json\s*(.*?)\s*```"
89
+ match = re.search(pattern, response, re.DOTALL)
90
+ if match:
91
+ return match.group(1).strip()
92
+ return None
93
+
94
+
95
+ def parse_json_response(response: str) -> dict[str, Any]:
96
+ """Parse JSON from a model response.
97
+
98
+ Tries multiple strategies:
99
+ 1. Extract from ```json code block
100
+ 2. Parse entire response as JSON
101
+ 3. Find JSON object anywhere in response
102
+
103
+ Args:
104
+ response: The raw response text.
105
+
106
+ Returns:
107
+ Parsed JSON as a dictionary.
108
+
109
+ Raises:
110
+ ResponseParseError: If JSON cannot be parsed.
111
+ """
112
+ # Strategy 1: Extract from ```json block
113
+ json_block = extract_json_block(response)
114
+ if json_block:
115
+ try:
116
+ result: dict[str, Any] = json.loads(json_block)
117
+ return result
118
+ except json.JSONDecodeError:
119
+ # Continue to other strategies
120
+ pass
121
+
122
+ # Strategy 2: Try parsing entire response as JSON
123
+ try:
124
+ result = json.loads(response.strip())
125
+ if isinstance(result, dict):
126
+ return result
127
+ except json.JSONDecodeError:
128
+ pass
129
+
130
+ # Strategy 3: Find JSON object anywhere in response
131
+ # Look for {...} pattern
132
+ json_pattern = r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}"
133
+ matches = re.findall(json_pattern, response, re.DOTALL)
134
+ for match in matches:
135
+ try:
136
+ data = json.loads(match)
137
+ # Verify it has expected structure
138
+ if isinstance(data, dict) and "has_new_issues" in data:
139
+ return data
140
+ except json.JSONDecodeError:
141
+ continue
142
+
143
+ raise ResponseParseError(
144
+ f"Could not parse JSON from response. Response starts with: {response[:200]}..."
145
+ )
146
+
147
+
148
+ def parse_response(response: str) -> ParsedResponse:
149
+ """Parse a model response into structured data.
150
+
151
+ Handles both new JSON format and legacy "NO NEW ISSUES" format
152
+ for backwards compatibility.
153
+
154
+ Args:
155
+ response: The raw response text.
156
+
157
+ Returns:
158
+ ParsedResponse with structured data.
159
+ """
160
+ # Check for legacy "NO NEW ISSUES" format (backwards compatibility)
161
+ if is_legacy_no_issues(response):
162
+ return ParsedResponse(
163
+ has_new_issues=False,
164
+ issues=[],
165
+ summary="No new issues identified.",
166
+ raw_response=response,
167
+ schema_version=LEGACY_SCHEMA_VERSION,
168
+ )
169
+
170
+ # Try to parse as JSON
171
+ try:
172
+ data = parse_json_response(response)
173
+
174
+ # Extract schema version with backwards compatibility
175
+ schema_version = data.get("schema_version", LEGACY_SCHEMA_VERSION)
176
+ if schema_version == LEGACY_SCHEMA_VERSION:
177
+ logger.warning(
178
+ "Response missing schema_version field; assuming version %s. "
179
+ "Update prompts to include schema_version for better compatibility.",
180
+ LEGACY_SCHEMA_VERSION,
181
+ )
182
+ elif schema_version != CURRENT_SCHEMA_VERSION:
183
+ logger.warning(
184
+ "Response has unexpected schema_version '%s' (expected '%s'). "
185
+ "Parsing may produce unexpected results.",
186
+ schema_version,
187
+ CURRENT_SCHEMA_VERSION,
188
+ )
189
+
190
+ return ParsedResponse(
191
+ has_new_issues=data.get("has_new_issues", True),
192
+ issues=[Issue.from_dict(i) for i in data.get("issues", [])],
193
+ summary=data.get("summary", ""),
194
+ raw_response=response,
195
+ schema_version=schema_version,
196
+ )
197
+ except ResponseParseError:
198
+ # Fallback: assume there are issues if we can't parse
199
+ # This maintains the prior behavior where any substantial response
200
+ # was treated as containing issues
201
+ return ParsedResponse(
202
+ has_new_issues=True,
203
+ issues=[],
204
+ summary="",
205
+ raw_response=response,
206
+ schema_version=LEGACY_SCHEMA_VERSION,
207
+ )
208
+
209
+
210
+ def is_legacy_no_issues(response: str) -> bool:
211
+ """Check if response uses legacy "NO NEW ISSUES" format.
212
+
213
+ This provides backwards compatibility with older prompts and
214
+ responses that haven't been updated to JSON format.
215
+
216
+ Args:
217
+ response: The raw response text.
218
+
219
+ Returns:
220
+ True if this is a legacy no-issues response.
221
+ """
222
+ return "NO NEW ISSUES" in response.upper()
223
+
224
+
225
+ def has_new_issues(response: str) -> bool:
226
+ """Check if a response indicates new issues were found.
227
+
228
+ This is the main entry point for checking if debate should continue.
229
+ Works with both JSON format and legacy "NO NEW ISSUES" format.
230
+
231
+ Args:
232
+ response: The raw response text (or parsed JSON string).
233
+
234
+ Returns:
235
+ True if the response contains new issues.
236
+ """
237
+ parsed = parse_response(response)
238
+ return parsed.has_new_issues
239
+
240
+
241
+ def is_valid_response(response: str, min_length: int = 100) -> bool:
242
+ """Check if a response is valid for processing.
243
+
244
+ A response is valid if:
245
+ - It's a proper JSON response (regardless of length)
246
+ - It uses legacy "NO NEW ISSUES" format
247
+ - It meets minimum length requirements
248
+
249
+ Args:
250
+ response: The raw response text.
251
+ min_length: Minimum length for non-JSON responses.
252
+
253
+ Returns:
254
+ True if the response is valid.
255
+ """
256
+ if not response or not response.strip():
257
+ return False
258
+
259
+ # Check for JSON format
260
+ if extract_json_block(response) is not None:
261
+ return True
262
+
263
+ # Check for legacy format
264
+ if is_legacy_no_issues(response):
265
+ return True
266
+
267
+ # Fall back to length check
268
+ return len(response) >= min_length
@@ -0,0 +1,163 @@
1
+ """Dynamic role assignment for adversarial debates.
2
+
3
+ Assigns Strategist, Critics, and Judge based on who initiated the debate.
4
+
5
+ DESIGN DECISION: Judge = Strategist's model family (isolated instance)
6
+
7
+ The Judge evaluates CRITICS, not the Strategist's plan.
8
+ Judge reads Critic A vs Critic B arguments and picks winner.
9
+ Since Judge is different family from both Critics, no bias.
10
+
11
+ See REQUIREMENTS_V2.md for full rationale and evidence:
12
+ - "Prefer a judge from different provider to reduce shared biases" (Evidently AI, 2026)
13
+ - GPT-4 achieves 80% human agreement as judge (LabelYourData, 2026)
14
+ - Bias is toward "own writing style" - Judge isn't reading own family's writing
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import os
20
+ import warnings
21
+ from dataclasses import dataclass
22
+ from typing import TYPE_CHECKING
23
+
24
+ from multi_model_debate.exceptions import InsufficientCriticsError
25
+
26
+ if TYPE_CHECKING:
27
+ from multi_model_debate.config import Config
28
+
29
+
30
+ # Environment variable for explicit strategist override
31
+ ENV_STRATEGIST = "ADVERSARIAL_CRITIQUE_STRATEGIST"
32
+
33
+
34
+ @dataclass
35
+ class RoleAssignment:
36
+ """Assignment of roles for a debate.
37
+
38
+ Attributes:
39
+ strategist: The model family running this session (defends the plan).
40
+ critics: Model families that critique the plan (all except strategist).
41
+ judge: Model family that picks the winner (same as strategist, isolated instance).
42
+ """
43
+
44
+ strategist: str
45
+ critics: list[str]
46
+ judge: str
47
+
48
+
49
+ def detect_strategist_family(config: Config) -> str:
50
+ """Detect which model family is running this session.
51
+
52
+ Detection priority:
53
+ 1. Config override (roles.strategist)
54
+ 2. Environment variable (ADVERSARIAL_CRITIQUE_STRATEGIST)
55
+ 3. Default to "claude" (most common use case with Claude Code)
56
+
57
+ Args:
58
+ config: Configuration with optional strategist override.
59
+
60
+ Returns:
61
+ Model family name (e.g., "claude", "gemini", "codex").
62
+ """
63
+ # 1. Check config override
64
+ if config.roles.strategist:
65
+ return config.roles.strategist
66
+
67
+ # 2. Check environment variable
68
+ env_strategist = os.environ.get(ENV_STRATEGIST)
69
+ if env_strategist:
70
+ return env_strategist.lower()
71
+
72
+ # 3. Default to claude (most common: running from Claude Code)
73
+ return "claude"
74
+
75
+
76
+ def assign_roles(config: Config) -> RoleAssignment:
77
+ """Assign roles for a debate based on config.
78
+
79
+ Supports two modes:
80
+ - Explicit: `config.roles.critics` is set - use explicit critic list
81
+ - Legacy: `config.roles.critics` is None - derive from models.available
82
+
83
+ Args:
84
+ config: Configuration with available models and role settings.
85
+
86
+ Returns:
87
+ RoleAssignment with strategist, critics, and judge.
88
+
89
+ Raises:
90
+ ValueError: If strategist is not in available models (legacy mode).
91
+ InsufficientCriticsError: If fewer than 2 critics available.
92
+ """
93
+ strategist = detect_strategist_family(config)
94
+
95
+ # Check if explicit mode (critics list provided)
96
+ if config.roles.critics is not None:
97
+ # Explicit mode: use provided critics list
98
+ critics = list(config.roles.critics) # Copy to avoid mutation
99
+
100
+ # Remove strategist from critics if accidentally included
101
+ if strategist in critics:
102
+ warnings.warn(
103
+ f"Strategist '{strategist}' found in critics list, removing automatically",
104
+ UserWarning,
105
+ stacklevel=2,
106
+ )
107
+ critics = [c for c in critics if c != strategist]
108
+
109
+ # Validate we still have enough critics
110
+ if len(critics) < 2:
111
+ raise InsufficientCriticsError(strategist=strategist, available=critics)
112
+
113
+ # Judge: explicit or default to strategist
114
+ judge = config.roles.judge if config.roles.judge else strategist
115
+
116
+ else:
117
+ # Legacy mode: derive critics from available models
118
+ available = config.models.available
119
+
120
+ # Validate strategist is available
121
+ if strategist not in available:
122
+ raise ValueError(
123
+ f"Strategist model '{strategist}' not in available models: {available}. "
124
+ f"Add it to [models].available or change the strategist."
125
+ )
126
+
127
+ # Critics = all models except strategist's family
128
+ critics = [m for m in available if m != strategist]
129
+
130
+ if len(critics) < 1:
131
+ raise InsufficientCriticsError(strategist=strategist, available=available)
132
+
133
+ # Judge defaults to strategist (same family, isolated instance)
134
+ judge = strategist
135
+
136
+ return RoleAssignment(
137
+ strategist=strategist,
138
+ critics=critics,
139
+ judge=judge,
140
+ )
141
+
142
+
143
+ def get_critic_pair(roles: RoleAssignment) -> tuple[str, str]:
144
+ """Get the first two critics for debate.
145
+
146
+ In a 3-model setup (default), this returns the two non-strategist models.
147
+ For example, if strategist is "claude", returns ("codex", "gemini").
148
+
149
+ Args:
150
+ roles: The role assignment.
151
+
152
+ Returns:
153
+ Tuple of (critic_a, critic_b) model names.
154
+
155
+ Raises:
156
+ ValueError: If fewer than 2 critics available.
157
+ """
158
+ if len(roles.critics) < 2:
159
+ raise ValueError(
160
+ f"Need at least 2 critics for debate, got {len(roles.critics)}: {roles.critics}"
161
+ )
162
+
163
+ return (roles.critics[0], roles.critics[1])
@@ -0,0 +1,17 @@
1
+ """Storage utilities for run management and artifacts."""
2
+
3
+ from multi_model_debate.storage.run import (
4
+ RunContext,
5
+ create_run,
6
+ find_latest_incomplete_run,
7
+ load_run,
8
+ verify_game_plan_integrity,
9
+ )
10
+
11
+ __all__ = [
12
+ "RunContext",
13
+ "create_run",
14
+ "find_latest_incomplete_run",
15
+ "load_run",
16
+ "verify_game_plan_integrity",
17
+ ]