multi-llm-consensus 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- multi_llm_consensus-0.1.3/.claude/settings.local.json +22 -0
- multi_llm_consensus-0.1.3/.github/banner.png +0 -0
- multi_llm_consensus-0.1.3/.gitignore +14 -0
- multi_llm_consensus-0.1.3/.python-version +1 -0
- multi_llm_consensus-0.1.3/LICENSE +21 -0
- multi_llm_consensus-0.1.3/PKG-INFO +171 -0
- multi_llm_consensus-0.1.3/README.md +139 -0
- multi_llm_consensus-0.1.3/llm_ensemble/__init__.py +4 -0
- multi_llm_consensus-0.1.3/llm_ensemble/consensus.py +165 -0
- multi_llm_consensus-0.1.3/llm_ensemble/prompts/judge.prompt +38 -0
- multi_llm_consensus-0.1.3/llm_ensemble/py.typed +0 -0
- multi_llm_consensus-0.1.3/llm_ensemble/run_llm.py +151 -0
- multi_llm_consensus-0.1.3/llm_ensemble/schemas/__init__.py +3 -0
- multi_llm_consensus-0.1.3/llm_ensemble/schemas/schemas.py +20 -0
- multi_llm_consensus-0.1.3/llm_ensemble/utils/__init__.py +4 -0
- multi_llm_consensus-0.1.3/llm_ensemble/utils/tavily_tool.py +30 -0
- multi_llm_consensus-0.1.3/llm_ensemble/utils/utils.py +27 -0
- multi_llm_consensus-0.1.3/pyproject.toml +52 -0
- multi_llm_consensus-0.1.3/tests/__init__.py +1 -0
- multi_llm_consensus-0.1.3/tests/test_consensus.py +109 -0
- multi_llm_consensus-0.1.3/tests/test_run_llm.py +87 -0
- multi_llm_consensus-0.1.3/uv.lock +2099 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"WebSearch",
|
|
5
|
+
"Bash(uv add:*)",
|
|
6
|
+
"Bash(tree:*)",
|
|
7
|
+
"Bash(uv run:*)",
|
|
8
|
+
"WebFetch(domain:docs.langchain.com)",
|
|
9
|
+
"WebFetch(domain:smith.langchain.com)",
|
|
10
|
+
"Bash(jq:*)",
|
|
11
|
+
"Bash(python:*)",
|
|
12
|
+
"Bash(python -c:*)",
|
|
13
|
+
"WebFetch(domain:github.com)",
|
|
14
|
+
"WebFetch(domain:raw.githubusercontent.com)",
|
|
15
|
+
"Bash(.venv/bin/python:*)",
|
|
16
|
+
"Bash(uv build:*)",
|
|
17
|
+
"Bash(unzip:*)",
|
|
18
|
+
"Bash(uv publish:*)",
|
|
19
|
+
"Bash(find:*)"
|
|
20
|
+
]
|
|
21
|
+
}
|
|
22
|
+
}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Roberto Pagliari
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: multi-llm-consensus
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: Achieve consensus across multiple LLMs through an AI judge coordinator
|
|
5
|
+
Project-URL: Homepage, https://github.com/zzzrbx/llm-ensemble
|
|
6
|
+
Project-URL: Repository, https://github.com/zzzrbx/llm-ensemble
|
|
7
|
+
Project-URL: Issues, https://github.com/zzzrbx/llm-ensemble/issues
|
|
8
|
+
Author-email: Roberto Pagliari <roberto.pagliari@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: agent,ai,consensus,langchain,langgraph,llm,multi-model
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.12
|
|
19
|
+
Requires-Dist: deepagents>=0.1.0
|
|
20
|
+
Requires-Dist: langchain-anthropic>=1.3.1
|
|
21
|
+
Requires-Dist: langchain-community>=0.4.1
|
|
22
|
+
Requires-Dist: langchain-google-genai>=4.1.3
|
|
23
|
+
Requires-Dist: langchain-openai>=1.1.7
|
|
24
|
+
Requires-Dist: langchain-xai>=1.2.1
|
|
25
|
+
Requires-Dist: langchain>=1.2.3
|
|
26
|
+
Requires-Dist: langgraph>=1.0.5
|
|
27
|
+
Requires-Dist: pydantic>=2.12.5
|
|
28
|
+
Requires-Dist: python-dotenv>=1.2.1
|
|
29
|
+
Requires-Dist: rich>=14.2.0
|
|
30
|
+
Requires-Dist: tavily-python>=0.7.17
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# LLM Ensemble
|
|
34
|
+
|
|
35
|
+

|
|
36
|
+
|
|
37
|
+
A Python library for achieving consensus across multiple Agents.
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- **Consensus**: Uses a moderator to iteratively coordinate multiple LLMs until consensus is reached
|
|
42
|
+
- Can use any model API based model such as OpenAI, Anthropic, Gemini, Grok supported by Langgraph
|
|
43
|
+
- Supports web search for real-time data
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
uv add multi-llm-consensus
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Or install from source:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
git clone https://github.com/zzzrbx/llm-ensemble.git
|
|
55
|
+
cd llm-ensemble
|
|
56
|
+
uv sync
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Environment Setup
|
|
60
|
+
|
|
61
|
+
Create a `.env` file with your API keys, for example:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
OPENAI_API_KEY=your_openai_key
|
|
65
|
+
ANTHROPIC_API_KEY=your_anthropic_key
|
|
66
|
+
GOOGLE_API_KEY=your_google_key
|
|
67
|
+
XAI_API_KEY=your_xai_key
|
|
68
|
+
TAVILY_API_KEY=your_tavily_key # For web search
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
You must provide at least two API keys for the models you want to use in the ensemble.
|
|
72
|
+
|
|
73
|
+
## How It Works
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
User Query → Judge (configurable, default: Claude Opus 4.5)
|
|
77
|
+
↓
|
|
78
|
+
Judge calls run_llms tool
|
|
79
|
+
├── Model A (parallel)
|
|
80
|
+
├── Model B (parallel)
|
|
81
|
+
├── Model C (parallel)
|
|
82
|
+
└── Model D (parallel)
|
|
83
|
+
↓
|
|
84
|
+
Judge analyzes responses
|
|
85
|
+
├── Consensus? → Return answer
|
|
86
|
+
└── No consensus? → Refine query and call run_llms again
|
|
87
|
+
↓
|
|
88
|
+
Repeat until consensus or limit reached
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**Key Features:**
|
|
92
|
+
- **Dynamic queries** - Judge crafts different prompts each iteration:
|
|
93
|
+
- Iteration 1: Sends initial question with research instructions
|
|
94
|
+
- Iteration 2+: Summarizes agreements, highlights disagreements, requests refinements
|
|
95
|
+
- Final iteration: Presents refined consensus statement for confirmation
|
|
96
|
+
- **Error handling** - Returns default values on timeout or tool call limit reached
|
|
97
|
+
|
|
98
|
+
**Tools currently available for LLMs:**
|
|
99
|
+
- `search_the_web` - Tavily web search for current events and factual data
|
|
100
|
+
- `add`, `subtract`, `multiply`, `divide` - Math operations
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
## Examples
|
|
104
|
+
|
|
105
|
+
### Example 1: With structured output
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from typing import TypedDict
|
|
109
|
+
from llm_ensemble import Consensus
|
|
110
|
+
|
|
111
|
+
class UserSchema(TypedDict):
|
|
112
|
+
consensus: bool
|
|
113
|
+
final_answer: str
|
|
114
|
+
notes: str
|
|
115
|
+
|
|
116
|
+
consensus = Consensus(
|
|
117
|
+
models=[
|
|
118
|
+
"openai:gpt-5-mini",
|
|
119
|
+
"google_genai:gemini-3-flash-preview",
|
|
120
|
+
"anthropic:claude-3-5-haiku-20241022",
|
|
121
|
+
"xai:grok-3-mini",
|
|
122
|
+
],
|
|
123
|
+
response_schema=UserSchema
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
result = consensus.invoke(
|
|
127
|
+
"If survival is arbitrary, is moral judgment arbitrary too?"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
print(f"Consensus: {result['consensus']}")
|
|
131
|
+
print(f"Answer: {result['final_answer']}")
|
|
132
|
+
print(f"Notes: {result['notes']}")
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Example 2: No structured output with web search enabled (you just need to mention it in the prompt)
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from llm_ensemble import Consensus
|
|
139
|
+
|
|
140
|
+
# No response_schema - returns full agent result
|
|
141
|
+
consensus = Consensus(
|
|
142
|
+
models=[
|
|
143
|
+
"openai:gpt-5-mini",
|
|
144
|
+
"google_genai:gemini-3-flash-preview",
|
|
145
|
+
"anthropic:claude-3-5-haiku-20241022",
|
|
146
|
+
"xai:grok-3-mini",
|
|
147
|
+
]
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
result = consensus.invoke(
|
|
151
|
+
"What are the latest developments in quantum computing?\n\n"
|
|
152
|
+
"Use the web search to research current news and breakthroughs."
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Access full agent result
|
|
156
|
+
print(result['messages'][-1].content)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Debugging and Observability
|
|
160
|
+
|
|
161
|
+
The library integrates with LangSmith for trace observability. Set `LANGSMITH_API_KEY` and `LANGSMITH_PROJECT` in your `.env` file to enable tracing.
|
|
162
|
+
|
|
163
|
+
## License
|
|
164
|
+
|
|
165
|
+
MIT License
|
|
166
|
+
|
|
167
|
+
## Contributing
|
|
168
|
+
|
|
169
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|
|
170
|
+
|
|
171
|
+
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# LLM Ensemble
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
A Python library for achieving consensus across multiple Agents.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Consensus**: Uses a moderator to iteratively coordinate multiple LLMs until consensus is reached
|
|
10
|
+
- Can use any model API based model such as OpenAI, Anthropic, Gemini, Grok supported by Langgraph
|
|
11
|
+
- Supports web search for real-time data
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
uv add multi-llm-consensus
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Or install from source:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
git clone https://github.com/zzzrbx/llm-ensemble.git
|
|
23
|
+
cd llm-ensemble
|
|
24
|
+
uv sync
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Environment Setup
|
|
28
|
+
|
|
29
|
+
Create a `.env` file with your API keys, for example:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
OPENAI_API_KEY=your_openai_key
|
|
33
|
+
ANTHROPIC_API_KEY=your_anthropic_key
|
|
34
|
+
GOOGLE_API_KEY=your_google_key
|
|
35
|
+
XAI_API_KEY=your_xai_key
|
|
36
|
+
TAVILY_API_KEY=your_tavily_key # For web search
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
You must provide at least two API keys for the models you want to use in the ensemble.
|
|
40
|
+
|
|
41
|
+
## How It Works
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
User Query → Judge (configurable, default: Claude Opus 4.5)
|
|
45
|
+
↓
|
|
46
|
+
Judge calls run_llms tool
|
|
47
|
+
├── Model A (parallel)
|
|
48
|
+
├── Model B (parallel)
|
|
49
|
+
├── Model C (parallel)
|
|
50
|
+
└── Model D (parallel)
|
|
51
|
+
↓
|
|
52
|
+
Judge analyzes responses
|
|
53
|
+
├── Consensus? → Return answer
|
|
54
|
+
└── No consensus? → Refine query and call run_llms again
|
|
55
|
+
↓
|
|
56
|
+
Repeat until consensus or limit reached
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
**Key Features:**
|
|
60
|
+
- **Dynamic queries** - Judge crafts different prompts each iteration:
|
|
61
|
+
- Iteration 1: Sends initial question with research instructions
|
|
62
|
+
- Iteration 2+: Summarizes agreements, highlights disagreements, requests refinements
|
|
63
|
+
- Final iteration: Presents refined consensus statement for confirmation
|
|
64
|
+
- **Error handling** - Returns default values on timeout or tool call limit reached
|
|
65
|
+
|
|
66
|
+
**Tools currently available for LLMs:**
|
|
67
|
+
- `search_the_web` - Tavily web search for current events and factual data
|
|
68
|
+
- `add`, `subtract`, `multiply`, `divide` - Math operations
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
## Examples
|
|
72
|
+
|
|
73
|
+
### Example 1: With structured output
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from typing import TypedDict
|
|
77
|
+
from llm_ensemble import Consensus
|
|
78
|
+
|
|
79
|
+
class UserSchema(TypedDict):
|
|
80
|
+
consensus: bool
|
|
81
|
+
final_answer: str
|
|
82
|
+
notes: str
|
|
83
|
+
|
|
84
|
+
consensus = Consensus(
|
|
85
|
+
models=[
|
|
86
|
+
"openai:gpt-5-mini",
|
|
87
|
+
"google_genai:gemini-3-flash-preview",
|
|
88
|
+
"anthropic:claude-3-5-haiku-20241022",
|
|
89
|
+
"xai:grok-3-mini",
|
|
90
|
+
],
|
|
91
|
+
response_schema=UserSchema
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
result = consensus.invoke(
|
|
95
|
+
"If survival is arbitrary, is moral judgment arbitrary too?"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
print(f"Consensus: {result['consensus']}")
|
|
99
|
+
print(f"Answer: {result['final_answer']}")
|
|
100
|
+
print(f"Notes: {result['notes']}")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Example 2: No structured output with web search enabled (you just need to mention it in the prompt)
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from llm_ensemble import Consensus
|
|
107
|
+
|
|
108
|
+
# No response_schema - returns full agent result
|
|
109
|
+
consensus = Consensus(
|
|
110
|
+
models=[
|
|
111
|
+
"openai:gpt-5-mini",
|
|
112
|
+
"google_genai:gemini-3-flash-preview",
|
|
113
|
+
"anthropic:claude-3-5-haiku-20241022",
|
|
114
|
+
"xai:grok-3-mini",
|
|
115
|
+
]
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
result = consensus.invoke(
|
|
119
|
+
"What are the latest developments in quantum computing?\n\n"
|
|
120
|
+
"Use the web search to research current news and breakthroughs."
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Access full agent result
|
|
124
|
+
print(result['messages'][-1].content)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Debugging and Observability
|
|
128
|
+
|
|
129
|
+
The library integrates with LangSmith for trace observability. Set `LANGSMITH_API_KEY` and `LANGSMITH_PROJECT` in your `.env` file to enable tracing.
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT License
|
|
134
|
+
|
|
135
|
+
## Contributing
|
|
136
|
+
|
|
137
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|
|
138
|
+
|
|
139
|
+
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
from langchain.agents import create_agent
|
|
2
|
+
from langchain.agents.middleware import TodoListMiddleware, SummarizationMiddleware, ToolCallLimitMiddleware
|
|
3
|
+
from deepagents.middleware.filesystem import FilesystemMiddleware
|
|
4
|
+
from deepagents.backends import StateBackend
|
|
5
|
+
from langchain.chat_models import init_chat_model
|
|
6
|
+
from langchain.tools import tool
|
|
7
|
+
from .run_llm import RunLLM
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Type
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Consensus:
|
|
13
|
+
"""
|
|
14
|
+
Consensus class that uses a configurable judge model to orchestrate
|
|
15
|
+
multiple RunLLM invocations until consensus is reached among LLMs.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
models: list[str],
|
|
21
|
+
judge_model: str = "anthropic:claude-opus-4-5-20251101",
|
|
22
|
+
summarization_model: str = "claude-4-5-sonnet-20250929",
|
|
23
|
+
summarization_trigger_tokens: int = 200_000,
|
|
24
|
+
summarization_keep_messages: int = 5,
|
|
25
|
+
run_limit: int = 20,
|
|
26
|
+
response_schema: Type | None = None
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the Consensus class.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
models: List of model strings in format "provider:model-name".
|
|
33
|
+
judge_model: Model string for the judge coordinator in format "provider:model-name".
|
|
34
|
+
Defaults to "anthropic:claude-opus-4-5-20251101".
|
|
35
|
+
summarization_model: Model string for summarization middleware in format "provider:model-name".
|
|
36
|
+
Defaults to "anthropic:claude-3-5-sonnet-20241022".
|
|
37
|
+
summarization_trigger_tokens: Token count to trigger summarization middleware.
|
|
38
|
+
summarization_keep_messages: Number of messages to keep after summarization.
|
|
39
|
+
run_limit: Maximum number of calls to run_llms tool per invocation.
|
|
40
|
+
response_schema: Optional schema for structured output (TypedDict or Pydantic model).
|
|
41
|
+
If None, returns full agent result without structured output.
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
ValueError: If models list is empty or contains only one model.
|
|
45
|
+
"""
|
|
46
|
+
if not models:
|
|
47
|
+
raise ValueError("models list cannot be empty")
|
|
48
|
+
if len(models) < 2:
|
|
49
|
+
raise ValueError("models list must contain at least 2 models for consensus")
|
|
50
|
+
|
|
51
|
+
# Store as instance variables for use in tool creation
|
|
52
|
+
self.models = models
|
|
53
|
+
self.system_message = "You are a helpful AI assistant."
|
|
54
|
+
|
|
55
|
+
# Load judge prompt
|
|
56
|
+
judge_prompt_path = Path(__file__).parent / "prompts" / "judge.prompt"
|
|
57
|
+
judge_prompt = judge_prompt_path.read_text()
|
|
58
|
+
|
|
59
|
+
# Create the run_llms tool
|
|
60
|
+
run_llms = self._create_run_llms_tool()
|
|
61
|
+
|
|
62
|
+
# Create judge LLM using init_chat_model
|
|
63
|
+
llm = init_chat_model(judge_model)
|
|
64
|
+
|
|
65
|
+
# Create middleware
|
|
66
|
+
middleware = [
|
|
67
|
+
TodoListMiddleware(),
|
|
68
|
+
FilesystemMiddleware(backend=lambda rt: StateBackend(rt)),
|
|
69
|
+
SummarizationMiddleware(
|
|
70
|
+
model=summarization_model,
|
|
71
|
+
trigger=("tokens", summarization_trigger_tokens),
|
|
72
|
+
keep=("messages", summarization_keep_messages)
|
|
73
|
+
),
|
|
74
|
+
ToolCallLimitMiddleware(
|
|
75
|
+
tool_name="run_llms",
|
|
76
|
+
run_limit=run_limit,
|
|
77
|
+
exit_behavior="error"
|
|
78
|
+
)
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
# Create agent with optional structured output
|
|
82
|
+
if response_schema is not None:
|
|
83
|
+
self._agent = create_agent(
|
|
84
|
+
model=llm,
|
|
85
|
+
tools=[run_llms],
|
|
86
|
+
system_prompt=judge_prompt,
|
|
87
|
+
middleware=middleware,
|
|
88
|
+
response_format=response_schema
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
self._agent = create_agent(
|
|
92
|
+
model=llm,
|
|
93
|
+
tools=[run_llms],
|
|
94
|
+
system_prompt=judge_prompt,
|
|
95
|
+
middleware=middleware
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
self._response_schema = response_schema
|
|
99
|
+
|
|
100
|
+
def _create_run_llms_tool(self):
|
|
101
|
+
"""
|
|
102
|
+
Creates the run_llms tool with access to instance variables.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
A LangChain tool that runs multiple LLMs in parallel.
|
|
106
|
+
"""
|
|
107
|
+
@tool
|
|
108
|
+
def run_llms(query: str) -> str:
|
|
109
|
+
"""
|
|
110
|
+
Runs multiple LLMs in parallel on the same query.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
query: The prompt/question to send to all LLMs. Include full context and
|
|
114
|
+
any specific instructions (e.g., "use search_the_web for web search",
|
|
115
|
+
"use add/multiply/subtract/divide tools for calculations").
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Aggregated responses from all LLMs. Each response is prefixed with the
|
|
119
|
+
exact model identifier (e.g., "openai:gpt-5-mini:", "google_genai:gemini-3-flash-preview:").
|
|
120
|
+
Always refer to models by these exact identifiers in your analysis.
|
|
121
|
+
"""
|
|
122
|
+
run_llm = RunLLM(models=self.models, system_message=self.system_message)
|
|
123
|
+
return run_llm.invoke(query)
|
|
124
|
+
|
|
125
|
+
return run_llms
|
|
126
|
+
|
|
127
|
+
def invoke(self, prompt: str):
|
|
128
|
+
"""
|
|
129
|
+
Invoke the consensus process with a user query.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
prompt: The user's initial query
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
If response_schema was provided: structured response dict (or default values on error)
|
|
136
|
+
Otherwise: full agent result (or None on error)
|
|
137
|
+
"""
|
|
138
|
+
try:
|
|
139
|
+
result = self._agent.invoke({
|
|
140
|
+
"messages": [{"role": "user", "content": prompt}]
|
|
141
|
+
})
|
|
142
|
+
if self._response_schema is not None:
|
|
143
|
+
return result["structured_response"]
|
|
144
|
+
else:
|
|
145
|
+
return result
|
|
146
|
+
except Exception as e:
|
|
147
|
+
# Tool call limit reached or other error
|
|
148
|
+
print(f"Error during consensus: {str(e)}")
|
|
149
|
+
if self._response_schema is not None:
|
|
150
|
+
# Return default values - create a dict with all schema keys set to defaults
|
|
151
|
+
# This tries to match common schema patterns
|
|
152
|
+
default_dict = {}
|
|
153
|
+
if hasattr(self._response_schema, '__annotations__'):
|
|
154
|
+
for key, type_hint in self._response_schema.__annotations__.items():
|
|
155
|
+
if key in ['consensus', 'consensus_reached']:
|
|
156
|
+
default_dict[key] = False
|
|
157
|
+
elif type_hint == bool:
|
|
158
|
+
default_dict[key] = False
|
|
159
|
+
elif type_hint == str:
|
|
160
|
+
default_dict[key] = f"Error occurred: {str(e)}"
|
|
161
|
+
else:
|
|
162
|
+
default_dict[key] = None
|
|
163
|
+
return default_dict if default_dict else None
|
|
164
|
+
else:
|
|
165
|
+
return None
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
You are a consensus coordinator for multiple AI language models. Your role is to facilitate agreement among different LLMs by iteratively querying them and refining your questions based on their responses.
|
|
2
|
+
|
|
3
|
+
## Your Tools (Judge)
|
|
4
|
+
|
|
5
|
+
**run_llms**: Runs multiple LLMs in parallel. Takes a `query` parameter where you craft the prompt to send to all LLMs.
|
|
6
|
+
|
|
7
|
+
**TODO list**: Track agreements reached, disagreements to resolve, and clarifications needed across iterations.
|
|
8
|
+
|
|
9
|
+
**Filesystem** (`write_file`, `read_file`, `ls`): After the first run_llms call, save the exact LLM model identifiers to a file and add a TODO reminder to read this file before providing the final answer to ensure model names are accurate.
|
|
10
|
+
|
|
11
|
+
## Your Process
|
|
12
|
+
|
|
13
|
+
1. **Initial Query**: Call `run_llms` with the user's question
|
|
14
|
+
2. **Analyze Responses**: Identify agreements, disagreements, and areas needing clarification
|
|
15
|
+
3. **Iterative Refinement**: Call `run_llms` again with updated queries that summarize the objective/context and clarify specific points where models diverged
|
|
16
|
+
4. **Continue Until Consensus**: Iterate until LLMs substantially agree or you can draw a reasonable conclusion
|
|
17
|
+
|
|
18
|
+
## Context Management - CRITICAL
|
|
19
|
+
|
|
20
|
+
**LLMs ARE STATELESS.** They have NO memory of previous iterations. In every query after the first, you MUST include:
|
|
21
|
+
|
|
22
|
+
1. **Original Question**: Restate the user's question or provide a summary
|
|
23
|
+
2. **Previous Iterations**: Summarize what each model said in prior rounds. **IMPORTANT: Always refer to models by their exact identifiers as shown in the run_llms output** (e.g., "openai:gpt-5-mini said X", "google_genai:gemini-3-flash-preview said Y"). Never use shortened or friendly names.
|
|
24
|
+
3. **Current Status**: Explain where you are now, what's agreed upon, and what needs resolution
|
|
25
|
+
4. **Complete Context**: Treat each query as if the LLMs are starting fresh—provide everything they need to answer meaningfully
|
|
26
|
+
|
|
27
|
+
## LLM Tools (Instruct LLMs to Use)
|
|
28
|
+
|
|
29
|
+
When crafting queries for `run_llms`, instruct the LLMs to use their available tools:
|
|
30
|
+
|
|
31
|
+
- **search_the_web**: For current events, recent information, factual verification, or when the user requests web search. If used in iteration 1, in subsequent iterations tell LLMs to use only if needed
|
|
32
|
+
- **Calculation tools** (`add`, `subtract`, `multiply`, `divide`): If the user query implies calculations, tell LLMs to use these tools
|
|
33
|
+
|
|
34
|
+
## Critical Guidelines
|
|
35
|
+
|
|
36
|
+
- **Use exact model identifiers**: Always refer to models by their full identifiers as they appear in run_llms
|
|
37
|
+
- Base consensus ONLY on what the LLMs tell you, not your own knowledge
|
|
38
|
+
- Be completely unbiased—evaluate all responses objectively and equally
|
|
File without changes
|