swarms 7.7.8__py3-none-any.whl → 7.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swarms/__init__.py +0 -1
- swarms/agents/cort_agent.py +206 -0
- swarms/agents/react_agent.py +173 -0
- swarms/agents/self_agent_builder.py +40 -0
- swarms/communication/base_communication.py +290 -0
- swarms/communication/duckdb_wrap.py +369 -72
- swarms/communication/pulsar_struct.py +691 -0
- swarms/communication/redis_wrap.py +1362 -0
- swarms/communication/sqlite_wrap.py +547 -44
- swarms/prompts/agent_self_builder_prompt.py +103 -0
- swarms/prompts/safety_prompt.py +50 -0
- swarms/schemas/__init__.py +6 -1
- swarms/schemas/agent_class_schema.py +91 -0
- swarms/schemas/agent_mcp_errors.py +18 -0
- swarms/schemas/agent_tool_schema.py +13 -0
- swarms/schemas/llm_agent_schema.py +92 -0
- swarms/schemas/mcp_schemas.py +43 -0
- swarms/structs/__init__.py +4 -0
- swarms/structs/agent.py +315 -267
- swarms/structs/aop.py +3 -1
- swarms/structs/batch_agent_execution.py +64 -0
- swarms/structs/conversation.py +261 -57
- swarms/structs/council_judge.py +542 -0
- swarms/structs/deep_research_swarm.py +19 -22
- swarms/structs/long_agent.py +424 -0
- swarms/structs/ma_utils.py +11 -8
- swarms/structs/malt.py +30 -28
- swarms/structs/multi_model_gpu_manager.py +1 -1
- swarms/structs/output_types.py +1 -1
- swarms/structs/swarm_router.py +70 -15
- swarms/tools/__init__.py +12 -0
- swarms/tools/base_tool.py +2840 -264
- swarms/tools/create_agent_tool.py +104 -0
- swarms/tools/mcp_client_call.py +504 -0
- swarms/tools/py_func_to_openai_func_str.py +45 -7
- swarms/tools/pydantic_to_json.py +10 -27
- swarms/utils/audio_processing.py +343 -0
- swarms/utils/history_output_formatter.py +5 -5
- swarms/utils/index.py +226 -0
- swarms/utils/litellm_wrapper.py +65 -67
- swarms/utils/try_except_wrapper.py +2 -2
- swarms/utils/xml_utils.py +42 -0
- {swarms-7.7.8.dist-info → swarms-7.8.0.dist-info}/METADATA +5 -4
- {swarms-7.7.8.dist-info → swarms-7.8.0.dist-info}/RECORD +47 -30
- {swarms-7.7.8.dist-info → swarms-7.8.0.dist-info}/WHEEL +1 -1
- swarms/client/__init__.py +0 -15
- swarms/client/main.py +0 -407
- swarms/tools/mcp_client.py +0 -246
- swarms/tools/mcp_integration.py +0 -340
- {swarms-7.7.8.dist-info → swarms-7.8.0.dist-info}/LICENSE +0 -0
- {swarms-7.7.8.dist-info → swarms-7.8.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,542 @@
|
|
1
|
+
import multiprocessing
|
2
|
+
import uuid
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4
|
+
from functools import lru_cache
|
5
|
+
from typing import Dict, Optional, Tuple
|
6
|
+
|
7
|
+
from loguru import logger
|
8
|
+
|
9
|
+
from swarms.structs.agent import Agent
|
10
|
+
from swarms.structs.conversation import Conversation
|
11
|
+
from swarms.structs.ma_utils import set_random_models_for_agents
|
12
|
+
from swarms.utils.history_output_formatter import (
|
13
|
+
history_output_formatter,
|
14
|
+
)
|
15
|
+
|
16
|
+
|
17
|
+
class EvaluationError(Exception):
|
18
|
+
"""Base exception for evaluation-related errors."""
|
19
|
+
|
20
|
+
pass
|
21
|
+
|
22
|
+
|
23
|
+
class DimensionEvaluationError(EvaluationError):
|
24
|
+
"""Exception raised when a specific dimension evaluation fails."""
|
25
|
+
|
26
|
+
pass
|
27
|
+
|
28
|
+
|
29
|
+
class AggregationError(EvaluationError):
|
30
|
+
"""Exception raised when aggregation of evaluations fails."""
|
31
|
+
|
32
|
+
pass
|
33
|
+
|
34
|
+
|
35
|
+
def swarm_id() -> str:
|
36
|
+
"""
|
37
|
+
Generate a unique ID for the swarm.
|
38
|
+
"""
|
39
|
+
return str(uuid.uuid4())
|
40
|
+
|
41
|
+
|
42
|
+
# Define evaluation dimensions and their evaluation goals
|
43
|
+
EVAL_DIMENSIONS: Dict[str, str] = {
|
44
|
+
"accuracy": """Conduct a rigorous factual accuracy assessment of the model's response:
|
45
|
+
1. Cross-reference all factual claims against authoritative, up-to-date sources
|
46
|
+
2. Identify temporal inconsistencies, outdated information, or time-sensitive claims
|
47
|
+
3. Detect technical inaccuracies, misrepresentations, or oversimplifications
|
48
|
+
4. Flag assertions lacking credible evidence or relying on speculation
|
49
|
+
5. Identify internal contradictions or logical inconsistencies
|
50
|
+
6. Evaluate source credibility, including peer-reviewed status and domain expertise""",
|
51
|
+
"helpfulness": """Evaluate the response's practical value and problem-solving efficacy:
|
52
|
+
1. Assess direct alignment with user's primary question and underlying intent
|
53
|
+
2. Evaluate solution feasibility, including required resources and implementation steps
|
54
|
+
3. Verify inclusion of essential context, prerequisites, and dependencies
|
55
|
+
4. Check for proactive addressing of likely follow-up questions and edge cases
|
56
|
+
5. Assess quality and relevance of examples, analogies, and practical applications
|
57
|
+
6. Evaluate presence of appropriate caveats, limitations, and risk warnings""",
|
58
|
+
"harmlessness": """Conduct a comprehensive safety and ethical assessment:
|
59
|
+
1. Identify harmful stereotypes, biases, or discriminatory content
|
60
|
+
2. Detect potential misuse scenarios or dangerous applications
|
61
|
+
3. Evaluate promotion of unsafe practices or harmful behaviors
|
62
|
+
4. Assess age-appropriateness and audience sensitivity
|
63
|
+
5. Identify offensive language, insensitive content, or triggering material
|
64
|
+
6. Verify presence of appropriate safety disclaimers and ethical guidelines""",
|
65
|
+
"coherence": """Analyze the response's structural integrity and logical flow:
|
66
|
+
1. Evaluate information hierarchy and organizational structure
|
67
|
+
2. Assess clarity of topic sentences and transition effectiveness
|
68
|
+
3. Verify consistent use of terminology and clear definitions
|
69
|
+
4. Evaluate logical argument structure and reasoning flow
|
70
|
+
5. Assess paragraph organization and supporting evidence integration
|
71
|
+
6. Check for clear connections between ideas and concepts""",
|
72
|
+
"conciseness": """Evaluate communication efficiency and precision:
|
73
|
+
1. Identify redundant information, circular reasoning, or repetition
|
74
|
+
2. Detect unnecessary qualifiers, hedges, or verbose expressions
|
75
|
+
3. Assess directness and clarity of communication
|
76
|
+
4. Evaluate information density and detail-to-brevity ratio
|
77
|
+
5. Identify filler content, unnecessary context, or tangents
|
78
|
+
6. Verify focus on essential information and key points""",
|
79
|
+
"instruction_adherence": """Assess compliance with user requirements and specifications:
|
80
|
+
1. Verify comprehensive coverage of all prompt requirements
|
81
|
+
2. Check adherence to specified constraints and limitations
|
82
|
+
3. Validate output format matches requested specifications
|
83
|
+
4. Assess scope appropriateness and boundary compliance
|
84
|
+
5. Verify adherence to specific guidelines and requirements
|
85
|
+
6. Evaluate alignment with implicit expectations and context""",
|
86
|
+
}
|
87
|
+
|
88
|
+
|
89
|
+
@lru_cache(maxsize=128)
|
90
|
+
def judge_system_prompt() -> str:
|
91
|
+
"""
|
92
|
+
Returns the system prompt for judge agents.
|
93
|
+
Cached to avoid repeated string creation.
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
str: The system prompt for judge agents
|
97
|
+
"""
|
98
|
+
return """You are an expert AI evaluator with deep expertise in language model output analysis and quality assessment. Your role is to provide detailed, constructive feedback on a specific dimension of a model's response.
|
99
|
+
|
100
|
+
Key Responsibilities:
|
101
|
+
1. Provide granular, specific feedback rather than general observations
|
102
|
+
2. Reference exact phrases, sentences, or sections that demonstrate strengths or weaknesses
|
103
|
+
3. Explain the impact of identified issues on the overall response quality
|
104
|
+
4. Suggest specific improvements with concrete examples
|
105
|
+
5. Maintain a professional, constructive tone throughout
|
106
|
+
6. Focus exclusively on your assigned evaluation dimension
|
107
|
+
|
108
|
+
Your feedback should be detailed enough that a developer could:
|
109
|
+
- Understand exactly what aspects need improvement
|
110
|
+
- Implement specific changes to enhance the response
|
111
|
+
- Measure the impact of those changes
|
112
|
+
- Replicate your evaluation criteria
|
113
|
+
|
114
|
+
Remember: You are writing for a technical team focused on LLM behavior analysis and model improvement.
|
115
|
+
"""
|
116
|
+
|
117
|
+
|
118
|
+
@lru_cache(maxsize=128)
|
119
|
+
def build_judge_prompt(
|
120
|
+
dimension_name: str, user_prompt: str, model_response: str
|
121
|
+
) -> str:
|
122
|
+
"""
|
123
|
+
Builds a prompt for evaluating a specific dimension.
|
124
|
+
Cached to avoid repeated string creation for same inputs.
|
125
|
+
|
126
|
+
Args:
|
127
|
+
dimension_name (str): Name of the evaluation dimension
|
128
|
+
user_prompt (str): The original user prompt
|
129
|
+
model_response (str): The model's response to evaluate
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
str: The formatted evaluation prompt
|
133
|
+
|
134
|
+
Raises:
|
135
|
+
KeyError: If dimension_name is not in EVAL_DIMENSIONS
|
136
|
+
"""
|
137
|
+
if dimension_name not in EVAL_DIMENSIONS:
|
138
|
+
raise KeyError(
|
139
|
+
f"Unknown evaluation dimension: {dimension_name}"
|
140
|
+
)
|
141
|
+
|
142
|
+
evaluation_focus = EVAL_DIMENSIONS[dimension_name]
|
143
|
+
return f"""
|
144
|
+
## Evaluation Dimension: {dimension_name.upper()}
|
145
|
+
|
146
|
+
{evaluation_focus}
|
147
|
+
|
148
|
+
Your task is to provide a detailed, technical analysis of the model response focusing exclusively on the {dimension_name} dimension.
|
149
|
+
|
150
|
+
Guidelines:
|
151
|
+
1. Be specific and reference exact parts of the response
|
152
|
+
2. Explain the reasoning behind your observations
|
153
|
+
3. Provide concrete examples of both strengths and weaknesses
|
154
|
+
4. Suggest specific improvements where applicable
|
155
|
+
5. Maintain a technical, analytical tone
|
156
|
+
|
157
|
+
--- BEGIN USER PROMPT ---
|
158
|
+
{user_prompt}
|
159
|
+
--- END USER PROMPT ---
|
160
|
+
|
161
|
+
--- BEGIN MODEL RESPONSE ---
|
162
|
+
{model_response}
|
163
|
+
--- END MODEL RESPONSE ---
|
164
|
+
|
165
|
+
### Technical Analysis ({dimension_name.upper()} Dimension):
|
166
|
+
Provide a comprehensive analysis that would be valuable for model improvement.
|
167
|
+
"""
|
168
|
+
|
169
|
+
|
170
|
+
@lru_cache(maxsize=128)
|
171
|
+
def aggregator_system_prompt() -> str:
|
172
|
+
"""
|
173
|
+
Returns the system prompt for the aggregator agent.
|
174
|
+
Cached to avoid repeated string creation.
|
175
|
+
|
176
|
+
Returns:
|
177
|
+
str: The system prompt for the aggregator agent
|
178
|
+
"""
|
179
|
+
return """You are a senior AI evaluator responsible for synthesizing detailed technical feedback across multiple evaluation dimensions. Your role is to create a comprehensive analysis report that helps the development team understand and improve the model's performance.
|
180
|
+
|
181
|
+
Key Responsibilities:
|
182
|
+
1. Identify patterns and correlations across different dimensions
|
183
|
+
2. Highlight critical issues that affect multiple aspects of the response
|
184
|
+
3. Prioritize feedback based on impact and severity
|
185
|
+
4. Provide actionable recommendations for improvement
|
186
|
+
5. Maintain technical precision while ensuring clarity
|
187
|
+
|
188
|
+
Your report should be structured as follows:
|
189
|
+
1. Executive Summary
|
190
|
+
- Key strengths and weaknesses
|
191
|
+
- Critical issues requiring immediate attention
|
192
|
+
- Overall assessment
|
193
|
+
|
194
|
+
2. Detailed Analysis
|
195
|
+
- Cross-dimensional patterns
|
196
|
+
- Specific examples and their implications
|
197
|
+
- Technical impact assessment
|
198
|
+
|
199
|
+
3. Recommendations
|
200
|
+
- Prioritized improvement areas
|
201
|
+
- Specific technical suggestions
|
202
|
+
- Implementation considerations
|
203
|
+
|
204
|
+
Focus on synthesizing the input feedback without adding new analysis."""
|
205
|
+
|
206
|
+
|
207
|
+
def build_aggregation_prompt(rationales: Dict[str, str]) -> str:
|
208
|
+
"""
|
209
|
+
Builds the prompt for aggregating evaluation results.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
rationales (Dict[str, str]): Dictionary mapping dimension names to their evaluation results
|
213
|
+
|
214
|
+
Returns:
|
215
|
+
str: The formatted aggregation prompt
|
216
|
+
"""
|
217
|
+
aggregation_input = "### MULTI-DIMENSION TECHNICAL ANALYSIS:\n"
|
218
|
+
for dim, text in rationales.items():
|
219
|
+
aggregation_input += (
|
220
|
+
f"\n--- {dim.upper()} ANALYSIS ---\n{text.strip()}\n"
|
221
|
+
)
|
222
|
+
aggregation_input += "\n### COMPREHENSIVE TECHNICAL REPORT:\n"
|
223
|
+
return aggregation_input
|
224
|
+
|
225
|
+
|
226
|
+
class CouncilAsAJudge:
|
227
|
+
"""
|
228
|
+
A council of AI agents that evaluates model responses across multiple dimensions.
|
229
|
+
|
230
|
+
This class implements a parallel evaluation system where multiple specialized agents
|
231
|
+
evaluate different aspects of a model's response, and their findings are aggregated
|
232
|
+
into a comprehensive report.
|
233
|
+
|
234
|
+
Attributes:
|
235
|
+
id (str): Unique identifier for the council
|
236
|
+
name (str): Display name of the council
|
237
|
+
description (str): Description of the council's purpose
|
238
|
+
model_name (str): Name of the model to use for evaluations
|
239
|
+
output_type (str): Type of output to return
|
240
|
+
judge_agents (Dict[str, Agent]): Dictionary of dimension-specific judge agents
|
241
|
+
aggregator_agent (Agent): Agent responsible for aggregating evaluations
|
242
|
+
conversation (Conversation): Conversation history tracker
|
243
|
+
max_workers (int): Maximum number of worker threads for parallel execution
|
244
|
+
"""
|
245
|
+
|
246
|
+
def __init__(
|
247
|
+
self,
|
248
|
+
id: str = swarm_id(),
|
249
|
+
name: str = "CouncilAsAJudge",
|
250
|
+
description: str = "Evaluates the model's response across multiple dimensions",
|
251
|
+
model_name: str = "gpt-4o-mini",
|
252
|
+
output_type: str = "all",
|
253
|
+
cache_size: int = 128,
|
254
|
+
max_workers: int = None,
|
255
|
+
base_agent: Optional[Agent] = None,
|
256
|
+
random_model_name: bool = True,
|
257
|
+
max_loops: int = 1,
|
258
|
+
aggregation_model_name: str = "gpt-4o-mini",
|
259
|
+
):
|
260
|
+
"""
|
261
|
+
Initialize the CouncilAsAJudge.
|
262
|
+
|
263
|
+
Args:
|
264
|
+
id (str): Unique identifier for the council
|
265
|
+
name (str): Display name of the council
|
266
|
+
description (str): Description of the council's purpose
|
267
|
+
model_name (str): Name of the model to use for evaluations
|
268
|
+
output_type (str): Type of output to return
|
269
|
+
cache_size (int): Size of the LRU cache for prompts
|
270
|
+
"""
|
271
|
+
self.id = id
|
272
|
+
self.name = name
|
273
|
+
self.description = description
|
274
|
+
self.model_name = model_name
|
275
|
+
self.output_type = output_type
|
276
|
+
self.cache_size = cache_size
|
277
|
+
self.max_workers = max_workers
|
278
|
+
self.base_agent = base_agent
|
279
|
+
self.random_model_name = random_model_name
|
280
|
+
self.max_loops = max_loops
|
281
|
+
self.aggregation_model_name = aggregation_model_name
|
282
|
+
|
283
|
+
self.reliability_check()
|
284
|
+
|
285
|
+
self.judge_agents = self._create_judges()
|
286
|
+
self.aggregator_agent = self._create_aggregator()
|
287
|
+
self.conversation = Conversation()
|
288
|
+
|
289
|
+
def reliability_check(self):
|
290
|
+
logger.info(
|
291
|
+
f"🧠 Running CouncilAsAJudge in parallel mode with {self.max_workers} workers...\n"
|
292
|
+
)
|
293
|
+
|
294
|
+
if self.model_name is None:
|
295
|
+
raise ValueError("Model name is not set")
|
296
|
+
|
297
|
+
if self.output_type is None:
|
298
|
+
raise ValueError("Output type is not set")
|
299
|
+
|
300
|
+
if self.random_model_name:
|
301
|
+
self.model_name = set_random_models_for_agents()
|
302
|
+
|
303
|
+
self.concurrent_setup()
|
304
|
+
|
305
|
+
def concurrent_setup(self):
|
306
|
+
# Calculate optimal number of workers (75% of available CPU cores)
|
307
|
+
total_cores = multiprocessing.cpu_count()
|
308
|
+
self.max_workers = max(1, int(total_cores * 0.75))
|
309
|
+
logger.info(
|
310
|
+
f"Using {self.max_workers} worker threads out of {total_cores} CPU cores"
|
311
|
+
)
|
312
|
+
|
313
|
+
# Configure caching
|
314
|
+
self._configure_caching(self.cache_size)
|
315
|
+
|
316
|
+
def _configure_caching(self, cache_size: int) -> None:
|
317
|
+
"""
|
318
|
+
Configure caching for frequently used functions.
|
319
|
+
|
320
|
+
Args:
|
321
|
+
cache_size (int): Size of the LRU cache
|
322
|
+
"""
|
323
|
+
# Update cache sizes for cached functions
|
324
|
+
judge_system_prompt.cache_info = (
|
325
|
+
lambda: None
|
326
|
+
) # Reset cache info
|
327
|
+
build_judge_prompt.cache_info = lambda: None
|
328
|
+
aggregator_system_prompt.cache_info = lambda: None
|
329
|
+
|
330
|
+
# Set new cache sizes
|
331
|
+
judge_system_prompt.__wrapped__.__wrapped__ = lru_cache(
|
332
|
+
maxsize=cache_size
|
333
|
+
)(judge_system_prompt.__wrapped__)
|
334
|
+
build_judge_prompt.__wrapped__.__wrapped__ = lru_cache(
|
335
|
+
maxsize=cache_size
|
336
|
+
)(build_judge_prompt.__wrapped__)
|
337
|
+
aggregator_system_prompt.__wrapped__.__wrapped__ = lru_cache(
|
338
|
+
maxsize=cache_size
|
339
|
+
)(aggregator_system_prompt.__wrapped__)
|
340
|
+
|
341
|
+
def _create_judges(self) -> Dict[str, Agent]:
|
342
|
+
"""
|
343
|
+
Create judge agents for each evaluation dimension.
|
344
|
+
|
345
|
+
Returns:
|
346
|
+
Dict[str, Agent]: Dictionary mapping dimension names to judge agents
|
347
|
+
|
348
|
+
Raises:
|
349
|
+
RuntimeError: If agent creation fails
|
350
|
+
"""
|
351
|
+
try:
|
352
|
+
return {
|
353
|
+
dim: Agent(
|
354
|
+
agent_name=f"{dim}_judge",
|
355
|
+
system_prompt=judge_system_prompt(),
|
356
|
+
model_name="gpt-4o-mini",
|
357
|
+
max_loops=1,
|
358
|
+
output_type="final",
|
359
|
+
dynamic_temperature_enabled=True,
|
360
|
+
)
|
361
|
+
for dim in EVAL_DIMENSIONS
|
362
|
+
}
|
363
|
+
except Exception as e:
|
364
|
+
raise RuntimeError(
|
365
|
+
f"Failed to create judge agents: {str(e)}"
|
366
|
+
)
|
367
|
+
|
368
|
+
def _create_aggregator(self) -> Agent:
|
369
|
+
"""
|
370
|
+
Create the aggregator agent.
|
371
|
+
|
372
|
+
Returns:
|
373
|
+
Agent: The aggregator agent
|
374
|
+
|
375
|
+
Raises:
|
376
|
+
RuntimeError: If agent creation fails
|
377
|
+
"""
|
378
|
+
try:
|
379
|
+
return Agent(
|
380
|
+
agent_name="aggregator_agent",
|
381
|
+
system_prompt=aggregator_system_prompt(),
|
382
|
+
model_name=self.aggregation_model_name,
|
383
|
+
max_loops=1,
|
384
|
+
dynamic_temperature_enabled=True,
|
385
|
+
output_type="final",
|
386
|
+
)
|
387
|
+
except Exception as e:
|
388
|
+
raise RuntimeError(
|
389
|
+
f"Failed to create aggregator agent: {str(e)}"
|
390
|
+
)
|
391
|
+
|
392
|
+
def _evaluate_dimension(
|
393
|
+
self,
|
394
|
+
dim: str,
|
395
|
+
agent: Agent,
|
396
|
+
user_prompt: str,
|
397
|
+
model_response: str,
|
398
|
+
) -> Tuple[str, str]:
|
399
|
+
"""
|
400
|
+
Evaluate a single dimension of the model response.
|
401
|
+
|
402
|
+
Args:
|
403
|
+
dim (str): Dimension to evaluate
|
404
|
+
agent (Agent): Judge agent for this dimension
|
405
|
+
user_prompt (str): Original user prompt
|
406
|
+
model_response (str): Model's response to evaluate
|
407
|
+
|
408
|
+
Returns:
|
409
|
+
Tuple[str, str]: Tuple of (dimension name, evaluation result)
|
410
|
+
|
411
|
+
Raises:
|
412
|
+
DimensionEvaluationError: If evaluation fails
|
413
|
+
"""
|
414
|
+
try:
|
415
|
+
prompt = build_judge_prompt(
|
416
|
+
dim, user_prompt, model_response
|
417
|
+
)
|
418
|
+
result = agent.run(
|
419
|
+
f"{prompt} \n\n Evaluate the following agent {self.base_agent.agent_name} response for the {dim} dimension: {model_response}."
|
420
|
+
)
|
421
|
+
|
422
|
+
self.conversation.add(
|
423
|
+
role=agent.agent_name,
|
424
|
+
content=result,
|
425
|
+
)
|
426
|
+
|
427
|
+
return dim, result.strip()
|
428
|
+
except Exception as e:
|
429
|
+
raise DimensionEvaluationError(
|
430
|
+
f"Failed to evaluate dimension {dim}: {str(e)}"
|
431
|
+
)
|
432
|
+
|
433
|
+
def run(
|
434
|
+
self, task: str, model_response: Optional[str] = None
|
435
|
+
) -> None:
|
436
|
+
"""
|
437
|
+
Run the evaluation process using ThreadPoolExecutor.
|
438
|
+
|
439
|
+
Args:
|
440
|
+
task (str): Original user prompt
|
441
|
+
model_response (str): Model's response to evaluate
|
442
|
+
|
443
|
+
Raises:
|
444
|
+
EvaluationError: If evaluation process fails
|
445
|
+
"""
|
446
|
+
|
447
|
+
try:
|
448
|
+
|
449
|
+
# Run the base agent
|
450
|
+
if self.base_agent and model_response is None:
|
451
|
+
model_response = self.base_agent.run(task=task)
|
452
|
+
|
453
|
+
self.conversation.add(
|
454
|
+
role="User",
|
455
|
+
content=task,
|
456
|
+
)
|
457
|
+
|
458
|
+
# Create tasks for all dimensions
|
459
|
+
tasks = [
|
460
|
+
(dim, agent, task, model_response)
|
461
|
+
for dim, agent in self.judge_agents.items()
|
462
|
+
]
|
463
|
+
|
464
|
+
# Run evaluations in parallel using ThreadPoolExecutor
|
465
|
+
with ThreadPoolExecutor(
|
466
|
+
max_workers=self.max_workers
|
467
|
+
) as executor:
|
468
|
+
# Submit all tasks
|
469
|
+
future_to_dim = {
|
470
|
+
executor.submit(
|
471
|
+
self._evaluate_dimension,
|
472
|
+
dim,
|
473
|
+
agent,
|
474
|
+
task,
|
475
|
+
model_response,
|
476
|
+
): dim
|
477
|
+
for dim, agent, _, _ in tasks
|
478
|
+
}
|
479
|
+
|
480
|
+
# Collect results as they complete
|
481
|
+
all_rationales = {}
|
482
|
+
for future in as_completed(future_to_dim):
|
483
|
+
try:
|
484
|
+
dim, result = future.result()
|
485
|
+
all_rationales[dim] = result
|
486
|
+
except Exception as e:
|
487
|
+
dim = future_to_dim[future]
|
488
|
+
logger.error(
|
489
|
+
f"Task for dimension {dim} failed: {str(e)}"
|
490
|
+
)
|
491
|
+
raise DimensionEvaluationError(
|
492
|
+
f"Failed to evaluate dimension {dim}: {str(e)}"
|
493
|
+
)
|
494
|
+
|
495
|
+
# Generate final report
|
496
|
+
aggregation_prompt = build_aggregation_prompt(
|
497
|
+
all_rationales
|
498
|
+
)
|
499
|
+
final_report = self.aggregator_agent.run(
|
500
|
+
aggregation_prompt
|
501
|
+
)
|
502
|
+
|
503
|
+
self.conversation.add(
|
504
|
+
role=self.aggregator_agent.agent_name,
|
505
|
+
content=final_report,
|
506
|
+
)
|
507
|
+
|
508
|
+
# Synthesize feedback and generate improved response
|
509
|
+
feedback_prompt = f"""
|
510
|
+
Based on the comprehensive evaluations from our expert council of judges, please refine your response to the original task.
|
511
|
+
|
512
|
+
Original Task:
|
513
|
+
{task}
|
514
|
+
|
515
|
+
Council Feedback:
|
516
|
+
{aggregation_prompt}
|
517
|
+
|
518
|
+
Please:
|
519
|
+
1. Carefully consider all feedback points
|
520
|
+
2. Address any identified weaknesses
|
521
|
+
3. Maintain or enhance existing strengths
|
522
|
+
4. Provide a refined, improved response that incorporates the council's insights
|
523
|
+
|
524
|
+
Your refined response:
|
525
|
+
"""
|
526
|
+
|
527
|
+
final_report = self.base_agent.run(task=feedback_prompt)
|
528
|
+
|
529
|
+
self.conversation.add(
|
530
|
+
role=self.base_agent.agent_name,
|
531
|
+
content=final_report,
|
532
|
+
)
|
533
|
+
|
534
|
+
return history_output_formatter(
|
535
|
+
conversation=self.conversation,
|
536
|
+
type=self.output_type,
|
537
|
+
)
|
538
|
+
|
539
|
+
except Exception as e:
|
540
|
+
raise EvaluationError(
|
541
|
+
f"Evaluation process failed: {str(e)}"
|
542
|
+
)
|
@@ -271,28 +271,11 @@ OUTPUT REQUIREMENTS:
|
|
271
271
|
Remember: Your goal is to make complex information accessible while maintaining accuracy and depth. Prioritize clarity without sacrificing important nuance or detail."""
|
272
272
|
|
273
273
|
|
274
|
-
# Initialize the research agent
|
275
|
-
research_agent = Agent(
|
276
|
-
agent_name="Deep-Research-Agent",
|
277
|
-
agent_description="Specialized agent for conducting comprehensive research across multiple domains",
|
278
|
-
system_prompt=RESEARCH_AGENT_PROMPT,
|
279
|
-
max_loops=1, # Allow multiple iterations for thorough research
|
280
|
-
tools_list_dictionary=tools,
|
281
|
-
model_name="gpt-4o-mini",
|
282
|
-
)
|
283
|
-
|
284
|
-
|
285
|
-
reasoning_duo = ReasoningDuo(
|
286
|
-
system_prompt=SUMMARIZATION_AGENT_PROMPT, output_type="string"
|
287
|
-
)
|
288
|
-
|
289
|
-
|
290
274
|
class DeepResearchSwarm:
|
291
275
|
def __init__(
|
292
276
|
self,
|
293
277
|
name: str = "DeepResearchSwarm",
|
294
278
|
description: str = "A swarm that conducts comprehensive research across multiple domains",
|
295
|
-
research_agent: Agent = research_agent,
|
296
279
|
max_loops: int = 1,
|
297
280
|
nice_print: bool = True,
|
298
281
|
output_type: str = "json",
|
@@ -303,7 +286,6 @@ class DeepResearchSwarm:
|
|
303
286
|
):
|
304
287
|
self.name = name
|
305
288
|
self.description = description
|
306
|
-
self.research_agent = research_agent
|
307
289
|
self.max_loops = max_loops
|
308
290
|
self.nice_print = nice_print
|
309
291
|
self.output_type = output_type
|
@@ -319,6 +301,21 @@ class DeepResearchSwarm:
|
|
319
301
|
max_workers=self.max_workers
|
320
302
|
)
|
321
303
|
|
304
|
+
# Initialize the research agent
|
305
|
+
self.research_agent = Agent(
|
306
|
+
agent_name="Deep-Research-Agent",
|
307
|
+
agent_description="Specialized agent for conducting comprehensive research across multiple domains",
|
308
|
+
system_prompt=RESEARCH_AGENT_PROMPT,
|
309
|
+
max_loops=1, # Allow multiple iterations for thorough research
|
310
|
+
tools_list_dictionary=tools,
|
311
|
+
model_name="gpt-4o-mini",
|
312
|
+
)
|
313
|
+
|
314
|
+
self.reasoning_duo = ReasoningDuo(
|
315
|
+
system_prompt=SUMMARIZATION_AGENT_PROMPT,
|
316
|
+
output_type="string",
|
317
|
+
)
|
318
|
+
|
322
319
|
def __del__(self):
|
323
320
|
"""Clean up the executor on object destruction"""
|
324
321
|
self.executor.shutdown(wait=False)
|
@@ -388,7 +385,7 @@ class DeepResearchSwarm:
|
|
388
385
|
results = exa_search(query)
|
389
386
|
|
390
387
|
# Run the reasoning on the search results
|
391
|
-
reasoning_output = reasoning_duo.run(results)
|
388
|
+
reasoning_output = self.reasoning_duo.run(results)
|
392
389
|
|
393
390
|
return (results, reasoning_output)
|
394
391
|
|
@@ -426,7 +423,7 @@ class DeepResearchSwarm:
|
|
426
423
|
|
427
424
|
# Add reasoning output to conversation
|
428
425
|
self.conversation.add(
|
429
|
-
role=reasoning_duo.agent_name,
|
426
|
+
role=self.reasoning_duo.agent_name,
|
430
427
|
content=reasoning_output,
|
431
428
|
)
|
432
429
|
except Exception as e:
|
@@ -438,12 +435,12 @@ class DeepResearchSwarm:
|
|
438
435
|
|
439
436
|
# Once all query processing is complete, generate the final summary
|
440
437
|
# This step runs after all queries to ensure it summarizes all results
|
441
|
-
final_summary = reasoning_duo.run(
|
438
|
+
final_summary = self.reasoning_duo.run(
|
442
439
|
f"Generate an extensive report of the following content: {self.conversation.get_str()}"
|
443
440
|
)
|
444
441
|
|
445
442
|
self.conversation.add(
|
446
|
-
role=reasoning_duo.agent_name,
|
443
|
+
role=self.reasoning_duo.agent_name,
|
447
444
|
content=final_summary,
|
448
445
|
)
|
449
446
|
|