camel-ai 0.2.20a0__py3-none-any.whl → 0.2.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/chat_agent.py +2 -3
- camel/agents/knowledge_graph_agent.py +1 -5
- camel/benchmarks/apibench.py +1 -5
- camel/benchmarks/nexus.py +1 -5
- camel/benchmarks/ragbench.py +2 -2
- camel/bots/telegram_bot.py +1 -5
- camel/configs/__init__.py +3 -0
- camel/configs/aiml_config.py +80 -0
- camel/datagen/__init__.py +3 -1
- camel/datagen/self_improving_cot.py +821 -0
- camel/interpreters/subprocess_interpreter.py +72 -6
- camel/models/__init__.py +2 -0
- camel/models/aiml_model.py +147 -0
- camel/models/model_factory.py +3 -0
- camel/models/siliconflow_model.py +1 -1
- camel/societies/workforce/role_playing_worker.py +2 -4
- camel/societies/workforce/single_agent_worker.py +1 -6
- camel/societies/workforce/workforce.py +3 -9
- camel/toolkits/__init__.py +2 -0
- camel/toolkits/reddit_toolkit.py +8 -38
- camel/toolkits/sympy_toolkit.py +778 -0
- camel/toolkits/whatsapp_toolkit.py +11 -32
- camel/types/enums.py +29 -1
- camel/utils/__init__.py +7 -2
- camel/utils/commons.py +198 -21
- camel/utils/deduplication.py +232 -0
- camel/utils/token_counting.py +0 -38
- {camel_ai-0.2.20a0.dist-info → camel_ai-0.2.21.dist-info}/METADATA +10 -12
- {camel_ai-0.2.20a0.dist-info → camel_ai-0.2.21.dist-info}/RECORD +33 -28
- /camel/datagen/{cotdatagen.py → cot_datagen.py} +0 -0
- {camel_ai-0.2.20a0.dist-info → camel_ai-0.2.21.dist-info}/LICENSE +0 -0
- {camel_ai-0.2.20a0.dist-info → camel_ai-0.2.21.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,821 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import json
|
|
17
|
+
import math
|
|
18
|
+
import os
|
|
19
|
+
import threading
|
|
20
|
+
import time
|
|
21
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
22
|
+
from typing import Any, Dict, List, Optional, Union
|
|
23
|
+
|
|
24
|
+
from pydantic import BaseModel
|
|
25
|
+
|
|
26
|
+
from camel.agents import ChatAgent
|
|
27
|
+
from camel.logger import get_logger
|
|
28
|
+
from camel.models.reward import BaseRewardModel, Evaluator
|
|
29
|
+
from camel.utils import BatchProcessor, retry_on_error
|
|
30
|
+
|
|
31
|
+
logger = get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AgentTraceEvaluation(BaseModel):
|
|
35
|
+
correctness: float
|
|
36
|
+
clarity: float
|
|
37
|
+
completeness: float
|
|
38
|
+
feedback: str
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class RewardTraceEvaluation(BaseModel):
|
|
42
|
+
feedback: str
|
|
43
|
+
|
|
44
|
+
def __init__(self, **data):
|
|
45
|
+
# Allow dynamic score fields while ensuring feedback is present
|
|
46
|
+
super().__init__(**data)
|
|
47
|
+
|
|
48
|
+
class Config:
|
|
49
|
+
extra = (
|
|
50
|
+
"allow" # Allow extra fields for different reward model dimensions
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class TraceIteration(BaseModel):
|
|
55
|
+
iteration: int
|
|
56
|
+
trace: str
|
|
57
|
+
evaluation: Union[AgentTraceEvaluation, RewardTraceEvaluation]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ProblemResult(BaseModel):
|
|
61
|
+
id: Optional[str] = None
|
|
62
|
+
type: Optional[str] = None
|
|
63
|
+
problem: str
|
|
64
|
+
solution: Optional[str] = None
|
|
65
|
+
final_trace: str
|
|
66
|
+
agent_evaluate_success: Optional[bool] = None
|
|
67
|
+
boxed_answer_success: bool = False
|
|
68
|
+
improvement_history: List[TraceIteration]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class SelfImprovingCoTPipeline:
|
|
72
|
+
r"""Pipeline for generating self-taught reasoning traces
|
|
73
|
+
using the self-improving methodology.
|
|
74
|
+
|
|
75
|
+
This implements the STaR paper's approach of:
|
|
76
|
+
1. Initial reasoning trace generation
|
|
77
|
+
2. Self-evaluation
|
|
78
|
+
3. Feedback-based improvement
|
|
79
|
+
4. Iterative refinement
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
reason_agent: ChatAgent,
|
|
85
|
+
problems: List[Dict],
|
|
86
|
+
max_iterations: int = 3,
|
|
87
|
+
score_threshold: Union[float, Dict[str, float]] = 0.7,
|
|
88
|
+
evaluate_agent: Optional[ChatAgent] = None,
|
|
89
|
+
reward_model: Optional[BaseRewardModel] = None,
|
|
90
|
+
output_path: Optional[str] = None,
|
|
91
|
+
few_shot_examples: Optional[str] = None,
|
|
92
|
+
batch_size: Optional[int] = None,
|
|
93
|
+
max_workers: Optional[int] = None,
|
|
94
|
+
solution_pattern: str = r'\\boxed{(.*?)}',
|
|
95
|
+
trace_pattern: Optional[str] = None,
|
|
96
|
+
):
|
|
97
|
+
r"""Initialize the self-improving cot pipeline.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
reason_agent (ChatAgent): The chat agent used for generating and
|
|
101
|
+
improving reasoning traces.
|
|
102
|
+
problems (List[Dict]): List of problem dictionaries to process.
|
|
103
|
+
max_iterations (int, optional): Maximum number of improvement
|
|
104
|
+
iterations. If set to `0`, the pipeline will generate an
|
|
105
|
+
initial trace without any improvement iterations.
|
|
106
|
+
(default: :obj:`3`)
|
|
107
|
+
score_threshold (Union[float, Dict[str, float]], optional):
|
|
108
|
+
Quality threshold. Can be either a single float value applied
|
|
109
|
+
to average score, or a dictionary mapping score dimensions to
|
|
110
|
+
their thresholds. For example: {"correctness": 0.8,
|
|
111
|
+
"coherence": 0.7}. If using reward model and threshold for a
|
|
112
|
+
dimension is not specified, will use the default value 0.7.
|
|
113
|
+
(default: :obj:`0.7`)
|
|
114
|
+
evaluate_agent (Optional[ChatAgent]): The chat agent used for
|
|
115
|
+
evaluating reasoning traces. (default: :obj:`None`)
|
|
116
|
+
reward_model (BaseRewardModel, optional): Model used to evaluate
|
|
117
|
+
reasoning traces. If `None`, uses Agent self-evaluation.
|
|
118
|
+
(default: :obj:`None`)
|
|
119
|
+
output_path (str, optional): Output path for saving traces. If
|
|
120
|
+
`None`, results will only be returned without saving to file.
|
|
121
|
+
(default: :obj:`None`)
|
|
122
|
+
few_shot_examples (str, optional): Examples to use for few-shot
|
|
123
|
+
generation. (default: :obj:`None`)
|
|
124
|
+
batch_size (int, optional): Batch size for parallel processing.
|
|
125
|
+
(default: :obj:`None`)
|
|
126
|
+
max_workers (int, optional): Maximum number of worker threads.
|
|
127
|
+
(default: :obj:`None`)
|
|
128
|
+
solution_pattern (str, optional): Regular expression pattern with
|
|
129
|
+
one capture group to extract answers from solution text.
|
|
130
|
+
(default: :obj:`r'\\boxed{(.*?)}'`)
|
|
131
|
+
trace_pattern (str, optional): Regular expression pattern with one
|
|
132
|
+
capture group to extract answers from trace text. If `None`,
|
|
133
|
+
uses the same pattern as solution_pattern.
|
|
134
|
+
(default: :obj:`None`)
|
|
135
|
+
"""
|
|
136
|
+
self.reason_agent = reason_agent
|
|
137
|
+
self.evaluate_agent = evaluate_agent
|
|
138
|
+
self.problems = problems
|
|
139
|
+
self.output_path = output_path
|
|
140
|
+
self.max_iterations = max_iterations
|
|
141
|
+
self.score_threshold = score_threshold
|
|
142
|
+
self.reward_model = reward_model
|
|
143
|
+
self.evaluator = (
|
|
144
|
+
Evaluator(reward_model=reward_model) if reward_model else None
|
|
145
|
+
)
|
|
146
|
+
self.reasoning_traces: List[Dict[str, Any]] = []
|
|
147
|
+
self.few_shot_examples = few_shot_examples
|
|
148
|
+
self.batch_processor = BatchProcessor(max_workers, batch_size)
|
|
149
|
+
self.solution_pattern = solution_pattern
|
|
150
|
+
self.trace_pattern = (
|
|
151
|
+
trace_pattern if trace_pattern is not None else solution_pattern
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Initialize output file with empty results if path is specified
|
|
155
|
+
if self.output_path:
|
|
156
|
+
with open(self.output_path, 'w') as f:
|
|
157
|
+
json.dump({'traces': []}, f, indent=2)
|
|
158
|
+
self.lock = threading.Lock()
|
|
159
|
+
|
|
160
|
+
def safe_write_json(self, file_path, data):
|
|
161
|
+
temp_path = file_path + ".tmp"
|
|
162
|
+
with open(temp_path, "w") as f:
|
|
163
|
+
json.dump(data, f, indent=2)
|
|
164
|
+
os.replace(temp_path, file_path)
|
|
165
|
+
|
|
166
|
+
def clean_json(self, data):
|
|
167
|
+
if isinstance(data, dict):
|
|
168
|
+
return {k: self.clean_json(v) for k, v in data.items()}
|
|
169
|
+
elif isinstance(data, list):
|
|
170
|
+
return [self.clean_json(v) for v in data]
|
|
171
|
+
elif isinstance(data, float) and (
|
|
172
|
+
math.isnan(data) or math.isinf(data)
|
|
173
|
+
):
|
|
174
|
+
return None
|
|
175
|
+
return data
|
|
176
|
+
|
|
177
|
+
async def _batch_process_problems(
|
|
178
|
+
self, problems: List[Dict], rationalization: bool
|
|
179
|
+
) -> List[ProblemResult]:
|
|
180
|
+
r"""Process multiple problems in parallel batches with dynamic sizing.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
problems (List[Dict]): List of problem dictionaries to process.
|
|
184
|
+
rationalization (bool): Whether to use rationalization.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
List[ProblemResult]: List of problem results.
|
|
188
|
+
"""
|
|
189
|
+
results = []
|
|
190
|
+
total_problems = len(problems)
|
|
191
|
+
processed = 0
|
|
192
|
+
|
|
193
|
+
while processed < total_problems:
|
|
194
|
+
batch_size = self.batch_processor.batch_size
|
|
195
|
+
batch = problems[processed : processed + batch_size]
|
|
196
|
+
batch_start_time = time.time()
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
with ThreadPoolExecutor(
|
|
200
|
+
max_workers=self.batch_processor.max_workers
|
|
201
|
+
) as executor:
|
|
202
|
+
# Create futures with rationalization parameter
|
|
203
|
+
futures = [
|
|
204
|
+
executor.submit(
|
|
205
|
+
self.process_problem,
|
|
206
|
+
problem=problem,
|
|
207
|
+
rationalization=rationalization,
|
|
208
|
+
)
|
|
209
|
+
for problem in batch
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
batch_results = []
|
|
213
|
+
batch_success = True
|
|
214
|
+
for future in as_completed(futures):
|
|
215
|
+
try:
|
|
216
|
+
result = future.result()
|
|
217
|
+
batch_results.append(result)
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.error(f"Error processing problem: {e}")
|
|
220
|
+
batch_success = False
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
results.extend(batch_results)
|
|
224
|
+
processed += len(batch)
|
|
225
|
+
|
|
226
|
+
# Calculate processing time and adjust batch size
|
|
227
|
+
processing_time = time.time() - batch_start_time
|
|
228
|
+
self.batch_processor.adjust_batch_size(
|
|
229
|
+
batch_success, processing_time
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Log progress and performance metrics
|
|
233
|
+
metrics = self.batch_processor.get_performance_metrics()
|
|
234
|
+
logger.info(
|
|
235
|
+
f"Processed {processed}/{total_problems} problems "
|
|
236
|
+
f"(batch size: {batch_size}, workers: "
|
|
237
|
+
f"{metrics['current_workers']}, "
|
|
238
|
+
f"CPU: {metrics['current_cpu']:.1f}%, "
|
|
239
|
+
f"Memory: {metrics['current_memory']:.1f}%)"
|
|
240
|
+
)
|
|
241
|
+
except Exception as e:
|
|
242
|
+
logger.error(f"Batch processing error: {e}")
|
|
243
|
+
self.batch_processor.adjust_batch_size(False)
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
return results
|
|
247
|
+
|
|
248
|
+
async def _batch_evaluate_traces(
|
|
249
|
+
self,
|
|
250
|
+
problems: List[Dict[str, Any]],
|
|
251
|
+
traces: List[str],
|
|
252
|
+
solutions: Optional[List[str]] = None,
|
|
253
|
+
) -> List[Dict[str, Any]]:
|
|
254
|
+
r"""Evaluate multiple traces in parallel batches with resource
|
|
255
|
+
monitoring.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
problems (List[Dict[str, Any]]): List of problem dictionaries
|
|
259
|
+
traces (List[str]): List of reasoning traces to evaluate
|
|
260
|
+
solutions (Optional[List[str]]): Optional list of solutions
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
List[Dict[str, Any]]: List of evaluation results
|
|
264
|
+
"""
|
|
265
|
+
if solutions is None:
|
|
266
|
+
solutions = ["null"] * len(problems)
|
|
267
|
+
|
|
268
|
+
results = []
|
|
269
|
+
total_traces = len(traces)
|
|
270
|
+
processed = 0
|
|
271
|
+
|
|
272
|
+
while processed < total_traces:
|
|
273
|
+
batch_size = self.batch_processor.batch_size
|
|
274
|
+
problem_batch = problems[processed : processed + batch_size]
|
|
275
|
+
trace_batch = traces[processed : processed + batch_size]
|
|
276
|
+
solution_batch = solutions[processed : processed + batch_size]
|
|
277
|
+
batch_start_time = time.time()
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
with ThreadPoolExecutor(
|
|
281
|
+
max_workers=self.batch_processor.max_workers
|
|
282
|
+
) as executor:
|
|
283
|
+
futures = [
|
|
284
|
+
executor.submit(
|
|
285
|
+
self.evaluate_trace,
|
|
286
|
+
problem=problem["problem"],
|
|
287
|
+
trace=trace,
|
|
288
|
+
solution=solution,
|
|
289
|
+
)
|
|
290
|
+
for problem, trace, solution in zip(
|
|
291
|
+
problem_batch, trace_batch, solution_batch
|
|
292
|
+
)
|
|
293
|
+
]
|
|
294
|
+
|
|
295
|
+
batch_results = []
|
|
296
|
+
batch_success = True
|
|
297
|
+
for future in as_completed(futures):
|
|
298
|
+
try:
|
|
299
|
+
result = future.result()
|
|
300
|
+
batch_results.append(result)
|
|
301
|
+
except Exception as e:
|
|
302
|
+
logger.error(f"Error evaluating trace: {e}")
|
|
303
|
+
batch_success = False
|
|
304
|
+
continue
|
|
305
|
+
|
|
306
|
+
results.extend(batch_results)
|
|
307
|
+
processed += len(batch_results)
|
|
308
|
+
|
|
309
|
+
# Calculate processing time and adjust batch size
|
|
310
|
+
processing_time = time.time() - batch_start_time
|
|
311
|
+
self.batch_processor.adjust_batch_size(
|
|
312
|
+
batch_success, processing_time
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Log progress and performance metrics
|
|
316
|
+
metrics = self.batch_processor.get_performance_metrics()
|
|
317
|
+
logger.info(
|
|
318
|
+
f"Evaluated {processed}/{total_traces} traces "
|
|
319
|
+
f"(batch size: {batch_size}, workers: "
|
|
320
|
+
f"{metrics['current_workers']}, "
|
|
321
|
+
f"avg time: {metrics['avg_processing_time']:.2f}s, "
|
|
322
|
+
f"error rate: {metrics['error_rate']:.1f}%)"
|
|
323
|
+
)
|
|
324
|
+
except Exception as e:
|
|
325
|
+
logger.error(f"Batch evaluation error: {e}")
|
|
326
|
+
self.batch_processor.adjust_batch_size(False)
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
return results
|
|
330
|
+
|
|
331
|
+
def _check_score_threshold(self, scores: Dict[str, float]) -> bool:
|
|
332
|
+
r"""Check if scores meet the threshold requirements.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
scores (Dict[str, float]): Dictionary of scores for different
|
|
336
|
+
dimensions.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
bool: True if scores meet threshold requirements, False otherwise.
|
|
340
|
+
"""
|
|
341
|
+
# If score_threshold is a float, apply it to all dimensions
|
|
342
|
+
if isinstance(self.score_threshold, float):
|
|
343
|
+
return all(
|
|
344
|
+
score >= self.score_threshold for score in scores.values()
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# If score_threshold is a dict, check each dimension with its threshold
|
|
348
|
+
# Use 0 as default threshold for unspecified dimensions
|
|
349
|
+
if isinstance(self.score_threshold, dict):
|
|
350
|
+
for dim, score in scores.items():
|
|
351
|
+
threshold = self.score_threshold.get(dim, 0)
|
|
352
|
+
if score < threshold:
|
|
353
|
+
return False
|
|
354
|
+
return True
|
|
355
|
+
|
|
356
|
+
# If score_threshold is None or invalid type, pass the check
|
|
357
|
+
return True
|
|
358
|
+
|
|
359
|
+
def _generate_feedback(self, scores: Dict[str, float]) -> str:
|
|
360
|
+
r"""Generate feedback based on which dimensions need improvement.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
scores (Dict[str, float]): Dictionary of scores for different
|
|
364
|
+
dimensions.
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
str: Feedback message indicating which dimensions need improvement.
|
|
368
|
+
"""
|
|
369
|
+
if isinstance(self.score_threshold, float):
|
|
370
|
+
below_threshold = [
|
|
371
|
+
dim
|
|
372
|
+
for dim, score in scores.items()
|
|
373
|
+
if score < self.score_threshold
|
|
374
|
+
]
|
|
375
|
+
if not below_threshold:
|
|
376
|
+
return "All dimensions meet the required threshold"
|
|
377
|
+
dims = ", ".join(below_threshold)
|
|
378
|
+
return f"Need improvement in: {dims}"
|
|
379
|
+
|
|
380
|
+
if isinstance(self.score_threshold, dict):
|
|
381
|
+
default_threshold = 0
|
|
382
|
+
below_threshold = [
|
|
383
|
+
dim
|
|
384
|
+
for dim, score in scores.items()
|
|
385
|
+
if score < self.score_threshold.get(dim, default_threshold)
|
|
386
|
+
]
|
|
387
|
+
if not below_threshold:
|
|
388
|
+
return "All dimensions meet their respective thresholds"
|
|
389
|
+
dims = ", ".join(below_threshold)
|
|
390
|
+
return f"Need improvement in: {dims}"
|
|
391
|
+
|
|
392
|
+
# If no threshold set, just list all dimensions and their scores
|
|
393
|
+
dims = ", ".join(
|
|
394
|
+
f"{dim}: {score:.2f}" for dim, score in scores.items()
|
|
395
|
+
)
|
|
396
|
+
return f"Current scores - {dims}"
|
|
397
|
+
|
|
398
|
+
@retry_on_error()
|
|
399
|
+
def generate_reasoning_trace(self, problem: str) -> str:
|
|
400
|
+
r"""Generate initial reasoning trace for a given problem.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
problem (str): The problem text to generate reasoning for.
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
str: Generated reasoning trace.
|
|
407
|
+
"""
|
|
408
|
+
self.reason_agent.reset()
|
|
409
|
+
few_shot_examples = (
|
|
410
|
+
f"Examples: {self.few_shot_examples}"
|
|
411
|
+
if self.few_shot_examples
|
|
412
|
+
else ""
|
|
413
|
+
)
|
|
414
|
+
prompt = self.REASONING_TEMPLATE.format(
|
|
415
|
+
problem=problem, few_shot_examples=few_shot_examples
|
|
416
|
+
)
|
|
417
|
+
response = self.reason_agent.step(prompt)
|
|
418
|
+
return response.msg.content
|
|
419
|
+
|
|
420
|
+
@retry_on_error()
|
|
421
|
+
def evaluate_trace(
|
|
422
|
+
self, problem: str, trace: str, solution: Optional[str] = None
|
|
423
|
+
) -> Dict[str, Any]:
|
|
424
|
+
r"""Evaluate the quality of a reasoning trace.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
problem (str): The original problem text to evaluate against.
|
|
428
|
+
trace (str): The reasoning trace to evaluate.
|
|
429
|
+
solution (Optional[str]): The solution to the problem, if provided.
|
|
430
|
+
(default: :obj:`None`)
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
Dict[str, Any]: Evaluation results containing:
|
|
434
|
+
- scores: Dict of evaluation dimensions and their scores
|
|
435
|
+
- feedback: Detailed feedback for improvement
|
|
436
|
+
|
|
437
|
+
For Agent self-evaluation, the scores will include:
|
|
438
|
+
- correctness: Score for logical correctness
|
|
439
|
+
- clarity: Score for clarity of explanation
|
|
440
|
+
- completeness: Score for completeness of reasoning
|
|
441
|
+
|
|
442
|
+
For reward model evaluation, the scores will depend on
|
|
443
|
+
the model's evaluation dimensions.
|
|
444
|
+
"""
|
|
445
|
+
self.evaluate_agent.reset() # type: ignore[union-attr]
|
|
446
|
+
if self.evaluator:
|
|
447
|
+
# Use reward model evaluation
|
|
448
|
+
messages = [
|
|
449
|
+
{"role": "user", "content": problem},
|
|
450
|
+
{"role": "assistant", "content": trace},
|
|
451
|
+
]
|
|
452
|
+
scores = self.evaluator.evaluate(messages)
|
|
453
|
+
|
|
454
|
+
# For models that return a single score
|
|
455
|
+
if isinstance(scores, (int, float)) or (
|
|
456
|
+
isinstance(scores, dict) and len(scores) == 1
|
|
457
|
+
):
|
|
458
|
+
if isinstance(scores, dict):
|
|
459
|
+
score = next(iter(scores.values()))
|
|
460
|
+
else:
|
|
461
|
+
score = scores
|
|
462
|
+
scores_dict = {"overall": score}
|
|
463
|
+
return {
|
|
464
|
+
**scores_dict,
|
|
465
|
+
"feedback": self._generate_feedback(scores_dict),
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
# For models that return multiple dimensions
|
|
469
|
+
return {**scores, "feedback": self._generate_feedback(scores)}
|
|
470
|
+
else:
|
|
471
|
+
# Fallback to original Agent self-evaluation
|
|
472
|
+
solution_text = f"Solution: {solution}" if solution else ""
|
|
473
|
+
prompt = self.EVALUATION_TEMPLATE.format(
|
|
474
|
+
problem=problem, trace=trace, solution=solution_text
|
|
475
|
+
)
|
|
476
|
+
response = self.evaluate_agent.step( # type: ignore[union-attr]
|
|
477
|
+
prompt, response_format=AgentTraceEvaluation
|
|
478
|
+
)
|
|
479
|
+
if response.msg.parsed is None:
|
|
480
|
+
raise AttributeError("Failed to parse evaluation response")
|
|
481
|
+
# Convert dict to AgentTraceEvaluation if needed
|
|
482
|
+
if isinstance(response.msg.parsed, dict):
|
|
483
|
+
evaluation = AgentTraceEvaluation(**response.msg.parsed)
|
|
484
|
+
else:
|
|
485
|
+
evaluation = response.msg.parsed
|
|
486
|
+
|
|
487
|
+
return evaluation.model_dump()
|
|
488
|
+
|
|
489
|
+
@retry_on_error()
|
|
490
|
+
def improve_trace(
|
|
491
|
+
self,
|
|
492
|
+
problem: str,
|
|
493
|
+
trace: str,
|
|
494
|
+
feedback: str,
|
|
495
|
+
solution: Optional[str] = None,
|
|
496
|
+
) -> str:
|
|
497
|
+
r"""Generate improved reasoning trace based on feedback.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
problem (str): The original problem text.
|
|
501
|
+
trace (str): The current reasoning trace.
|
|
502
|
+
feedback (str): Feedback for improving the trace.
|
|
503
|
+
solution (Optional[str]): The solution to the problem, if provided.
|
|
504
|
+
(default: :obj:`None`)
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
str: Improved reasoning trace.
|
|
508
|
+
"""
|
|
509
|
+
self.reason_agent.reset()
|
|
510
|
+
solution_text = f"Solution: {solution}" if solution else ""
|
|
511
|
+
prompt = self.IMPROVEMENT_TEMPLATE.format(
|
|
512
|
+
problem=problem,
|
|
513
|
+
trace=trace,
|
|
514
|
+
feedback=feedback,
|
|
515
|
+
solution=solution_text,
|
|
516
|
+
)
|
|
517
|
+
response = self.reason_agent.step(prompt)
|
|
518
|
+
return response.msg.content
|
|
519
|
+
|
|
520
|
+
def validate_problem_format(self, problem: Dict) -> None:
|
|
521
|
+
r"""Validate that a problem dictionary has the required format.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
problem (Dict): Problem dictionary to validate.
|
|
525
|
+
|
|
526
|
+
Raises:
|
|
527
|
+
ValueError: If the problem format is invalid.
|
|
528
|
+
"""
|
|
529
|
+
if not isinstance(problem, dict):
|
|
530
|
+
raise ValueError("Problem must be a dictionary.")
|
|
531
|
+
|
|
532
|
+
# Check required problem field
|
|
533
|
+
if "problem" not in problem:
|
|
534
|
+
raise ValueError("Problem dictionary must contain 'problem' key.")
|
|
535
|
+
if not isinstance(problem["problem"], str):
|
|
536
|
+
raise ValueError("Problem 'problem' field must be a string.")
|
|
537
|
+
|
|
538
|
+
# Optional fields validation
|
|
539
|
+
optional_fields: dict[str, type | tuple[type, ...]] = {
|
|
540
|
+
"id": (str, int, type(None)),
|
|
541
|
+
"type": str,
|
|
542
|
+
"solution": str,
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
for field, expected_type in optional_fields.items():
|
|
546
|
+
if field in problem and not isinstance(
|
|
547
|
+
problem[field], expected_type
|
|
548
|
+
):
|
|
549
|
+
type_name = (
|
|
550
|
+
expected_type.__name__
|
|
551
|
+
if hasattr(expected_type, '__name__')
|
|
552
|
+
else str(expected_type)
|
|
553
|
+
)
|
|
554
|
+
raise ValueError(
|
|
555
|
+
f"Problem '{field}' must be of "
|
|
556
|
+
f"type {type_name} if present."
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
def _check_boxed_answers(self, solution: str, trace: str) -> bool:
|
|
560
|
+
r"""Check if the answer in the trace matches the solution using the
|
|
561
|
+
configured patterns.
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
solution (str): The problem solution string.
|
|
565
|
+
trace (str): The reasoning trace string.
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
bool: True if answers match, False otherwise
|
|
569
|
+
"""
|
|
570
|
+
import re
|
|
571
|
+
|
|
572
|
+
# Extract content using the configured patterns
|
|
573
|
+
solution_match = re.search(self.solution_pattern, solution, re.DOTALL)
|
|
574
|
+
trace_match = re.search(self.trace_pattern, trace, re.DOTALL)
|
|
575
|
+
|
|
576
|
+
if solution_match and trace_match:
|
|
577
|
+
# Clean up whitespace and normalize content
|
|
578
|
+
solution_answer = solution_match.group(1).strip()
|
|
579
|
+
trace_answer = trace_match.group(1).strip()
|
|
580
|
+
return solution_answer == trace_answer
|
|
581
|
+
|
|
582
|
+
return False
|
|
583
|
+
|
|
584
|
+
def process_problem(
|
|
585
|
+
self, problem: Dict, rationalization: bool = False
|
|
586
|
+
) -> ProblemResult:
|
|
587
|
+
r"""Process a single problem through the self-improving cot pipeline.
|
|
588
|
+
|
|
589
|
+
Args:
|
|
590
|
+
problem (Dict): Problem dictionary containing the problem text.
|
|
591
|
+
rationalization (bool, optional): Whether to use rationalization.
|
|
592
|
+
(default: :obj:`False`)
|
|
593
|
+
|
|
594
|
+
Returns:
|
|
595
|
+
ProblemResult: Results with final trace and history.
|
|
596
|
+
|
|
597
|
+
Raises:
|
|
598
|
+
ValueError: If the problem format is invalid.
|
|
599
|
+
"""
|
|
600
|
+
# Validate problem format before processing
|
|
601
|
+
self.validate_problem_format(problem)
|
|
602
|
+
|
|
603
|
+
problem_text = problem["problem"]
|
|
604
|
+
solution_text = problem.get("solution", "")
|
|
605
|
+
current_trace = self.generate_reasoning_trace(problem_text)
|
|
606
|
+
improvement_history = []
|
|
607
|
+
scores = {}
|
|
608
|
+
|
|
609
|
+
# Only evaluate if evaluate_agent or reward_model is set
|
|
610
|
+
if self.evaluate_agent or self.reward_model:
|
|
611
|
+
# Create batches for parallel evaluation
|
|
612
|
+
batch_problems = [problem]
|
|
613
|
+
batch_traces = [current_trace]
|
|
614
|
+
batch_solutions = [solution_text]
|
|
615
|
+
|
|
616
|
+
# Evaluate current trace batch
|
|
617
|
+
loop = asyncio.new_event_loop()
|
|
618
|
+
asyncio.set_event_loop(loop)
|
|
619
|
+
try:
|
|
620
|
+
eval_results = loop.run_until_complete(
|
|
621
|
+
self._batch_evaluate_traces(
|
|
622
|
+
batch_problems, batch_traces, batch_solutions
|
|
623
|
+
)
|
|
624
|
+
)
|
|
625
|
+
finally:
|
|
626
|
+
loop.close()
|
|
627
|
+
|
|
628
|
+
# Process evaluation results
|
|
629
|
+
eval_dict = eval_results[-1] # Get latest evaluation
|
|
630
|
+
scores = {k: v for k, v in eval_dict.items() if k != "feedback"}
|
|
631
|
+
|
|
632
|
+
# Record initial evaluation
|
|
633
|
+
if self.evaluator:
|
|
634
|
+
improvement_history.append(
|
|
635
|
+
TraceIteration(
|
|
636
|
+
iteration=0,
|
|
637
|
+
trace=current_trace,
|
|
638
|
+
evaluation=RewardTraceEvaluation(**eval_dict),
|
|
639
|
+
)
|
|
640
|
+
)
|
|
641
|
+
else:
|
|
642
|
+
improvement_history.append(
|
|
643
|
+
TraceIteration(
|
|
644
|
+
iteration=0,
|
|
645
|
+
trace=current_trace,
|
|
646
|
+
evaluation=AgentTraceEvaluation(
|
|
647
|
+
**scores, feedback=eval_dict["feedback"]
|
|
648
|
+
),
|
|
649
|
+
)
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
# Only do improvement iterations if max_iterations > 0
|
|
653
|
+
if self.max_iterations > 0:
|
|
654
|
+
for iteration in range(0, self.max_iterations):
|
|
655
|
+
# Check if quality threshold met
|
|
656
|
+
if self._check_score_threshold(scores):
|
|
657
|
+
break
|
|
658
|
+
|
|
659
|
+
# Generate improved trace
|
|
660
|
+
if rationalization:
|
|
661
|
+
current_trace = self.improve_trace(
|
|
662
|
+
problem_text,
|
|
663
|
+
current_trace,
|
|
664
|
+
eval_dict["feedback"],
|
|
665
|
+
solution_text,
|
|
666
|
+
)
|
|
667
|
+
else:
|
|
668
|
+
current_trace = self.improve_trace(
|
|
669
|
+
problem_text, current_trace, eval_dict["feedback"]
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
# Evaluate improved trace
|
|
673
|
+
batch_traces = [current_trace]
|
|
674
|
+
loop = asyncio.new_event_loop()
|
|
675
|
+
asyncio.set_event_loop(loop)
|
|
676
|
+
try:
|
|
677
|
+
eval_results = loop.run_until_complete(
|
|
678
|
+
self._batch_evaluate_traces(
|
|
679
|
+
batch_problems, batch_traces, batch_solutions
|
|
680
|
+
)
|
|
681
|
+
)
|
|
682
|
+
finally:
|
|
683
|
+
loop.close()
|
|
684
|
+
|
|
685
|
+
eval_dict = eval_results[-1]
|
|
686
|
+
scores = {
|
|
687
|
+
k: v for k, v in eval_dict.items() if k != "feedback"
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
# Record iteration history
|
|
691
|
+
if self.evaluator:
|
|
692
|
+
improvement_history.append(
|
|
693
|
+
TraceIteration(
|
|
694
|
+
iteration=iteration + 1,
|
|
695
|
+
trace=current_trace,
|
|
696
|
+
evaluation=RewardTraceEvaluation(**eval_dict),
|
|
697
|
+
)
|
|
698
|
+
)
|
|
699
|
+
else:
|
|
700
|
+
improvement_history.append(
|
|
701
|
+
TraceIteration(
|
|
702
|
+
iteration=iteration + 1,
|
|
703
|
+
trace=current_trace,
|
|
704
|
+
evaluation=AgentTraceEvaluation(
|
|
705
|
+
**scores, feedback=eval_dict["feedback"]
|
|
706
|
+
),
|
|
707
|
+
)
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
boxed_answer_success = self._check_boxed_answers(
|
|
711
|
+
problem.get("solution", ""), current_trace
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
result = ProblemResult(
|
|
715
|
+
id=problem.get("id", ""),
|
|
716
|
+
type=problem.get("type", ""),
|
|
717
|
+
problem=problem_text,
|
|
718
|
+
solution=problem.get("solution", ""),
|
|
719
|
+
final_trace=current_trace,
|
|
720
|
+
agent_evaluate_success=self._check_score_threshold(scores)
|
|
721
|
+
if scores
|
|
722
|
+
else None,
|
|
723
|
+
boxed_answer_success=boxed_answer_success,
|
|
724
|
+
improvement_history=improvement_history,
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
# Write result to file immediately if output path is specified
|
|
728
|
+
if self.output_path:
|
|
729
|
+
with self.lock:
|
|
730
|
+
try:
|
|
731
|
+
# Read existing results
|
|
732
|
+
with open(self.output_path, 'r') as f:
|
|
733
|
+
data = json.load(f)
|
|
734
|
+
|
|
735
|
+
cleaned_result = self.clean_json(result.model_dump())
|
|
736
|
+
data['traces'].append(cleaned_result)
|
|
737
|
+
self.safe_write_json(self.output_path, data)
|
|
738
|
+
|
|
739
|
+
except Exception as e:
|
|
740
|
+
logger.error(f"Error writing result to file: {e}")
|
|
741
|
+
|
|
742
|
+
return result
|
|
743
|
+
|
|
744
|
+
def generate(self, rationalization: bool = False) -> List[Dict[str, Any]]:
|
|
745
|
+
r"""Execute the self-improving cot pipeline on all problems.
|
|
746
|
+
|
|
747
|
+
Process problems and return results. If output_path is specified,
|
|
748
|
+
also save results to file.
|
|
749
|
+
|
|
750
|
+
Args:
|
|
751
|
+
rationalization (bool, optional): Whether to use rationalization.
|
|
752
|
+
(default: :obj:`False`)
|
|
753
|
+
|
|
754
|
+
Returns:
|
|
755
|
+
List[Dict[str, Any]]: List of processed results
|
|
756
|
+
"""
|
|
757
|
+
# Pre-allocate results list
|
|
758
|
+
self.reasoning_traces = []
|
|
759
|
+
|
|
760
|
+
# Process problems in batches
|
|
761
|
+
loop = asyncio.new_event_loop()
|
|
762
|
+
asyncio.set_event_loop(loop)
|
|
763
|
+
|
|
764
|
+
try:
|
|
765
|
+
results = loop.run_until_complete(
|
|
766
|
+
self._batch_process_problems(self.problems, rationalization)
|
|
767
|
+
)
|
|
768
|
+
finally:
|
|
769
|
+
loop.close()
|
|
770
|
+
|
|
771
|
+
self.reasoning_traces = [result.model_dump() for result in results]
|
|
772
|
+
return self.reasoning_traces
|
|
773
|
+
|
|
774
|
+
# Templates for generating reasoning, evaluation and improving them.
|
|
775
|
+
REASONING_TEMPLATE = """Let's solve this step by step:
|
|
776
|
+
Problem: {problem}
|
|
777
|
+
1. First, let's understand what we're asked
|
|
778
|
+
2. Let's break this down into parts
|
|
779
|
+
3. Let's solve each part systematically
|
|
780
|
+
4. Finally, let's verify our solution
|
|
781
|
+
|
|
782
|
+
{few_shot_examples}
|
|
783
|
+
|
|
784
|
+
Please show your complete reasoning process."""
|
|
785
|
+
|
|
786
|
+
EVALUATION_TEMPLATE = """Please evaluate this reasoning trace and
|
|
787
|
+
provide scores and feedback in valid JSON format.
|
|
788
|
+
|
|
789
|
+
Problem: {problem}
|
|
790
|
+
|
|
791
|
+
{solution}
|
|
792
|
+
|
|
793
|
+
Reasoning Trace:
|
|
794
|
+
{trace}
|
|
795
|
+
|
|
796
|
+
Evaluate for:
|
|
797
|
+
1. Correctness (Is each step logically sound?)
|
|
798
|
+
2. Clarity (Is the explanation clear and well-structured?)
|
|
799
|
+
3. Completeness (Are all necessary steps included?)
|
|
800
|
+
|
|
801
|
+
Respond ONLY with a JSON object in this exact format:
|
|
802
|
+
{{
|
|
803
|
+
"correctness": <score between 0 and 1>,
|
|
804
|
+
"clarity": <score between 0 and 1>,
|
|
805
|
+
"completeness": <score between 0 and 1>,
|
|
806
|
+
"feedback": "<specific feedback for improvement>"
|
|
807
|
+
}}"""
|
|
808
|
+
|
|
809
|
+
IMPROVEMENT_TEMPLATE = """Based on this feedback, generate an
|
|
810
|
+
improved reasoning trace:
|
|
811
|
+
Problem: {problem}
|
|
812
|
+
|
|
813
|
+
{solution}
|
|
814
|
+
|
|
815
|
+
Previous Trace:
|
|
816
|
+
{trace}
|
|
817
|
+
|
|
818
|
+
Feedback:
|
|
819
|
+
{feedback}
|
|
820
|
+
|
|
821
|
+
Generate a new, improved reasoning trace that addresses the feedback."""
|