camel-ai 0.2.20a1__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (42) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/chat_agent.py +2 -3
  3. camel/agents/knowledge_graph_agent.py +1 -5
  4. camel/benchmarks/apibench.py +1 -5
  5. camel/benchmarks/nexus.py +1 -5
  6. camel/benchmarks/ragbench.py +2 -2
  7. camel/bots/telegram_bot.py +1 -5
  8. camel/configs/__init__.py +3 -0
  9. camel/configs/aiml_config.py +80 -0
  10. camel/datagen/__init__.py +3 -1
  11. camel/datagen/self_improving_cot.py +821 -0
  12. camel/datagen/self_instruct/self_instruct.py +1 -1
  13. camel/embeddings/openai_embedding.py +10 -1
  14. camel/interpreters/docker/Dockerfile +12 -0
  15. camel/interpreters/docker_interpreter.py +19 -1
  16. camel/interpreters/subprocess_interpreter.py +97 -6
  17. camel/loaders/__init__.py +2 -0
  18. camel/loaders/mineru_extractor.py +250 -0
  19. camel/models/__init__.py +2 -0
  20. camel/models/aiml_model.py +147 -0
  21. camel/models/base_model.py +54 -1
  22. camel/models/deepseek_model.py +0 -18
  23. camel/models/model_factory.py +3 -0
  24. camel/models/siliconflow_model.py +1 -1
  25. camel/societies/workforce/role_playing_worker.py +2 -4
  26. camel/societies/workforce/single_agent_worker.py +1 -6
  27. camel/societies/workforce/workforce.py +3 -9
  28. camel/toolkits/__init__.py +5 -0
  29. camel/toolkits/mineru_toolkit.py +178 -0
  30. camel/toolkits/reddit_toolkit.py +8 -38
  31. camel/toolkits/sympy_toolkit.py +816 -0
  32. camel/toolkits/whatsapp_toolkit.py +11 -32
  33. camel/types/enums.py +25 -1
  34. camel/utils/__init__.py +7 -2
  35. camel/utils/commons.py +198 -21
  36. camel/utils/deduplication.py +232 -0
  37. camel/utils/token_counting.py +0 -38
  38. {camel_ai-0.2.20a1.dist-info → camel_ai-0.2.22.dist-info}/METADATA +10 -13
  39. {camel_ai-0.2.20a1.dist-info → camel_ai-0.2.22.dist-info}/RECORD +42 -34
  40. {camel_ai-0.2.20a1.dist-info → camel_ai-0.2.22.dist-info}/WHEEL +1 -1
  41. /camel/datagen/{cotdatagen.py → cot_datagen.py} +0 -0
  42. {camel_ai-0.2.20a1.dist-info → camel_ai-0.2.22.dist-info}/LICENSE +0 -0
@@ -0,0 +1,821 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+
15
+ import asyncio
16
+ import json
17
+ import math
18
+ import os
19
+ import threading
20
+ import time
21
+ from concurrent.futures import ThreadPoolExecutor, as_completed
22
+ from typing import Any, Dict, List, Optional, Union
23
+
24
+ from pydantic import BaseModel
25
+
26
+ from camel.agents import ChatAgent
27
+ from camel.logger import get_logger
28
+ from camel.models.reward import BaseRewardModel, Evaluator
29
+ from camel.utils import BatchProcessor, retry_on_error
30
+
31
+ logger = get_logger(__name__)
32
+
33
+
34
+ class AgentTraceEvaluation(BaseModel):
35
+ correctness: float
36
+ clarity: float
37
+ completeness: float
38
+ feedback: str
39
+
40
+
41
+ class RewardTraceEvaluation(BaseModel):
42
+ feedback: str
43
+
44
+ def __init__(self, **data):
45
+ # Allow dynamic score fields while ensuring feedback is present
46
+ super().__init__(**data)
47
+
48
+ class Config:
49
+ extra = (
50
+ "allow" # Allow extra fields for different reward model dimensions
51
+ )
52
+
53
+
54
+ class TraceIteration(BaseModel):
55
+ iteration: int
56
+ trace: str
57
+ evaluation: Union[AgentTraceEvaluation, RewardTraceEvaluation]
58
+
59
+
60
+ class ProblemResult(BaseModel):
61
+ id: Optional[str] = None
62
+ type: Optional[str] = None
63
+ problem: str
64
+ solution: Optional[str] = None
65
+ final_trace: str
66
+ agent_evaluate_success: Optional[bool] = None
67
+ boxed_answer_success: bool = False
68
+ improvement_history: List[TraceIteration]
69
+
70
+
71
+ class SelfImprovingCoTPipeline:
72
+ r"""Pipeline for generating self-taught reasoning traces
73
+ using the self-improving methodology.
74
+
75
+ This implements the STaR paper's approach of:
76
+ 1. Initial reasoning trace generation
77
+ 2. Self-evaluation
78
+ 3. Feedback-based improvement
79
+ 4. Iterative refinement
80
+ """
81
+
82
+ def __init__(
83
+ self,
84
+ reason_agent: ChatAgent,
85
+ problems: List[Dict],
86
+ max_iterations: int = 3,
87
+ score_threshold: Union[float, Dict[str, float]] = 0.7,
88
+ evaluate_agent: Optional[ChatAgent] = None,
89
+ reward_model: Optional[BaseRewardModel] = None,
90
+ output_path: Optional[str] = None,
91
+ few_shot_examples: Optional[str] = None,
92
+ batch_size: Optional[int] = None,
93
+ max_workers: Optional[int] = None,
94
+ solution_pattern: str = r'\\boxed{(.*?)}',
95
+ trace_pattern: Optional[str] = None,
96
+ ):
97
+ r"""Initialize the self-improving cot pipeline.
98
+
99
+ Args:
100
+ reason_agent (ChatAgent): The chat agent used for generating and
101
+ improving reasoning traces.
102
+ problems (List[Dict]): List of problem dictionaries to process.
103
+ max_iterations (int, optional): Maximum number of improvement
104
+ iterations. If set to `0`, the pipeline will generate an
105
+ initial trace without any improvement iterations.
106
+ (default: :obj:`3`)
107
+ score_threshold (Union[float, Dict[str, float]], optional):
108
+ Quality threshold. Can be either a single float value applied
109
+ to average score, or a dictionary mapping score dimensions to
110
+ their thresholds. For example: {"correctness": 0.8,
111
+ "coherence": 0.7}. If using reward model and threshold for a
112
+ dimension is not specified, will use the default value 0.7.
113
+ (default: :obj:`0.7`)
114
+ evaluate_agent (Optional[ChatAgent]): The chat agent used for
115
+ evaluating reasoning traces. (default: :obj:`None`)
116
+ reward_model (BaseRewardModel, optional): Model used to evaluate
117
+ reasoning traces. If `None`, uses Agent self-evaluation.
118
+ (default: :obj:`None`)
119
+ output_path (str, optional): Output path for saving traces. If
120
+ `None`, results will only be returned without saving to file.
121
+ (default: :obj:`None`)
122
+ few_shot_examples (str, optional): Examples to use for few-shot
123
+ generation. (default: :obj:`None`)
124
+ batch_size (int, optional): Batch size for parallel processing.
125
+ (default: :obj:`None`)
126
+ max_workers (int, optional): Maximum number of worker threads.
127
+ (default: :obj:`None`)
128
+ solution_pattern (str, optional): Regular expression pattern with
129
+ one capture group to extract answers from solution text.
130
+ (default: :obj:`r'\\boxed{(.*?)}'`)
131
+ trace_pattern (str, optional): Regular expression pattern with one
132
+ capture group to extract answers from trace text. If `None`,
133
+ uses the same pattern as solution_pattern.
134
+ (default: :obj:`None`)
135
+ """
136
+ self.reason_agent = reason_agent
137
+ self.evaluate_agent = evaluate_agent
138
+ self.problems = problems
139
+ self.output_path = output_path
140
+ self.max_iterations = max_iterations
141
+ self.score_threshold = score_threshold
142
+ self.reward_model = reward_model
143
+ self.evaluator = (
144
+ Evaluator(reward_model=reward_model) if reward_model else None
145
+ )
146
+ self.reasoning_traces: List[Dict[str, Any]] = []
147
+ self.few_shot_examples = few_shot_examples
148
+ self.batch_processor = BatchProcessor(max_workers, batch_size)
149
+ self.solution_pattern = solution_pattern
150
+ self.trace_pattern = (
151
+ trace_pattern if trace_pattern is not None else solution_pattern
152
+ )
153
+
154
+ # Initialize output file with empty results if path is specified
155
+ if self.output_path:
156
+ with open(self.output_path, 'w') as f:
157
+ json.dump({'traces': []}, f, indent=2)
158
+ self.lock = threading.Lock()
159
+
160
+ def safe_write_json(self, file_path, data):
161
+ temp_path = file_path + ".tmp"
162
+ with open(temp_path, "w") as f:
163
+ json.dump(data, f, indent=2)
164
+ os.replace(temp_path, file_path)
165
+
166
+ def clean_json(self, data):
167
+ if isinstance(data, dict):
168
+ return {k: self.clean_json(v) for k, v in data.items()}
169
+ elif isinstance(data, list):
170
+ return [self.clean_json(v) for v in data]
171
+ elif isinstance(data, float) and (
172
+ math.isnan(data) or math.isinf(data)
173
+ ):
174
+ return None
175
+ return data
176
+
177
+ async def _batch_process_problems(
178
+ self, problems: List[Dict], rationalization: bool
179
+ ) -> List[ProblemResult]:
180
+ r"""Process multiple problems in parallel batches with dynamic sizing.
181
+
182
+ Args:
183
+ problems (List[Dict]): List of problem dictionaries to process.
184
+ rationalization (bool): Whether to use rationalization.
185
+
186
+ Returns:
187
+ List[ProblemResult]: List of problem results.
188
+ """
189
+ results = []
190
+ total_problems = len(problems)
191
+ processed = 0
192
+
193
+ while processed < total_problems:
194
+ batch_size = self.batch_processor.batch_size
195
+ batch = problems[processed : processed + batch_size]
196
+ batch_start_time = time.time()
197
+
198
+ try:
199
+ with ThreadPoolExecutor(
200
+ max_workers=self.batch_processor.max_workers
201
+ ) as executor:
202
+ # Create futures with rationalization parameter
203
+ futures = [
204
+ executor.submit(
205
+ self.process_problem,
206
+ problem=problem,
207
+ rationalization=rationalization,
208
+ )
209
+ for problem in batch
210
+ ]
211
+
212
+ batch_results = []
213
+ batch_success = True
214
+ for future in as_completed(futures):
215
+ try:
216
+ result = future.result()
217
+ batch_results.append(result)
218
+ except Exception as e:
219
+ logger.error(f"Error processing problem: {e}")
220
+ batch_success = False
221
+ continue
222
+
223
+ results.extend(batch_results)
224
+ processed += len(batch)
225
+
226
+ # Calculate processing time and adjust batch size
227
+ processing_time = time.time() - batch_start_time
228
+ self.batch_processor.adjust_batch_size(
229
+ batch_success, processing_time
230
+ )
231
+
232
+ # Log progress and performance metrics
233
+ metrics = self.batch_processor.get_performance_metrics()
234
+ logger.info(
235
+ f"Processed {processed}/{total_problems} problems "
236
+ f"(batch size: {batch_size}, workers: "
237
+ f"{metrics['current_workers']}, "
238
+ f"CPU: {metrics['current_cpu']:.1f}%, "
239
+ f"Memory: {metrics['current_memory']:.1f}%)"
240
+ )
241
+ except Exception as e:
242
+ logger.error(f"Batch processing error: {e}")
243
+ self.batch_processor.adjust_batch_size(False)
244
+ continue
245
+
246
+ return results
247
+
248
+ async def _batch_evaluate_traces(
249
+ self,
250
+ problems: List[Dict[str, Any]],
251
+ traces: List[str],
252
+ solutions: Optional[List[str]] = None,
253
+ ) -> List[Dict[str, Any]]:
254
+ r"""Evaluate multiple traces in parallel batches with resource
255
+ monitoring.
256
+
257
+ Args:
258
+ problems (List[Dict[str, Any]]): List of problem dictionaries
259
+ traces (List[str]): List of reasoning traces to evaluate
260
+ solutions (Optional[List[str]]): Optional list of solutions
261
+
262
+ Returns:
263
+ List[Dict[str, Any]]: List of evaluation results
264
+ """
265
+ if solutions is None:
266
+ solutions = ["null"] * len(problems)
267
+
268
+ results = []
269
+ total_traces = len(traces)
270
+ processed = 0
271
+
272
+ while processed < total_traces:
273
+ batch_size = self.batch_processor.batch_size
274
+ problem_batch = problems[processed : processed + batch_size]
275
+ trace_batch = traces[processed : processed + batch_size]
276
+ solution_batch = solutions[processed : processed + batch_size]
277
+ batch_start_time = time.time()
278
+
279
+ try:
280
+ with ThreadPoolExecutor(
281
+ max_workers=self.batch_processor.max_workers
282
+ ) as executor:
283
+ futures = [
284
+ executor.submit(
285
+ self.evaluate_trace,
286
+ problem=problem["problem"],
287
+ trace=trace,
288
+ solution=solution,
289
+ )
290
+ for problem, trace, solution in zip(
291
+ problem_batch, trace_batch, solution_batch
292
+ )
293
+ ]
294
+
295
+ batch_results = []
296
+ batch_success = True
297
+ for future in as_completed(futures):
298
+ try:
299
+ result = future.result()
300
+ batch_results.append(result)
301
+ except Exception as e:
302
+ logger.error(f"Error evaluating trace: {e}")
303
+ batch_success = False
304
+ continue
305
+
306
+ results.extend(batch_results)
307
+ processed += len(batch_results)
308
+
309
+ # Calculate processing time and adjust batch size
310
+ processing_time = time.time() - batch_start_time
311
+ self.batch_processor.adjust_batch_size(
312
+ batch_success, processing_time
313
+ )
314
+
315
+ # Log progress and performance metrics
316
+ metrics = self.batch_processor.get_performance_metrics()
317
+ logger.info(
318
+ f"Evaluated {processed}/{total_traces} traces "
319
+ f"(batch size: {batch_size}, workers: "
320
+ f"{metrics['current_workers']}, "
321
+ f"avg time: {metrics['avg_processing_time']:.2f}s, "
322
+ f"error rate: {metrics['error_rate']:.1f}%)"
323
+ )
324
+ except Exception as e:
325
+ logger.error(f"Batch evaluation error: {e}")
326
+ self.batch_processor.adjust_batch_size(False)
327
+ continue
328
+
329
+ return results
330
+
331
+ def _check_score_threshold(self, scores: Dict[str, float]) -> bool:
332
+ r"""Check if scores meet the threshold requirements.
333
+
334
+ Args:
335
+ scores (Dict[str, float]): Dictionary of scores for different
336
+ dimensions.
337
+
338
+ Returns:
339
+ bool: True if scores meet threshold requirements, False otherwise.
340
+ """
341
+ # If score_threshold is a float, apply it to all dimensions
342
+ if isinstance(self.score_threshold, float):
343
+ return all(
344
+ score >= self.score_threshold for score in scores.values()
345
+ )
346
+
347
+ # If score_threshold is a dict, check each dimension with its threshold
348
+ # Use 0 as default threshold for unspecified dimensions
349
+ if isinstance(self.score_threshold, dict):
350
+ for dim, score in scores.items():
351
+ threshold = self.score_threshold.get(dim, 0)
352
+ if score < threshold:
353
+ return False
354
+ return True
355
+
356
+ # If score_threshold is None or invalid type, pass the check
357
+ return True
358
+
359
+ def _generate_feedback(self, scores: Dict[str, float]) -> str:
360
+ r"""Generate feedback based on which dimensions need improvement.
361
+
362
+ Args:
363
+ scores (Dict[str, float]): Dictionary of scores for different
364
+ dimensions.
365
+
366
+ Returns:
367
+ str: Feedback message indicating which dimensions need improvement.
368
+ """
369
+ if isinstance(self.score_threshold, float):
370
+ below_threshold = [
371
+ dim
372
+ for dim, score in scores.items()
373
+ if score < self.score_threshold
374
+ ]
375
+ if not below_threshold:
376
+ return "All dimensions meet the required threshold"
377
+ dims = ", ".join(below_threshold)
378
+ return f"Need improvement in: {dims}"
379
+
380
+ if isinstance(self.score_threshold, dict):
381
+ default_threshold = 0
382
+ below_threshold = [
383
+ dim
384
+ for dim, score in scores.items()
385
+ if score < self.score_threshold.get(dim, default_threshold)
386
+ ]
387
+ if not below_threshold:
388
+ return "All dimensions meet their respective thresholds"
389
+ dims = ", ".join(below_threshold)
390
+ return f"Need improvement in: {dims}"
391
+
392
+ # If no threshold set, just list all dimensions and their scores
393
+ dims = ", ".join(
394
+ f"{dim}: {score:.2f}" for dim, score in scores.items()
395
+ )
396
+ return f"Current scores - {dims}"
397
+
398
+ @retry_on_error()
399
+ def generate_reasoning_trace(self, problem: str) -> str:
400
+ r"""Generate initial reasoning trace for a given problem.
401
+
402
+ Args:
403
+ problem (str): The problem text to generate reasoning for.
404
+
405
+ Returns:
406
+ str: Generated reasoning trace.
407
+ """
408
+ self.reason_agent.reset()
409
+ few_shot_examples = (
410
+ f"Examples: {self.few_shot_examples}"
411
+ if self.few_shot_examples
412
+ else ""
413
+ )
414
+ prompt = self.REASONING_TEMPLATE.format(
415
+ problem=problem, few_shot_examples=few_shot_examples
416
+ )
417
+ response = self.reason_agent.step(prompt)
418
+ return response.msg.content
419
+
420
+ @retry_on_error()
421
+ def evaluate_trace(
422
+ self, problem: str, trace: str, solution: Optional[str] = None
423
+ ) -> Dict[str, Any]:
424
+ r"""Evaluate the quality of a reasoning trace.
425
+
426
+ Args:
427
+ problem (str): The original problem text to evaluate against.
428
+ trace (str): The reasoning trace to evaluate.
429
+ solution (Optional[str]): The solution to the problem, if provided.
430
+ (default: :obj:`None`)
431
+
432
+ Returns:
433
+ Dict[str, Any]: Evaluation results containing:
434
+ - scores: Dict of evaluation dimensions and their scores
435
+ - feedback: Detailed feedback for improvement
436
+
437
+ For Agent self-evaluation, the scores will include:
438
+ - correctness: Score for logical correctness
439
+ - clarity: Score for clarity of explanation
440
+ - completeness: Score for completeness of reasoning
441
+
442
+ For reward model evaluation, the scores will depend on
443
+ the model's evaluation dimensions.
444
+ """
445
+ self.evaluate_agent.reset() # type: ignore[union-attr]
446
+ if self.evaluator:
447
+ # Use reward model evaluation
448
+ messages = [
449
+ {"role": "user", "content": problem},
450
+ {"role": "assistant", "content": trace},
451
+ ]
452
+ scores = self.evaluator.evaluate(messages)
453
+
454
+ # For models that return a single score
455
+ if isinstance(scores, (int, float)) or (
456
+ isinstance(scores, dict) and len(scores) == 1
457
+ ):
458
+ if isinstance(scores, dict):
459
+ score = next(iter(scores.values()))
460
+ else:
461
+ score = scores
462
+ scores_dict = {"overall": score}
463
+ return {
464
+ **scores_dict,
465
+ "feedback": self._generate_feedback(scores_dict),
466
+ }
467
+
468
+ # For models that return multiple dimensions
469
+ return {**scores, "feedback": self._generate_feedback(scores)}
470
+ else:
471
+ # Fallback to original Agent self-evaluation
472
+ solution_text = f"Solution: {solution}" if solution else ""
473
+ prompt = self.EVALUATION_TEMPLATE.format(
474
+ problem=problem, trace=trace, solution=solution_text
475
+ )
476
+ response = self.evaluate_agent.step( # type: ignore[union-attr]
477
+ prompt, response_format=AgentTraceEvaluation
478
+ )
479
+ if response.msg.parsed is None:
480
+ raise AttributeError("Failed to parse evaluation response")
481
+ # Convert dict to AgentTraceEvaluation if needed
482
+ if isinstance(response.msg.parsed, dict):
483
+ evaluation = AgentTraceEvaluation(**response.msg.parsed)
484
+ else:
485
+ evaluation = response.msg.parsed
486
+
487
+ return evaluation.model_dump()
488
+
489
+ @retry_on_error()
490
+ def improve_trace(
491
+ self,
492
+ problem: str,
493
+ trace: str,
494
+ feedback: str,
495
+ solution: Optional[str] = None,
496
+ ) -> str:
497
+ r"""Generate improved reasoning trace based on feedback.
498
+
499
+ Args:
500
+ problem (str): The original problem text.
501
+ trace (str): The current reasoning trace.
502
+ feedback (str): Feedback for improving the trace.
503
+ solution (Optional[str]): The solution to the problem, if provided.
504
+ (default: :obj:`None`)
505
+
506
+ Returns:
507
+ str: Improved reasoning trace.
508
+ """
509
+ self.reason_agent.reset()
510
+ solution_text = f"Solution: {solution}" if solution else ""
511
+ prompt = self.IMPROVEMENT_TEMPLATE.format(
512
+ problem=problem,
513
+ trace=trace,
514
+ feedback=feedback,
515
+ solution=solution_text,
516
+ )
517
+ response = self.reason_agent.step(prompt)
518
+ return response.msg.content
519
+
520
+ def validate_problem_format(self, problem: Dict) -> None:
521
+ r"""Validate that a problem dictionary has the required format.
522
+
523
+ Args:
524
+ problem (Dict): Problem dictionary to validate.
525
+
526
+ Raises:
527
+ ValueError: If the problem format is invalid.
528
+ """
529
+ if not isinstance(problem, dict):
530
+ raise ValueError("Problem must be a dictionary.")
531
+
532
+ # Check required problem field
533
+ if "problem" not in problem:
534
+ raise ValueError("Problem dictionary must contain 'problem' key.")
535
+ if not isinstance(problem["problem"], str):
536
+ raise ValueError("Problem 'problem' field must be a string.")
537
+
538
+ # Optional fields validation
539
+ optional_fields: dict[str, type | tuple[type, ...]] = {
540
+ "id": (str, int, type(None)),
541
+ "type": str,
542
+ "solution": str,
543
+ }
544
+
545
+ for field, expected_type in optional_fields.items():
546
+ if field in problem and not isinstance(
547
+ problem[field], expected_type
548
+ ):
549
+ type_name = (
550
+ expected_type.__name__
551
+ if hasattr(expected_type, '__name__')
552
+ else str(expected_type)
553
+ )
554
+ raise ValueError(
555
+ f"Problem '{field}' must be of "
556
+ f"type {type_name} if present."
557
+ )
558
+
559
+ def _check_boxed_answers(self, solution: str, trace: str) -> bool:
560
+ r"""Check if the answer in the trace matches the solution using the
561
+ configured patterns.
562
+
563
+ Args:
564
+ solution (str): The problem solution string.
565
+ trace (str): The reasoning trace string.
566
+
567
+ Returns:
568
+ bool: True if answers match, False otherwise
569
+ """
570
+ import re
571
+
572
+ # Extract content using the configured patterns
573
+ solution_match = re.search(self.solution_pattern, solution, re.DOTALL)
574
+ trace_match = re.search(self.trace_pattern, trace, re.DOTALL)
575
+
576
+ if solution_match and trace_match:
577
+ # Clean up whitespace and normalize content
578
+ solution_answer = solution_match.group(1).strip()
579
+ trace_answer = trace_match.group(1).strip()
580
+ return solution_answer == trace_answer
581
+
582
+ return False
583
+
584
+ def process_problem(
585
+ self, problem: Dict, rationalization: bool = False
586
+ ) -> ProblemResult:
587
+ r"""Process a single problem through the self-improving cot pipeline.
588
+
589
+ Args:
590
+ problem (Dict): Problem dictionary containing the problem text.
591
+ rationalization (bool, optional): Whether to use rationalization.
592
+ (default: :obj:`False`)
593
+
594
+ Returns:
595
+ ProblemResult: Results with final trace and history.
596
+
597
+ Raises:
598
+ ValueError: If the problem format is invalid.
599
+ """
600
+ # Validate problem format before processing
601
+ self.validate_problem_format(problem)
602
+
603
+ problem_text = problem["problem"]
604
+ solution_text = problem.get("solution", "")
605
+ current_trace = self.generate_reasoning_trace(problem_text)
606
+ improvement_history = []
607
+ scores = {}
608
+
609
+ # Only evaluate if evaluate_agent or reward_model is set
610
+ if self.evaluate_agent or self.reward_model:
611
+ # Create batches for parallel evaluation
612
+ batch_problems = [problem]
613
+ batch_traces = [current_trace]
614
+ batch_solutions = [solution_text]
615
+
616
+ # Evaluate current trace batch
617
+ loop = asyncio.new_event_loop()
618
+ asyncio.set_event_loop(loop)
619
+ try:
620
+ eval_results = loop.run_until_complete(
621
+ self._batch_evaluate_traces(
622
+ batch_problems, batch_traces, batch_solutions
623
+ )
624
+ )
625
+ finally:
626
+ loop.close()
627
+
628
+ # Process evaluation results
629
+ eval_dict = eval_results[-1] # Get latest evaluation
630
+ scores = {k: v for k, v in eval_dict.items() if k != "feedback"}
631
+
632
+ # Record initial evaluation
633
+ if self.evaluator:
634
+ improvement_history.append(
635
+ TraceIteration(
636
+ iteration=0,
637
+ trace=current_trace,
638
+ evaluation=RewardTraceEvaluation(**eval_dict),
639
+ )
640
+ )
641
+ else:
642
+ improvement_history.append(
643
+ TraceIteration(
644
+ iteration=0,
645
+ trace=current_trace,
646
+ evaluation=AgentTraceEvaluation(
647
+ **scores, feedback=eval_dict["feedback"]
648
+ ),
649
+ )
650
+ )
651
+
652
+ # Only do improvement iterations if max_iterations > 0
653
+ if self.max_iterations > 0:
654
+ for iteration in range(0, self.max_iterations):
655
+ # Check if quality threshold met
656
+ if self._check_score_threshold(scores):
657
+ break
658
+
659
+ # Generate improved trace
660
+ if rationalization:
661
+ current_trace = self.improve_trace(
662
+ problem_text,
663
+ current_trace,
664
+ eval_dict["feedback"],
665
+ solution_text,
666
+ )
667
+ else:
668
+ current_trace = self.improve_trace(
669
+ problem_text, current_trace, eval_dict["feedback"]
670
+ )
671
+
672
+ # Evaluate improved trace
673
+ batch_traces = [current_trace]
674
+ loop = asyncio.new_event_loop()
675
+ asyncio.set_event_loop(loop)
676
+ try:
677
+ eval_results = loop.run_until_complete(
678
+ self._batch_evaluate_traces(
679
+ batch_problems, batch_traces, batch_solutions
680
+ )
681
+ )
682
+ finally:
683
+ loop.close()
684
+
685
+ eval_dict = eval_results[-1]
686
+ scores = {
687
+ k: v for k, v in eval_dict.items() if k != "feedback"
688
+ }
689
+
690
+ # Record iteration history
691
+ if self.evaluator:
692
+ improvement_history.append(
693
+ TraceIteration(
694
+ iteration=iteration + 1,
695
+ trace=current_trace,
696
+ evaluation=RewardTraceEvaluation(**eval_dict),
697
+ )
698
+ )
699
+ else:
700
+ improvement_history.append(
701
+ TraceIteration(
702
+ iteration=iteration + 1,
703
+ trace=current_trace,
704
+ evaluation=AgentTraceEvaluation(
705
+ **scores, feedback=eval_dict["feedback"]
706
+ ),
707
+ )
708
+ )
709
+
710
+ boxed_answer_success = self._check_boxed_answers(
711
+ problem.get("solution", ""), current_trace
712
+ )
713
+
714
+ result = ProblemResult(
715
+ id=problem.get("id", ""),
716
+ type=problem.get("type", ""),
717
+ problem=problem_text,
718
+ solution=problem.get("solution", ""),
719
+ final_trace=current_trace,
720
+ agent_evaluate_success=self._check_score_threshold(scores)
721
+ if scores
722
+ else None,
723
+ boxed_answer_success=boxed_answer_success,
724
+ improvement_history=improvement_history,
725
+ )
726
+
727
+ # Write result to file immediately if output path is specified
728
+ if self.output_path:
729
+ with self.lock:
730
+ try:
731
+ # Read existing results
732
+ with open(self.output_path, 'r') as f:
733
+ data = json.load(f)
734
+
735
+ cleaned_result = self.clean_json(result.model_dump())
736
+ data['traces'].append(cleaned_result)
737
+ self.safe_write_json(self.output_path, data)
738
+
739
+ except Exception as e:
740
+ logger.error(f"Error writing result to file: {e}")
741
+
742
+ return result
743
+
744
+ def generate(self, rationalization: bool = False) -> List[Dict[str, Any]]:
745
+ r"""Execute the self-improving cot pipeline on all problems.
746
+
747
+ Process problems and return results. If output_path is specified,
748
+ also save results to file.
749
+
750
+ Args:
751
+ rationalization (bool, optional): Whether to use rationalization.
752
+ (default: :obj:`False`)
753
+
754
+ Returns:
755
+ List[Dict[str, Any]]: List of processed results
756
+ """
757
+ # Pre-allocate results list
758
+ self.reasoning_traces = []
759
+
760
+ # Process problems in batches
761
+ loop = asyncio.new_event_loop()
762
+ asyncio.set_event_loop(loop)
763
+
764
+ try:
765
+ results = loop.run_until_complete(
766
+ self._batch_process_problems(self.problems, rationalization)
767
+ )
768
+ finally:
769
+ loop.close()
770
+
771
+ self.reasoning_traces = [result.model_dump() for result in results]
772
+ return self.reasoning_traces
773
+
774
+ # Templates for generating reasoning, evaluation and improving them.
775
+ REASONING_TEMPLATE = """Let's solve this step by step:
776
+ Problem: {problem}
777
+ 1. First, let's understand what we're asked
778
+ 2. Let's break this down into parts
779
+ 3. Let's solve each part systematically
780
+ 4. Finally, let's verify our solution
781
+
782
+ {few_shot_examples}
783
+
784
+ Please show your complete reasoning process."""
785
+
786
+ EVALUATION_TEMPLATE = """Please evaluate this reasoning trace and
787
+ provide scores and feedback in valid JSON format.
788
+
789
+ Problem: {problem}
790
+
791
+ {solution}
792
+
793
+ Reasoning Trace:
794
+ {trace}
795
+
796
+ Evaluate for:
797
+ 1. Correctness (Is each step logically sound?)
798
+ 2. Clarity (Is the explanation clear and well-structured?)
799
+ 3. Completeness (Are all necessary steps included?)
800
+
801
+ Respond ONLY with a JSON object in this exact format:
802
+ {{
803
+ "correctness": <score between 0 and 1>,
804
+ "clarity": <score between 0 and 1>,
805
+ "completeness": <score between 0 and 1>,
806
+ "feedback": "<specific feedback for improvement>"
807
+ }}"""
808
+
809
+ IMPROVEMENT_TEMPLATE = """Based on this feedback, generate an
810
+ improved reasoning trace:
811
+ Problem: {problem}
812
+
813
+ {solution}
814
+
815
+ Previous Trace:
816
+ {trace}
817
+
818
+ Feedback:
819
+ {feedback}
820
+
821
+ Generate a new, improved reasoning trace that addresses the feedback."""