opik-optimizer 1.1.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/base_optimizer.py +376 -19
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +80 -17
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +179 -39
- opik_optimizer/evolutionary_optimizer/llm_support.py +3 -1
- opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +17 -3
- opik_optimizer/evolutionary_optimizer/population_ops.py +5 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +47 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +12 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +65 -59
- opik_optimizer/gepa_optimizer/adapter.py +5 -3
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +163 -66
- opik_optimizer/mcp_utils/mcp_workflow.py +57 -3
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +75 -69
- opik_optimizer/mipro_optimizer/_lm.py +10 -3
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +1 -1
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +96 -21
- opik_optimizer/optimizable_agent.py +5 -0
- opik_optimizer/optimization_result.py +1 -0
- opik_optimizer/utils/core.py +56 -14
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.1.dist-info}/METADATA +97 -10
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.1.dist-info}/RECORD +27 -26
- /opik_optimizer/{colbert.py → utils/colbert.py} +0 -0
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.1.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.1.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,3 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
1
|
import logging
|
4
2
|
from contextlib import nullcontext
|
5
3
|
from typing import Any, ContextManager
|
@@ -12,15 +10,18 @@ from opik.evaluation.metrics.score_result import ScoreResult
|
|
12
10
|
from ..base_optimizer import BaseOptimizer
|
13
11
|
from ..optimization_config import chat_prompt, mappers
|
14
12
|
from ..optimization_result import OptimizationResult
|
15
|
-
from ..
|
16
|
-
from ..
|
13
|
+
from ..optimizable_agent import OptimizableAgent
|
14
|
+
from ..utils import (
|
15
|
+
optimization_context,
|
16
|
+
create_litellm_agent_class,
|
17
|
+
disable_experiment_reporting,
|
18
|
+
enable_experiment_reporting,
|
19
|
+
)
|
17
20
|
from .. import task_evaluator
|
18
21
|
from . import reporting as gepa_reporting
|
19
22
|
from .adapter import OpikDataInst, OpikGEPAAdapter
|
20
23
|
|
21
|
-
|
22
|
-
_setup_logging()
|
23
|
-
LOGGER = logging.getLogger("opik_optimizer.gepa.optimizer")
|
24
|
+
logger = logging.getLogger(__name__)
|
24
25
|
|
25
26
|
|
26
27
|
class GepaOptimizer(BaseOptimizer):
|
@@ -32,14 +33,63 @@ class GepaOptimizer(BaseOptimizer):
|
|
32
33
|
project_name: str | None = None,
|
33
34
|
reflection_model: str | None = None,
|
34
35
|
verbose: int = 1,
|
36
|
+
seed: int = 42,
|
35
37
|
**model_kwargs: Any,
|
36
38
|
) -> None:
|
37
|
-
|
39
|
+
# Validate required parameters
|
40
|
+
if model is None:
|
41
|
+
raise ValueError("model parameter is required and cannot be None")
|
42
|
+
if not isinstance(model, str):
|
43
|
+
raise ValueError(f"model must be a string, got {type(model).__name__}")
|
44
|
+
if not model.strip():
|
45
|
+
raise ValueError("model cannot be empty or whitespace-only")
|
46
|
+
|
47
|
+
# Validate optional parameters
|
48
|
+
if project_name is not None and not isinstance(project_name, str):
|
49
|
+
raise ValueError(
|
50
|
+
f"project_name must be a string or None, got {type(project_name).__name__}"
|
51
|
+
)
|
52
|
+
|
53
|
+
if reflection_model is not None and not isinstance(reflection_model, str):
|
54
|
+
raise ValueError(
|
55
|
+
f"reflection_model must be a string or None, got {type(reflection_model).__name__}"
|
56
|
+
)
|
57
|
+
|
58
|
+
if not isinstance(verbose, int):
|
59
|
+
raise ValueError(
|
60
|
+
f"verbose must be an integer, got {type(verbose).__name__}"
|
61
|
+
)
|
62
|
+
if verbose < 0:
|
63
|
+
raise ValueError("verbose must be non-negative")
|
64
|
+
|
65
|
+
if not isinstance(seed, int):
|
66
|
+
raise ValueError(f"seed must be an integer, got {type(seed).__name__}")
|
67
|
+
|
68
|
+
super().__init__(model=model, verbose=verbose, seed=seed, **model_kwargs)
|
38
69
|
self.project_name = project_name
|
39
70
|
self.reflection_model = reflection_model or model
|
40
71
|
self.num_threads = self.model_kwargs.pop("num_threads", 6)
|
41
|
-
self.seed = self.model_kwargs.pop("seed", 42)
|
42
72
|
self._gepa_live_metric_calls = 0
|
73
|
+
self._adapter = None # Will be set during optimization
|
74
|
+
|
75
|
+
def get_optimizer_metadata(self) -> dict[str, Any]:
|
76
|
+
return {
|
77
|
+
"project_name": self.project_name,
|
78
|
+
"reflection_model": self.reflection_model,
|
79
|
+
}
|
80
|
+
|
81
|
+
def cleanup(self) -> None:
|
82
|
+
"""
|
83
|
+
Clean up GEPA-specific resources.
|
84
|
+
"""
|
85
|
+
# Call parent cleanup
|
86
|
+
super().cleanup()
|
87
|
+
|
88
|
+
# Clear GEPA-specific resources
|
89
|
+
self._adapter = None
|
90
|
+
self._gepa_live_metric_calls = 0
|
91
|
+
|
92
|
+
logger.debug("Cleaned up GEPA-specific resources")
|
43
93
|
|
44
94
|
# ------------------------------------------------------------------
|
45
95
|
# Helpers
|
@@ -105,21 +155,62 @@ class GepaOptimizer(BaseOptimizer):
|
|
105
155
|
def optimize_prompt(
|
106
156
|
self,
|
107
157
|
prompt: chat_prompt.ChatPrompt,
|
108
|
-
dataset:
|
109
|
-
metric: Callable
|
110
|
-
experiment_config: dict
|
158
|
+
dataset: Dataset,
|
159
|
+
metric: Callable,
|
160
|
+
experiment_config: dict | None = None,
|
161
|
+
n_samples: int | None = None,
|
162
|
+
auto_continue: bool = False,
|
163
|
+
agent_class: type[OptimizableAgent] | None = None,
|
111
164
|
**kwargs: Any,
|
112
165
|
) -> OptimizationResult:
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
166
|
+
"""
|
167
|
+
Optimize a prompt using GEPA (Genetic-Pareto) algorithm.
|
168
|
+
|
169
|
+
Args:
|
170
|
+
prompt: The prompt to optimize
|
171
|
+
dataset: Opik Dataset to optimize on
|
172
|
+
metric: Metric function to evaluate on
|
173
|
+
experiment_config: Optional configuration for the experiment
|
174
|
+
n_samples: Optional number of items to test in the dataset
|
175
|
+
auto_continue: Whether to auto-continue optimization
|
176
|
+
agent_class: Optional agent class to use
|
177
|
+
**kwargs: GEPA-specific parameters:
|
178
|
+
max_metric_calls (int | None): Maximum number of metric evaluations (default: 30)
|
179
|
+
reflection_minibatch_size (int): Size of reflection minibatches (default: 3)
|
180
|
+
candidate_selection_strategy (str): Strategy for candidate selection (default: "pareto")
|
181
|
+
skip_perfect_score (bool): Skip candidates with perfect scores (default: True)
|
182
|
+
perfect_score (float): Score considered perfect (default: 1.0)
|
183
|
+
use_merge (bool): Enable merge operations (default: False)
|
184
|
+
max_merge_invocations (int): Maximum merge invocations (default: 5)
|
185
|
+
run_dir (str | None): Directory for run outputs (default: None)
|
186
|
+
track_best_outputs (bool): Track best outputs during optimization (default: False)
|
187
|
+
display_progress_bar (bool): Display progress bar (default: False)
|
188
|
+
seed (int): Random seed for reproducibility (default: 42)
|
189
|
+
raise_on_exception (bool): Raise exceptions instead of continuing (default: True)
|
190
|
+
mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
|
191
|
+
|
192
|
+
Returns:
|
193
|
+
OptimizationResult: Result of the optimization
|
194
|
+
"""
|
195
|
+
# Use base class validation and setup methods
|
196
|
+
self.validate_optimization_inputs(prompt, dataset, metric)
|
197
|
+
|
198
|
+
# Extract GEPA-specific parameters from kwargs
|
199
|
+
max_metric_calls: int | None = kwargs.get("max_metric_calls", 30)
|
118
200
|
reflection_minibatch_size: int = int(kwargs.get("reflection_minibatch_size", 3))
|
119
201
|
candidate_selection_strategy: str = str(
|
120
202
|
kwargs.get("candidate_selection_strategy", "pareto")
|
121
203
|
)
|
122
|
-
|
204
|
+
skip_perfect_score: bool = kwargs.get("skip_perfect_score", True)
|
205
|
+
perfect_score: float = float(kwargs.get("perfect_score", 1.0))
|
206
|
+
use_merge: bool = kwargs.get("use_merge", False)
|
207
|
+
max_merge_invocations: int = int(kwargs.get("max_merge_invocations", 5))
|
208
|
+
run_dir: str | None = kwargs.get("run_dir", None)
|
209
|
+
track_best_outputs: bool = kwargs.get("track_best_outputs", False)
|
210
|
+
display_progress_bar: bool = kwargs.get("display_progress_bar", False)
|
211
|
+
seed: int = int(kwargs.get("seed", 42))
|
212
|
+
raise_on_exception: bool = kwargs.get("raise_on_exception", True)
|
213
|
+
kwargs.pop("mcp_config", None) # Added for MCP support (for future use)
|
123
214
|
|
124
215
|
prompt = prompt.copy()
|
125
216
|
if self.project_name:
|
@@ -147,16 +238,19 @@ class GepaOptimizer(BaseOptimizer):
|
|
147
238
|
|
148
239
|
opik_client = opik.Opik(project_name=self.project_name)
|
149
240
|
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
241
|
+
disable_experiment_reporting()
|
242
|
+
|
243
|
+
try:
|
244
|
+
with optimization_context(
|
245
|
+
client=opik_client,
|
246
|
+
dataset_name=dataset.name,
|
247
|
+
objective_name=metric.__name__,
|
248
|
+
metadata={"optimizer": self.__class__.__name__},
|
249
|
+
) as optimization:
|
250
|
+
try:
|
251
|
+
opt_id = optimization.id if optimization is not None else None
|
252
|
+
except Exception:
|
253
|
+
opt_id = None
|
160
254
|
|
161
255
|
gepa_reporting.display_header(
|
162
256
|
algorithm="GEPA",
|
@@ -210,7 +304,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
210
304
|
)
|
211
305
|
baseline.set_score(initial_score)
|
212
306
|
except Exception:
|
213
|
-
|
307
|
+
logger.exception("Baseline evaluation failed")
|
214
308
|
|
215
309
|
adapter_prompt = self._apply_system_text(base_prompt, seed_prompt_text)
|
216
310
|
adapter_prompt.project_name = self.project_name
|
@@ -244,10 +338,17 @@ class GepaOptimizer(BaseOptimizer):
|
|
244
338
|
"task_lm": None,
|
245
339
|
"reflection_lm": self.reflection_model,
|
246
340
|
"candidate_selection_strategy": candidate_selection_strategy,
|
341
|
+
"skip_perfect_score": skip_perfect_score,
|
247
342
|
"reflection_minibatch_size": reflection_minibatch_size,
|
343
|
+
"perfect_score": perfect_score,
|
344
|
+
"use_merge": use_merge,
|
345
|
+
"max_merge_invocations": max_merge_invocations,
|
248
346
|
"max_metric_calls": max_metric_calls,
|
249
|
-
"
|
250
|
-
"track_best_outputs":
|
347
|
+
"run_dir": run_dir,
|
348
|
+
"track_best_outputs": track_best_outputs,
|
349
|
+
"display_progress_bar": display_progress_bar,
|
350
|
+
"seed": seed,
|
351
|
+
"raise_on_exception": raise_on_exception,
|
251
352
|
"logger": gepa_reporting.RichGEPAOptimizerLogger(
|
252
353
|
self, verbose=self.verbose
|
253
354
|
),
|
@@ -265,10 +366,13 @@ class GepaOptimizer(BaseOptimizer):
|
|
265
366
|
with gepa_reporting.start_gepa_optimization(verbose=self.verbose):
|
266
367
|
gepa_result = gepa.optimize(**kwargs_gepa)
|
267
368
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
369
|
+
try:
|
370
|
+
opt_id = optimization.id if optimization is not None else None
|
371
|
+
except Exception:
|
372
|
+
opt_id = None
|
373
|
+
|
374
|
+
finally:
|
375
|
+
enable_experiment_reporting()
|
272
376
|
|
273
377
|
# ------------------------------------------------------------------
|
274
378
|
# Rescoring & result assembly
|
@@ -308,7 +412,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
308
412
|
try:
|
309
413
|
score = float(self._evaluate_prompt_logged(**eval_kwargs))
|
310
414
|
except Exception:
|
311
|
-
|
415
|
+
logger.debug("Rescoring failed for candidate %s", idx, exc_info=True)
|
312
416
|
score = 0.0
|
313
417
|
|
314
418
|
rescored.append(score)
|
@@ -382,12 +486,12 @@ class GepaOptimizer(BaseOptimizer):
|
|
382
486
|
try:
|
383
487
|
self._evaluate_prompt_logged(**final_eval_kwargs)
|
384
488
|
except Exception:
|
385
|
-
|
489
|
+
logger.debug("Final evaluation failed", exc_info=True)
|
386
490
|
|
387
491
|
per_item_scores: list[dict[str, Any]] = []
|
388
492
|
try:
|
389
493
|
analysis_prompt = final_prompt.copy()
|
390
|
-
agent_cls = create_litellm_agent_class(analysis_prompt)
|
494
|
+
agent_cls = create_litellm_agent_class(analysis_prompt, optimizer_ref=self)
|
391
495
|
agent = agent_cls(analysis_prompt)
|
392
496
|
for item in items:
|
393
497
|
messages = analysis_prompt.get_messages(item)
|
@@ -408,7 +512,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
408
512
|
}
|
409
513
|
)
|
410
514
|
except Exception:
|
411
|
-
|
515
|
+
logger.debug("Per-item diagnostics failed", exc_info=True)
|
412
516
|
|
413
517
|
details: dict[str, Any] = {
|
414
518
|
"model": self.model,
|
@@ -420,13 +524,13 @@ class GepaOptimizer(BaseOptimizer):
|
|
420
524
|
"val_scores": val_scores,
|
421
525
|
"opik_rescored_scores": rescored,
|
422
526
|
"candidate_summary": candidate_rows,
|
423
|
-
"best_candidate_iteration":
|
424
|
-
|
425
|
-
|
527
|
+
"best_candidate_iteration": (
|
528
|
+
candidate_rows[best_idx]["iteration"] if candidate_rows else 0
|
529
|
+
),
|
426
530
|
"selected_candidate_index": best_idx,
|
427
|
-
"selected_candidate_gepa_score":
|
428
|
-
|
429
|
-
|
531
|
+
"selected_candidate_gepa_score": (
|
532
|
+
val_scores[best_idx] if best_idx < len(val_scores) else None
|
533
|
+
),
|
430
534
|
"selected_candidate_opik_score": best_score,
|
431
535
|
"gepa_live_metric_used": True,
|
432
536
|
"gepa_live_metric_call_count": self._gepa_live_metric_calls,
|
@@ -446,16 +550,16 @@ class GepaOptimizer(BaseOptimizer):
|
|
446
550
|
best_prompt_text, best_score, verbose=self.verbose
|
447
551
|
)
|
448
552
|
|
449
|
-
if
|
553
|
+
if logger.isEnabledFor(logging.DEBUG):
|
450
554
|
for idx, row in enumerate(candidate_rows):
|
451
|
-
|
555
|
+
logger.debug(
|
452
556
|
"candidate=%s source=%s gepa=%s opik=%s",
|
453
557
|
idx,
|
454
558
|
row.get("source"),
|
455
559
|
row.get("gepa_score"),
|
456
560
|
row.get("opik_score"),
|
457
561
|
)
|
458
|
-
|
562
|
+
logger.debug(
|
459
563
|
"selected candidate idx=%s gepa=%s opik=%.4f",
|
460
564
|
best_idx,
|
461
565
|
details.get("selected_candidate_gepa_score"),
|
@@ -516,7 +620,8 @@ class GepaOptimizer(BaseOptimizer):
|
|
516
620
|
if prompt.model_kwargs is None:
|
517
621
|
prompt.model_kwargs = self.model_kwargs
|
518
622
|
|
519
|
-
agent_class = create_litellm_agent_class(prompt)
|
623
|
+
agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
|
624
|
+
self.agent_class = agent_class
|
520
625
|
agent = agent_class(prompt)
|
521
626
|
|
522
627
|
def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
|
@@ -524,22 +629,14 @@ class GepaOptimizer(BaseOptimizer):
|
|
524
629
|
raw = agent.invoke(messages)
|
525
630
|
return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
|
526
631
|
|
527
|
-
|
528
|
-
experiment_config
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
"metric": metric.__name__,
|
536
|
-
"dataset": dataset.name,
|
537
|
-
"configuration": {
|
538
|
-
"prompt": prompt.get_messages(),
|
539
|
-
"gepa": (extra_metadata or {}),
|
540
|
-
},
|
541
|
-
},
|
542
|
-
}
|
632
|
+
configuration_updates = self._drop_none({"gepa": extra_metadata})
|
633
|
+
experiment_config = self._prepare_experiment_config(
|
634
|
+
prompt=prompt,
|
635
|
+
dataset=dataset,
|
636
|
+
metric=metric,
|
637
|
+
experiment_config=experiment_config,
|
638
|
+
configuration_updates=configuration_updates,
|
639
|
+
)
|
543
640
|
|
544
641
|
score = task_evaluator.evaluate(
|
545
642
|
dataset=dataset,
|
@@ -547,7 +644,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
547
644
|
metric=metric,
|
548
645
|
evaluated_task=llm_task,
|
549
646
|
num_threads=self.num_threads,
|
550
|
-
project_name=
|
647
|
+
project_name=experiment_config.get("project_name"),
|
551
648
|
experiment_config=experiment_config,
|
552
649
|
optimization_id=optimization_id,
|
553
650
|
n_samples=n_samples,
|
@@ -11,6 +11,7 @@ from __future__ import annotations
|
|
11
11
|
import contextlib
|
12
12
|
import copy
|
13
13
|
import io
|
14
|
+
import json
|
14
15
|
import logging
|
15
16
|
import os
|
16
17
|
import textwrap
|
@@ -346,12 +347,19 @@ class MCPToolInvocation:
|
|
346
347
|
preview_label: str | None = None
|
347
348
|
preview_chars: int = 160
|
348
349
|
rate_limit_sleep: float = DEFAULT_MCP_RATELIMIT_SLEEP
|
350
|
+
cache_enabled: bool = True
|
349
351
|
_logger: logging.Logger = field(default_factory=lambda: logger)
|
352
|
+
_cache: dict[str, str] = field(default_factory=dict, init=False)
|
350
353
|
|
351
354
|
def __call__(self, **arguments: Any) -> str:
|
352
355
|
return self.invoke(arguments)
|
353
356
|
|
354
|
-
def
|
357
|
+
def clear_cache(self) -> None:
|
358
|
+
self._cache.clear()
|
359
|
+
|
360
|
+
def invoke(
|
361
|
+
self, arguments: Mapping[str, Any], *, use_cache: bool | None = None
|
362
|
+
) -> str:
|
355
363
|
def call_tool(name: str, payload: dict[str, Any]) -> Any:
|
356
364
|
if self.rate_limit_sleep > 0:
|
357
365
|
time.sleep(self.rate_limit_sleep)
|
@@ -367,6 +375,19 @@ class MCPToolInvocation:
|
|
367
375
|
if self.argument_adapter:
|
368
376
|
prepared = self.argument_adapter(prepared, call_tool)
|
369
377
|
|
378
|
+
effective_cache = self.cache_enabled if use_cache is None else use_cache
|
379
|
+
cache_key: str | None = None
|
380
|
+
if effective_cache:
|
381
|
+
cache_key = self._make_cache_key(prepared)
|
382
|
+
cached_summary = self._cache.get(cache_key)
|
383
|
+
if cached_summary is not None:
|
384
|
+
if self.summary_handler:
|
385
|
+
self.summary_handler.record_summary(cached_summary)
|
386
|
+
self._logger.debug(
|
387
|
+
"MCP tool %s cache hit arguments=%s", self.tool_name, prepared
|
388
|
+
)
|
389
|
+
return cached_summary
|
390
|
+
|
370
391
|
# TODO(opik-mcp): reuse a persistent MCP client so we avoid spawning a
|
371
392
|
# new stdio subprocess for each call. This currently mirrors the
|
372
393
|
# original blocking behaviour for stability.
|
@@ -391,11 +412,41 @@ class MCPToolInvocation:
|
|
391
412
|
if self.summary_handler:
|
392
413
|
self.summary_handler.record_summary(summary)
|
393
414
|
|
415
|
+
if effective_cache and cache_key is not None:
|
416
|
+
self._cache[cache_key] = summary
|
417
|
+
|
394
418
|
if os.getenv("OPIK_DEBUG_MCP"):
|
395
419
|
self._logger.info("MCP %s raw response:\n%s", label, text)
|
396
420
|
|
397
421
|
return summary
|
398
422
|
|
423
|
+
def _make_cache_key(self, payload: Mapping[str, Any]) -> str:
|
424
|
+
try:
|
425
|
+
return json.dumps(payload, sort_keys=True, default=str)
|
426
|
+
except TypeError:
|
427
|
+
normalised = self._normalise_cache_payload(payload)
|
428
|
+
return json.dumps(normalised, sort_keys=True, default=str)
|
429
|
+
|
430
|
+
@staticmethod
|
431
|
+
def _normalise_cache_payload(value: Any) -> Any:
|
432
|
+
if isinstance(value, Mapping):
|
433
|
+
return {
|
434
|
+
key: MCPToolInvocation._normalise_cache_payload(val)
|
435
|
+
for key, val in sorted(value.items(), key=lambda item: str(item[0]))
|
436
|
+
}
|
437
|
+
if isinstance(value, list):
|
438
|
+
return [MCPToolInvocation._normalise_cache_payload(item) for item in value]
|
439
|
+
if isinstance(value, tuple):
|
440
|
+
return [MCPToolInvocation._normalise_cache_payload(item) for item in value]
|
441
|
+
if isinstance(value, set):
|
442
|
+
return [
|
443
|
+
MCPToolInvocation._normalise_cache_payload(item)
|
444
|
+
for item in sorted(value, key=repr)
|
445
|
+
]
|
446
|
+
if isinstance(value, (str, int, float, bool)) or value is None:
|
447
|
+
return value
|
448
|
+
return str(value)
|
449
|
+
|
399
450
|
|
400
451
|
def summarise_with_template(template: str) -> SummaryBuilder:
|
401
452
|
"""Return a summary builder that fills the provided template."""
|
@@ -465,6 +516,7 @@ def preview_second_pass(
|
|
465
516
|
dataset_item: dict[str, Any],
|
466
517
|
coordinator: MCPSecondPassCoordinator,
|
467
518
|
agent_factory: Callable[[Any], Any],
|
519
|
+
seed: int = 42,
|
468
520
|
) -> None:
|
469
521
|
"""Debug helper mirroring the old inline scripts."""
|
470
522
|
|
@@ -472,7 +524,9 @@ def preview_second_pass(
|
|
472
524
|
agent = agent_factory(prompt)
|
473
525
|
base_messages = prompt.get_messages(dataset_item)
|
474
526
|
|
475
|
-
raw_output = agent.llm_invoke(
|
527
|
+
raw_output = agent.llm_invoke(
|
528
|
+
messages=base_messages, seed=seed, allow_tool_use=True
|
529
|
+
)
|
476
530
|
logger.debug("Raw model output: %s", raw_output)
|
477
531
|
|
478
532
|
second_pass_messages = coordinator.build_second_pass_messages(
|
@@ -484,7 +538,7 @@ def preview_second_pass(
|
|
484
538
|
logger.debug("Second-pass messages: %s", second_pass_messages)
|
485
539
|
final_output = agent.llm_invoke(
|
486
540
|
messages=second_pass_messages,
|
487
|
-
seed=
|
541
|
+
seed=seed,
|
488
542
|
allow_tool_use=True,
|
489
543
|
)
|
490
544
|
else:
|