opik-optimizer 2.1.1__tar.gz → 2.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/PKG-INFO +1 -1
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/pyproject.toml +1 -1
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/__init__.py +2 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/base_optimizer.py +2 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/gepa_optimizer/gepa_optimizer.py +3 -3
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +146 -93
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py +38 -14
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/reporting.py +127 -65
- opik_optimizer-2.1.3/src/opik_optimizer/multi_metric_objective.py +33 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/task_evaluator.py +26 -9
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer.egg-info/PKG-INFO +1 -1
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer.egg-info/SOURCES.txt +1 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/LICENSE +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/README.md +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/setup.cfg +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/setup.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/_throttle.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/cache_config.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/data/context7_eval.jsonl +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/data/hotpot-500.json +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/ai2_arc.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/cnn_dailymail.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/context7_eval.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/election_questions.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/gsm8k.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/halu_eval.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/hotpot_qa.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/medhallu.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/rag_hallucinations.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/ragbench.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/tiny_test.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/truthful_qa.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/demo/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/demo/cache.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/demo/datasets.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/crossover_ops.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/evaluation_ops.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/helpers.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/llm_support.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/mcp.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/mutation_ops.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/population_ops.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/prompts.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/reporting.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/style_ops.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/few_shot_bayesian_optimizer/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/few_shot_bayesian_optimizer/reporting.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/gepa_optimizer/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/gepa_optimizer/adapter.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/gepa_optimizer/reporting.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/prompts.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/types.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/logging_config.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mcp_utils/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mcp_utils/mcp.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mcp_utils/mcp_second_pass.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mcp_utils/mcp_simulator.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mcp_utils/mcp_workflow.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/meta_prompt_optimizer/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/meta_prompt_optimizer/reporting.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mipro_optimizer/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mipro_optimizer/_lm.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mipro_optimizer/utils.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimizable_agent.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimization_config/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimization_config/chat_prompt.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimization_config/configs.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimization_config/mappers.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimization_result.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/parameter_optimizer.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/parameter_search_space.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/parameter_spec.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/search_space_types.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/sensitivity_analysis.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/py.typed +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/reporting_utils.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/utils/__init__.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/utils/colbert.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/utils/core.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/utils/dataset_utils.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/utils/prompt_segments.py +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer.egg-info/dependency_links.txt +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer.egg-info/requires.txt +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer.egg-info/top_level.txt +0 -0
- {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/tests/test_setup.py +0 -0
|
@@ -19,6 +19,7 @@ from .mipro_optimizer import MiproOptimizer
|
|
|
19
19
|
from .hierarchical_reflective_optimizer import HierarchicalReflectiveOptimizer
|
|
20
20
|
from .optimization_config.configs import TaskConfig
|
|
21
21
|
from .optimization_result import OptimizationResult
|
|
22
|
+
from .multi_metric_objective import MultiMetricObjective
|
|
22
23
|
from .parameter_optimizer import (
|
|
23
24
|
ParameterOptimizer,
|
|
24
25
|
ParameterSearchSpace,
|
|
@@ -48,6 +49,7 @@ __all__ = [
|
|
|
48
49
|
"setup_logging",
|
|
49
50
|
"datasets",
|
|
50
51
|
"TaskConfig",
|
|
52
|
+
"MultiMetricObjective",
|
|
51
53
|
"ParameterSearchSpace",
|
|
52
54
|
"ParameterSpec",
|
|
53
55
|
"ParameterType",
|
|
@@ -280,6 +280,7 @@ class BaseOptimizer(ABC):
|
|
|
280
280
|
agent_config["project_name"] = getattr(prompt, "project_name", None)
|
|
281
281
|
agent_config["model"] = getattr(prompt, "model", None) or self.model
|
|
282
282
|
agent_config["tools"] = self._serialize_tools(prompt)
|
|
283
|
+
agent_config["optimizer"] = self.__class__.__name__
|
|
283
284
|
return self._drop_none(agent_config)
|
|
284
285
|
|
|
285
286
|
def get_optimizer_metadata(self) -> dict[str, Any]:
|
|
@@ -341,6 +342,7 @@ class BaseOptimizer(ABC):
|
|
|
341
342
|
"metric": getattr(metric, "__name__", str(metric)),
|
|
342
343
|
"dataset": getattr(dataset, "name", None),
|
|
343
344
|
"dataset_id": dataset_id,
|
|
345
|
+
"optimizer": self.__class__.__name__,
|
|
344
346
|
"optimizer_metadata": self._build_optimizer_metadata(),
|
|
345
347
|
"tool_signatures": self._summarize_tool_signatures(prompt),
|
|
346
348
|
"configuration": {
|
{opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/gepa_optimizer/gepa_optimizer.py
RENAMED
|
@@ -253,7 +253,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
253
253
|
opt_id = None
|
|
254
254
|
|
|
255
255
|
gepa_reporting.display_header(
|
|
256
|
-
algorithm=
|
|
256
|
+
algorithm=self.__class__.__name__,
|
|
257
257
|
optimization_id=opt_id,
|
|
258
258
|
dataset_id=getattr(dataset, "id", None),
|
|
259
259
|
verbose=self.verbose,
|
|
@@ -264,7 +264,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
264
264
|
_display_config(
|
|
265
265
|
messages=prompt.get_messages(),
|
|
266
266
|
optimizer_config={
|
|
267
|
-
"optimizer":
|
|
267
|
+
"optimizer": self.__class__.__name__,
|
|
268
268
|
"model": self.model,
|
|
269
269
|
"reflection_model": self.reflection_model,
|
|
270
270
|
"max_metric_calls": max_metric_calls,
|
|
@@ -422,7 +422,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
422
422
|
"system_prompt": candidate_prompt,
|
|
423
423
|
"gepa_score": val_scores[idx] if idx < len(val_scores) else None,
|
|
424
424
|
"opik_score": score,
|
|
425
|
-
"source":
|
|
425
|
+
"source": self.__class__.__name__,
|
|
426
426
|
}
|
|
427
427
|
)
|
|
428
428
|
history.append(
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from opik.environment import get_tqdm_for_current_environment
|
|
2
1
|
import os
|
|
3
2
|
import logging
|
|
4
3
|
|
|
@@ -29,8 +28,6 @@ from .types import (
|
|
|
29
28
|
)
|
|
30
29
|
from .prompts import IMPROVE_PROMPT_TEMPLATE
|
|
31
30
|
|
|
32
|
-
tqdm = get_tqdm_for_current_environment()
|
|
33
|
-
|
|
34
31
|
# Using disk cache for LLM calls
|
|
35
32
|
disk_cache_dir = os.path.expanduser("~/.litellm_cache")
|
|
36
33
|
litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
|
|
@@ -61,10 +58,14 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
61
58
|
max_parallel_batches: Maximum number of batches to process concurrently during
|
|
62
59
|
hierarchical root cause analysis (default: 5)
|
|
63
60
|
batch_size: Number of test cases per batch for root cause analysis (default: 25)
|
|
61
|
+
max_iterations: Maximum number of optimization iterations (default: 5)
|
|
62
|
+
convergence_threshold: Stop if relative improvement is below this threshold (default: 0.01)
|
|
64
63
|
**model_kwargs: Additional arguments passed to the LLM model
|
|
65
64
|
"""
|
|
66
65
|
|
|
67
66
|
DEFAULT_ROUNDS = 10
|
|
67
|
+
DEFAULT_MAX_ITERATIONS = 5
|
|
68
|
+
DEFAULT_CONVERGENCE_THRESHOLD = 0.01 # Stop if improvement is less than 1%
|
|
68
69
|
|
|
69
70
|
def __init__(
|
|
70
71
|
self,
|
|
@@ -74,6 +75,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
74
75
|
seed: int = 42,
|
|
75
76
|
max_parallel_batches: int = 5,
|
|
76
77
|
batch_size: int = 25,
|
|
78
|
+
max_iterations: int = DEFAULT_MAX_ITERATIONS,
|
|
79
|
+
convergence_threshold: float = DEFAULT_CONVERGENCE_THRESHOLD,
|
|
77
80
|
**model_kwargs: Any,
|
|
78
81
|
):
|
|
79
82
|
super().__init__(
|
|
@@ -83,6 +86,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
83
86
|
self.num_threads = num_threads
|
|
84
87
|
self.max_parallel_batches = max_parallel_batches
|
|
85
88
|
self.batch_size = batch_size
|
|
89
|
+
self.max_iterations = max_iterations
|
|
90
|
+
self.convergence_threshold = convergence_threshold
|
|
86
91
|
|
|
87
92
|
# Initialize hierarchical analyzer
|
|
88
93
|
self._hierarchical_analyzer = HierarchicalRootCauseAnalyzer(
|
|
@@ -247,6 +252,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
247
252
|
"reasoning_model": self.reasoning_model,
|
|
248
253
|
"num_threads": self.num_threads,
|
|
249
254
|
"max_parallel_batches": self.max_parallel_batches,
|
|
255
|
+
"max_iterations": self.max_iterations,
|
|
256
|
+
"convergence_threshold": self.convergence_threshold,
|
|
250
257
|
"seed": self.seed,
|
|
251
258
|
"verbose": self.verbose,
|
|
252
259
|
}
|
|
@@ -417,7 +424,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
417
424
|
n_samples: int | None,
|
|
418
425
|
attempt: int,
|
|
419
426
|
max_attempts: int,
|
|
420
|
-
) -> tuple[chat_prompt.ChatPrompt, float]:
|
|
427
|
+
) -> tuple[chat_prompt.ChatPrompt, float, EvaluationResult]:
|
|
421
428
|
"""
|
|
422
429
|
Generate and evaluate a single improvement attempt for a failure mode.
|
|
423
430
|
|
|
@@ -434,7 +441,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
434
441
|
max_attempts: Total number of attempts
|
|
435
442
|
|
|
436
443
|
Returns:
|
|
437
|
-
Tuple of (improved_prompt, improved_score)
|
|
444
|
+
Tuple of (improved_prompt, improved_score, improved_experiment_result)
|
|
438
445
|
"""
|
|
439
446
|
# Generate improvement with progress indication
|
|
440
447
|
with reporting.display_prompt_improvement(
|
|
@@ -485,7 +492,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
485
492
|
) / len(improved_experiment_result.test_results)
|
|
486
493
|
improved_reporter.set_score(improved_score)
|
|
487
494
|
|
|
488
|
-
return improved_chat_prompt, improved_score
|
|
495
|
+
return improved_chat_prompt, improved_score, improved_experiment_result
|
|
489
496
|
|
|
490
497
|
def optimize_prompt(
|
|
491
498
|
self,
|
|
@@ -528,6 +535,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
528
535
|
"n_samples": n_samples,
|
|
529
536
|
"auto_continue": auto_continue,
|
|
530
537
|
"max_retries": max_retries,
|
|
538
|
+
"max_iterations": self.max_iterations,
|
|
539
|
+
"convergence_threshold": self.convergence_threshold,
|
|
531
540
|
},
|
|
532
541
|
verbose=self.verbose,
|
|
533
542
|
tools=getattr(prompt, "tools", None),
|
|
@@ -557,53 +566,62 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
557
566
|
prompt.get_messages()
|
|
558
567
|
) # Store copy of initial messages for diff
|
|
559
568
|
|
|
560
|
-
#
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
synthesis_notes=hierarchical_analysis.synthesis_notes,
|
|
582
|
-
verbose=self.verbose,
|
|
583
|
-
)
|
|
569
|
+
# Multi-iteration optimization loop
|
|
570
|
+
iteration = 0
|
|
571
|
+
previous_iteration_score = initial_score
|
|
572
|
+
|
|
573
|
+
for iteration in range(1, self.max_iterations + 1):
|
|
574
|
+
logger.info(f"Starting iteration {iteration}/{self.max_iterations}")
|
|
575
|
+
|
|
576
|
+
with reporting.display_optimization_iteration(
|
|
577
|
+
iteration=iteration, verbose=self.verbose
|
|
578
|
+
) as iteration_reporter:
|
|
579
|
+
# Perform hierarchical root cause analysis
|
|
580
|
+
with reporting.display_root_cause_analysis(
|
|
581
|
+
verbose=self.verbose
|
|
582
|
+
) as analysis_reporter:
|
|
583
|
+
hierarchical_analysis = self._hierarchical_root_cause_analysis(
|
|
584
|
+
experiment_result
|
|
585
|
+
)
|
|
586
|
+
analysis_reporter.set_completed(
|
|
587
|
+
total_test_cases=hierarchical_analysis.total_test_cases,
|
|
588
|
+
num_batches=hierarchical_analysis.num_batches,
|
|
589
|
+
)
|
|
584
590
|
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
591
|
+
# Display hierarchical synthesis and failure modes
|
|
592
|
+
if self.verbose:
|
|
593
|
+
reporting.display_hierarchical_synthesis(
|
|
594
|
+
total_test_cases=hierarchical_analysis.total_test_cases,
|
|
595
|
+
num_batches=hierarchical_analysis.num_batches,
|
|
596
|
+
synthesis_notes=hierarchical_analysis.synthesis_notes,
|
|
597
|
+
verbose=self.verbose,
|
|
598
|
+
)
|
|
589
599
|
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
):
|
|
594
|
-
logger.debug(
|
|
595
|
-
f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
|
|
600
|
+
reporting.display_failure_modes(
|
|
601
|
+
failure_modes=hierarchical_analysis.unified_failure_modes,
|
|
602
|
+
verbose=self.verbose,
|
|
596
603
|
)
|
|
597
604
|
|
|
598
|
-
#
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
605
|
+
# Generate improved prompt for each failure mode
|
|
606
|
+
for idx, root_cause in enumerate(
|
|
607
|
+
hierarchical_analysis.unified_failure_modes, 1
|
|
608
|
+
):
|
|
609
|
+
logger.debug(
|
|
610
|
+
f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
|
|
611
|
+
)
|
|
602
612
|
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
improved_chat_prompt
|
|
606
|
-
|
|
613
|
+
# Try multiple attempts if needed
|
|
614
|
+
max_attempts = max_retries + 1
|
|
615
|
+
improved_chat_prompt = None
|
|
616
|
+
improved_score = None
|
|
617
|
+
|
|
618
|
+
for attempt in range(1, max_attempts + 1):
|
|
619
|
+
# Generate and evaluate improvement
|
|
620
|
+
(
|
|
621
|
+
improved_chat_prompt,
|
|
622
|
+
improved_score,
|
|
623
|
+
improved_experiment_result,
|
|
624
|
+
) = self._generate_and_evaluate_improvement(
|
|
607
625
|
root_cause=root_cause,
|
|
608
626
|
best_prompt=best_prompt,
|
|
609
627
|
best_score=best_score,
|
|
@@ -615,64 +633,86 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
615
633
|
attempt=attempt,
|
|
616
634
|
max_attempts=max_attempts,
|
|
617
635
|
)
|
|
618
|
-
)
|
|
619
636
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
637
|
+
# Check if we got improvement
|
|
638
|
+
if improved_score > best_score:
|
|
639
|
+
logger.info(
|
|
640
|
+
f"Improvement found for '{root_cause.name}' on attempt {attempt}"
|
|
641
|
+
)
|
|
642
|
+
break
|
|
643
|
+
|
|
644
|
+
# No improvement - should we retry?
|
|
645
|
+
if attempt < max_attempts:
|
|
646
|
+
reporting.display_retry_attempt(
|
|
647
|
+
attempt=attempt,
|
|
648
|
+
max_attempts=max_attempts,
|
|
649
|
+
failure_mode_name=root_cause.name,
|
|
650
|
+
verbose=self.verbose,
|
|
651
|
+
)
|
|
652
|
+
else:
|
|
653
|
+
logger.debug(
|
|
654
|
+
f"No improvement after {attempt} attempts for '{root_cause.name}'"
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# Check if final result is an improvement
|
|
658
|
+
if (
|
|
659
|
+
improved_score is not None
|
|
660
|
+
and improved_chat_prompt is not None
|
|
661
|
+
and improved_score > best_score
|
|
662
|
+
):
|
|
663
|
+
improvement = self._calculate_improvement(
|
|
664
|
+
improved_score, best_score
|
|
624
665
|
)
|
|
625
|
-
break
|
|
626
666
|
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
failure_mode_name=root_cause.name,
|
|
667
|
+
# Display improvement for this iteration
|
|
668
|
+
reporting.display_iteration_improvement(
|
|
669
|
+
improvement=improvement,
|
|
670
|
+
current_score=improved_score,
|
|
671
|
+
best_score=best_score,
|
|
633
672
|
verbose=self.verbose,
|
|
634
673
|
)
|
|
674
|
+
|
|
675
|
+
# Update best
|
|
676
|
+
best_score = improved_score
|
|
677
|
+
best_prompt = improved_chat_prompt
|
|
678
|
+
best_messages = improved_chat_prompt.get_messages()
|
|
679
|
+
experiment_result = improved_experiment_result
|
|
680
|
+
logger.info(
|
|
681
|
+
f"Updated best prompt after addressing '{root_cause.name}'"
|
|
682
|
+
)
|
|
635
683
|
else:
|
|
636
684
|
logger.debug(
|
|
637
|
-
f"
|
|
685
|
+
f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
|
|
638
686
|
)
|
|
639
687
|
|
|
640
|
-
#
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
):
|
|
646
|
-
improvement = self._calculate_improvement(
|
|
647
|
-
improved_score, best_score
|
|
648
|
-
)
|
|
649
|
-
|
|
650
|
-
# Display improvement for this iteration
|
|
651
|
-
reporting.display_iteration_improvement(
|
|
652
|
-
improvement=improvement,
|
|
653
|
-
current_score=improved_score,
|
|
654
|
-
best_score=best_score,
|
|
655
|
-
verbose=self.verbose,
|
|
656
|
-
)
|
|
688
|
+
# Mark iteration complete
|
|
689
|
+
improved_since_start = best_score > initial_score
|
|
690
|
+
iteration_reporter.iteration_complete(
|
|
691
|
+
best_score=best_score, improved=improved_since_start
|
|
692
|
+
)
|
|
657
693
|
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
logger.info(
|
|
663
|
-
f"Updated best prompt after addressing '{root_cause.name}'"
|
|
664
|
-
)
|
|
665
|
-
else:
|
|
666
|
-
logger.debug(
|
|
667
|
-
f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
|
|
668
|
-
)
|
|
694
|
+
# Check for convergence after iteration
|
|
695
|
+
iteration_improvement = self._calculate_improvement(
|
|
696
|
+
best_score, previous_iteration_score
|
|
697
|
+
)
|
|
669
698
|
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
best_score=best_score, improved=improved_since_start
|
|
699
|
+
logger.info(
|
|
700
|
+
f"Iteration {iteration} complete. Score: {best_score:.4f}, "
|
|
701
|
+
f"Improvement: {iteration_improvement:.2%}"
|
|
674
702
|
)
|
|
675
703
|
|
|
704
|
+
# Stop if improvement is below convergence threshold
|
|
705
|
+
if abs(iteration_improvement) < self.convergence_threshold:
|
|
706
|
+
logger.info(
|
|
707
|
+
f"Convergence achieved: improvement ({iteration_improvement:.2%}) "
|
|
708
|
+
f"below threshold ({self.convergence_threshold:.2%}). "
|
|
709
|
+
f"Stopping after {iteration} iterations."
|
|
710
|
+
)
|
|
711
|
+
break
|
|
712
|
+
|
|
713
|
+
# Update previous score for next iteration
|
|
714
|
+
previous_iteration_score = best_score
|
|
715
|
+
|
|
676
716
|
# Display final optimization result with diff
|
|
677
717
|
reporting.display_optimized_prompt_diff(
|
|
678
718
|
initial_messages=initial_messages,
|
|
@@ -682,14 +722,27 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
682
722
|
verbose=self.verbose,
|
|
683
723
|
)
|
|
684
724
|
|
|
725
|
+
# Update optimization status to completed
|
|
726
|
+
try:
|
|
727
|
+
optimization.update(status="completed")
|
|
728
|
+
logger.info(f"Optimization {optimization.id} status updated to completed.")
|
|
729
|
+
except Exception as e:
|
|
730
|
+
logger.warning(f"Failed to update optimization status: {e}")
|
|
731
|
+
|
|
685
732
|
# Prepare details for the result
|
|
686
733
|
details = {
|
|
734
|
+
"model": best_prompt.model or self.model,
|
|
735
|
+
"temperature": (best_prompt.model_kwargs or {}).get("temperature")
|
|
736
|
+
or self.model_kwargs.get("temperature"),
|
|
687
737
|
"reasoning_model": self.reasoning_model,
|
|
688
738
|
"num_threads": self.num_threads,
|
|
689
739
|
"max_parallel_batches": self.max_parallel_batches,
|
|
690
740
|
"max_retries": max_retries,
|
|
691
741
|
"n_samples": n_samples,
|
|
692
742
|
"auto_continue": auto_continue,
|
|
743
|
+
"max_iterations": self.max_iterations,
|
|
744
|
+
"convergence_threshold": self.convergence_threshold,
|
|
745
|
+
"iterations_completed": iteration,
|
|
693
746
|
}
|
|
694
747
|
|
|
695
748
|
# Extract tool prompts if tools exist
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import asyncio
|
|
3
3
|
from typing import Any
|
|
4
|
-
from tqdm import tqdm
|
|
5
4
|
|
|
5
|
+
from rich.progress import Progress, TextColumn, BarColumn, TaskProgressColumn
|
|
6
6
|
from opik.evaluation.evaluation_result import EvaluationResult
|
|
7
7
|
from .types import (
|
|
8
8
|
RootCauseAnalysis,
|
|
@@ -11,6 +11,7 @@ from .types import (
|
|
|
11
11
|
)
|
|
12
12
|
from . import reporting
|
|
13
13
|
from .prompts import BATCH_ANALYSIS_PROMPT, SYNTHESIS_PROMPT
|
|
14
|
+
from ..reporting_utils import get_console
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
16
17
|
|
|
@@ -285,13 +286,11 @@ Scores:
|
|
|
285
286
|
|
|
286
287
|
semaphore = asyncio.Semaphore(self.max_parallel_batches)
|
|
287
288
|
|
|
288
|
-
# Create progress bar for batch processing
|
|
289
|
-
|
|
290
|
-
total=len(batch_tasks), desc="Processing batches", unit="batch", leave=False
|
|
291
|
-
)
|
|
289
|
+
# Create progress bar for batch processing using Rich
|
|
290
|
+
console = get_console()
|
|
292
291
|
|
|
293
292
|
async def run_with_semaphore(
|
|
294
|
-
batch_num: int, task: Any
|
|
293
|
+
batch_num: int, task: Any, progress: Progress | None, task_id: Any | None
|
|
295
294
|
) -> tuple[int, BatchAnalysis]:
|
|
296
295
|
async with semaphore:
|
|
297
296
|
try:
|
|
@@ -300,19 +299,44 @@ Scores:
|
|
|
300
299
|
f"Completed batch {batch_num}: "
|
|
301
300
|
f"identified {len(result.failure_modes)} failure modes"
|
|
302
301
|
)
|
|
303
|
-
|
|
302
|
+
if progress and task_id is not None:
|
|
303
|
+
progress.update(task_id, advance=1) # Update progress bar
|
|
304
304
|
return batch_num, result
|
|
305
305
|
except Exception as exc:
|
|
306
306
|
logger.error(f"Batch {batch_num} failed: {exc}")
|
|
307
|
-
|
|
307
|
+
if progress and task_id is not None:
|
|
308
|
+
progress.update(
|
|
309
|
+
task_id, advance=1
|
|
310
|
+
) # Update progress bar even on error
|
|
308
311
|
raise
|
|
309
312
|
|
|
310
|
-
# Run all tasks with semaphore control
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
313
|
+
# Run all tasks with semaphore control and rich progress bar
|
|
314
|
+
if self.verbose >= 1:
|
|
315
|
+
with Progress(
|
|
316
|
+
TextColumn("│ "),
|
|
317
|
+
TextColumn("[progress.description]{task.description}"),
|
|
318
|
+
BarColumn(),
|
|
319
|
+
TaskProgressColumn(),
|
|
320
|
+
console=console,
|
|
321
|
+
transient=True,
|
|
322
|
+
) as progress:
|
|
323
|
+
task_id = progress.add_task(
|
|
324
|
+
"Processing batches", total=len(batch_tasks)
|
|
325
|
+
)
|
|
326
|
+
results = await asyncio.gather(
|
|
327
|
+
*[
|
|
328
|
+
run_with_semaphore(num, task, progress, task_id)
|
|
329
|
+
for num, task in batch_tasks
|
|
330
|
+
]
|
|
331
|
+
)
|
|
332
|
+
else:
|
|
333
|
+
# No progress bar in non-verbose mode
|
|
334
|
+
results = await asyncio.gather(
|
|
335
|
+
*[
|
|
336
|
+
run_with_semaphore(num, task, None, None)
|
|
337
|
+
for num, task in batch_tasks
|
|
338
|
+
]
|
|
339
|
+
)
|
|
316
340
|
|
|
317
341
|
# Sort by batch number to maintain order
|
|
318
342
|
batch_analyses = [result for _, result in sorted(results)]
|