PyPI - opik-optimizer - Versions diffs - 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

opik-optimizer 1.0.6py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

opik_optimizer/__init__.py +4 -0
opik_optimizer/_throttle.py +2 -1
opik_optimizer/base_optimizer.py +402 -28
opik_optimizer/data/context7_eval.jsonl +3 -0
opik_optimizer/datasets/context7_eval.py +90 -0
opik_optimizer/datasets/tiny_test.py +33 -34
opik_optimizer/datasets/truthful_qa.py +2 -2
opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
opik_optimizer/gepa_optimizer/__init__.py +3 -0
opik_optimizer/gepa_optimizer/adapter.py +154 -0
opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
opik_optimizer/gepa_optimizer/reporting.py +181 -0
opik_optimizer/logging_config.py +42 -7
opik_optimizer/mcp_utils/__init__.py +22 -0
opik_optimizer/mcp_utils/mcp.py +541 -0
opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
opik_optimizer/mipro_optimizer/_lm.py +30 -23
opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
opik_optimizer/mipro_optimizer/utils.py +2 -4
opik_optimizer/optimizable_agent.py +21 -16
opik_optimizer/optimization_config/chat_prompt.py +44 -23
opik_optimizer/optimization_config/configs.py +3 -3
opik_optimizer/optimization_config/mappers.py +9 -8
opik_optimizer/optimization_result.py +22 -14
opik_optimizer/reporting_utils.py +61 -10
opik_optimizer/task_evaluator.py +9 -8
opik_optimizer/utils/__init__.py +15 -0
opik_optimizer/utils/colbert.py +236 -0
opik_optimizer/{utils.py → utils/core.py} +160 -33
opik_optimizer/utils/dataset_utils.py +49 -0
opik_optimizer/utils/prompt_segments.py +186 -0
opik_optimizer-2.0.0.dist-info/METADATA +345 -0
opik_optimizer-2.0.0.dist-info/RECORD +74 -0
opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
opik_optimizer-1.0.6.dist-info/METADATA +0 -181
opik_optimizer-1.0.6.dist-info/RECORD +0 -50
opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
{opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
{opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0

opik_optimizer/datasets/tiny_test.py CHANGED Viewed

@@ -1,42 +1,12 @@
 import opik
-TINY_TEST_ITEMS = [
-    {
-        "text": "What is the capital of France?",
-        "label": "Paris",
-        "metadata": {"context": "France is a country in Europe. Its capital is Paris."},
-    },
-    {
-        "text": "Who wrote Romeo and Juliet?",
-        "label": "William Shakespeare",
-        "metadata": {
-            "context": "Romeo and Juliet is a famous play written by William Shakespeare."
-        },
-    },
-    {
-        "text": "What is 2 + 2?",
-        "label": "4",
-        "metadata": {"context": "Basic arithmetic: 2 + 2 equals 4."},
-    },
-    {
-        "text": "What is the largest planet in our solar system?",
-        "label": "Jupiter",
-        "metadata": {"context": "Jupiter is the largest planet in our solar system."},
-    },
-    {
-        "text": "Who painted the Mona Lisa?",
-        "label": "Leonardo da Vinci",
-        "metadata": {"context": "The Mona Lisa was painted by Leonardo da Vinci."},
-    },
-]
 def tiny_test(test_mode: bool = False) -> opik.Dataset:
     """
-    Dataset containing the first 5 samples of the HotpotQA dataset.
+    Tiny QA benchmark (core_en subset from vincentkoc/tiny_qa_benchmark_pp).
     """
     dataset_name = "tiny_test" if not test_mode else "tiny_test_test"
-    nb_items = len(TINY_TEST_ITEMS)
+    nb_items = 5  # keep tiny dataset size consistent with tests/docs
     client = opik.Opik()
     dataset = client.get_or_create_dataset(dataset_name)
@@ -49,5 +19,34 @@ def tiny_test(test_mode: bool = False) -> opik.Dataset:
             f"Dataset {dataset_name} contains {len(items)} items, expected {nb_items}. We recommend deleting the dataset and re-creating it."
         )
     elif len(items) == 0:
-        dataset.insert(TINY_TEST_ITEMS)
-        return dataset
+        import datasets as ds
+        download_config = ds.DownloadConfig(download_desc=False, disable_tqdm=True)
+        ds.disable_progress_bar()
+        try:
+            # Load only the core_en subset JSONL from the repo
+            # Use the generic JSON loader with streaming for efficiency
+            hf_dataset = ds.load_dataset(
+                "json",
+                data_files="hf://datasets/vincentkoc/tiny_qa_benchmark_pp/data/core_en/core_en.jsonl",
+                streaming=True,
+                download_config=download_config,
+            )["train"]
+            data = []
+            for i, item in enumerate(hf_dataset):
+                if i >= nb_items:
+                    break
+                data.append(
+                    {
+                        "text": item.get("text", ""),
+                        "label": item.get("label", ""),
+                        # Preserve original tiny_test shape with metadata.context
+                        "metadata": {"context": item.get("context", "")},
+                    }
+                )
+            dataset.insert(data)
+            return dataset
+        finally:
+            ds.enable_progress_bar()

opik_optimizer/datasets/truthful_qa.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import opik
-from typing import Any, Dict, List
+from typing import Any
 def truthful_qa(test_mode: bool = False) -> opik.Dataset:
@@ -33,7 +33,7 @@ def truthful_qa(test_mode: bool = False) -> opik.Dataset:
             "truthful_qa", "multiple_choice", download_config=download_config
         )
-        data: List[Dict[str, Any]] = []
+        data: list[dict[str, Any]] = []
         for gen_item, mc_item in zip(
             gen_dataset["validation"], mc_dataset["validation"]
         ):

opik_optimizer/evolutionary_optimizer/crossover_ops.py ADDED Viewed

@@ -0,0 +1,194 @@
+from typing import Any, TYPE_CHECKING
+import logging
+import random
+import json
+from deap import creator as _creator
+from . import prompts as evo_prompts
+from . import reporting
+from .. import utils
+logger = logging.getLogger(__name__)
+creator = _creator  # backward compt.
+class CrossoverOps:
+    if TYPE_CHECKING:
+        verbose: int
+        output_style_guidance: str
+        _call_model: Any
+    def _deap_crossover_chunking_strategy(
+        self, messages_1_str: str, messages_2_str: str
+    ) -> tuple[str, str]:
+        chunks1 = [
+            chunk.strip() for chunk in messages_1_str.split(".") if chunk.strip()
+        ]
+        chunks2 = [
+            chunk.strip() for chunk in messages_2_str.split(".") if chunk.strip()
+        ]
+        if len(chunks1) >= 2 and len(chunks2) >= 2:
+            min_num_chunks = min(len(chunks1), len(chunks2))
+            point = random.randint(1, min_num_chunks - 1)
+            child1_chunks = chunks1[:point] + chunks2[point:]
+            child2_chunks = chunks2[:point] + chunks1[point:]
+            child1_str = ". ".join(child1_chunks) + ("." if child1_chunks else "")
+            child2_str = ". ".join(child2_chunks) + ("." if child2_chunks else "")
+            return child1_str, child2_str
+        else:
+            raise ValueError(
+                "Not enough chunks in either prompt for chunk-level crossover"
+            )
+    def _deap_crossover_word_level(
+        self, messages_1_str: str, messages_2_str: str
+    ) -> tuple[str, str]:
+        words1 = messages_1_str.split()
+        words2 = messages_2_str.split()
+        if not words1 or not words2:
+            return messages_1_str, messages_2_str
+        min_word_len = min(len(words1), len(words2))
+        if min_word_len < 2:
+            return messages_1_str, messages_2_str
+        point = random.randint(1, min_word_len - 1)
+        child1_words = words1[:point] + words2[point:]
+        child2_words = words2[:point] + words1[point:]
+        return " ".join(child1_words), " ".join(child2_words)
+    def _deap_crossover(self, ind1: Any, ind2: Any) -> tuple[Any, Any]:
+        """Crossover operation that preserves semantic meaning.
+        Attempts chunk-level crossover first, then falls back to word-level.
+        """
+        reporting.display_message(
+            "      Recombining prompts by mixing and matching words and sentences.",
+            verbose=self.verbose,
+        )
+        messages_1_orig: list[dict[str, str]] = ind1
+        messages_2_orig: list[dict[str, str]] = ind2
+        for i, message_1 in enumerate(messages_1_orig):
+            role: str = message_1["role"]
+            message_1_str: str = message_1["content"]
+            if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i]["role"] == role):
+                message_2 = messages_2_orig[i]
+                message_2_str: str = message_2["content"]
+                try:
+                    child1_str, child2_str = self._deap_crossover_chunking_strategy(
+                        message_1_str, message_2_str
+                    )
+                except ValueError:
+                    child1_str, child2_str = self._deap_crossover_word_level(
+                        message_1_str, message_2_str
+                    )
+                messages_1_orig[i]["content"] = child1_str
+                messages_2_orig[i]["content"] = child2_str
+            else:
+                pass
+        return creator.Individual(messages_1_orig), creator.Individual(messages_2_orig)
+    def _llm_deap_crossover(self, ind1: Any, ind2: Any) -> tuple[Any, Any]:
+        """Perform crossover by asking an LLM to blend two parent prompts."""
+        reporting.display_message(
+            "      Recombining prompts using an LLM.", verbose=self.verbose
+        )
+        parent1_messages: list[dict[str, str]] = ind1
+        parent2_messages: list[dict[str, str]] = ind2
+        current_output_style_guidance = self.output_style_guidance
+        user_prompt_for_llm_crossover = evo_prompts.llm_crossover_user_prompt(
+            parent1_messages, parent2_messages, current_output_style_guidance
+        )
+        try:
+            logger.debug(
+                f"Attempting LLM-driven crossover between: '{parent1_messages[:50]}...' and '{parent2_messages[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'"
+            )
+            response_content = self._call_model(
+                messages=[
+                    {
+                        "role": "system",
+                        "content": evo_prompts.llm_crossover_system_prompt(
+                            current_output_style_guidance
+                        ),
+                    },
+                    {"role": "user", "content": user_prompt_for_llm_crossover},
+                ],
+                is_reasoning=True,
+            )
+            logger.debug(f"Raw LLM response for crossover: {response_content}")
+            # First, try strict JSON parsing
+            json_response = None
+            try:
+                json_response = utils.json_to_dict(response_content)
+            except Exception:
+                # Continue with heuristic extraction below
+                json_response = None
+            children: list[list[dict[str, str]]] = []
+            if isinstance(json_response, list):
+                children = [c for c in json_response if isinstance(c, list)]
+            # If strict parse failed to yield children, try extracting arrays heuristically
+            if not children:
+                extracted = self._extract_json_arrays(response_content)
+                for arr in extracted:
+                    try:
+                        parsed = json.loads(arr)
+                        if isinstance(parsed, list) and all(
+                            isinstance(m, dict) and {"role", "content"} <= set(m.keys())
+                            for m in parsed
+                        ):
+                            children.append(parsed)
+                    except Exception:
+                        continue
+            if len(children) == 0:
+                raise ValueError("LLM response did not include any valid child prompts")
+            # We only need two children; if only one returned, duplicate pattern from DEAP
+            first_child = children[0]
+            second_child = children[1] if len(children) > 1 else children[0]
+            return creator.Individual(first_child), creator.Individual(second_child)
+        except Exception as e:
+            logger.warning(
+                f"LLM-driven crossover failed: {e}. Falling back to DEAP crossover."
+            )
+            return self._deap_crossover(ind1, ind2)
+    def _extract_json_arrays(self, text: str) -> list[str]:
+        """Extract top-level JSON array substrings from arbitrary text.
+        This helps when models return multiple arrays like `[...],\n[...]`.
+        """
+        arrays: list[str] = []
+        depth = 0
+        start: int | None = None
+        in_str = False
+        escape = False
+        for i, ch in enumerate(text):
+            if escape:
+                # current char is escaped; skip special handling
+                escape = False
+                continue
+            if ch == "\\":
+                escape = True
+                continue
+            if ch == '"':
+                in_str = not in_str
+                continue
+            if in_str:
+                continue
+            if ch == "[":
+                if depth == 0:
+                    start = i
+                depth += 1
+            elif ch == "]" and depth > 0:
+                depth -= 1
+                if depth == 0 and start is not None:
+                    arrays.append(text[start : i + 1])
+                    start = None
+        return arrays

opik_optimizer/evolutionary_optimizer/evaluation_ops.py ADDED Viewed

@@ -0,0 +1,136 @@
+from typing import Any, TYPE_CHECKING, cast
+from collections.abc import Callable
+from .. import task_evaluator
+from ..optimization_config import mappers, chat_prompt
+from ..mcp_utils.mcp_workflow import MCPExecutionConfig
+import opik
+import copy
+if TYPE_CHECKING:  # pragma: no cover - typing only
+    from ..base_optimizer import BaseOptimizer
+class EvaluationOps:
+    if TYPE_CHECKING:
+        agent_class: type[Any]
+        num_threads: int
+    def _evaluate_prompt(
+        self,
+        prompt: chat_prompt.ChatPrompt,
+        messages: list[dict[str, str]],
+        dataset: opik.Dataset,
+        metric: Callable,
+        n_samples: int | None = None,
+        dataset_item_ids: list[str] | None = None,
+        experiment_config: dict | None = None,
+        optimization_id: str | None = None,
+        verbose: int = 0,
+        **kwargs: Any,
+    ) -> float:
+        """Evaluate a single prompt (individual) against the dataset and return the score."""
+        total_items = len(dataset.get_items())
+        new_prompt = prompt.copy()
+        new_prompt.set_messages(messages)
+        tools = getattr(messages, "tools", None)
+        if tools is not None:
+            new_prompt.tools = copy.deepcopy(tools)
+        optimizer = cast("BaseOptimizer", self)
+        configuration_updates = optimizer._drop_none(
+            {
+                "n_samples_for_eval": (
+                    len(dataset_item_ids) if dataset_item_ids is not None else n_samples
+                ),
+                "total_dataset_items": total_items,
+            }
+        )
+        evaluation_details = optimizer._drop_none(
+            {
+                "dataset_item_ids": dataset_item_ids,
+                "optimization_id": optimization_id,
+            }
+        )
+        additional_metadata = (
+            {"evaluation": evaluation_details} if evaluation_details else None
+        )
+        experiment_config = optimizer._prepare_experiment_config(
+            prompt=new_prompt,
+            dataset=dataset,
+            metric=metric,
+            experiment_config=experiment_config,
+            configuration_updates=configuration_updates,
+            additional_metadata=additional_metadata,
+        )
+        try:
+            agent = self.agent_class(new_prompt)
+        except Exception:
+            return 0.0
+        mcp_execution_config: MCPExecutionConfig | None = kwargs.get("mcp_config")
+        def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
+            messages = new_prompt.get_messages(dataset_item)
+            if mcp_execution_config is None:
+                model_output = agent.invoke(messages)
+                return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
+            coordinator = mcp_execution_config.coordinator
+            coordinator.reset()
+            raw_model_output = agent.llm_invoke(
+                messages=messages,
+                seed=getattr(self, "seed", None),
+                allow_tool_use=True,
+            )
+            second_pass_messages = coordinator.build_second_pass_messages(
+                base_messages=messages,
+                dataset_item=dataset_item,
+            )
+            if (
+                second_pass_messages is None
+                and mcp_execution_config.fallback_invoker is not None
+            ):
+                fallback_args = mcp_execution_config.fallback_arguments(dataset_item)
+                if fallback_args:
+                    summary_override = mcp_execution_config.fallback_invoker(
+                        fallback_args
+                    )
+                    second_pass_messages = coordinator.build_second_pass_messages(
+                        base_messages=messages,
+                        dataset_item=dataset_item,
+                        summary_override=summary_override,
+                    )
+            if second_pass_messages is not None:
+                final_response = agent.llm_invoke(
+                    messages=second_pass_messages,
+                    seed=getattr(self, "seed", None),
+                    allow_tool_use=mcp_execution_config.allow_tool_use_on_second_pass,
+                )
+            else:
+                final_response = raw_model_output
+            return {mappers.EVALUATED_LLM_TASK_OUTPUT: final_response.strip()}
+        score = task_evaluator.evaluate(
+            dataset=dataset,
+            dataset_item_ids=dataset_item_ids,
+            metric=metric,
+            evaluated_task=llm_task,
+            num_threads=self.num_threads,
+            project_name=experiment_config.get("project_name"),
+            n_samples=n_samples if dataset_item_ids is None else None,
+            experiment_config=experiment_config,
+            optimization_id=optimization_id,
+            verbose=verbose,
+        )
+        return score

opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl

opik-optimizer 1.0.6py3-none-any.whl → 2.0.0py3-none-any.whl