PyPI - mantisdk - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mantisdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mantisdk might be problematic. Click here for more details.

Files changed (190) hide show

mantisdk/__init__.py +22 -0
mantisdk/adapter/__init__.py +15 -0
mantisdk/adapter/base.py +94 -0
mantisdk/adapter/messages.py +270 -0
mantisdk/adapter/triplet.py +1028 -0
mantisdk/algorithm/__init__.py +39 -0
mantisdk/algorithm/apo/__init__.py +5 -0
mantisdk/algorithm/apo/apo.py +889 -0
mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
mantisdk/algorithm/base.py +162 -0
mantisdk/algorithm/decorator.py +264 -0
mantisdk/algorithm/fast.py +250 -0
mantisdk/algorithm/gepa/__init__.py +59 -0
mantisdk/algorithm/gepa/adapter.py +459 -0
mantisdk/algorithm/gepa/gepa.py +364 -0
mantisdk/algorithm/gepa/lib/__init__.py +18 -0
mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
mantisdk/algorithm/gepa/lib/api.py +375 -0
mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
mantisdk/algorithm/gepa/lib/core/result.py +233 -0
mantisdk/algorithm/gepa/lib/core/state.py +636 -0
mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
mantisdk/algorithm/gepa/lib/py.typed +0 -0
mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
mantisdk/algorithm/gepa/tracing.py +105 -0
mantisdk/algorithm/utils.py +177 -0
mantisdk/algorithm/verl/__init__.py +5 -0
mantisdk/algorithm/verl/interface.py +202 -0
mantisdk/cli/__init__.py +56 -0
mantisdk/cli/prometheus.py +115 -0
mantisdk/cli/store.py +131 -0
mantisdk/cli/vllm.py +29 -0
mantisdk/client.py +408 -0
mantisdk/config.py +348 -0
mantisdk/emitter/__init__.py +43 -0
mantisdk/emitter/annotation.py +370 -0
mantisdk/emitter/exception.py +54 -0
mantisdk/emitter/message.py +61 -0
mantisdk/emitter/object.py +117 -0
mantisdk/emitter/reward.py +320 -0
mantisdk/env_var.py +156 -0
mantisdk/execution/__init__.py +15 -0
mantisdk/execution/base.py +64 -0
mantisdk/execution/client_server.py +443 -0
mantisdk/execution/events.py +69 -0
mantisdk/execution/inter_process.py +16 -0
mantisdk/execution/shared_memory.py +282 -0
mantisdk/instrumentation/__init__.py +119 -0
mantisdk/instrumentation/agentops.py +314 -0
mantisdk/instrumentation/agentops_langchain.py +45 -0
mantisdk/instrumentation/litellm.py +83 -0
mantisdk/instrumentation/vllm.py +81 -0
mantisdk/instrumentation/weave.py +500 -0
mantisdk/litagent/__init__.py +11 -0
mantisdk/litagent/decorator.py +536 -0
mantisdk/litagent/litagent.py +252 -0
mantisdk/llm_proxy.py +1890 -0
mantisdk/logging.py +370 -0
mantisdk/reward.py +7 -0
mantisdk/runner/__init__.py +11 -0
mantisdk/runner/agent.py +845 -0
mantisdk/runner/base.py +182 -0
mantisdk/runner/legacy.py +309 -0
mantisdk/semconv.py +170 -0
mantisdk/server.py +401 -0
mantisdk/store/__init__.py +23 -0
mantisdk/store/base.py +897 -0
mantisdk/store/client_server.py +2092 -0
mantisdk/store/collection/__init__.py +30 -0
mantisdk/store/collection/base.py +587 -0
mantisdk/store/collection/memory.py +970 -0
mantisdk/store/collection/mongo.py +1412 -0
mantisdk/store/collection_based.py +1823 -0
mantisdk/store/insight.py +648 -0
mantisdk/store/listener.py +58 -0
mantisdk/store/memory.py +396 -0
mantisdk/store/mongo.py +165 -0
mantisdk/store/sqlite.py +3 -0
mantisdk/store/threading.py +357 -0
mantisdk/store/utils.py +142 -0
mantisdk/tracer/__init__.py +16 -0
mantisdk/tracer/agentops.py +242 -0
mantisdk/tracer/base.py +287 -0
mantisdk/tracer/dummy.py +106 -0
mantisdk/tracer/otel.py +555 -0
mantisdk/tracer/weave.py +677 -0
mantisdk/trainer/__init__.py +6 -0
mantisdk/trainer/init_utils.py +263 -0
mantisdk/trainer/legacy.py +367 -0
mantisdk/trainer/registry.py +12 -0
mantisdk/trainer/trainer.py +618 -0
mantisdk/types/__init__.py +6 -0
mantisdk/types/core.py +553 -0
mantisdk/types/resources.py +204 -0
mantisdk/types/tracer.py +515 -0
mantisdk/types/tracing.py +218 -0
mantisdk/utils/__init__.py +1 -0
mantisdk/utils/id.py +18 -0
mantisdk/utils/metrics.py +1025 -0
mantisdk/utils/otel.py +578 -0
mantisdk/utils/otlp.py +536 -0
mantisdk/utils/server_launcher.py +1045 -0
mantisdk/utils/system_snapshot.py +81 -0
mantisdk/verl/__init__.py +8 -0
mantisdk/verl/__main__.py +6 -0
mantisdk/verl/async_server.py +46 -0
mantisdk/verl/config.yaml +27 -0
mantisdk/verl/daemon.py +1154 -0
mantisdk/verl/dataset.py +44 -0
mantisdk/verl/entrypoint.py +248 -0
mantisdk/verl/trainer.py +549 -0
mantisdk-0.1.0.dist-info/METADATA +119 -0
mantisdk-0.1.0.dist-info/RECORD +190 -0
mantisdk-0.1.0.dist-info/WHEEL +4 -0
mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0

mantisdk/algorithm/gepa/lib/core/engine.py ADDED Viewed

@@ -0,0 +1,356 @@
+# Copyright (c) 2025 Lakshya A Agrawal and the GEPA contributors
+# https://github.com/gepa-ai/gepa
+import traceback
+from collections.abc import Sequence
+from typing import Generic
+from mantisdk.algorithm.gepa.lib.core.adapter import DataInst, GEPAAdapter, RolloutOutput, Trajectory
+from mantisdk.algorithm.gepa.lib.core.data_loader import DataId, DataLoader, ensure_loader
+from mantisdk.algorithm.gepa.lib.core.state import EvaluationCache, FrontierType, GEPAState, ValsetEvaluation, initialize_gepa_state
+from mantisdk.algorithm.gepa.lib.logging.experiment_tracker import ExperimentTracker
+from mantisdk.algorithm.gepa.lib.logging.logger import LoggerProtocol
+from mantisdk.algorithm.gepa.lib.logging.utils import log_detailed_metrics_after_discovering_new_program
+from mantisdk.algorithm.gepa.lib.proposer.merge import MergeProposer
+from mantisdk.algorithm.gepa.lib.proposer.reflective_mutation.reflective_mutation import (
+    ReflectiveMutationProposer,
+)
+from mantisdk.algorithm.gepa.lib.strategies.eval_policy import EvaluationPolicy, FullEvaluationPolicy
+from mantisdk.algorithm.gepa.lib.utils import StopperProtocol
+# Import tqdm for progress bar functionality
+try:
+    from tqdm import tqdm
+except ImportError:
+    tqdm = None
+class GEPAEngine(Generic[DataId, DataInst, Trajectory, RolloutOutput]):
+    """Orchestrates the optimization loop using pluggable candidate proposers."""
+    def __init__(
+        self,
+        adapter: GEPAAdapter[DataInst, Trajectory, RolloutOutput],
+        run_dir: str | None,
+        valset: list[DataInst] | DataLoader[DataId, DataInst] | None,
+        seed_candidate: dict[str, str],
+        # Controls
+        perfect_score: float,
+        seed: int,
+        # Strategies and helpers
+        reflective_proposer: ReflectiveMutationProposer,
+        merge_proposer: MergeProposer | None,
+        frontier_type: FrontierType,
+        # Logging
+        logger: LoggerProtocol,
+        experiment_tracker: ExperimentTracker,
+        # Optional parameters
+        track_best_outputs: bool = False,
+        display_progress_bar: bool = False,
+        raise_on_exception: bool = True,
+        use_cloudpickle: bool = False,
+        # Budget and Stop Condition
+        stop_callback: StopperProtocol | None = None,
+        val_evaluation_policy: EvaluationPolicy[DataId, DataInst] | None = None,
+        # Evaluation caching (stored in state, passed here for initialization)
+        evaluation_cache: EvaluationCache[RolloutOutput, DataId] | None = None,
+    ):
+        self.logger = logger
+        self.run_dir = run_dir
+        # Graceful stopping mechanism
+        self._stop_requested = False
+        # Set up stopping mechanism
+        self.stop_callback = stop_callback
+        self.adapter = adapter
+        # Store cache reference for state initialization (actual cache lives in GEPAState)
+        self._initial_evaluation_cache = evaluation_cache
+        def evaluator(
+            batch: list[DataInst], program: dict[str, str]
+        ) -> tuple[list[RolloutOutput], list[float], Sequence[dict[str, float]] | None]:
+            eval_result = adapter.evaluate(batch, program, capture_traces=False)
+            return eval_result.outputs, eval_result.scores, eval_result.objective_scores
+        self.evaluator = evaluator
+        self.valset = ensure_loader(valset) if valset is not None else None
+        self.seed_candidate = seed_candidate
+        self.perfect_score = perfect_score
+        self.seed = seed
+        self.experiment_tracker = experiment_tracker
+        self.reflective_proposer = reflective_proposer
+        self.merge_proposer = merge_proposer
+        self.frontier_type: FrontierType = frontier_type
+        # Merge scheduling flags (mirroring previous behavior)
+        if self.merge_proposer is not None:
+            self.merge_proposer.last_iter_found_new_program = False
+        self.track_best_outputs = track_best_outputs
+        self.display_progress_bar = display_progress_bar
+        self.use_cloudpickle = use_cloudpickle
+        self.raise_on_exception = raise_on_exception
+        self.val_evaluation_policy: EvaluationPolicy[DataId, DataInst] = (
+            val_evaluation_policy if val_evaluation_policy is not None else FullEvaluationPolicy()
+        )
+    def _evaluate_on_valset(
+        self,
+        program: dict[str, str],
+        state: GEPAState[RolloutOutput, DataId],
+    ) -> ValsetEvaluation[RolloutOutput, DataId]:
+        valset = self.valset
+        assert valset is not None
+        val_ids = self.val_evaluation_policy.get_eval_batch(valset, state)
+        outputs_by_val_idx, scores_by_val_idx, objective_by_val_idx, num_actual_evals = state.cached_evaluate_full(
+            program, list(val_ids), valset.fetch, self.evaluator
+        )
+        state.total_num_evals += num_actual_evals
+        return ValsetEvaluation(
+            outputs_by_val_id=outputs_by_val_idx,
+            scores_by_val_id=scores_by_val_idx,
+            objective_scores_by_val_id=objective_by_val_idx,
+        )
+    def _run_full_eval_and_add(
+        self,
+        new_program: dict[str, str],
+        state: GEPAState[RolloutOutput, DataId],
+        parent_program_idx: list[int],
+    ) -> tuple[int, int]:
+        num_metric_calls_by_discovery = state.total_num_evals
+        valset_evaluation = self._evaluate_on_valset(new_program, state)
+        state.num_full_ds_evals += 1
+        new_program_idx = state.update_state_with_new_program(
+            parent_program_idx=parent_program_idx,
+            new_program=new_program,
+            valset_evaluation=valset_evaluation,
+            run_dir=self.run_dir,
+            num_metric_calls_by_discovery_of_new_program=num_metric_calls_by_discovery,
+        )
+        state.full_program_trace[-1]["new_program_idx"] = new_program_idx
+        state.full_program_trace[-1]["evaluated_val_indices"] = sorted(valset_evaluation.scores_by_val_id.keys())
+        valset_score = self.val_evaluation_policy.get_valset_score(new_program_idx, state)
+        linear_pareto_front_program_idx = self.val_evaluation_policy.get_best_program(state)
+        if new_program_idx == linear_pareto_front_program_idx:
+            self.logger.log(f"Iteration {state.i + 1}: Found a better program on the valset with score {valset_score}.")
+        valset = self.valset
+        assert valset is not None
+        log_detailed_metrics_after_discovering_new_program(
+            logger=self.logger,
+            gepa_state=state,
+            new_program_idx=new_program_idx,
+            valset_evaluation=valset_evaluation,
+            objective_scores=state.prog_candidate_objective_scores[new_program_idx],
+            experiment_tracker=self.experiment_tracker,
+            linear_pareto_front_program_idx=linear_pareto_front_program_idx,
+            valset_size=len(valset),
+            val_evaluation_policy=self.val_evaluation_policy,
+        )
+        return new_program_idx, linear_pareto_front_program_idx
+    def run(self) -> GEPAState[RolloutOutput, DataId]:
+        # Check tqdm availability if progress bar is enabled
+        progress_bar = None
+        if self.display_progress_bar:
+            if tqdm is None:
+                raise ImportError("tqdm must be installed when display_progress_bar is enabled")
+            # Check if stop_callback contains MaxMetricCallsStopper
+            total_calls: int | None = None
+            stop_cb = self.stop_callback
+            if stop_cb is not None:
+                max_calls_attr = getattr(stop_cb, "max_metric_calls", None)
+                if isinstance(max_calls_attr, int):
+                    # Direct MaxMetricCallsStopper
+                    total_calls = max_calls_attr
+                else:
+                    stoppers = getattr(stop_cb, "stoppers", None)
+                    if stoppers is not None:
+                        # CompositeStopper - iterate to find MaxMetricCallsStopper
+                        for stopper in stoppers:
+                            stopper_max = getattr(stopper, "max_metric_calls", None)
+                            if isinstance(stopper_max, int):
+                                total_calls = stopper_max
+                                break
+            if total_calls is not None:
+                progress_bar = tqdm(total=total_calls, desc="GEPA Optimization", unit="rollouts")
+            else:
+                progress_bar = tqdm(desc="GEPA Optimization", unit="rollouts")
+            progress_bar.update(0)
+        # Prepare valset
+        valset = self.valset
+        if valset is None:
+            raise ValueError("valset must be provided to GEPAEngine.run()")
+        def valset_evaluator(
+            program: dict[str, str],
+        ) -> ValsetEvaluation[RolloutOutput, DataId]:
+            all_ids = list(valset.all_ids())
+            outputs, scores, objective_scores = self.evaluator(valset.fetch(all_ids), program)
+            outputs_dict = dict(zip(all_ids, outputs, strict=False))
+            scores_dict = dict(zip(all_ids, scores, strict=False))
+            objective_scores_dict = (
+                dict(zip(all_ids, objective_scores, strict=False)) if objective_scores is not None else None
+            )
+            return ValsetEvaluation(
+                outputs_by_val_id=outputs_dict,
+                scores_by_val_id=scores_dict,
+                objective_scores_by_val_id=objective_scores_dict,
+            )
+        # Initialize state
+        state = initialize_gepa_state(
+            run_dir=self.run_dir,
+            logger=self.logger,
+            seed_candidate=self.seed_candidate,
+            valset_evaluator=valset_evaluator,
+            track_best_outputs=self.track_best_outputs,
+            frontier_type=self.frontier_type,
+            evaluation_cache=self._initial_evaluation_cache,
+        )
+        # Log base program score
+        base_val_avg, base_val_coverage = state.get_program_average_val_subset(0)
+        self.experiment_tracker.log_metrics(
+            {
+                "base_program_full_valset_score": base_val_avg,
+                "base_program_val_coverage": base_val_coverage,
+                "iteration": state.i + 1,
+            },
+            step=state.i + 1,
+        )
+        self.logger.log(
+            f"Iteration {state.i + 1}: Base program full valset score: {base_val_avg} "
+            f"over {base_val_coverage} / {len(valset)} examples"
+        )
+        # Merge scheduling
+        if self.merge_proposer is not None:
+            self.merge_proposer.last_iter_found_new_program = False
+        # Main loop
+        last_pbar_val = 0
+        while not self._should_stop(state):
+            if self.display_progress_bar and progress_bar is not None:
+                delta = state.total_num_evals - last_pbar_val
+                progress_bar.update(delta)
+                last_pbar_val = state.total_num_evals
+            assert state.is_consistent()
+            try:
+                state.save(self.run_dir, use_cloudpickle=self.use_cloudpickle)
+                state.i += 1
+                state.full_program_trace.append({"i": state.i})
+                # 1) Attempt merge first if scheduled and last iter found new program
+                if self.merge_proposer is not None and self.merge_proposer.use_merge:
+                    if self.merge_proposer.merges_due > 0 and self.merge_proposer.last_iter_found_new_program:
+                        proposal = self.merge_proposer.propose(state)
+                        self.merge_proposer.last_iter_found_new_program = False  # old behavior
+                        if proposal is not None and proposal.tag == "merge":
+                            parent_sums = proposal.subsample_scores_before or [
+                                float("-inf"),
+                                float("-inf"),
+                            ]
+                            new_sum = sum(proposal.subsample_scores_after or [])
+                            if new_sum >= max(parent_sums):
+                                # ACCEPTED: consume one merge attempt and record it
+                                self._run_full_eval_and_add(
+                                    new_program=proposal.candidate,
+                                    state=state,
+                                    parent_program_idx=proposal.parent_program_ids,
+                                )
+                                self.merge_proposer.merges_due -= 1
+                                self.merge_proposer.total_merges_tested += 1
+                                continue  # skip reflective this iteration
+                            else:
+                                # REJECTED: do NOT consume merges_due or total_merges_tested
+                                self.logger.log(
+                                    f"Iteration {state.i + 1}: New program subsample score {new_sum} "
+                                    f"is worse than both parents {parent_sums}, skipping merge"
+                                )
+                                # Skip reflective this iteration (old behavior)
+                                continue
+                    # Old behavior: regardless of whether we attempted, clear the flag before reflective
+                    self.merge_proposer.last_iter_found_new_program = False
+                # 2) Reflective mutation proposer
+                proposal = self.reflective_proposer.propose(state)
+                if proposal is None:
+                    self.logger.log(f"Iteration {state.i + 1}: Reflective mutation did not propose a new candidate")
+                    continue
+                # Acceptance: require strict improvement on subsample
+                old_sum = sum(proposal.subsample_scores_before or [])
+                new_sum = sum(proposal.subsample_scores_after or [])
+                if new_sum <= old_sum:
+                    self.logger.log(
+                        f"Iteration {state.i + 1}: New subsample score {new_sum} is not better than old score {old_sum}, skipping"
+                    )
+                    continue
+                else:
+                    self.logger.log(
+                        f"Iteration {state.i + 1}: New subsample score {new_sum} is better than old score {old_sum}. Continue to full eval and add to candidate pool."
+                    )
+                # Accept: full eval + add
+                self._run_full_eval_and_add(
+                    new_program=proposal.candidate,
+                    state=state,
+                    parent_program_idx=proposal.parent_program_ids,
+                )
+                # Schedule merge attempts like original behavior
+                if self.merge_proposer is not None:
+                    self.merge_proposer.last_iter_found_new_program = True
+                    if self.merge_proposer.total_merges_tested < self.merge_proposer.max_merge_invocations:
+                        self.merge_proposer.merges_due += 1
+            except Exception as e:
+                self.logger.log(f"Iteration {state.i + 1}: Exception during optimization: {e}")
+                self.logger.log(traceback.format_exc())
+                if self.raise_on_exception:
+                    raise e
+                else:
+                    continue
+        # Close progress bar if it exists
+        if self.display_progress_bar and progress_bar is not None:
+            progress_bar.close()
+        state.save(self.run_dir)
+        return state
+    def _should_stop(self, state: GEPAState[RolloutOutput, DataId]) -> bool:
+        """Check if the optimization should stop."""
+        if self._stop_requested:
+            return True
+        if self.stop_callback and self.stop_callback(state):
+            return True
+        return False
+    def request_stop(self) -> None:
+        """Manually request the optimization to stop gracefully."""
+        self.logger.log("Stop requested manually. Initiating graceful shutdown...")
+        self._stop_requested = True

mantisdk/algorithm/gepa/lib/core/result.py ADDED Viewed

@@ -0,0 +1,233 @@
+# Copyright (c) 2025 Lakshya A Agrawal and the GEPA contributors
+# https://github.com/gepa-ai/gepa
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, ClassVar, Generic
+from mantisdk.algorithm.gepa.lib.core.adapter import RolloutOutput
+from mantisdk.algorithm.gepa.lib.core.data_loader import DataId
+from mantisdk.algorithm.gepa.lib.core.state import ProgramIdx
+if TYPE_CHECKING:
+    from mantisdk.algorithm.gepa.lib.core.state import GEPAState
+@dataclass(frozen=True)
+class GEPAResult(Generic[RolloutOutput, DataId]):
+    """
+    Immutable snapshot of a GEPA run with convenience accessors.
+    - candidates: list of proposed candidates (component_name -> component_text)
+    - parents: lineage info; for each candidate i, parents[i] is a list of parent indices or None
+    - val_aggregate_scores: per-candidate aggregate score on the validation set (higher is better)
+    - val_subscores: per-candidate mapping from validation id to score on the validation set (sparse dict)
+    - val_aggregate_subscores: optional per-candidate aggregate subscores across objectives
+    - per_val_instance_best_candidates: for each val instance t, a set of candidate indices achieving the current best score on t
+    - per_objective_best_candidates: optional per-objective set of candidate indices achieving best aggregate subscore
+    - discovery_eval_counts: number of metric calls accumulated up to the discovery of each candidate
+    Optional fields:
+    - best_outputs_valset: per-task best outputs on the validation set. [task_idx -> [(program_idx_1, output_1), (program_idx_2, output_2), ...]]
+    Run-level metadata:
+    - total_metric_calls: total number of metric calls made across the run
+    - num_full_val_evals: number of full validation evaluations performed
+    - run_dir: where artifacts were written (if any)
+    - seed: RNG seed for reproducibility (if known)
+    - tracked_scores: optional tracked aggregate scores (if different from val_aggregate_scores)
+    Convenience:
+    - best_idx: candidate index with the highest val_aggregate_scores
+    - best_candidate: the program text mapping for best_idx
+    - non_dominated_indices(): candidate indices that are not dominated across per-instance pareto fronts
+    - lineage(idx): parent chain from base to idx
+    - diff(parent_idx, child_idx, only_changed=True): component-wise diff between two candidates
+    - best_k(k): top-k candidates by aggregate val score
+    - instance_winners(t): set of candidates on the pareto front for val instance t
+    - to_dict(...), save_json(...): serialization helpers
+    """
+    # Core data
+    candidates: list[dict[str, str]]
+    parents: list[list[ProgramIdx | None]]
+    val_aggregate_scores: list[float]
+    val_subscores: list[dict[DataId, float]]
+    per_val_instance_best_candidates: dict[DataId, set[ProgramIdx]]
+    discovery_eval_counts: list[int]
+    val_aggregate_subscores: list[dict[str, float]] | None = None
+    per_objective_best_candidates: dict[str, set[ProgramIdx]] | None = None
+    objective_pareto_front: dict[str, float] | None = None
+    # Optional data
+    best_outputs_valset: dict[DataId, list[tuple[ProgramIdx, RolloutOutput]]] | None = None
+    # Run metadata (optional)
+    total_metric_calls: int | None = None
+    num_full_val_evals: int | None = None
+    run_dir: str | None = None
+    seed: int | None = None
+    _VALIDATION_SCHEMA_VERSION: ClassVar[int] = 2
+    # -------- Convenience properties --------
+    @property
+    def num_candidates(self) -> int:
+        return len(self.candidates)
+    @property
+    def num_val_instances(self) -> int:
+        return len(self.per_val_instance_best_candidates)
+    @property
+    def best_idx(self) -> int:
+        scores = self.val_aggregate_scores
+        return max(range(len(scores)), key=lambda i: scores[i])
+    @property
+    def best_candidate(self) -> dict[str, str]:
+        return self.candidates[self.best_idx]
+    def to_dict(self) -> dict[str, Any]:
+        cands = [dict(cand.items()) for cand in self.candidates]
+        return {
+            "candidates": cands,
+            "parents": self.parents,
+            "val_aggregate_scores": self.val_aggregate_scores,
+            "val_subscores": self.val_subscores,
+            "best_outputs_valset": self.best_outputs_valset,
+            "per_val_instance_best_candidates": {
+                val_id: list(front) for val_id, front in self.per_val_instance_best_candidates.items()
+            },
+            "val_aggregate_subscores": self.val_aggregate_subscores,
+            "per_objective_best_candidates": (
+                {k: list(v) for k, v in self.per_objective_best_candidates.items()}
+                if self.per_objective_best_candidates is not None
+                else None
+            ),
+            "objective_pareto_front": self.objective_pareto_front,
+            "discovery_eval_counts": self.discovery_eval_counts,
+            "total_metric_calls": self.total_metric_calls,
+            "num_full_val_evals": self.num_full_val_evals,
+            "run_dir": self.run_dir,
+            "seed": self.seed,
+            "best_idx": self.best_idx,
+            "validation_schema_version": GEPAResult._VALIDATION_SCHEMA_VERSION,
+        }
+    @staticmethod
+    def from_dict(d: dict[str, Any]) -> "GEPAResult[RolloutOutput, DataId]":
+        version = d.get("validation_schema_version") or 0
+        if version > GEPAResult._VALIDATION_SCHEMA_VERSION:
+            raise ValueError(
+                f"Unsupported GEPAResult validation schema version {version}; "
+                f"max supported is {GEPAResult._VALIDATION_SCHEMA_VERSION}"
+            )
+        if version <= 1:
+            return GEPAResult._migrate_from_dict_v0(d)
+        return GEPAResult._from_dict_v2(d)
+    @staticmethod
+    def _common_kwargs_from_dict(d: dict[str, Any]) -> dict[str, Any]:
+        return {
+            "candidates": [dict(candidate) for candidate in d.get("candidates", [])],
+            "parents": [list(parent_row) for parent_row in d.get("parents", [])],
+            "val_aggregate_scores": list(d.get("val_aggregate_scores", [])),
+            "discovery_eval_counts": list(d.get("discovery_eval_counts", [])),
+            "total_metric_calls": d.get("total_metric_calls"),
+            "num_full_val_evals": d.get("num_full_val_evals"),
+            "run_dir": d.get("run_dir"),
+            "seed": d.get("seed"),
+        }
+    @staticmethod
+    def _migrate_from_dict_v0(d: dict[str, Any]) -> "GEPAResult[RolloutOutput, DataId]":
+        kwargs = GEPAResult._common_kwargs_from_dict(d)
+        kwargs["val_subscores"] = [
+            {idx: score for idx, score in enumerate(scores)} for scores in d.get("val_subscores", [])
+        ]
+        kwargs["per_val_instance_best_candidates"] = {
+            idx: set(front) for idx, front in enumerate(d.get("per_val_instance_best_candidates", []))
+        }
+        best_outputs_valset = d.get("best_outputs_valset")
+        if best_outputs_valset is not None:
+            kwargs["best_outputs_valset"] = {
+                idx: [(program_idx, output) for program_idx, output in outputs]
+                for idx, outputs in enumerate(best_outputs_valset)
+            }
+        else:
+            kwargs["best_outputs_valset"] = None
+        return GEPAResult(**kwargs)
+    @staticmethod
+    def _from_dict_v2(d: dict[str, Any]) -> "GEPAResult[RolloutOutput, DataId]":
+        kwargs = GEPAResult._common_kwargs_from_dict(d)
+        kwargs["val_subscores"] = [dict(scores) for scores in d.get("val_subscores", [])]
+        per_val_instance_best_candidates_data = d.get("per_val_instance_best_candidates", {})
+        kwargs["per_val_instance_best_candidates"] = {
+            val_id: set(candidates_on_front)
+            for val_id, candidates_on_front in per_val_instance_best_candidates_data.items()
+        }
+        best_outputs_valset = d.get("best_outputs_valset")
+        if best_outputs_valset is not None:
+            kwargs["best_outputs_valset"] = {
+                val_id: [(program_idx, output) for program_idx, output in outputs]
+                for val_id, outputs in best_outputs_valset.items()
+            }
+        else:
+            kwargs["best_outputs_valset"] = None
+        val_aggregate_subscores = d.get("val_aggregate_subscores")
+        kwargs["val_aggregate_subscores"] = (
+            [dict(scores) for scores in val_aggregate_subscores] if val_aggregate_subscores is not None else None
+        )
+        per_objective_best_candidates = d.get("per_objective_best_candidates")
+        if per_objective_best_candidates is not None:
+            kwargs["per_objective_best_candidates"] = {
+                objective: set(program_indices) for objective, program_indices in per_objective_best_candidates.items()
+            }
+        else:
+            kwargs["per_objective_best_candidates"] = None
+        objective_pareto_front = d.get("objective_pareto_front")
+        kwargs["objective_pareto_front"] = dict(objective_pareto_front) if objective_pareto_front is not None else None
+        return GEPAResult(**kwargs)
+    @staticmethod
+    def from_state(
+        state: "GEPAState[RolloutOutput, DataId]",
+        run_dir: str | None = None,
+        seed: int | None = None,
+    ) -> "GEPAResult[RolloutOutput, DataId]":
+        """Build a GEPAResult from a GEPAState."""
+        objective_scores_list = [dict(scores) for scores in state.prog_candidate_objective_scores]
+        has_objective_scores = any(obj for obj in objective_scores_list)
+        per_objective_best = {
+            objective: set(front) for objective, front in state.program_at_pareto_front_objectives.items()
+        }
+        objective_front = dict(state.objective_pareto_front)
+        return GEPAResult(
+            candidates=list(state.program_candidates),
+            parents=list(state.parent_program_for_candidate),
+            val_aggregate_scores=list(state.program_full_scores_val_set),
+            best_outputs_valset=getattr(state, "best_outputs_valset", None),
+            val_subscores=[dict(scores) for scores in state.prog_candidate_val_subscores],
+            per_val_instance_best_candidates={
+                val_id: set(front) for val_id, front in state.program_at_pareto_front_valset.items()
+            },
+            val_aggregate_subscores=(objective_scores_list if has_objective_scores else None),
+            per_objective_best_candidates=(per_objective_best if per_objective_best else None),
+            objective_pareto_front=objective_front if objective_front else None,
+            discovery_eval_counts=list(state.num_metric_calls_by_discovery),
+            total_metric_calls=getattr(state, "total_num_evals", None),
+            num_full_val_evals=getattr(state, "num_full_ds_evals", None),
+            run_dir=run_dir,
+            seed=seed,
+        )