PyPI - mantisdk - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mantisdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mantisdk might be problematic. Click here for more details.

Files changed (190) hide show

mantisdk/__init__.py +22 -0
mantisdk/adapter/__init__.py +15 -0
mantisdk/adapter/base.py +94 -0
mantisdk/adapter/messages.py +270 -0
mantisdk/adapter/triplet.py +1028 -0
mantisdk/algorithm/__init__.py +39 -0
mantisdk/algorithm/apo/__init__.py +5 -0
mantisdk/algorithm/apo/apo.py +889 -0
mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
mantisdk/algorithm/base.py +162 -0
mantisdk/algorithm/decorator.py +264 -0
mantisdk/algorithm/fast.py +250 -0
mantisdk/algorithm/gepa/__init__.py +59 -0
mantisdk/algorithm/gepa/adapter.py +459 -0
mantisdk/algorithm/gepa/gepa.py +364 -0
mantisdk/algorithm/gepa/lib/__init__.py +18 -0
mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
mantisdk/algorithm/gepa/lib/api.py +375 -0
mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
mantisdk/algorithm/gepa/lib/core/result.py +233 -0
mantisdk/algorithm/gepa/lib/core/state.py +636 -0
mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
mantisdk/algorithm/gepa/lib/py.typed +0 -0
mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
mantisdk/algorithm/gepa/tracing.py +105 -0
mantisdk/algorithm/utils.py +177 -0
mantisdk/algorithm/verl/__init__.py +5 -0
mantisdk/algorithm/verl/interface.py +202 -0
mantisdk/cli/__init__.py +56 -0
mantisdk/cli/prometheus.py +115 -0
mantisdk/cli/store.py +131 -0
mantisdk/cli/vllm.py +29 -0
mantisdk/client.py +408 -0
mantisdk/config.py +348 -0
mantisdk/emitter/__init__.py +43 -0
mantisdk/emitter/annotation.py +370 -0
mantisdk/emitter/exception.py +54 -0
mantisdk/emitter/message.py +61 -0
mantisdk/emitter/object.py +117 -0
mantisdk/emitter/reward.py +320 -0
mantisdk/env_var.py +156 -0
mantisdk/execution/__init__.py +15 -0
mantisdk/execution/base.py +64 -0
mantisdk/execution/client_server.py +443 -0
mantisdk/execution/events.py +69 -0
mantisdk/execution/inter_process.py +16 -0
mantisdk/execution/shared_memory.py +282 -0
mantisdk/instrumentation/__init__.py +119 -0
mantisdk/instrumentation/agentops.py +314 -0
mantisdk/instrumentation/agentops_langchain.py +45 -0
mantisdk/instrumentation/litellm.py +83 -0
mantisdk/instrumentation/vllm.py +81 -0
mantisdk/instrumentation/weave.py +500 -0
mantisdk/litagent/__init__.py +11 -0
mantisdk/litagent/decorator.py +536 -0
mantisdk/litagent/litagent.py +252 -0
mantisdk/llm_proxy.py +1890 -0
mantisdk/logging.py +370 -0
mantisdk/reward.py +7 -0
mantisdk/runner/__init__.py +11 -0
mantisdk/runner/agent.py +845 -0
mantisdk/runner/base.py +182 -0
mantisdk/runner/legacy.py +309 -0
mantisdk/semconv.py +170 -0
mantisdk/server.py +401 -0
mantisdk/store/__init__.py +23 -0
mantisdk/store/base.py +897 -0
mantisdk/store/client_server.py +2092 -0
mantisdk/store/collection/__init__.py +30 -0
mantisdk/store/collection/base.py +587 -0
mantisdk/store/collection/memory.py +970 -0
mantisdk/store/collection/mongo.py +1412 -0
mantisdk/store/collection_based.py +1823 -0
mantisdk/store/insight.py +648 -0
mantisdk/store/listener.py +58 -0
mantisdk/store/memory.py +396 -0
mantisdk/store/mongo.py +165 -0
mantisdk/store/sqlite.py +3 -0
mantisdk/store/threading.py +357 -0
mantisdk/store/utils.py +142 -0
mantisdk/tracer/__init__.py +16 -0
mantisdk/tracer/agentops.py +242 -0
mantisdk/tracer/base.py +287 -0
mantisdk/tracer/dummy.py +106 -0
mantisdk/tracer/otel.py +555 -0
mantisdk/tracer/weave.py +677 -0
mantisdk/trainer/__init__.py +6 -0
mantisdk/trainer/init_utils.py +263 -0
mantisdk/trainer/legacy.py +367 -0
mantisdk/trainer/registry.py +12 -0
mantisdk/trainer/trainer.py +618 -0
mantisdk/types/__init__.py +6 -0
mantisdk/types/core.py +553 -0
mantisdk/types/resources.py +204 -0
mantisdk/types/tracer.py +515 -0
mantisdk/types/tracing.py +218 -0
mantisdk/utils/__init__.py +1 -0
mantisdk/utils/id.py +18 -0
mantisdk/utils/metrics.py +1025 -0
mantisdk/utils/otel.py +578 -0
mantisdk/utils/otlp.py +536 -0
mantisdk/utils/server_launcher.py +1045 -0
mantisdk/utils/system_snapshot.py +81 -0
mantisdk/verl/__init__.py +8 -0
mantisdk/verl/__main__.py +6 -0
mantisdk/verl/async_server.py +46 -0
mantisdk/verl/config.yaml +27 -0
mantisdk/verl/daemon.py +1154 -0
mantisdk/verl/dataset.py +44 -0
mantisdk/verl/entrypoint.py +248 -0
mantisdk/verl/trainer.py +549 -0
mantisdk-0.1.0.dist-info/METADATA +119 -0
mantisdk-0.1.0.dist-info/RECORD +190 -0
mantisdk-0.1.0.dist-info/WHEEL +4 -0
mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0

mantisdk/algorithm/gepa/lib/api.py ADDED Viewed

@@ -0,0 +1,375 @@
+# Copyright (c) 2025 Lakshya A Agrawal and the GEPA contributors
+# https://github.com/gepa-ai/gepa
+import os
+import random
+from collections.abc import Sequence
+from typing import Any, Literal, cast
+from mantisdk.algorithm.gepa.lib.adapters.default_adapter.default_adapter import (
+    ChatCompletionCallable,
+    DefaultAdapter,
+    Evaluator,
+)
+from mantisdk.algorithm.gepa.lib.core.adapter import DataInst, GEPAAdapter, RolloutOutput, Trajectory
+from mantisdk.algorithm.gepa.lib.core.data_loader import DataId, DataLoader, ensure_loader
+from mantisdk.algorithm.gepa.lib.core.engine import GEPAEngine
+from mantisdk.algorithm.gepa.lib.core.result import GEPAResult
+from mantisdk.algorithm.gepa.lib.core.state import EvaluationCache, FrontierType
+from mantisdk.algorithm.gepa.lib.logging.experiment_tracker import create_experiment_tracker
+from mantisdk.algorithm.gepa.lib.logging.logger import LoggerProtocol, StdOutLogger
+from mantisdk.algorithm.gepa.lib.proposer.merge import MergeProposer
+from mantisdk.algorithm.gepa.lib.proposer.reflective_mutation.base import CandidateSelector, LanguageModel, ReflectionComponentSelector
+from mantisdk.algorithm.gepa.lib.proposer.reflective_mutation.reflective_mutation import ReflectiveMutationProposer
+from mantisdk.algorithm.gepa.lib.strategies.batch_sampler import BatchSampler, EpochShuffledBatchSampler
+from mantisdk.algorithm.gepa.lib.strategies.candidate_selector import (
+    CurrentBestCandidateSelector,
+    EpsilonGreedyCandidateSelector,
+    ParetoCandidateSelector,
+)
+from mantisdk.algorithm.gepa.lib.strategies.component_selector import (
+    AllReflectionComponentSelector,
+    RoundRobinReflectionComponentSelector,
+)
+from mantisdk.algorithm.gepa.lib.strategies.eval_policy import EvaluationPolicy, FullEvaluationPolicy
+from mantisdk.algorithm.gepa.lib.utils import FileStopper, StopperProtocol
+def optimize(
+    seed_candidate: dict[str, str],
+    trainset: list[DataInst] | DataLoader[DataId, DataInst],
+    valset: list[DataInst] | DataLoader[DataId, DataInst] | None = None,
+    adapter: GEPAAdapter[DataInst, Trajectory, RolloutOutput] | None = None,
+    task_lm: str | ChatCompletionCallable | None = None,
+    evaluator: Evaluator | None = None,
+    # Reflection-based configuration
+    reflection_lm: LanguageModel | str | None = None,
+    candidate_selection_strategy: CandidateSelector | Literal["pareto", "current_best", "epsilon_greedy"] = "pareto",
+    frontier_type: FrontierType = "instance",
+    skip_perfect_score: bool = True,
+    batch_sampler: BatchSampler | Literal["epoch_shuffled"] = "epoch_shuffled",
+    reflection_minibatch_size: int | None = None,
+    perfect_score: float = 1.0,
+    reflection_prompt_template: str | None = None,
+    # Component selection configuration
+    module_selector: ReflectionComponentSelector | str = "round_robin",
+    # Merge-based configuration
+    use_merge: bool = False,
+    max_merge_invocations: int = 5,
+    merge_val_overlap_floor: int = 5,
+    # Budget and Stop Condition
+    max_metric_calls: int | None = None,
+    stop_callbacks: StopperProtocol | Sequence[StopperProtocol] | None = None,
+    # Logging
+    logger: LoggerProtocol | None = None,
+    run_dir: str | None = None,
+    use_wandb: bool = False,
+    wandb_api_key: str | None = None,
+    wandb_init_kwargs: dict[str, Any] | None = None,
+    use_mlflow: bool = False,
+    mlflow_tracking_uri: str | None = None,
+    mlflow_experiment_name: str | None = None,
+    track_best_outputs: bool = False,
+    display_progress_bar: bool = False,
+    use_cloudpickle: bool = False,
+    # Evaluation caching
+    cache_evaluation: bool = False,
+    # Reproducibility
+    seed: int = 0,
+    raise_on_exception: bool = True,
+    val_evaluation_policy: EvaluationPolicy[DataId, DataInst] | Literal["full_eval"] | None = None,
+) -> GEPAResult[RolloutOutput, DataId]:
+    """
+    GEPA is an evolutionary optimizer that evolves (multiple) text components of a complex system to optimize them towards a given metric.
+    GEPA can also leverage rich textual feedback obtained from the system's execution environment, evaluation,
+    and the system's own execution traces to iteratively improve the system's performance.
+    Concepts:
+    - System: A harness that uses text components to perform a task. Each text component of the system to be optimized is a named component of the system.
+    - Candidate: A mapping from component names to component text. A concrete instantiation of the system is realized by setting the text of each system component
+      to the text provided by the candidate mapping.
+    - `DataInst`: An (uninterpreted) data type over which the system operates.
+    - `RolloutOutput`: The output of the system on a `DataInst`.
+    Each execution of the system produces a `RolloutOutput`, which can be evaluated to produce a score. The execution of the system also produces a trajectory,
+    which consists of the operations performed by different components of the system, including the text of the components that were executed.
+    GEPA can be applied to optimize any system that uses text components (e.g., prompts in a AI system, code snippets/code files/functions/classes in a codebase, etc.).
+    In order for GEPA to plug into your system's environment, GEPA requires an adapter, `GEPAAdapter` to be implemented. The adapter is responsible for:
+    1. Evaluating a proposed candidate on a batch of inputs.
+       - The adapter receives a candidate proposed by GEPA, along with a batch of inputs selected from the training/validation set.
+       - The adapter instantiates the system with the texts proposed in the candidate.
+       - The adapter then evaluates the candidate on the batch of inputs, and returns the scores.
+       - The adapter should also capture relevant information from the execution of the candidate, like system and evaluation traces.
+    2. Identifying textual information relevant to a component of the candidate
+       - Given the trajectories captured during the execution of the candidate, GEPA selects a component of the candidate to update.
+       - The adapter receives the candidate, the batch of inputs, and the trajectories captured during the execution of the candidate.
+       - The adapter is responsible for identifying the textual information relevant to the component to update.
+       - This information is used by GEPA to reflect on the performnace of the component, and propose new component texts.
+    At each iteration, GEPA proposes a new candidate using one of the following strategies:
+    1. Reflective mutation: GEPA proposes a new candidate by mutating the current candidate, leveraging rich textual feedback.
+    2. Merge: GEPA proposes a new candidate by merging 2 candidates that are on the Pareto frontier.
+    GEPA also tracks the Pareto frontier of performance achieved by different candidates on the validation set. This way, it can leverage candidates that
+    work well on a subset of inputs to improve the system's performance on the entire validation set, by evolving from the Pareto frontier.
+    Parameters:
+    - seed_candidate: The initial candidate to start with.
+    - trainset: Training data supplied as an in-memory sequence or a `DataLoader` yielding batches for reflective updates.
+    - valset: Validation data source (sequence or `DataLoader`) used for tracking Pareto scores. If not provided, GEPA reuses the trainset.
+    - adapter: A `GEPAAdapter` instance that implements the adapter interface. This allows GEPA to plug into your system's environment. If not provided, GEPA will use a default adapter: `gepa.adapters.default_adapter.default_adapter.DefaultAdapter`, with model defined by `task_lm`.
+    - task_lm: Optional. The model to use for the task. This is only used if `adapter` is not provided, and is used to initialize the default adapter.
+    - evaluator: Optional. A custom evaluator to use for evaluating the candidate program. If not provided, GEPA will use the default evaluator: `gepa.adapters.default_adapter.default_adapter.ContainsAnswerEvaluator`. Only used if `adapter` is not provided.
+    # Reflection-based configuration
+    - reflection_lm: A `LanguageModel` instance that is used to reflect on the performance of the candidate program.
+    - candidate_selection_strategy: The strategy to use for selecting the candidate to update. Supported strategies: 'pareto', 'current_best', 'epsilon_greedy'. Defaults to 'pareto'.
+    - frontier_type: Strategy for tracking Pareto frontiers. 'instance' tracks per validation example, 'objective' tracks per objective metric, 'hybrid' combines both, 'cartesian' tracks per (example, objective) pair. Defaults to 'instance'.
+    - skip_perfect_score: Whether to skip updating the candidate if it achieves a perfect score on the minibatch.
+    - batch_sampler: Strategy for selecting training examples. Can be a [BatchSampler](src/gepa/strategies/batch_sampler.py) instance or a string for a predefined strategy from ['epoch_shuffled']. Defaults to 'epoch_shuffled', which creates an [EpochShuffledBatchSampler](src/gepa/strategies/batch_sampler.py).
+    - reflection_minibatch_size: The number of examples to use for reflection in each proposal step. Defaults to 3. Only valid when batch_sampler='epoch_shuffled' (default), and is ignored otherwise.
+    - perfect_score: The perfect score to achieve.
+    - reflection_prompt_template: The prompt template to use for reflection. If not provided, GEPA will use the default prompt template (see [InstructionProposalSignature](src/gepa/strategies/instruction_proposal.py)). The prompt template must contain the following placeholders, which will be replaced with actual values: `<curr_instructions>` (will be replaced by the instructions to evolve) and `<inputs_outputs_feedback>` (replaced with the inputs, outputs, and feedback generated with current instruction). This will be ignored if the adapter provides its own `propose_new_texts` method.
+    # Component selection configuration
+    - module_selector: Component selection strategy. Can be a ReflectionComponentSelector instance or a string ('round_robin', 'all'). Defaults to 'round_robin'. The 'round_robin' strategy cycles through components in order. The 'all' strategy selects all components for modification in every GEPA iteration.
+    # Merge-based configuration
+    - use_merge: Whether to use the merge strategy.
+    - max_merge_invocations: The maximum number of merge invocations to perform.
+    - merge_val_overlap_floor: Minimum number of shared validation ids required between parents before attempting a merge subsample. Only relevant when using `val_evaluation_policy` other than `full_eval`.
+    # Budget and Stop Condition
+    - max_metric_calls: Optional maximum number of metric calls to perform. If not provided, stop_callbacks must be provided.
+    - stop_callbacks: Optional stopper(s) that return True when optimization should stop. Can be a single StopperProtocol or a list or tuple of StopperProtocol instances. Examples: FileStopper, TimeoutStopCondition, SignalStopper, NoImprovementStopper, or custom stopping logic. If not provided, max_metric_calls must be provided.
+    # Logging
+    - logger: A `LoggerProtocol` instance that is used to log the progress of the optimization.
+    - run_dir: The directory to save the results to. Optimization state and results will be saved to this directory. If the directory already exists, GEPA will read the state from this directory and resume the optimization from the last saved state. If provided, a FileStopper is automatically created which checks for the presence of "gepa.stop" in this directory, allowing graceful stopping of the optimization process upon its presence.
+    - use_wandb: Whether to use Weights and Biases to log the progress of the optimization.
+    - wandb_api_key: The API key to use for Weights and Biases.
+    - wandb_init_kwargs: Additional keyword arguments to pass to the Weights and Biases initialization.
+    - use_mlflow: Whether to use MLflow to log the progress of the optimization.
+      Both wandb and mlflow can be used simultaneously if desired.
+    - mlflow_tracking_uri: The tracking URI to use for MLflow.
+    - mlflow_experiment_name: The experiment name to use for MLflow.
+    - track_best_outputs: Whether to track the best outputs on the validation set. If True, GEPAResult will contain the best outputs obtained for each task in the validation set.
+    - display_progress_bar: Show a tqdm progress bar over metric calls when enabled.
+    - use_cloudpickle: Use cloudpickle instead of pickle. This can be helpful when the serialized state contains dynamically generated DSPy signatures.
+    # Evaluation caching
+    - cache_evaluation: Whether to cache the (score, output, objective_scores) of (candidate, example) pairs. If True and a cache entry exists, GEPA will skip the fitness evaluation and use the cached results. This helps avoid redundant evaluations and saves metric calls. Defaults to False.
+    # Reproducibility
+    - seed: The seed to use for the random number generator.
+    - val_evaluation_policy: Strategy controlling which validation ids to score each iteration and which candidate is currently best. Supported strings: "full_eval" (evaluate every id each time) Passing None defaults to "full_eval".
+    - raise_on_exception: Whether to propagate proposer/evaluator exceptions instead of stopping gracefully.
+    """
+    active_adapter: GEPAAdapter[DataInst, Trajectory, RolloutOutput] | None = None
+    if adapter is None:
+        assert task_lm is not None, (
+            "Since no adapter is provided, GEPA requires a task LM to be provided. Please set the `task_lm` parameter."
+        )
+        active_adapter = cast(
+            GEPAAdapter[DataInst, Trajectory, RolloutOutput], DefaultAdapter(model=task_lm, evaluator=evaluator)
+        )
+    else:
+        assert task_lm is None, (
+            "Since an adapter is provided, GEPA does not require a task LM to be provided. Please set the `task_lm` parameter to None."
+        )
+        assert evaluator is None, (
+            "Since an adapter is provided, GEPA does not require an evaluator to be provided. Please set the `evaluator` parameter to None."
+        )
+        active_adapter = adapter
+    # Normalize datasets to DataLoader instances
+    train_loader = ensure_loader(trainset)
+    val_loader = ensure_loader(valset) if valset is not None else train_loader
+    # Comprehensive stop_callback logic
+    # Convert stop_callbacks to a list if it's not already
+    stop_callbacks_list: list[StopperProtocol] = []
+    if stop_callbacks is not None:
+        if isinstance(stop_callbacks, Sequence):
+            stop_callbacks_list.extend(stop_callbacks)
+        else:
+            stop_callbacks_list.append(stop_callbacks)
+    # Add file stopper if run_dir is provided
+    if run_dir is not None:
+        stop_file_path = os.path.join(run_dir, "gepa.stop")
+        file_stopper = FileStopper(stop_file_path)
+        stop_callbacks_list.append(file_stopper)
+    # Add max_metric_calls stopper if provided
+    if max_metric_calls is not None:
+        from mantisdk.algorithm.gepa.lib.utils import MaxMetricCallsStopper
+        max_calls_stopper = MaxMetricCallsStopper(max_metric_calls)
+        stop_callbacks_list.append(max_calls_stopper)
+    # Assert that at least one stopping condition is provided
+    if not stop_callbacks_list:
+        raise ValueError(
+            "The user must provide at least one of stop_callbacks or max_metric_calls to specify a stopping condition."
+        )
+    # Create composite stopper if multiple stoppers, or use single stopper
+    stop_callback: StopperProtocol
+    if len(stop_callbacks_list) == 1:
+        stop_callback = stop_callbacks_list[0]
+    else:
+        from mantisdk.algorithm.gepa.lib.utils import CompositeStopper
+        stop_callback = CompositeStopper(*stop_callbacks_list)
+    if not hasattr(active_adapter, "propose_new_texts"):
+        assert reflection_lm is not None, (
+            f"reflection_lm was not provided. The adapter used '{active_adapter!s}' does not provide a propose_new_texts method, "
+            + "and hence, GEPA will use the default proposer, which requires a reflection_lm to be specified."
+        )
+    if isinstance(reflection_lm, str):
+        import litellm
+        reflection_lm_name = reflection_lm
+        def _reflection_lm(prompt: str) -> str:
+            completion = litellm.completion(model=reflection_lm_name, messages=[{"role": "user", "content": prompt}])
+            return completion.choices[0].message.content  # type: ignore
+        reflection_lm = _reflection_lm
+    if logger is None:
+        logger = StdOutLogger()
+    rng = random.Random(seed)
+    candidate_selector: CandidateSelector
+    if isinstance(candidate_selection_strategy, str):
+        factories = {
+            "pareto": lambda: ParetoCandidateSelector(rng=rng),
+            "current_best": lambda: CurrentBestCandidateSelector(),
+            "epsilon_greedy": lambda: EpsilonGreedyCandidateSelector(epsilon=0.1, rng=rng),
+        }
+        try:
+            candidate_selector = factories[candidate_selection_strategy]()
+        except KeyError as exc:
+            raise ValueError(
+                f"Unknown candidate_selector strategy: {candidate_selection_strategy}. "
+                "Supported strategies: 'pareto', 'current_best', 'epsilon_greedy'"
+            ) from exc
+    elif isinstance(candidate_selection_strategy, CandidateSelector):
+        candidate_selector = candidate_selection_strategy
+    else:
+        raise TypeError(
+            "candidate_selection_strategy must be a supported string strategy or an instance of CandidateSelector."
+        )
+    if val_evaluation_policy is None or val_evaluation_policy == "full_eval":
+        val_evaluation_policy = FullEvaluationPolicy()
+    elif not isinstance(val_evaluation_policy, EvaluationPolicy):
+        raise ValueError(
+            f"val_evaluation_policy should be one of 'full_eval' or an instance of EvaluationPolicy, but got {type(val_evaluation_policy)}"
+        )
+    if isinstance(module_selector, str):
+        module_selector_cls = {
+            "round_robin": RoundRobinReflectionComponentSelector,
+            "all": AllReflectionComponentSelector,
+        }.get(module_selector)
+        assert module_selector_cls is not None, (
+            f"Unknown module_selector strategy: {module_selector}. Supported strategies: 'round_robin', 'all'"
+        )
+        module_selector_instance: ReflectionComponentSelector = module_selector_cls()
+    else:
+        module_selector_instance = module_selector
+    if batch_sampler == "epoch_shuffled":
+        batch_sampler = EpochShuffledBatchSampler(minibatch_size=reflection_minibatch_size or 3, rng=rng)
+    else:
+        assert reflection_minibatch_size is None, (
+            "reflection_minibatch_size only accepted if batch_sampler is 'epoch_shuffled'"
+        )
+    experiment_tracker = create_experiment_tracker(
+        use_wandb=use_wandb,
+        wandb_api_key=wandb_api_key,
+        wandb_init_kwargs=wandb_init_kwargs,
+        use_mlflow=use_mlflow,
+        mlflow_tracking_uri=mlflow_tracking_uri,
+        mlflow_experiment_name=mlflow_experiment_name,
+    )
+    if reflection_prompt_template is not None:
+        assert not (adapter is not None and getattr(adapter, "propose_new_texts", None) is not None), (
+            f"Adapter {adapter!s} provides its own propose_new_texts method; reflection_prompt_template will be ignored. "
+            "Set reflection_prompt_template to None."
+        )
+    # Create evaluation cache if enabled
+    evaluation_cache: EvaluationCache[RolloutOutput, DataId] | None = None
+    if cache_evaluation:
+        evaluation_cache = EvaluationCache[RolloutOutput, DataId]()
+    reflective_proposer = ReflectiveMutationProposer(
+        logger=logger,
+        trainset=train_loader,
+        adapter=active_adapter,
+        candidate_selector=candidate_selector,
+        module_selector=module_selector_instance,
+        batch_sampler=batch_sampler,
+        perfect_score=perfect_score,
+        skip_perfect_score=skip_perfect_score,
+        experiment_tracker=experiment_tracker,
+        reflection_lm=reflection_lm,
+        reflection_prompt_template=reflection_prompt_template,
+    )
+    def evaluator_fn(
+        inputs: list[DataInst], prog: dict[str, str]
+    ) -> tuple[list[RolloutOutput], list[float], Sequence[dict[str, float]] | None]:
+        eval_out = active_adapter.evaluate(inputs, prog, capture_traces=False)
+        return eval_out.outputs, eval_out.scores, eval_out.objective_scores
+    merge_proposer: MergeProposer | None = None
+    if use_merge:
+        merge_proposer = MergeProposer(
+            logger=logger,
+            valset=val_loader,
+            evaluator=evaluator_fn,
+            use_merge=use_merge,
+            max_merge_invocations=max_merge_invocations,
+            rng=rng,
+            val_overlap_floor=merge_val_overlap_floor,
+        )
+    engine = GEPAEngine(
+        adapter=active_adapter,
+        run_dir=run_dir,
+        valset=val_loader,
+        seed_candidate=seed_candidate,
+        perfect_score=perfect_score,
+        seed=seed,
+        reflective_proposer=reflective_proposer,
+        merge_proposer=merge_proposer,
+        frontier_type=frontier_type,
+        logger=logger,
+        experiment_tracker=experiment_tracker,
+        track_best_outputs=track_best_outputs,
+        display_progress_bar=display_progress_bar,
+        raise_on_exception=raise_on_exception,
+        stop_callback=stop_callback,
+        val_evaluation_policy=val_evaluation_policy,
+        use_cloudpickle=use_cloudpickle,
+        evaluation_cache=evaluation_cache,
+    )
+    with experiment_tracker:
+        state = engine.run()
+    return GEPAResult.from_state(state, run_dir=run_dir, seed=seed)

mantisdk/algorithm/gepa/lib/core/__init__.py ADDED Viewed

File without changes

mantisdk/algorithm/gepa/lib/core/adapter.py ADDED Viewed

@@ -0,0 +1,180 @@
+# Copyright (c) 2025 Lakshya A Agrawal and the GEPA contributors
+# https://github.com/gepa-ai/gepa
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+from typing import Any, Generic, Protocol, TypeVar
+# Generic type aliases matching your original
+RolloutOutput = TypeVar("RolloutOutput")
+Trajectory = TypeVar("Trajectory")
+DataInst = TypeVar("DataInst")
+Candidate = dict[str, str]
+@dataclass
+class EvaluationBatch(Generic[Trajectory, RolloutOutput]):
+    """
+    Container for the result of evaluating a proposed candidate on a batch of data.
+    - outputs: raw per-example outputs from upon executing the candidate. GEPA does not interpret these;
+      they are forwarded to other parts of the user's code or logging as-is.
+    - scores: per-example numeric scores (floats). GEPA sums these for minibatch acceptance
+      and averages them over the full validation set for tracking/pareto fronts.
+    - trajectories: optional per-example traces used by make_reflective_dataset to build
+      a reflective dataset (See `GEPAAdapter.make_reflective_dataset`). If capture_traces=True is passed to `evaluate`, trajectories
+      should be provided and align one-to-one with `outputs` and `scores`.
+    - objective_scores: optional per-example maps of objective name -> score. Leave None when
+      the evaluator does not expose multi-objective metrics.
+    """
+    outputs: list[RolloutOutput]
+    scores: list[float]
+    trajectories: list[Trajectory] | None = None
+    objective_scores: list[dict[str, float]] | None = None
+class ProposalFn(Protocol):
+    def __call__(
+        self,
+        candidate: dict[str, str],
+        reflective_dataset: Mapping[str, Sequence[Mapping[str, Any]]],
+        components_to_update: list[str],
+    ) -> dict[str, str]:
+        """
+        - Given the current `candidate`, a reflective dataset (as returned by
+          `GEPAAdapter.make_reflective_dataset`), and a list of component names to update,
+          return a mapping component_name -> new component text (str). This allows the user
+          to implement their own instruction proposal logic. For example, the user can use
+          a different LLM, implement DSPy signatures, etc. Another example can be situations
+          where 2 or more components need to be updated together (coupled updates).
+        Returns
+        - Dict[str, str] mapping component names to newly proposed component texts.
+        """
+        ...
+class GEPAAdapter(Protocol[DataInst, Trajectory, RolloutOutput]):
+    """
+    GEPAAdapter is the single integration point between your system
+    and the GEPA optimization engine. Implementers provide three responsibilities:
+    The following are user-defined types that are not interpreted by GEPA but are used by the user's code
+        to define the adapter:
+    DataInst: User-defined type of input data to the program under optimization.
+    Trajectory: User-defined type of trajectory data, which typically captures the
+        different steps of the program candidate execution.
+    RolloutOutput: User-defined type of output data from the program candidate.
+    The following are the responsibilities of the adapter:
+    1) Program construction and evaluation (evaluate):
+       Given a batch of DataInst and a "candidate" program (mapping from named components
+       -> component text), execute the program to produce per-example scores and
+       optionally rich trajectories (capturing intermediate states) needed for reflection.
+    2) Reflective dataset construction (make_reflective_dataset):
+       Given the candidate, EvaluationBatch (trajectories, outputs, scores), and the list of components to update,
+       produce a small JSON-serializable dataset for each component that you want to update. This
+       dataset is fed to the teacher LM to propose improved component text.
+    3) Optional instruction proposal (propose_new_texts):
+       GEPA provides a default implementation (instruction_proposal.py) that serializes the reflective dataset
+       to propose new component texts. However, users can implement their own proposal logic by implementing this method.
+       This method receives the current candidate, the reflective dataset, and the list of components to update,
+       and returns a mapping from component name to new component text.
+    Key concepts and contracts:
+    - candidate: Dict[str, str] mapping a named component of the system to its corresponding text.
+    - scores: higher is better. GEPA uses:
+      - minibatch: sum(scores) to compare old vs. new candidate (acceptance test),
+      - full valset: mean(scores) for tracking and Pareto-front selection.
+      Ensure your metric is calibrated accordingly or normalized to a consistent scale.
+    - trajectories: opaque to GEPA (the engine never inspects them). They must be
+      consumable by your own make_reflective_dataset implementation to extract the
+      minimal context needed to produce meaningful feedback for every component of
+      the system under optimization.
+    - error handling: Never raise for individual example failures. Instead:
+      - Return a valid `EvaluationBatch` with per-example failure scores (e.g., 0.0)
+        when formatting/parsing fails. Even better if the trajectories are also populated
+        with the failed example, including the error message, identifying the reason for the failure.
+      - Reserve exceptions for unrecoverable, systemic failures (e.g., missing model,
+        misconfigured program, schema mismatch).
+      - If an exception is raised, the engine will log the error and proceed to the next iteration.
+    """
+    def evaluate(
+        self,
+        batch: list[DataInst],
+        candidate: dict[str, str],
+        capture_traces: bool = False,
+    ) -> EvaluationBatch[Trajectory, RolloutOutput]:
+        """
+        Run the program defined by `candidate` on a batch of data.
+        Parameters
+        - batch: list of task-specific inputs (DataInst).
+        - candidate: mapping from component name -> component text. You must instantiate
+          your full system with the component text for each component, and execute it on the batch.
+        - capture_traces: when True, you must populate `EvaluationBatch.trajectories`
+          with a per-example trajectory object that your `make_reflective_dataset` can
+          later consume. When False, you may set trajectories=None to save time/memory.
+          capture_traces=True is used by the reflective mutation proposer to build a reflective dataset.
+        Returns
+        - EvaluationBatch with:
+          - outputs: raw per-example outputs (opaque to GEPA).
+          - scores: per-example floats, length == len(batch). Higher is better.
+          - trajectories:
+              - if capture_traces=True: list[Trajectory] with length == len(batch).
+              - if capture_traces=False: None.
+        Scoring semantics
+        - The engine uses sum(scores) on minibatches to decide whether to accept a
+          candidate mutation and average(scores) over the full valset for tracking.
+        - Prefer to return per-example scores, that can be aggregated via summation.
+        - If an example fails (e.g., parse error), use a fallback score (e.g., 0.0).
+        Correctness constraints
+        - len(outputs) == len(scores) == len(batch)
+        - If capture_traces=True: trajectories must be provided and len(trajectories) == len(batch)
+        - Do not mutate `batch` or `candidate` in-place. Construct a fresh program
+          instance or deep-copy as needed.
+        """
+        ...
+    def make_reflective_dataset(
+        self,
+        candidate: dict[str, str],
+        eval_batch: EvaluationBatch[Trajectory, RolloutOutput],
+        components_to_update: list[str],
+    ) -> Mapping[str, Sequence[Mapping[str, Any]]]:
+        """
+        Build a small, JSON-serializable dataset (per component) to drive instruction
+        refinement by a teacher LLM.
+        Parameters
+        - candidate: the same candidate evaluated in evaluate().
+        - eval_batch: The result of evaluate(..., capture_traces=True) on
+          the same batch. You should extract everything you need from eval_batch.trajectories
+          (and optionally outputs/scores) to assemble concise, high-signal examples.
+        - components_to_update: subset of component names for which the proposer has
+          requested updates. At a time, GEPA identifies a subset of components to update.
+        Returns
+        - A dict: component_name -> list of dict records (the "reflective dataset").
+          Each record should be JSON-serializable and is passed verbatim to the
+          instruction proposal prompt. A recommended schema is:
+            {
+              "Inputs": Dict[str, str],             # Minimal, clean view of the inputs to the component
+              "Generated Outputs": Dict[str, str] | str,  # Model outputs or raw text
+              "Feedback": str                       # Feedback on the component's performance, including correct answer, error messages, etc.
+            }
+          You may include additional keys (e.g., "score", "rationale", "trace_id") if useful.
+        Determinism
+        - If you subsample trace instances, use a seeded RNG to keep runs reproducible.
+        """
+        ...
+    propose_new_texts: ProposalFn | None = None

mantisdk/algorithm/gepa/lib/core/data_loader.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""Data loader protocols and concrete helpers."""
+from __future__ import annotations
+from typing import Any, Hashable, Protocol, Sequence, TypeVar, cast, runtime_checkable
+from mantisdk.algorithm.gepa.lib.core.adapter import DataInst
+class ComparableHashable(Hashable, Protocol):
+    """Protocol requiring hashing and rich comparison support."""
+    def __lt__(self, other: Any, /) -> bool: ...
+    def __gt__(self, other: Any, /) -> bool: ...
+    def __le__(self, other: Any, /) -> bool: ...
+    def __ge__(self, other: Any, /) -> bool: ...
+DataId = TypeVar("DataId", bound=ComparableHashable)
+""" Generic for the identifier for data examples """
+@runtime_checkable
+class DataLoader(Protocol[DataId, DataInst]):
+    """Minimal interface for retrieving validation examples keyed by opaque ids."""
+    def all_ids(self) -> Sequence[DataId]:
+        """Return the ordered universe of ids currently available. This may change over time."""
+        ...
+    def fetch(self, ids: Sequence[DataId]) -> list[DataInst]:
+        """Materialise the payloads corresponding to `ids`, preserving order."""
+        ...
+    def __len__(self) -> int:
+        """Return current number of items in the loader."""
+        ...
+class MutableDataLoader(DataLoader[DataId, DataInst], Protocol):
+    """A data loader that can be mutated."""
+    def add_items(self, items: list[DataInst]) -> None:
+        """Add items to the loader."""
+class ListDataLoader(MutableDataLoader[int, DataInst]):
+    """In-memory reference implementation backed by a list."""
+    def __init__(self, items: Sequence[DataInst]):
+        self.items = list(items)
+    def all_ids(self) -> Sequence[int]:
+        return list(range(len(self.items)))
+    def fetch(self, ids: Sequence[int]) -> list[DataInst]:
+        return [self.items[data_id] for data_id in ids]
+    def __len__(self) -> int:
+        return len(self.items)
+    def add_items(self, items: Sequence[DataInst]) -> None:
+        self.items.extend(items)
+def ensure_loader(data_or_loader: Sequence[DataInst] | DataLoader[DataId, DataInst]) -> DataLoader[DataId, DataInst]:
+    if isinstance(data_or_loader, DataLoader):
+        return data_or_loader
+    if isinstance(data_or_loader, Sequence):
+        return cast(DataLoader[DataId, DataInst], ListDataLoader(data_or_loader))
+    raise TypeError(f"Unable to cast to a DataLoader type: {type(data_or_loader)}")