PyPI - mantisdk - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mantisdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mantisdk might be problematic. Click here for more details.

Files changed (190) hide show

mantisdk/__init__.py +22 -0
mantisdk/adapter/__init__.py +15 -0
mantisdk/adapter/base.py +94 -0
mantisdk/adapter/messages.py +270 -0
mantisdk/adapter/triplet.py +1028 -0
mantisdk/algorithm/__init__.py +39 -0
mantisdk/algorithm/apo/__init__.py +5 -0
mantisdk/algorithm/apo/apo.py +889 -0
mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
mantisdk/algorithm/base.py +162 -0
mantisdk/algorithm/decorator.py +264 -0
mantisdk/algorithm/fast.py +250 -0
mantisdk/algorithm/gepa/__init__.py +59 -0
mantisdk/algorithm/gepa/adapter.py +459 -0
mantisdk/algorithm/gepa/gepa.py +364 -0
mantisdk/algorithm/gepa/lib/__init__.py +18 -0
mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
mantisdk/algorithm/gepa/lib/api.py +375 -0
mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
mantisdk/algorithm/gepa/lib/core/result.py +233 -0
mantisdk/algorithm/gepa/lib/core/state.py +636 -0
mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
mantisdk/algorithm/gepa/lib/py.typed +0 -0
mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
mantisdk/algorithm/gepa/tracing.py +105 -0
mantisdk/algorithm/utils.py +177 -0
mantisdk/algorithm/verl/__init__.py +5 -0
mantisdk/algorithm/verl/interface.py +202 -0
mantisdk/cli/__init__.py +56 -0
mantisdk/cli/prometheus.py +115 -0
mantisdk/cli/store.py +131 -0
mantisdk/cli/vllm.py +29 -0
mantisdk/client.py +408 -0
mantisdk/config.py +348 -0
mantisdk/emitter/__init__.py +43 -0
mantisdk/emitter/annotation.py +370 -0
mantisdk/emitter/exception.py +54 -0
mantisdk/emitter/message.py +61 -0
mantisdk/emitter/object.py +117 -0
mantisdk/emitter/reward.py +320 -0
mantisdk/env_var.py +156 -0
mantisdk/execution/__init__.py +15 -0
mantisdk/execution/base.py +64 -0
mantisdk/execution/client_server.py +443 -0
mantisdk/execution/events.py +69 -0
mantisdk/execution/inter_process.py +16 -0
mantisdk/execution/shared_memory.py +282 -0
mantisdk/instrumentation/__init__.py +119 -0
mantisdk/instrumentation/agentops.py +314 -0
mantisdk/instrumentation/agentops_langchain.py +45 -0
mantisdk/instrumentation/litellm.py +83 -0
mantisdk/instrumentation/vllm.py +81 -0
mantisdk/instrumentation/weave.py +500 -0
mantisdk/litagent/__init__.py +11 -0
mantisdk/litagent/decorator.py +536 -0
mantisdk/litagent/litagent.py +252 -0
mantisdk/llm_proxy.py +1890 -0
mantisdk/logging.py +370 -0
mantisdk/reward.py +7 -0
mantisdk/runner/__init__.py +11 -0
mantisdk/runner/agent.py +845 -0
mantisdk/runner/base.py +182 -0
mantisdk/runner/legacy.py +309 -0
mantisdk/semconv.py +170 -0
mantisdk/server.py +401 -0
mantisdk/store/__init__.py +23 -0
mantisdk/store/base.py +897 -0
mantisdk/store/client_server.py +2092 -0
mantisdk/store/collection/__init__.py +30 -0
mantisdk/store/collection/base.py +587 -0
mantisdk/store/collection/memory.py +970 -0
mantisdk/store/collection/mongo.py +1412 -0
mantisdk/store/collection_based.py +1823 -0
mantisdk/store/insight.py +648 -0
mantisdk/store/listener.py +58 -0
mantisdk/store/memory.py +396 -0
mantisdk/store/mongo.py +165 -0
mantisdk/store/sqlite.py +3 -0
mantisdk/store/threading.py +357 -0
mantisdk/store/utils.py +142 -0
mantisdk/tracer/__init__.py +16 -0
mantisdk/tracer/agentops.py +242 -0
mantisdk/tracer/base.py +287 -0
mantisdk/tracer/dummy.py +106 -0
mantisdk/tracer/otel.py +555 -0
mantisdk/tracer/weave.py +677 -0
mantisdk/trainer/__init__.py +6 -0
mantisdk/trainer/init_utils.py +263 -0
mantisdk/trainer/legacy.py +367 -0
mantisdk/trainer/registry.py +12 -0
mantisdk/trainer/trainer.py +618 -0
mantisdk/types/__init__.py +6 -0
mantisdk/types/core.py +553 -0
mantisdk/types/resources.py +204 -0
mantisdk/types/tracer.py +515 -0
mantisdk/types/tracing.py +218 -0
mantisdk/utils/__init__.py +1 -0
mantisdk/utils/id.py +18 -0
mantisdk/utils/metrics.py +1025 -0
mantisdk/utils/otel.py +578 -0
mantisdk/utils/otlp.py +536 -0
mantisdk/utils/server_launcher.py +1045 -0
mantisdk/utils/system_snapshot.py +81 -0
mantisdk/verl/__init__.py +8 -0
mantisdk/verl/__main__.py +6 -0
mantisdk/verl/async_server.py +46 -0
mantisdk/verl/config.yaml +27 -0
mantisdk/verl/daemon.py +1154 -0
mantisdk/verl/dataset.py +44 -0
mantisdk/verl/entrypoint.py +248 -0
mantisdk/verl/trainer.py +549 -0
mantisdk-0.1.0.dist-info/METADATA +119 -0
mantisdk-0.1.0.dist-info/RECORD +190 -0
mantisdk-0.1.0.dist-info/WHEEL +4 -0
mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0

mantisdk/verl/trainer.py ADDED Viewed

@@ -0,0 +1,549 @@
+# Copyright (c) Microsoft. All rights reserved.
+# type: ignore
+from __future__ import annotations
+import random
+from contextlib import contextmanager
+from copy import deepcopy
+from pprint import pprint
+from typing import Dict, Tuple, Type
+import numpy as np
+import torch
+import verl
+from codetiming import Timer
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from verl import DataProto
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+from verl.trainer.ppo.core_algos import agg_loss
+from verl.trainer.ppo.metric_utils import (
+    _compute_response_info,
+    compute_throughout_metrics,
+    compute_timing_metrics,
+)
+from verl.trainer.ppo.ray_trainer import (
+    AdvantageEstimator,
+    RayPPOTrainer,
+    apply_kl_penalty,
+    compute_advantage,
+    compute_response_mask,
+)
+from verl.utils.metric import reduce_metrics
+from verl.utils.tracking import Tracking
+from mantisdk.adapter import TraceAdapter, TraceToTripletBase
+from mantisdk.llm_proxy import LLMProxy
+from mantisdk.store.base import LightningStore
+from .daemon import AgentModeDaemon
+__all__ = [
+    "MantisdkTrainer",
+]
+@contextmanager
+def _timer(name: str, timing_raw: Dict[str, float]):
+    with Timer(name=name, logger=None) as timer:
+        yield
+    if name not in timing_raw:
+        timing_raw[name] = 0
+    timing_raw[name] += timer.last
+# This function is adapted from verl.
+# We introduce a new parameter `suffix` to distinguish between metrics computed
+# before and after Mantisdk’s post-processing.
+# - "Before" refers to raw reward and advantage values.
+# - "After" refers to values computed following post-processing, which involves:
+#     (1) Dropping prompts that exceed the maximum allowed length.
+#     (2) Adjusting the batch size to be a multiple of the mini PPO size.
+# Different suffixes are used to label these two stages accordingly.
+def compute_data_metrics(batch: DataProto, use_critic: bool = True, suffix: str = "") -> Dict[str, Any]:
+    """
+    Computes various metrics from a batch of data for PPO training.
+    This function calculates metrics related to scores, rewards, advantages, returns, values,
+    and sequence lengths from a batch of data. It provides statistical information (mean, max, min)
+    for each metric category.
+    Args:
+        batch: A DataProto object containing batch data with token-level scores, rewards, advantages, etc.
+        use_critic: Whether to include critic-specific metrics. Defaults to True.
+    Returns:
+        A dictionary of metrics including:
+            - critic/score/mean, max, min: Statistics about sequence scores
+            - critic/rewards/mean, max, min: Statistics about sequence rewards
+            - critic/advantages/mean, max, min: Statistics about advantages
+            - critic/returns/mean, max, min: Statistics about returns
+            - critic/values/mean, max, min: Statistics about critic values (if use_critic=True)
+            - critic/vf_explained_var: Explained variance of the value function (if use_critic=True)
+            - response_length/mean, max, min, clip_ratio: Statistics about response lengths
+            - prompt_length/mean, max, min, clip_ratio: Statistics about prompt lengths
+    """
+    sequence_score = batch.batch["token_level_scores"].sum(-1)
+    sequence_reward = batch.batch["token_level_rewards"].sum(-1)
+    advantages = batch.batch["advantages"]
+    returns = batch.batch["returns"]
+    max_response_length = batch.batch["responses"].shape[-1]
+    prompt_mask = batch.batch["attention_mask"][:, :-max_response_length].bool()
+    response_mask = batch.batch["attention_mask"][:, -max_response_length:].bool()
+    max_prompt_length = prompt_mask.size(-1)
+    response_info = _compute_response_info(batch)
+    prompt_length = response_info["prompt_length"]
+    response_length = response_info["response_length"]
+    valid_adv = torch.masked_select(advantages, response_mask)
+    valid_returns = torch.masked_select(returns, response_mask)
+    if use_critic:
+        values = batch.batch["values"]
+        valid_values = torch.masked_select(values, response_mask)
+        return_diff_var = torch.var(valid_returns - valid_values)
+        return_var = torch.var(valid_returns)
+    metrics = {
+        # score
+        "critic/score/mean" + suffix: torch.mean(sequence_score).detach().item(),
+        "critic/score/max" + suffix: torch.max(sequence_score).detach().item(),
+        "critic/score/min" + suffix: torch.min(sequence_score).detach().item(),
+        # reward
+        "critic/rewards/mean" + suffix: torch.mean(sequence_reward).detach().item(),
+        "critic/rewards/max" + suffix: torch.max(sequence_reward).detach().item(),
+        "critic/rewards/min" + suffix: torch.min(sequence_reward).detach().item(),
+        # adv
+        "critic/advantages/mean" + suffix: torch.mean(valid_adv).detach().item(),
+        "critic/advantages/max" + suffix: torch.max(valid_adv).detach().item(),
+        "critic/advantages/min" + suffix: torch.min(valid_adv).detach().item(),
+        # returns
+        "critic/returns/mean" + suffix: torch.mean(valid_returns).detach().item(),
+        "critic/returns/max" + suffix: torch.max(valid_returns).detach().item(),
+        "critic/returns/min" + suffix: torch.min(valid_returns).detach().item(),
+        **(
+            {
+                # values
+                "critic/values/mean" + suffix: torch.mean(valid_values).detach().item(),
+                "critic/values/max" + suffix: torch.max(valid_values).detach().item(),
+                "critic/values/min" + suffix: torch.min(valid_values).detach().item(),
+                # vf explained var
+                "critic/vf_explained_var" + suffix: (1.0 - return_diff_var / (return_var + 1e-5)).detach().item(),
+            }
+            if use_critic
+            else {}
+        ),
+        # response length
+        "response_length/mean" + suffix: torch.mean(response_length).detach().item(),
+        "response_length/max" + suffix: torch.max(response_length).detach().item(),
+        "response_length/min" + suffix: torch.min(response_length).detach().item(),
+        "response_length/clip_ratio"
+        + suffix: torch.mean(torch.eq(response_length, max_response_length).float()).detach().item(),
+        # prompt length
+        "prompt_length/mean" + suffix: torch.mean(prompt_length).detach().item(),
+        "prompt_length/max" + suffix: torch.max(prompt_length).detach().item(),
+        "prompt_length/min" + suffix: torch.min(prompt_length).detach().item(),
+        "prompt_length/clip_ratio"
+        + suffix: torch.mean(torch.eq(prompt_length, max_prompt_length).float()).detach().item(),
+    }
+    return metrics
+class MantisdkTrainer(RayPPOTrainer):
+    """
+    Specialized PPO trainer for agent-based reinforcement learning.
+    This trainer is designed specifically for scenarios where the model interacts with
+    external environments, tools, or APIs through an MantisdkServer. It simplifies
+    the training loop by removing the complex conditional logic present in the original
+    RayPPOTrainer and focusing on the agent mode workflow.
+    Key differences from RayPPOTrainer:
+    1. Uses AgentModeDaemon for server communication
+    2. Simplified data flow without pop/union operations
+    3. Direct batch processing through agent daemon
+    4. Streamlined validation using agent_mode validation
+    """
+    def __init__(
+        self,
+        store: LightningStore | None,
+        llm_proxy: LLMProxy | None,
+        adapter: TraceAdapter | None,
+        daemon_cls: Type[AgentModeDaemon],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.store = store
+        self.llm_proxy = llm_proxy
+        self.adapter = adapter
+        self.daemon_cls = daemon_cls
+    def _validate(self):
+        assert len(self.val_dataloader) == 1, "Please set val_batch_size to None for better throughput."
+        test_data = next(iter(self.val_dataloader))
+        test_batch = DataProto.from_single_dict(test_data)
+        self.async_rollout_manager.wake_up()
+        self.agent_mode_daemon.set_up_data_and_server(
+            test_batch.non_tensor_batch,
+            self.async_rollout_manager.server_addresses,
+            is_train=False,
+        )
+        self.agent_mode_daemon.run_until_all_finished()
+        test_metrics = self.agent_mode_daemon.get_test_metrics()
+        self.agent_mode_daemon.clear_data_and_server()
+        self.async_rollout_manager.sleep()
+        return test_metrics
+    def _compute_reference_log_prob(self, batch: DataProto) -> DataProto:
+        """Compute reference log probability using the correct worker based on LoRA configuration.
+        In verl 0.6.0+, when LoRA is detected (indicated by ref_in_actor=True),
+        the reference policy is computed by the actor rollout worker instead of a separate
+        ref policy worker. This method handles both scenarios by checking the ref_in_actor flag.
+        Note: verl sets ref_in_actor=True when it detects LoRA configuration (e.g., lora_rank > 0 or lora_adapter_path is set).
+        Args:
+            batch: The data batch to compute reference log probabilities for.
+        Returns:
+            DataProto with reference log probabilities added.
+        Raises:
+            RuntimeError: If the required worker is not available.
+        """
+        if getattr(self, "ref_in_actor", False):
+            actor_worker = getattr(self, "actor_rollout_wg", None)
+            if actor_worker is None:
+                raise RuntimeError("actor_rollout_wg is required when ref_in_actor is True.")
+            return actor_worker.compute_ref_log_prob(batch)
+        ref_worker = getattr(self, "ref_policy_wg", None)
+        if ref_worker is None:
+            raise RuntimeError(
+                "Reference policy worker was not initialized. "
+                "Ensure `use_reference_policy` is enabled and the VERL config exposes the ref worker."
+            )
+        return ref_worker.compute_ref_log_prob(batch)
+    def _train_step(self, batch_dict: dict) -> dict:
+        # Isolate in a separate method to automatically recycle the variables before validation.
+        batch: DataProto = DataProto.from_single_dict(batch_dict)
+        metrics = {}
+        timing_raw = {}
+        with _timer("step", timing_raw):
+            # When agent mode is enabled, we read the batch as it is.
+            gen_batch = batch
+            # generate a batch
+            with _timer("gen", timing_raw):
+                self.async_rollout_manager.wake_up()
+                self.agent_mode_daemon.set_up_data_and_server(
+                    gen_batch.non_tensor_batch, self.async_rollout_manager.server_addresses
+                )
+                self.agent_mode_daemon.run_until_all_finished()
+                batch, agent_metrics = self.agent_mode_daemon.get_train_data_batch(
+                    max_prompt_length=(
+                        self.config.mantisdk.trace_aggregator.trajectory_max_prompt_length
+                        if self.config.mantisdk.trace_aggregator.level.startswith("trajectory")
+                        else self.config.data.max_prompt_length
+                    ),
+                    max_response_length=(
+                        self.config.mantisdk.trace_aggregator.trajectory_max_response_length
+                        if self.config.mantisdk.trace_aggregator.level.startswith("trajectory")
+                        else self.config.data.max_response_length
+                    ),
+                    device=gen_batch.batch["fake_ids"].device,
+                    global_steps=self.global_steps,
+                )
+                metrics.update(agent_metrics)
+                self.agent_mode_daemon.clear_data_and_server()
+                self.async_rollout_manager.sleep()
+            if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
+                with _timer("gen_max", timing_raw):
+                    gen_baseline_batch = deepcopy(gen_batch)
+                    gen_baseline_batch.meta_info["do_sample"] = False
+                    gen_baseline_output = self.async_rollout_manager.generate_sequences(gen_baseline_batch)
+                    batch = batch.union(gen_baseline_output)
+                    reward_baseline_tensor = self.reward_fn(batch)
+                    reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1)
+                    batch.pop(batch_keys=list(gen_baseline_output.batch.keys()))
+                    batch.batch["reward_baselines"] = reward_baseline_tensor
+                    del gen_baseline_batch, gen_baseline_output
+            # uid is used for algorithm like GRPO, should be aligned to data id
+            batch.non_tensor_batch["uid"] = batch.non_tensor_batch["data_id_list"]
+            if "response_mask" not in batch.batch:
+                batch.batch["response_mask"] = compute_response_mask(batch)
+            # compute global_valid tokens
+            batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+            with _timer("reward", timing_raw):
+                # compute reward model score
+                if self.use_rm:
+                    reward_tensor = self.rm_wg.compute_rm_score(batch)
+                    batch = batch.union(reward_tensor)
+                reward_extra_infos_dict = {}
+            # for agent mode, pad the lengths to calculate old log prob, ref, and values
+            batch, pad_size = pad_dataproto_to_divisor(batch, self.actor_rollout_wg.world_size)
+            # recompute old_log_probs
+            with _timer("old_log_prob", timing_raw):
+                old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+                entropys = old_log_prob.batch["entropys"]
+                response_masks = batch.batch["response_mask"]
+                loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+                entropy_loss = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
+                old_log_prob_metrics = {"actor/entropy_loss": entropy_loss.detach().item()}
+                metrics.update(old_log_prob_metrics)
+                old_log_prob.batch.pop("entropys")
+                batch = batch.union(old_log_prob)
+            if self.use_reference_policy:
+                # compute reference log_prob
+                with _timer("ref", timing_raw):
+                    ref_log_prob = self._compute_reference_log_prob(batch)
+                    batch = batch.union(ref_log_prob)
+            # compute values
+            if self.use_critic:
+                with _timer("values", timing_raw):
+                    values = self.critic_wg.compute_values(batch)
+                    batch = batch.union(values)
+            # for agent mode, unpad to calculate adv
+            # it is important, as adv should be based on the raw traces
+            batch = unpad_dataproto(batch, pad_size=pad_size)
+            with _timer("adv", timing_raw):
+                # if agent_mode is enabled, there is already token_level_scores
+                # token_level_scores is not needed to compute here
+                # compute rewards. apply_kl_penalty if available
+                if self.config.algorithm.use_kl_in_reward:
+                    batch, kl_metrics = apply_kl_penalty(
+                        batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
+                    )
+                    metrics.update(kl_metrics)
+                else:
+                    batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
+                # compute advantages, executed on the driver process
+                norm_adv_by_std_in_grpo = self.config.algorithm.get(
+                    "norm_adv_by_std_in_grpo", True
+                )  # GRPO adv normalization factor
+                batch = compute_advantage(
+                    batch,
+                    adv_estimator=self.config.algorithm.adv_estimator,
+                    gamma=self.config.algorithm.gamma,
+                    lam=self.config.algorithm.lam,
+                    num_repeat=self.config.actor_rollout_ref.rollout.n,
+                    norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+                    config=self.config.algorithm,
+                )
+            # Calculate the metrics before processing. Refer to the comments of function `compute_data_metrics` for details.
+            metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic, suffix="_before_processing"))
+            # after advantages are assinged, we begin to drop (1) long prompt (2) floor to ppo minisize
+            keep_indices = (~batch.batch["is_drop_mask"]).nonzero(as_tuple=True)[0]
+            metrics["training/n_triplets_prompt_too_long"] = (
+                batch.batch["is_drop_mask"].shape[0] - keep_indices.shape[0]
+            )
+            batch = batch[keep_indices]
+            # next, round to minibatch size
+            mini_batch_size = self.config.actor_rollout_ref.actor.ppo_mini_batch_size
+            n_transition = len(batch)
+            random_indices = list(range(n_transition))
+            random.shuffle(random_indices)
+            batch.reorder(torch.tensor(random_indices).type(torch.int32))
+            n_remained_transition = n_transition // mini_batch_size * mini_batch_size
+            batch = batch[list(range(n_remained_transition))]
+            metrics["training/n_triplets_dropped_remainder"] = n_transition - n_remained_transition
+            # Agent mode note: Change the order of balance batch;
+            #     1. first calculate advantage
+            #     2. then drop the samples (too long prompt & floor to ppo minisize)
+            #     3. balance
+            # balance the number of valid tokens on each dp rank.
+            # Note that this breaks the order of data inside the batch.
+            # Please take care when you implement group based adv computation such as GRPO and rloo
+            if self.config.trainer.balance_batch:
+                self._balance_batch(batch, metrics=metrics)
+            # update critic
+            if self.use_critic:
+                with _timer("update_critic", timing_raw):
+                    critic_output = self.critic_wg.update_critic(batch)
+                critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
+                metrics.update(critic_output_metrics)
+            # implement critic warmup
+            if self.config.trainer.critic_warmup <= self.global_steps:
+                # update actor
+                with _timer("update_actor", timing_raw):
+                    batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
+                    actor_output = self.actor_rollout_wg.update_actor(batch)
+                actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
+                metrics.update(actor_output_metrics)
+            # Log rollout generations if enabled
+            rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
+            if rollout_data_dir:
+                with _timer("dump_rollout_generations", timing_raw):
+                    print(batch.batch.keys())
+                    inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
+                    outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
+                    scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                    self._dump_generations(
+                        inputs=inputs,
+                        outputs=outputs,
+                        scores=scores,
+                        reward_extra_infos_dict=reward_extra_infos_dict,
+                        dump_path=rollout_data_dir,
+                    )
+        # compute training metrics
+        metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic, suffix="_after_processing"))
+        metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+        # TODO: implement actual tflpo and theoretical tflpo
+        n_gpus = self.resource_pool_manager.get_n_gpus()
+        metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
+        return metrics
+    def fit(self):
+        logger = Tracking(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+            default_backend=self.config.trainer.logger,
+            config=OmegaConf.to_container(self.config, resolve=True),
+        )
+        self.global_steps = 0
+        # load checkpoint before doing anything
+        self._load_checkpoint()
+        assert self.async_rollout_mode, "If agent mode is enabled, async server must be enabled"
+        if self.adapter is not None and not isinstance(self.adapter, TraceToTripletBase):
+            raise ValueError("Adapter must be a TraceToTripletBase for currently VERL implementation.")
+        verl_version = verl.__version__
+        if verl_version == "0.5.0":
+            # Note (Zhiyuan): To avoid further patch into vllm async server, using the same sentence to get the naming here.
+            # However, it is possible that verl updates the naming and causes incompatibility.
+            # Reference: https://github.com/volcengine/verl/blob/5b5e09d9cc20625e436d01f69d9cc739ff681c54/verl/workers/rollout/vllm_rollout/vllm_async_server.py#L217
+            model = "/".join(self.config.actor_rollout_ref.model.path.split("/")[-2:])
+        else:
+            # For other versions (e.g., 0.6.0), we use the full path to the model.
+            model = self.config.actor_rollout_ref.model.path
+        self.agent_mode_daemon = self.daemon_cls(
+            self.config.mantisdk.port,
+            self.config.actor_rollout_ref.rollout.n,
+            train_information={
+                "model": model,
+                "temperature": self.config.actor_rollout_ref.rollout.temperature,
+            },
+            tokenizer=self.tokenizer,
+            mini_batch_size=self.config.actor_rollout_ref.actor.ppo_mini_batch_size,
+            pad_token_id=self.tokenizer.pad_token_id,
+            mode="v1" if self.store is not None else "v0",
+            store=self.store,
+            llm_proxy=self.llm_proxy,
+            adapter=self.adapter,
+            processor=self.processor,  # For Qwen2-VL mrope position_ids
+            image_base_dir=getattr(self.config.data, "image_base_dir", None),
+            trace_aggregator=self.config.mantisdk.trace_aggregator,
+        )
+        self.agent_mode_daemon.start()
+        # perform validation before training
+        # currently, we only support validation using the reward_function.
+        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
+            val_metrics = self._validate()
+            assert val_metrics, f"{val_metrics=}"
+            pprint(f"Initial validation metrics: {val_metrics}")
+            logger.log(data=val_metrics, step=self.global_steps)
+            if self.config.trainer.get("val_only", False):
+                return
+        # add tqdm
+        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
+        # we start from step 1
+        self.global_steps += 1
+        last_val_metrics = None
+        for epoch in range(self.config.trainer.total_epochs):
+            for batch_dict in self.train_dataloader:
+                metrics = {}
+                timing_raw = {}
+                is_last_step = self.global_steps >= self.total_training_steps
+                # train step
+                metrics = self._train_step(batch_dict)
+                # validate
+                if (
+                    self.val_reward_fn is not None
+                    and self.config.trainer.test_freq > 0
+                    and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
+                ):
+                    with _timer("validate", timing_raw):
+                        val_metrics: dict = self._validate()
+                        if is_last_step:
+                            last_val_metrics = val_metrics
+                    metrics.update(val_metrics)
+                if self.config.trainer.save_freq > 0 and (
+                    is_last_step or self.global_steps % self.config.trainer.save_freq == 0
+                ):
+                    with _timer("save_checkpoint", timing_raw):
+                        self._save_checkpoint()
+                # step metrics
+                metrics.update(
+                    {
+                        "training/global_step": self.global_steps,
+                        "training/epoch": epoch,
+                    }
+                )
+                # TODO: make a canonical logger that supports various backend
+                logger.log(data=metrics, step=self.global_steps)
+                if is_last_step:
+                    pprint(f"Final validation metrics: {last_val_metrics}")
+                    progress_bar.close()
+                    # This exit logic is to ensure a robust CI.
+                    pprint(f"Flush the logger...")
+                    del logger  # Make sure the loggers are flushed and closed properly
+                    pprint(f"Training finished at step {self.global_steps}.")
+                    return
+                progress_bar.update(1)
+                self.global_steps += 1

mantisdk-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,119 @@
+Metadata-Version: 2.4
+Name: mantisdk
+Version: 0.1.0
+Summary: Mantisdk - AI Agent Training and Evaluation Platform
+Project-URL: Homepage, https://github.com/withmetis/mantis
+Project-URL: Documentation, https://withmetis.github.io/mantis/mantisdk/
+Project-URL: Repository, https://github.com/withmetis/mantis
+Project-URL: Issues, https://github.com/withmetis/mantis/issues
+Author-email: Metis Team <team@withmetis.ai>
+License: MIT
+License-File: LICENSE
+Keywords: agents,ai,evaluation,llm,mantis,observability,reinforcement-learning,training
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10
+Requires-Dist: agentops>=0.4.13
+Requires-Dist: aiohttp
+Requires-Dist: aiologic
+Requires-Dist: fastapi
+Requires-Dist: flask
+Requires-Dist: gepa>=0.0.24
+Requires-Dist: gpustat
+Requires-Dist: graphviz
+Requires-Dist: gunicorn
+Requires-Dist: litellm[proxy]>=1.74
+Requires-Dist: openai
+Requires-Dist: opentelemetry-api>=1.35
+Requires-Dist: opentelemetry-exporter-otlp>=1.35
+Requires-Dist: opentelemetry-sdk>=1.35
+Requires-Dist: portpicker
+Requires-Dist: psutil
+Requires-Dist: pydantic>=2.11
+Requires-Dist: rich
+Requires-Dist: setproctitle
+Requires-Dist: uvicorn
+Requires-Dist: uvicorn-worker
+Provides-Extra: apo
+Requires-Dist: poml; extra == 'apo'
+Provides-Extra: mongo
+Requires-Dist: pymongo; extra == 'mongo'
+Provides-Extra: verl
+Requires-Dist: verl>=0.5.0; extra == 'verl'
+Requires-Dist: vllm>=0.8.4; extra == 'verl'
+Provides-Extra: weave
+Requires-Dist: weave>=0.52.22; extra == 'weave'
+Description-Content-Type: text/markdown
+# Mantisdk
+[![PyPI version](https://badge.fury.io/py/mantisdk.svg)](https://badge.fury.io/py/mantisdk)
+[![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
+**AI Agent Training and Evaluation Platform**
+Mantisdk is a comprehensive toolkit for training and evaluating AI agents using reinforcement learning, automatic prompt optimization, and supervised fine-tuning.
+## Core Features
+- Turn your agent into an optimizable beast with **minimal code changes**
+- Build with **any** agent framework (LangChain, OpenAI Agent SDK, AutoGen, CrewAI, and more)
+- **Selectively** optimize one or more agents in a multi-agent system
+- Embraces **algorithms** like Reinforcement Learning, Automatic Prompt Optimization, Supervised Fine-tuning and more
+## Installation
+```bash
+pip install mantisdk
+```
+For optional dependencies:
+```bash
+# For APO (Automatic Prompt Optimization)
+pip install mantisdk[apo]
+# For VERL integration
+pip install mantisdk[verl]
+# For Weave integration
+pip install mantisdk[weave]
+# For MongoDB store
+pip install mantisdk[mongo]
+```
+## Quick Start
+```python
+import mantisdk as msk
+# Initialize the client
+client = msk.MantisdkClient()
+# Your agent code here...
+```
+## CLI Usage
+```bash
+# Start the Mantisdk server
+msk store serve
+# Run with vLLM
+msk vllm start
+```
+## Documentation
+For full documentation, visit [https://withmetis.github.io/mantis/mantisdk/](https://withmetis.github.io/mantis/mantisdk/)
+## License
+MIT License - see [LICENSE](LICENSE) for details.