PyPI - universal-mcp-agents - Versions diffs - 0.1.23rc3__tar.gz → 0.1.23rc4__tar.gz - Mend

universal-mcp-agents 0.1.23rc3tar.gz → 0.1.23rc4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of universal-mcp-agents might be problematic. Click here for more details.

Files changed (70) hide show

{universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/PKG-INFO RENAMED Viewed

@@ -1,23 +1,24 @@
 Metadata-Version: 2.4
 Name: universal-mcp-agents
-Version: 0.1.23rc3
+Version: 0.1.23rc4
 Summary: Add your description here
 Project-URL: Homepage, https://github.com/universal-mcp/applications
 Project-URL: Repository, https://github.com/universal-mcp/applications
 Author-email: Manoj Bajaj <manojbajaj95@gmail.com>
 License: MIT
 Requires-Python: >=3.11
+Requires-Dist: cloudpickle>=3.1.1
 Requires-Dist: langchain-anthropic>=0.3.19
 Requires-Dist: langchain-google-genai>=2.1.10
 Requires-Dist: langchain-openai>=0.3.32
 Requires-Dist: langgraph>=0.6.6
-Requires-Dist: typer>=0.17.4
 Requires-Dist: universal-mcp-applications>=0.1.25
-Requires-Dist: universal-mcp>=0.1.24rc26
+Requires-Dist: universal-mcp>=0.1.24rc27
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == 'dev'
 Requires-Dist: ruff; extra == 'dev'
+Requires-Dist: typer>=0.17.4; extra == 'dev'
 Provides-Extra: test
-Requires-Dist: pytest-asyncio>=1.1.0; extra == 'test'
+Requires-Dist: pytest-asyncio>=1.2.0; extra == 'test'
 Requires-Dist: pytest-cov; extra == 'test'
 Requires-Dist: pytest<9.0.0,>=7.0.0; extra == 'test'

{universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/pyproject.toml RENAMED Viewed

@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
 [project]
 name = "universal-mcp-agents"
-version = "0.1.23-rc3"
+version = "0.1.23-rc4"
 description = "Add your description here"
 readme = "README.md"
 authors = [
@@ -14,12 +14,12 @@ authors = [
 ]
 requires-python = ">=3.11"
 dependencies = [
+    "cloudpickle>=3.1.1",
     "langchain-anthropic>=0.3.19",
     "langchain-google-genai>=2.1.10",
     "langchain-openai>=0.3.32",
     "langgraph>=0.6.6",
-    "typer>=0.17.4",
-    "universal-mcp>=0.1.24rc26",
+    "universal-mcp>=0.1.24rc27",
     "universal-mcp-applications>=0.1.25",
 ]
@@ -29,11 +29,12 @@ text = "MIT"
 [project.optional-dependencies]
 test = [
     "pytest>=7.0.0,<9.0.0",
-    "pytest-asyncio>=1.1.0",
+    "pytest-asyncio>=1.2.0",
     "pytest-cov",
 ]
 dev = [
     "ruff",
+    "typer>=0.17.4",
     "pre-commit",
 ]
@@ -66,6 +67,7 @@ lint.select = [
 ]
 lint.ignore = [
     "E501",  # Ignore line length errors
+    "PLR2004" # Ignore errors caused due to constants
 ]
 [tool.ruff.lint.pylint]
@@ -84,8 +86,3 @@ pythonpath = [
 ]
 asyncio_mode = "strict"
 asyncio_default_fixture_loop_scope = "module"
-[dependency-groups]
-dev = [
-    "ruff>=0.13.0",
-]

{universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/evals/datasets/exact.jsonl RENAMED Viewed

@@ -4,3 +4,4 @@
 {"user_input": "What is the capital of France?", "expected_output": "Paris"}
 {"user_input": "Who wrote 'To Kill a Mockingbird'?", "expected_output": "Harper Lee"}
 {"user_input": "What is the boiling point of water at sea level in Celsius?", "expected_output": "100"}
+{"user_input": "Find the 80th fibonnacci number", "expected_output": "23416728348467685"}

universal_mcp_agents-0.1.23rc4/src/evals/datasets/test.jsonl ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"user_input": "What is 2 + 2?", "expected_output": "4"}

universal_mcp_agents-0.1.23rc4/src/evals/evaluators.py ADDED Viewed

@@ -0,0 +1,14 @@
+from openevals.llm import create_llm_as_judge
+from evals.prompts import CODEACT_EVALUATOR_PROMPT, CORRECTNESS_PROMPT
+correctness_evaluator = create_llm_as_judge(
+    prompt=CORRECTNESS_PROMPT,
+    model="anthropic:claude-4-sonnet-20250514",
+)
+codeact_evaluator = create_llm_as_judge(
+    prompt=CODEACT_EVALUATOR_PROMPT,
+    model="anthropic:claude-4-sonnet-20250514",
+)

universal_mcp_agents-0.1.23rc4/src/evals/prompts.py ADDED Viewed

@@ -0,0 +1,47 @@
+CORRECTNESS_PROMPT = """You are an expert at evaluating LLM trajectories and responses, for an agent that uses code-writing to perform actions. You will be able to see the entire run, including the human input prompt, the system prompt containing tool information for additional tools (call_llm, ai_classify, creative_writer, data_extractor, smart_print) , and the code inputs/outputs.
+Judge the correctness of the trajectory based on the following-
+<Rubric>
+- The agent returns the correct output to the user at the end, or has completed the task the user asked it to do.
+- There are no remaining errors in the code. Do not penalise for errors that the LLM corrects based on the output.
+- The agent calls the functions with correct arguments as per the user's task.
+- The agent utilizes the correct functions/tools for the task.
+- During the run, the agent will search for tools from different applications. Ensure that the following is followed by the agent-
+     -Prioritize connected applications over unconnected ones from the output of `search_functions`.
+    - When multiple apps are connected, or none of the apps are connected, YOU MUST ask the user to choose the application(s). The search results will inform you when such a case occurs (including some irrelevant apps), and you must stop and ask the user if multiple apps are relevant.
+- When there is no output at all, the run has failed. Give 0 for this.
+</Rubric>
+<input>
+{inputs}
+</input>
+<output>
+{outputs}
+</output>
+Use the reference outputs below to help you evaluate the correctness of the response:
+<reference_outputs>
+{reference_outputs}
+</reference_outputs>
+"""
+CODEACT_EVALUATOR_PROMPT = """
+You are a code execution evaluator. You will be given the entire run of an agent, starting with a human input task, the intermediate steps taken, and the final output of the agent given to the user.
+These steps will contain code written by the agent to solve the problem as well as its outputs. Your job is to check ONLY if the code executes correctly.
+Keep in mind that the agent has access to tools like- ai_classify, call_llm, creative_writer, data_extractor, smart_print as pre-loaded tools. These calls are to be treated as valid if they run without errors.
+These are the only criteria you should evaluate-
+<Rubric>
+- The code written by the agent in tool calls should be syntactically correct and use existing or loaded objects.
+- The code outputs should not have an error or empty/unexpected outputs
+- The output should not be empty, since that indicates a failed run.
+</Rubric>
+If either of the above are not satisfied, you should give 0.
+<Reminder>
+You must not judge whether the code is helpful to the task or not, only if the code itself is correct or not.
+</Reminder>
+"""

universal_mcp_agents-0.1.23rc4/src/evals/run.py ADDED Viewed

@@ -0,0 +1,207 @@
+import asyncio
+from datetime import datetime
+from enum import Enum
+from typing import Annotated, Any
+import typer
+from langsmith import Client, aevaluate
+from langsmith.utils import LangSmithConflictError
+from universal_mcp.agentr.client import AgentrClient
+from universal_mcp.agentr.registry import AgentrRegistry
+from evals.dataset import load_dataset
+from evals.evaluators import (
+    codeact_evaluator,
+    correctness_evaluator,
+)
+from universal_mcp.agents import get_agent
+# 2. Evaluator Registry
+EVALUATORS: dict[str, Any] = {
+    "correctness": correctness_evaluator,
+    "codeact": codeact_evaluator,
+}
+class Difficulty(str, Enum):
+    easy = "easy"
+    medium = "medium"
+    hard = "hard"
+async def agent_runner(inputs: dict) -> dict:
+    """
+    Runs the agent and returns a dictionary with the final output.
+    """
+    agent_name = "codeact-repl"
+    current_date_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    client = AgentrClient()
+    registry = AgentrRegistry(client=client)
+    common_params = {
+        "instructions": f"You are a helpful assistant. The current date and time is {current_date_time}",
+        "model": "anthropic:claude-haiku-4-5",
+        "registry": registry,
+        "tools": inputs.get("tools", {}),
+    }
+    agent = get_agent(agent_name)(name=agent_name, **common_params)
+    result = await agent.invoke(user_input=inputs["user_input"])
+    # The trajectory evaluator expects the raw output dict, with serialized messages
+    # result["messages"] = messages_to_list(result["messages"])
+    return result
+async def run_evaluation(
+    dataset_name: str,
+    difficulty_split: str | None = None,
+    dataset_version: str | None = None,
+    max_concurrency: int = 1,
+    description: str | None = None,
+):
+    """
+    The main async function for the evaluation.
+    """
+    agent_name = "codeact-repl"
+    evaluators = [correctness_evaluator]  # TODO: Add codeact_evaluator
+    # Create a callable for aevaluate
+    async def target_func(inputs: dict):
+        return await agent_runner(inputs)
+    # 2. Run the evaluation
+    client = Client()
+    data = dataset_name
+    if difficulty_split or dataset_version:
+        kwargs = {"dataset_name": dataset_name}
+        if difficulty_split:
+            kwargs["metadata"] = {"difficulty": difficulty_split}
+        if dataset_version:
+            kwargs["as_of"] = dataset_version
+        data = client.list_examples(**kwargs)
+    await aevaluate(
+        target_func,
+        data=data,
+        evaluators=evaluators,
+        experiment_prefix=f"{agent_name}-eval",
+        max_concurrency=max_concurrency,
+        description=description,
+    )
+def upload_dataset(
+    dataset_path: str,
+):
+    """
+    Loads a dataset from a file and uploads it to LangSmith, creating a new version.
+    If a dataset with the same name already exists, all previous examples are deleted
+    before adding the new ones, ensuring a clean new version.
+    """
+    dataset_examples = load_dataset(dataset_path)
+    client = Client()
+    dataset_name = f"{dataset_path.split('/')[-1].split('.')[0]}"
+    try:
+        dataset = client.create_dataset(
+            dataset_name,
+            description="Dataset for codeact-repl agent evaluation.",
+        )
+    except LangSmithConflictError:
+        dataset = client.read_dataset(dataset_name=dataset_name)
+        # Delete existing examples to create a clean slate for the new version
+        example_ids = [example.id for example in client.list_examples(dataset_id=dataset.id)]
+        if example_ids:
+            client.delete_examples(example_ids=example_ids)
+    examples = []
+    for ex in dataset_examples:
+        metadata = {}
+        if "difficulty" in ex:
+            difficulty = ex["difficulty"]
+            metadata["difficulty_score"] = difficulty
+            if difficulty in {1, 2}:
+                metadata["difficulty"] = "easy"
+            elif difficulty == 3:
+                metadata["difficulty"] = "medium"
+            elif difficulty in {4, 5}:
+                metadata["difficulty"] = "hard"
+        examples.append(
+            {
+                "inputs": {"user_input": ex["user_input"], "tools": ex.get("required_tools", {})},
+                "outputs": {
+                    "expected_output": ex.get("expected_output", ""),
+                    "required_tools": ex.get("required_tools", {}),
+                },
+                "metadata": metadata,
+            }
+        )
+    client.create_examples(
+        dataset_id=dataset.id,
+        examples=examples,
+    )
+app = typer.Typer()
+@app.command()
+def upload(
+    dataset_path: Annotated[
+        str,
+        typer.Argument(help="Path to the dataset file (e.g., src/evals/datasets/tasks.jsonl)."),
+    ],
+):
+    """
+    Uploads a dataset to LangSmith.
+    """
+    upload_dataset(dataset_path)
+@app.command()
+def run(
+    dataset_name: Annotated[str, typer.Argument(help="The name of the dataset in LangSmith.")],
+    difficulty: Annotated[
+        Difficulty | None,
+        typer.Option(
+            help="The difficulty split to use from the dataset.",
+            case_sensitive=False,
+        ),
+    ] = None,
+    dataset_version: Annotated[
+        str | None,
+        typer.Option(
+            help="The dataset version to use (e.g., 'latest', a timestamp, or a tag).",
+        ),
+    ] = None,
+    concurrency: Annotated[
+        int,
+        typer.Option(
+            help="The number of concurrent runs to execute.",
+        ),
+    ] = 5,
+    description: Annotated[
+        str | None,
+        typer.Option(
+            help="A description for the evaluation experiment.",
+        ),
+    ] = None,
+):
+    """
+    Run evaluations on the codeact-repl agent.
+    """
+    difficulty_value = difficulty.value if difficulty else None
+    asyncio.run(
+        run_evaluation(
+            dataset_name=dataset_name,
+            difficulty_split=difficulty_value,
+            dataset_version=dataset_version,
+            max_concurrency=concurrency,
+            description=description,
+        )
+    )
+if __name__ == "__main__":
+    app()

{universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/tests/test_agents.py RENAMED Viewed

@@ -1,16 +1,14 @@
 from typing import Any
+from uuid import uuid4
 import pytest
-from langchain_core.messages import HumanMessage
-from langchain_core.tools import tool
+from langchain_core.tools import StructuredTool
+from langgraph.checkpoint.memory import MemorySaver
 from universal_mcp.tools.registry import ToolRegistry
 from universal_mcp.types import ToolFormat
 from universal_mcp.agents import get_agent
-from universal_mcp.agents.base import BaseAgent
-from universal_mcp.agents.builder.builder import BuilderAgent
-from universal_mcp.agents.llm import load_chat_model
-from universal_mcp.agents.shared.tool_node import build_tool_node_graph
+from universal_mcp.agents.utils import get_message_text
 class MockToolRegistry(ToolRegistry):
@@ -129,6 +127,11 @@ class MockToolRegistry(ToolRegistry):
                 "code": ["create_pull_request", "get_repository"],
             },
         }
+        super().__init__(**kwargs)
+    def _load_tools_from_app(self, app_id: str, tools: list[str]) -> None:
+        """Mock implementation for loading tools."""
+        pass
     async def list_all_apps(self) -> list[dict[str, Any]]:
         """Get list of available apps."""
@@ -197,22 +200,35 @@ class MockToolRegistry(ToolRegistry):
     async def export_tools(
         self,
-        tools: list[str],
-        format: ToolFormat,
+        tools: list[str] | None = None,
+        format: ToolFormat = ToolFormat.NATIVE,
     ) -> list[Any]:
-        """Exports a list of mock LangChain tools."""
+        """Exports a list of mock tools."""
+        async def mock_send_email(to: str, body: str):
+            """Sends an email."""
+            return {"status": f"Email sent to {to} with body '{body}'"}
-        @tool
-        async def mock_tool_callable(query: str):
+        if tools and "google_mail__send_email" in tools:
+            if format == ToolFormat.NATIVE:
+                return [mock_send_email]
+            elif format == ToolFormat.LANGCHAIN:
+                return [StructuredTool.from_function(mock_send_email)]
+        async def mock_tool_callable(**kwargs: str):
             """A mock tool that confirms the task is done."""
-            return {"status": "task has been done"}
+            return {"status": "Task has been done"}
-        # Return a list of mock tools for the ReAct agent to use
-        return [mock_tool_callable]
+        if format == ToolFormat.NATIVE:
+            return [mock_tool_callable]
+        elif format == ToolFormat.LANGCHAIN:
+            return [StructuredTool.from_function(mock_tool_callable)]
+        else:
+            raise ValueError(f"Invalid format: {format}")
     async def call_tool(self, tool_name: str, tool_args: dict[str, Any]) -> dict[str, Any]:
         """Call a tool with the given name and arguments."""
-        return {"status": f"task has been done by tool {tool_name}"}
+        return {"status": f"Task has been done by tool {tool_name}"}
     async def list_connected_apps(self) -> list[dict[str, str]]:
         """
@@ -222,141 +238,62 @@ class MockToolRegistry(ToolRegistry):
         return [{"app_id": app_id} for app_id in self._connected_apps]
-class TestToolFinderGraph:
-    @pytest.fixture
-    def llm(self):
-        return load_chat_model("anthropic/claude-sonnet-4-20250514", thinking=False)
-    @pytest.fixture
-    def registry(self):
-        return MockToolRegistry()
-    @pytest.mark.asyncio
-    async def test_simple_case(self, llm, registry):
-        """Test Case 1: Simple task requiring a single app and tool."""
-        task = "Send an email to my manager about the project update."
-        graph = build_tool_node_graph(llm, registry)
-        final_state = await graph.ainvoke(
-            {"original_task": task, "messages": [HumanMessage(content=task)], "decomposition_attempts": 0}
-        )
-        tool_config = final_state.get("execution_plan")
-        # FIX: Assert against the correct, hyphenated app ID.
-        assert "google_mail" in tool_config
-        assert "send_email" in tool_config["google_mail"]
-    @pytest.mark.asyncio
-    async def test_multi_step_task(self, llm, registry):
-        """Test Case 2: A task requiring multiple tools from different apps."""
-        task = "Create a new issue for a bug in our github repository, and send a message on slack about the issue."
-        graph = build_tool_node_graph(llm, registry)
-        final_state = await graph.ainvoke(
-            {"original_task": task, "messages": [HumanMessage(content=task)], "decomposition_attempts": 0}
-        )
-        tool_config = final_state.get("execution_plan")
-        assert tool_config, "Execution plan should not be empty"
-        assert "github" in tool_config
-        assert "create_issue" in tool_config["github"]
-        assert "slack" in tool_config
-        assert "send_message" in tool_config["slack"]
-    @pytest.mark.asyncio
-    async def test_no_relevant_app(self, llm, registry):
-        """Test Case 3: A task for which no tools or apps are available."""
-        task = "Can you create a blog post on my wordpress site?"
-        graph = build_tool_node_graph(llm, registry)
-        final_state = await graph.ainvoke(
-            {"original_task": task, "messages": [HumanMessage(content=task)], "decomposition_attempts": 0}
-        )
-        plan = final_state.get("execution_plan")
-        assert not plan
-        last_message = final_state.get("messages", [])[-1].content
-        assert "could not create a final plan" in last_message.lower()
-@pytest.mark.parametrize(
-    "agent_name",
-    [
-        "react",
-        "simple",
-        "builder",
-        "bigtool",
-        # "codeact-script",
-        # "codeact-repl",
-    ],
-)
-class TestAgents:
-    @pytest.fixture
-    def agent(self, agent_name: str):
-        """Set up the test environment for the agent."""
-        registry = MockToolRegistry()
-        agent_class = get_agent(agent_name)
-        agent = agent_class(
-            name=f"Test {agent_name}",
-            instructions="Test instructions",
-            model="anthropic/claude-sonnet-4-20250514",
-            registry=registry,
-        )
-        return agent
-    @pytest.mark.asyncio
-    async def test_end_to_end_with_tool(self, agent: BaseAgent):
-        """Tests the full flow from task to tool execution."""
-        task = "Send an email to my manager."
-        thread_id = f"test-thread-{agent.name.replace(' ', '-')}"
-        await agent.ainit()
-        # Invoke the agent graph to get the final state
-        final_state = await agent.invoke(
-            user_input={"userInput": task} if agent.name == "Test builder" else task,
-            thread_id=thread_id,
-        )
-        # Extract the content of the last message
-        if agent.name != "Test builder":
-            final_messages = final_state.get("messages", [])
-            assert final_messages, "The agent should have produced at least one message."
-            last_message = final_messages[-1]
-            final_response = last_message.content if hasattr(last_message, "content") else str(last_message)
-            assert final_response is not None, "The final response should not be None."
-            assert final_response != "", "The final response should not be an empty string."
-class TestAgentBuilder:
-    @pytest.fixture
-    def agent_builder(self):
-        """Set up the agent builder."""
-        registry = MockToolRegistry()
-        agent = BuilderAgent(
-            name="Test Builder Agent",
-            instructions="Test instructions for builder",
-            model="gemini/gemini-2.5-flash",
-            registry=registry,
-        )
-        yield agent
-    @pytest.mark.asyncio
-    async def test_create_agent(self, agent_builder: BuilderAgent):
-        """Test case for creating an agent with the builder."""
-        task = "Send a daily email to manoj@agentr.dev with daily agenda of the day"
-        thread_id = "test-thread-create-agent"
-        result = await agent_builder.invoke(thread_id=thread_id, user_input={"userInput": task})
-        assert "generated_agent" in result
-        generated_agent = result["generated_agent"]
-        assert generated_agent.name
-        assert generated_agent.description
-        assert generated_agent.expertise
-        assert "manoj@agentr.dev" in generated_agent.instructions
-        assert generated_agent.schedule is not None
-        assert "tool_config" in result
-        tool_config = result["tool_config"]
-        assert "google_mail" in tool_config
+@pytest.mark.asyncio
+async def test_simple_agent():
+    """Tests the simple agent."""
+    agent = get_agent("simple")(
+        name="Test Simple",
+        instructions="Test instructions",
+        model="anthropic/claude-haiku-4-5",
+    )
+    result = await agent.invoke(user_input="What is the capital of France?")
+    assert result is not None
+    last_message = result["messages"][-1]
+    last_message_text = get_message_text(last_message)
+    assert "paris" in last_message_text.lower()
+@pytest.mark.asyncio
+async def test_codeact_single_turn():
+    """Tests the codeact-repl agent."""
+    agent = get_agent("codeact-repl")(
+        name="Test Codeact Repl",
+        instructions="Test instructions",
+        model="anthropic/claude-haiku-4-5",
+        registry=MockToolRegistry(),
+    )
+    result = await agent.invoke(user_input="What is 2+2?")
+    assert result is not None
+    last_message = result["messages"][-1]
+    last_message_text = get_message_text(last_message)
+    assert "4" in last_message_text.lower()
+@pytest.mark.asyncio
+async def test_codeact_multi_turn():
+    """Tests the codeact-repl agent."""
+    checkpoint_saver = MemorySaver()
+    agent = get_agent("codeact-repl")(
+        name="Test Codeact Repl",
+        instructions="You are a helpful assistant",
+        model="anthropic/claude-haiku-4-5",
+        registry=MockToolRegistry(),
+        memory=checkpoint_saver,
+    )
+    thread_id = str(uuid4())
+    result = await agent.invoke(
+        user_input="Generate a function to calculate fibonnaci number, and get 10th number in the sequence. Use fib(0) = 0 and fib(1) = 1 as the base cases. Set x = fib(10)",
+        thread_id=thread_id,
+    )
+    assert result is not None
+    last_message = result["messages"][-1]
+    last_message_text = get_message_text(last_message)
+    assert "55" in last_message_text.lower()
+    turn2 = await agent.invoke(
+        user_input="What is the x+5?",
+        thread_id=thread_id,
+    )
+    assert turn2 is not None
+    last_message2 = turn2["messages"][-1]
+    last_message2_text = get_message_text(last_message2)
+    assert "60" in last_message2_text.lower()

universal-mcp-agents 0.1.23rc3__tar.gz → 0.1.23rc4__tar.gz

Potentially problematic release.

universal-mcp-agents 0.1.23rc3tar.gz → 0.1.23rc4tar.gz