universal-mcp-agents 0.1.23rc3__tar.gz → 0.1.23rc4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of universal-mcp-agents might be problematic. Click here for more details.

Files changed (70) hide show
  1. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/PKG-INFO +5 -4
  2. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/pyproject.toml +6 -9
  3. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/evals/datasets/exact.jsonl +1 -0
  4. universal_mcp_agents-0.1.23rc4/src/evals/datasets/test.jsonl +1 -0
  5. universal_mcp_agents-0.1.23rc4/src/evals/evaluators.py +14 -0
  6. universal_mcp_agents-0.1.23rc4/src/evals/prompts.py +47 -0
  7. universal_mcp_agents-0.1.23rc4/src/evals/run.py +207 -0
  8. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/tests/test_agents.py +90 -153
  9. universal_mcp_agents-0.1.23rc4/src/tests/test_sandbox.py +244 -0
  10. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/__init__.py +1 -7
  11. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/base.py +1 -1
  12. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/bigtool/state.py +1 -1
  13. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/codeact0/__main__.py +1 -1
  14. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/codeact0/agent.py +58 -33
  15. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/codeact0/prompts.py +26 -44
  16. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/codeact0/sandbox.py +2 -1
  17. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/codeact0/state.py +1 -1
  18. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/codeact0/tools.py +11 -9
  19. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/codeact0/utils.py +76 -1
  20. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/react.py +3 -3
  21. universal_mcp_agents-0.1.23rc4/src/universal_mcp/agents/sandbox.py +123 -0
  22. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/uv.lock +403 -487
  23. universal_mcp_agents-0.1.23rc3/src/evals/datasets/codeact.jsonl +0 -11
  24. universal_mcp_agents-0.1.23rc3/src/evals/evaluators.py +0 -83
  25. universal_mcp_agents-0.1.23rc3/src/evals/prompts.py +0 -66
  26. universal_mcp_agents-0.1.23rc3/src/evals/run.py +0 -176
  27. universal_mcp_agents-0.1.23rc3/src/universal_mcp/agents/sandbox.py +0 -90
  28. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/.github/workflows/evals.yml +0 -0
  29. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/.github/workflows/lint.yml +0 -0
  30. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/.github/workflows/release-please.yml +0 -0
  31. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/.github/workflows/tests.yml +0 -0
  32. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/.gitignore +0 -0
  33. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/.pre-commit-config.yaml +0 -0
  34. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/GEMINI.md +0 -0
  35. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/PROMPTS.md +0 -0
  36. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/README.md +0 -0
  37. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/bump_and_release.sh +0 -0
  38. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/evals/__init__.py +0 -0
  39. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/evals/dataset.py +0 -0
  40. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/evals/datasets/tasks.jsonl +0 -0
  41. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/evals/utils.py +0 -0
  42. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/bigtool/__init__.py +0 -0
  43. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/bigtool/__main__.py +0 -0
  44. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/bigtool/agent.py +0 -0
  45. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/bigtool/context.py +0 -0
  46. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/bigtool/graph.py +0 -0
  47. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/bigtool/prompts.py +0 -0
  48. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/bigtool/tools.py +0 -0
  49. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/builder/__main__.py +0 -0
  50. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/builder/builder.py +0 -0
  51. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/builder/helper.py +0 -0
  52. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/builder/prompts.py +0 -0
  53. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/builder/state.py +0 -0
  54. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/cli.py +0 -0
  55. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/codeact0/__init__.py +0 -0
  56. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/codeact0/config.py +0 -0
  57. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/codeact0/langgraph_agent.py +0 -0
  58. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/codeact0/llm_tool.py +0 -0
  59. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/hil.py +0 -0
  60. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/llm.py +0 -0
  61. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/shared/__main__.py +0 -0
  62. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/shared/prompts.py +0 -0
  63. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/shared/tool_node.py +0 -0
  64. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/simple.py +0 -0
  65. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/agents/utils.py +0 -0
  66. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/applications/filesystem/__init__.py +0 -0
  67. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/applications/filesystem/app.py +0 -0
  68. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/applications/llm/__init__.py +0 -0
  69. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/applications/llm/app.py +0 -0
  70. {universal_mcp_agents-0.1.23rc3 → universal_mcp_agents-0.1.23rc4}/src/universal_mcp/applications/ui/app.py +0 -0
@@ -1,23 +1,24 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: universal-mcp-agents
3
- Version: 0.1.23rc3
3
+ Version: 0.1.23rc4
4
4
  Summary: Add your description here
5
5
  Project-URL: Homepage, https://github.com/universal-mcp/applications
6
6
  Project-URL: Repository, https://github.com/universal-mcp/applications
7
7
  Author-email: Manoj Bajaj <manojbajaj95@gmail.com>
8
8
  License: MIT
9
9
  Requires-Python: >=3.11
10
+ Requires-Dist: cloudpickle>=3.1.1
10
11
  Requires-Dist: langchain-anthropic>=0.3.19
11
12
  Requires-Dist: langchain-google-genai>=2.1.10
12
13
  Requires-Dist: langchain-openai>=0.3.32
13
14
  Requires-Dist: langgraph>=0.6.6
14
- Requires-Dist: typer>=0.17.4
15
15
  Requires-Dist: universal-mcp-applications>=0.1.25
16
- Requires-Dist: universal-mcp>=0.1.24rc26
16
+ Requires-Dist: universal-mcp>=0.1.24rc27
17
17
  Provides-Extra: dev
18
18
  Requires-Dist: pre-commit; extra == 'dev'
19
19
  Requires-Dist: ruff; extra == 'dev'
20
+ Requires-Dist: typer>=0.17.4; extra == 'dev'
20
21
  Provides-Extra: test
21
- Requires-Dist: pytest-asyncio>=1.1.0; extra == 'test'
22
+ Requires-Dist: pytest-asyncio>=1.2.0; extra == 'test'
22
23
  Requires-Dist: pytest-cov; extra == 'test'
23
24
  Requires-Dist: pytest<9.0.0,>=7.0.0; extra == 'test'
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
6
6
 
7
7
  [project]
8
8
  name = "universal-mcp-agents"
9
- version = "0.1.23-rc3"
9
+ version = "0.1.23-rc4"
10
10
  description = "Add your description here"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -14,12 +14,12 @@ authors = [
14
14
  ]
15
15
  requires-python = ">=3.11"
16
16
  dependencies = [
17
+ "cloudpickle>=3.1.1",
17
18
  "langchain-anthropic>=0.3.19",
18
19
  "langchain-google-genai>=2.1.10",
19
20
  "langchain-openai>=0.3.32",
20
21
  "langgraph>=0.6.6",
21
- "typer>=0.17.4",
22
- "universal-mcp>=0.1.24rc26",
22
+ "universal-mcp>=0.1.24rc27",
23
23
  "universal-mcp-applications>=0.1.25",
24
24
  ]
25
25
 
@@ -29,11 +29,12 @@ text = "MIT"
29
29
  [project.optional-dependencies]
30
30
  test = [
31
31
  "pytest>=7.0.0,<9.0.0",
32
- "pytest-asyncio>=1.1.0",
32
+ "pytest-asyncio>=1.2.0",
33
33
  "pytest-cov",
34
34
  ]
35
35
  dev = [
36
36
  "ruff",
37
+ "typer>=0.17.4",
37
38
  "pre-commit",
38
39
  ]
39
40
 
@@ -66,6 +67,7 @@ lint.select = [
66
67
  ]
67
68
  lint.ignore = [
68
69
  "E501", # Ignore line length errors
70
+ "PLR2004" # Ignore errors caused due to constants
69
71
  ]
70
72
 
71
73
  [tool.ruff.lint.pylint]
@@ -84,8 +86,3 @@ pythonpath = [
84
86
  ]
85
87
  asyncio_mode = "strict"
86
88
  asyncio_default_fixture_loop_scope = "module"
87
-
88
- [dependency-groups]
89
- dev = [
90
- "ruff>=0.13.0",
91
- ]
@@ -4,3 +4,4 @@
4
4
  {"user_input": "What is the capital of France?", "expected_output": "Paris"}
5
5
  {"user_input": "Who wrote 'To Kill a Mockingbird'?", "expected_output": "Harper Lee"}
6
6
  {"user_input": "What is the boiling point of water at sea level in Celsius?", "expected_output": "100"}
7
+ {"user_input": "Find the 80th fibonnacci number", "expected_output": "23416728348467685"}
@@ -0,0 +1 @@
1
+ {"user_input": "What is 2 + 2?", "expected_output": "4"}
@@ -0,0 +1,14 @@
1
+ from openevals.llm import create_llm_as_judge
2
+
3
+ from evals.prompts import CODEACT_EVALUATOR_PROMPT, CORRECTNESS_PROMPT
4
+
5
+ correctness_evaluator = create_llm_as_judge(
6
+ prompt=CORRECTNESS_PROMPT,
7
+ model="anthropic:claude-4-sonnet-20250514",
8
+ )
9
+
10
+
11
+ codeact_evaluator = create_llm_as_judge(
12
+ prompt=CODEACT_EVALUATOR_PROMPT,
13
+ model="anthropic:claude-4-sonnet-20250514",
14
+ )
@@ -0,0 +1,47 @@
1
+ CORRECTNESS_PROMPT = """You are an expert at evaluating LLM trajectories and responses, for an agent that uses code-writing to perform actions. You will be able to see the entire run, including the human input prompt, the system prompt containing tool information for additional tools (call_llm, ai_classify, creative_writer, data_extractor, smart_print) , and the code inputs/outputs.
2
+
3
+ Judge the correctness of the trajectory based on the following-
4
+ <Rubric>
5
+ - The agent returns the correct output to the user at the end, or has completed the task the user asked it to do.
6
+ - There are no remaining errors in the code. Do not penalise for errors that the LLM corrects based on the output.
7
+ - The agent calls the functions with correct arguments as per the user's task.
8
+ - The agent utilizes the correct functions/tools for the task.
9
+ - During the run, the agent will search for tools from different applications. Ensure that the following is followed by the agent-
10
+ -Prioritize connected applications over unconnected ones from the output of `search_functions`.
11
+ - When multiple apps are connected, or none of the apps are connected, YOU MUST ask the user to choose the application(s). The search results will inform you when such a case occurs (including some irrelevant apps), and you must stop and ask the user if multiple apps are relevant.
12
+
13
+ - When there is no output at all, the run has failed. Give 0 for this.
14
+ </Rubric>
15
+
16
+ <input>
17
+ {inputs}
18
+ </input>
19
+
20
+ <output>
21
+ {outputs}
22
+ </output>
23
+
24
+ Use the reference outputs below to help you evaluate the correctness of the response:
25
+
26
+ <reference_outputs>
27
+ {reference_outputs}
28
+ </reference_outputs>
29
+ """
30
+
31
+ CODEACT_EVALUATOR_PROMPT = """
32
+ You are a code execution evaluator. You will be given the entire run of an agent, starting with a human input task, the intermediate steps taken, and the final output of the agent given to the user.
33
+ These steps will contain code written by the agent to solve the problem as well as its outputs. Your job is to check ONLY if the code executes correctly.
34
+ Keep in mind that the agent has access to tools like- ai_classify, call_llm, creative_writer, data_extractor, smart_print as pre-loaded tools. These calls are to be treated as valid if they run without errors.
35
+ These are the only criteria you should evaluate-
36
+
37
+ <Rubric>
38
+ - The code written by the agent in tool calls should be syntactically correct and use existing or loaded objects.
39
+ - The code outputs should not have an error or empty/unexpected outputs
40
+ - The output should not be empty, since that indicates a failed run.
41
+ </Rubric>
42
+ If either of the above are not satisfied, you should give 0.
43
+
44
+ <Reminder>
45
+ You must not judge whether the code is helpful to the task or not, only if the code itself is correct or not.
46
+ </Reminder>
47
+ """
@@ -0,0 +1,207 @@
1
+ import asyncio
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from typing import Annotated, Any
5
+
6
+ import typer
7
+ from langsmith import Client, aevaluate
8
+ from langsmith.utils import LangSmithConflictError
9
+ from universal_mcp.agentr.client import AgentrClient
10
+ from universal_mcp.agentr.registry import AgentrRegistry
11
+
12
+ from evals.dataset import load_dataset
13
+ from evals.evaluators import (
14
+ codeact_evaluator,
15
+ correctness_evaluator,
16
+ )
17
+ from universal_mcp.agents import get_agent
18
+
19
+ # 2. Evaluator Registry
20
+ EVALUATORS: dict[str, Any] = {
21
+ "correctness": correctness_evaluator,
22
+ "codeact": codeact_evaluator,
23
+ }
24
+
25
+
26
+ class Difficulty(str, Enum):
27
+ easy = "easy"
28
+ medium = "medium"
29
+ hard = "hard"
30
+
31
+
32
+ async def agent_runner(inputs: dict) -> dict:
33
+ """
34
+ Runs the agent and returns a dictionary with the final output.
35
+ """
36
+ agent_name = "codeact-repl"
37
+ current_date_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
38
+ client = AgentrClient()
39
+ registry = AgentrRegistry(client=client)
40
+ common_params = {
41
+ "instructions": f"You are a helpful assistant. The current date and time is {current_date_time}",
42
+ "model": "anthropic:claude-haiku-4-5",
43
+ "registry": registry,
44
+ "tools": inputs.get("tools", {}),
45
+ }
46
+ agent = get_agent(agent_name)(name=agent_name, **common_params)
47
+ result = await agent.invoke(user_input=inputs["user_input"])
48
+ # The trajectory evaluator expects the raw output dict, with serialized messages
49
+ # result["messages"] = messages_to_list(result["messages"])
50
+ return result
51
+
52
+
53
+ async def run_evaluation(
54
+ dataset_name: str,
55
+ difficulty_split: str | None = None,
56
+ dataset_version: str | None = None,
57
+ max_concurrency: int = 1,
58
+ description: str | None = None,
59
+ ):
60
+ """
61
+ The main async function for the evaluation.
62
+ """
63
+ agent_name = "codeact-repl"
64
+ evaluators = [correctness_evaluator] # TODO: Add codeact_evaluator
65
+
66
+ # Create a callable for aevaluate
67
+ async def target_func(inputs: dict):
68
+ return await agent_runner(inputs)
69
+
70
+ # 2. Run the evaluation
71
+ client = Client()
72
+ data = dataset_name
73
+ if difficulty_split or dataset_version:
74
+ kwargs = {"dataset_name": dataset_name}
75
+ if difficulty_split:
76
+ kwargs["metadata"] = {"difficulty": difficulty_split}
77
+ if dataset_version:
78
+ kwargs["as_of"] = dataset_version
79
+ data = client.list_examples(**kwargs)
80
+
81
+ await aevaluate(
82
+ target_func,
83
+ data=data,
84
+ evaluators=evaluators,
85
+ experiment_prefix=f"{agent_name}-eval",
86
+ max_concurrency=max_concurrency,
87
+ description=description,
88
+ )
89
+
90
+
91
+ def upload_dataset(
92
+ dataset_path: str,
93
+ ):
94
+ """
95
+ Loads a dataset from a file and uploads it to LangSmith, creating a new version.
96
+ If a dataset with the same name already exists, all previous examples are deleted
97
+ before adding the new ones, ensuring a clean new version.
98
+ """
99
+ dataset_examples = load_dataset(dataset_path)
100
+
101
+ client = Client()
102
+ dataset_name = f"{dataset_path.split('/')[-1].split('.')[0]}"
103
+
104
+ try:
105
+ dataset = client.create_dataset(
106
+ dataset_name,
107
+ description="Dataset for codeact-repl agent evaluation.",
108
+ )
109
+ except LangSmithConflictError:
110
+ dataset = client.read_dataset(dataset_name=dataset_name)
111
+ # Delete existing examples to create a clean slate for the new version
112
+ example_ids = [example.id for example in client.list_examples(dataset_id=dataset.id)]
113
+ if example_ids:
114
+ client.delete_examples(example_ids=example_ids)
115
+
116
+ examples = []
117
+ for ex in dataset_examples:
118
+ metadata = {}
119
+ if "difficulty" in ex:
120
+ difficulty = ex["difficulty"]
121
+ metadata["difficulty_score"] = difficulty
122
+ if difficulty in {1, 2}:
123
+ metadata["difficulty"] = "easy"
124
+ elif difficulty == 3:
125
+ metadata["difficulty"] = "medium"
126
+ elif difficulty in {4, 5}:
127
+ metadata["difficulty"] = "hard"
128
+
129
+ examples.append(
130
+ {
131
+ "inputs": {"user_input": ex["user_input"], "tools": ex.get("required_tools", {})},
132
+ "outputs": {
133
+ "expected_output": ex.get("expected_output", ""),
134
+ "required_tools": ex.get("required_tools", {}),
135
+ },
136
+ "metadata": metadata,
137
+ }
138
+ )
139
+
140
+ client.create_examples(
141
+ dataset_id=dataset.id,
142
+ examples=examples,
143
+ )
144
+
145
+
146
+ app = typer.Typer()
147
+
148
+
149
+ @app.command()
150
+ def upload(
151
+ dataset_path: Annotated[
152
+ str,
153
+ typer.Argument(help="Path to the dataset file (e.g., src/evals/datasets/tasks.jsonl)."),
154
+ ],
155
+ ):
156
+ """
157
+ Uploads a dataset to LangSmith.
158
+ """
159
+ upload_dataset(dataset_path)
160
+
161
+
162
+ @app.command()
163
+ def run(
164
+ dataset_name: Annotated[str, typer.Argument(help="The name of the dataset in LangSmith.")],
165
+ difficulty: Annotated[
166
+ Difficulty | None,
167
+ typer.Option(
168
+ help="The difficulty split to use from the dataset.",
169
+ case_sensitive=False,
170
+ ),
171
+ ] = None,
172
+ dataset_version: Annotated[
173
+ str | None,
174
+ typer.Option(
175
+ help="The dataset version to use (e.g., 'latest', a timestamp, or a tag).",
176
+ ),
177
+ ] = None,
178
+ concurrency: Annotated[
179
+ int,
180
+ typer.Option(
181
+ help="The number of concurrent runs to execute.",
182
+ ),
183
+ ] = 5,
184
+ description: Annotated[
185
+ str | None,
186
+ typer.Option(
187
+ help="A description for the evaluation experiment.",
188
+ ),
189
+ ] = None,
190
+ ):
191
+ """
192
+ Run evaluations on the codeact-repl agent.
193
+ """
194
+ difficulty_value = difficulty.value if difficulty else None
195
+ asyncio.run(
196
+ run_evaluation(
197
+ dataset_name=dataset_name,
198
+ difficulty_split=difficulty_value,
199
+ dataset_version=dataset_version,
200
+ max_concurrency=concurrency,
201
+ description=description,
202
+ )
203
+ )
204
+
205
+
206
+ if __name__ == "__main__":
207
+ app()
@@ -1,16 +1,14 @@
1
1
  from typing import Any
2
+ from uuid import uuid4
2
3
 
3
4
  import pytest
4
- from langchain_core.messages import HumanMessage
5
- from langchain_core.tools import tool
5
+ from langchain_core.tools import StructuredTool
6
+ from langgraph.checkpoint.memory import MemorySaver
6
7
  from universal_mcp.tools.registry import ToolRegistry
7
8
  from universal_mcp.types import ToolFormat
8
9
 
9
10
  from universal_mcp.agents import get_agent
10
- from universal_mcp.agents.base import BaseAgent
11
- from universal_mcp.agents.builder.builder import BuilderAgent
12
- from universal_mcp.agents.llm import load_chat_model
13
- from universal_mcp.agents.shared.tool_node import build_tool_node_graph
11
+ from universal_mcp.agents.utils import get_message_text
14
12
 
15
13
 
16
14
  class MockToolRegistry(ToolRegistry):
@@ -129,6 +127,11 @@ class MockToolRegistry(ToolRegistry):
129
127
  "code": ["create_pull_request", "get_repository"],
130
128
  },
131
129
  }
130
+ super().__init__(**kwargs)
131
+
132
+ def _load_tools_from_app(self, app_id: str, tools: list[str]) -> None:
133
+ """Mock implementation for loading tools."""
134
+ pass
132
135
 
133
136
  async def list_all_apps(self) -> list[dict[str, Any]]:
134
137
  """Get list of available apps."""
@@ -197,22 +200,35 @@ class MockToolRegistry(ToolRegistry):
197
200
 
198
201
  async def export_tools(
199
202
  self,
200
- tools: list[str],
201
- format: ToolFormat,
203
+ tools: list[str] | None = None,
204
+ format: ToolFormat = ToolFormat.NATIVE,
202
205
  ) -> list[Any]:
203
- """Exports a list of mock LangChain tools."""
206
+ """Exports a list of mock tools."""
207
+
208
+ async def mock_send_email(to: str, body: str):
209
+ """Sends an email."""
210
+ return {"status": f"Email sent to {to} with body '{body}'"}
204
211
 
205
- @tool
206
- async def mock_tool_callable(query: str):
212
+ if tools and "google_mail__send_email" in tools:
213
+ if format == ToolFormat.NATIVE:
214
+ return [mock_send_email]
215
+ elif format == ToolFormat.LANGCHAIN:
216
+ return [StructuredTool.from_function(mock_send_email)]
217
+
218
+ async def mock_tool_callable(**kwargs: str):
207
219
  """A mock tool that confirms the task is done."""
208
- return {"status": "task has been done"}
220
+ return {"status": "Task has been done"}
209
221
 
210
- # Return a list of mock tools for the ReAct agent to use
211
- return [mock_tool_callable]
222
+ if format == ToolFormat.NATIVE:
223
+ return [mock_tool_callable]
224
+ elif format == ToolFormat.LANGCHAIN:
225
+ return [StructuredTool.from_function(mock_tool_callable)]
226
+ else:
227
+ raise ValueError(f"Invalid format: {format}")
212
228
 
213
229
  async def call_tool(self, tool_name: str, tool_args: dict[str, Any]) -> dict[str, Any]:
214
230
  """Call a tool with the given name and arguments."""
215
- return {"status": f"task has been done by tool {tool_name}"}
231
+ return {"status": f"Task has been done by tool {tool_name}"}
216
232
 
217
233
  async def list_connected_apps(self) -> list[dict[str, str]]:
218
234
  """
@@ -222,141 +238,62 @@ class MockToolRegistry(ToolRegistry):
222
238
  return [{"app_id": app_id} for app_id in self._connected_apps]
223
239
 
224
240
 
225
- class TestToolFinderGraph:
226
- @pytest.fixture
227
- def llm(self):
228
- return load_chat_model("anthropic/claude-sonnet-4-20250514", thinking=False)
229
-
230
- @pytest.fixture
231
- def registry(self):
232
- return MockToolRegistry()
233
-
234
- @pytest.mark.asyncio
235
- async def test_simple_case(self, llm, registry):
236
- """Test Case 1: Simple task requiring a single app and tool."""
237
- task = "Send an email to my manager about the project update."
238
- graph = build_tool_node_graph(llm, registry)
239
- final_state = await graph.ainvoke(
240
- {"original_task": task, "messages": [HumanMessage(content=task)], "decomposition_attempts": 0}
241
- )
242
-
243
- tool_config = final_state.get("execution_plan")
244
-
245
- # FIX: Assert against the correct, hyphenated app ID.
246
- assert "google_mail" in tool_config
247
- assert "send_email" in tool_config["google_mail"]
248
-
249
- @pytest.mark.asyncio
250
- async def test_multi_step_task(self, llm, registry):
251
- """Test Case 2: A task requiring multiple tools from different apps."""
252
- task = "Create a new issue for a bug in our github repository, and send a message on slack about the issue."
253
- graph = build_tool_node_graph(llm, registry)
254
- final_state = await graph.ainvoke(
255
- {"original_task": task, "messages": [HumanMessage(content=task)], "decomposition_attempts": 0}
256
- )
257
-
258
- tool_config = final_state.get("execution_plan")
259
- assert tool_config, "Execution plan should not be empty"
260
-
261
- assert "github" in tool_config
262
- assert "create_issue" in tool_config["github"]
263
- assert "slack" in tool_config
264
- assert "send_message" in tool_config["slack"]
265
-
266
- @pytest.mark.asyncio
267
- async def test_no_relevant_app(self, llm, registry):
268
- """Test Case 3: A task for which no tools or apps are available."""
269
- task = "Can you create a blog post on my wordpress site?"
270
- graph = build_tool_node_graph(llm, registry)
271
- final_state = await graph.ainvoke(
272
- {"original_task": task, "messages": [HumanMessage(content=task)], "decomposition_attempts": 0}
273
- )
274
- plan = final_state.get("execution_plan")
275
- assert not plan
276
- last_message = final_state.get("messages", [])[-1].content
277
- assert "could not create a final plan" in last_message.lower()
278
-
279
-
280
- @pytest.mark.parametrize(
281
- "agent_name",
282
- [
283
- "react",
284
- "simple",
285
- "builder",
286
- "bigtool",
287
- # "codeact-script",
288
- # "codeact-repl",
289
- ],
290
- )
291
- class TestAgents:
292
- @pytest.fixture
293
- def agent(self, agent_name: str):
294
- """Set up the test environment for the agent."""
295
- registry = MockToolRegistry()
296
- agent_class = get_agent(agent_name)
297
- agent = agent_class(
298
- name=f"Test {agent_name}",
299
- instructions="Test instructions",
300
- model="anthropic/claude-sonnet-4-20250514",
301
- registry=registry,
302
- )
303
- return agent
304
-
305
- @pytest.mark.asyncio
306
- async def test_end_to_end_with_tool(self, agent: BaseAgent):
307
- """Tests the full flow from task to tool execution."""
308
- task = "Send an email to my manager."
309
- thread_id = f"test-thread-{agent.name.replace(' ', '-')}"
310
-
311
- await agent.ainit()
312
- # Invoke the agent graph to get the final state
313
- final_state = await agent.invoke(
314
- user_input={"userInput": task} if agent.name == "Test builder" else task,
315
- thread_id=thread_id,
316
- )
317
-
318
- # Extract the content of the last message
319
- if agent.name != "Test builder":
320
- final_messages = final_state.get("messages", [])
321
- assert final_messages, "The agent should have produced at least one message."
322
- last_message = final_messages[-1]
323
-
324
- final_response = last_message.content if hasattr(last_message, "content") else str(last_message)
325
-
326
- assert final_response is not None, "The final response should not be None."
327
- assert final_response != "", "The final response should not be an empty string."
328
-
329
-
330
- class TestAgentBuilder:
331
- @pytest.fixture
332
- def agent_builder(self):
333
- """Set up the agent builder."""
334
- registry = MockToolRegistry()
335
- agent = BuilderAgent(
336
- name="Test Builder Agent",
337
- instructions="Test instructions for builder",
338
- model="gemini/gemini-2.5-flash",
339
- registry=registry,
340
- )
341
- yield agent
342
-
343
- @pytest.mark.asyncio
344
- async def test_create_agent(self, agent_builder: BuilderAgent):
345
- """Test case for creating an agent with the builder."""
346
- task = "Send a daily email to manoj@agentr.dev with daily agenda of the day"
347
- thread_id = "test-thread-create-agent"
348
-
349
- result = await agent_builder.invoke(thread_id=thread_id, user_input={"userInput": task})
350
-
351
- assert "generated_agent" in result
352
- generated_agent = result["generated_agent"]
353
-
354
- assert generated_agent.name
355
- assert generated_agent.description
356
- assert generated_agent.expertise
357
- assert "manoj@agentr.dev" in generated_agent.instructions
358
- assert generated_agent.schedule is not None
359
-
360
- assert "tool_config" in result
361
- tool_config = result["tool_config"]
362
- assert "google_mail" in tool_config
241
+ @pytest.mark.asyncio
242
+ async def test_simple_agent():
243
+ """Tests the simple agent."""
244
+ agent = get_agent("simple")(
245
+ name="Test Simple",
246
+ instructions="Test instructions",
247
+ model="anthropic/claude-haiku-4-5",
248
+ )
249
+ result = await agent.invoke(user_input="What is the capital of France?")
250
+ assert result is not None
251
+ last_message = result["messages"][-1]
252
+ last_message_text = get_message_text(last_message)
253
+ assert "paris" in last_message_text.lower()
254
+
255
+
256
+ @pytest.mark.asyncio
257
+ async def test_codeact_single_turn():
258
+ """Tests the codeact-repl agent."""
259
+ agent = get_agent("codeact-repl")(
260
+ name="Test Codeact Repl",
261
+ instructions="Test instructions",
262
+ model="anthropic/claude-haiku-4-5",
263
+ registry=MockToolRegistry(),
264
+ )
265
+ result = await agent.invoke(user_input="What is 2+2?")
266
+ assert result is not None
267
+ last_message = result["messages"][-1]
268
+ last_message_text = get_message_text(last_message)
269
+ assert "4" in last_message_text.lower()
270
+
271
+
272
+ @pytest.mark.asyncio
273
+ async def test_codeact_multi_turn():
274
+ """Tests the codeact-repl agent."""
275
+ checkpoint_saver = MemorySaver()
276
+ agent = get_agent("codeact-repl")(
277
+ name="Test Codeact Repl",
278
+ instructions="You are a helpful assistant",
279
+ model="anthropic/claude-haiku-4-5",
280
+ registry=MockToolRegistry(),
281
+ memory=checkpoint_saver,
282
+ )
283
+ thread_id = str(uuid4())
284
+ result = await agent.invoke(
285
+ user_input="Generate a function to calculate fibonnaci number, and get 10th number in the sequence. Use fib(0) = 0 and fib(1) = 1 as the base cases. Set x = fib(10)",
286
+ thread_id=thread_id,
287
+ )
288
+ assert result is not None
289
+ last_message = result["messages"][-1]
290
+ last_message_text = get_message_text(last_message)
291
+ assert "55" in last_message_text.lower()
292
+ turn2 = await agent.invoke(
293
+ user_input="What is the x+5?",
294
+ thread_id=thread_id,
295
+ )
296
+ assert turn2 is not None
297
+ last_message2 = turn2["messages"][-1]
298
+ last_message2_text = get_message_text(last_message2)
299
+ assert "60" in last_message2_text.lower()