universal-mcp-agents 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of universal-mcp-agents might be problematic. Click here for more details.

Files changed (75) hide show
  1. universal_mcp_agents-0.1.5/.pre-commit-config.yaml +58 -0
  2. universal_mcp_agents-0.1.5/GEMINI.md +47 -0
  3. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/PKG-INFO +3 -2
  4. universal_mcp_agents-0.1.5/PROMPTS.md +27 -0
  5. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/bump_and_release.sh +4 -4
  6. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/pyproject.toml +6 -3
  7. universal_mcp_agents-0.1.5/src/evals/datasets/tasks.jsonl +22 -0
  8. universal_mcp_agents-0.1.5/src/evals/datasets/test.jsonl +1 -0
  9. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/evals/evaluators.py +20 -7
  10. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/evals/run.py +37 -47
  11. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/evals/utils.py +39 -17
  12. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/tests/test_agents.py +123 -41
  13. universal_mcp_agents-0.1.5/src/universal_mcp/agents/__init__.py +38 -0
  14. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/autoagent/graph.py +32 -13
  15. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/autoagent/studio.py +3 -8
  16. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/base.py +80 -22
  17. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtool/__init__.py +13 -9
  18. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtool/__main__.py +6 -7
  19. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtool/graph.py +84 -40
  20. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtool/prompts.py +3 -3
  21. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtool2/__init__.py +16 -6
  22. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtool2/__main__.py +7 -6
  23. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtool2/agent.py +4 -2
  24. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtool2/graph.py +78 -36
  25. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtool2/prompts.py +1 -1
  26. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtoolcache/__init__.py +8 -4
  27. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtoolcache/agent.py +5 -3
  28. {universal_mcp_agents-0.1.3/src/universal_mcp/agents/bigtool2 → universal_mcp_agents-0.1.5/src/universal_mcp/agents/bigtoolcache}/context.py +0 -1
  29. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtoolcache/graph.py +99 -69
  30. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtoolcache/prompts.py +28 -0
  31. universal_mcp_agents-0.1.5/src/universal_mcp/agents/bigtoolcache/tools_all.txt +956 -0
  32. universal_mcp_agents-0.1.5/src/universal_mcp/agents/bigtoolcache/tools_important.txt +474 -0
  33. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/builder.py +62 -20
  34. universal_mcp_agents-0.1.5/src/universal_mcp/agents/cli.py +41 -0
  35. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/codeact/__init__.py +16 -4
  36. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/codeact/test.py +2 -1
  37. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/hil.py +16 -4
  38. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/llm.py +12 -4
  39. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/planner/__init__.py +14 -4
  40. universal_mcp_agents-0.1.5/src/universal_mcp/agents/planner/__main__.py +28 -0
  41. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/planner/graph.py +9 -3
  42. universal_mcp_agents-0.1.5/src/universal_mcp/agents/planner/prompts.py +14 -0
  43. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/planner/state.py +0 -1
  44. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/react.py +36 -22
  45. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/shared/tool_node.py +26 -11
  46. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/simple.py +27 -4
  47. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/tools.py +9 -4
  48. universal_mcp_agents-0.1.5/src/universal_mcp/agents/ui_tools.py +305 -0
  49. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/utils.py +55 -17
  50. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/uv.lock +360 -276
  51. universal_mcp_agents-0.1.3/src/evals/datasets/tasks.jsonl +0 -22
  52. universal_mcp_agents-0.1.3/src/evals/test.py +0 -41
  53. universal_mcp_agents-0.1.3/src/universal_mcp/agents/__init__.py +0 -19
  54. universal_mcp_agents-0.1.3/src/universal_mcp/agents/bigtool/context.py +0 -24
  55. universal_mcp_agents-0.1.3/src/universal_mcp/agents/bigtoolcache/context.py +0 -33
  56. universal_mcp_agents-0.1.3/src/universal_mcp/agents/cli.py +0 -27
  57. universal_mcp_agents-0.1.3/src/universal_mcp/agents/planner/__main__.py +0 -24
  58. universal_mcp_agents-0.1.3/src/universal_mcp/agents/planner/prompts.py +0 -1
  59. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/.gitignore +0 -0
  60. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/README.md +0 -0
  61. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/evals/__init__.py +0 -0
  62. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/evals/dataset.py +0 -0
  63. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/evals/datasets/exact.jsonl +0 -0
  64. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/autoagent/__init__.py +1 -1
  65. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/autoagent/__main__.py +1 -1
  66. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/autoagent/context.py +0 -0
  67. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/autoagent/prompts.py +0 -0
  68. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/autoagent/state.py +0 -0
  69. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/autoagent/utils.py +0 -0
  70. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtool/state.py +0 -0
  71. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtool2/state.py +0 -0
  72. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtoolcache/__main__.py +1 -1
  73. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/bigtoolcache/state.py +0 -0
  74. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/codeact/sandbox.py +0 -0
  75. {universal_mcp_agents-0.1.3 → universal_mcp_agents-0.1.5}/src/universal_mcp/agents/codeact/utils.py +0 -0
@@ -0,0 +1,58 @@
1
+ fail_fast: false
2
+
3
+ repos:
4
+ - repo: https://github.com/pre-commit/mirrors-prettier
5
+ rev: v3.1.0
6
+ hooks:
7
+ - id: prettier
8
+ types_or: [yaml, json5]
9
+
10
+ - repo: https://github.com/astral-sh/ruff-pre-commit
11
+ rev: v0.11.13
12
+ hooks:
13
+ # Run the linter.
14
+ - id: ruff-check
15
+ args: [--fix]
16
+ # Run the formatter.
17
+ - id: ruff-format
18
+
19
+ # - repo: https://github.com/pre-commit/mirrors-mypy
20
+ # rev: v1.8.0
21
+ # hooks:
22
+ # - id: mypy
23
+ # additional_dependencies: []
24
+ # args: ["--install-types", "--non-interactive"]
25
+
26
+ - repo: https://github.com/pre-commit/pygrep-hooks
27
+ rev: v1.10.0
28
+ hooks:
29
+ - id: python-check-blanket-noqa
30
+ # - id: python-no-eval
31
+ - id: python-no-log-warn
32
+ - id: python-use-type-annotations
33
+ - id: python-check-mock-methods
34
+
35
+ - repo: https://github.com/pre-commit/pre-commit-hooks
36
+ rev: v4.5.0
37
+ hooks:
38
+ - id: trailing-whitespace
39
+ - id: end-of-file-fixer
40
+ - id: check-yaml
41
+ - id: check-added-large-files
42
+ - id: check-ast
43
+ - id: check-json
44
+ - id: check-merge-conflict
45
+ - id: detect-private-key
46
+ - id: mixed-line-ending
47
+ - id: debug-statements
48
+ # - id: name-tests-test
49
+ - id: requirements-txt-fixer
50
+
51
+ # - repo: local
52
+ # hooks:
53
+ # - id: uv-lock-check
54
+ # name: Check uv.lock is up to date
55
+ # entry: uv lock --check
56
+ # language: system
57
+ # files: ^(pyproject\.toml|uv\.lock)$
58
+ # pass_filenames: false
@@ -0,0 +1,47 @@
1
+ # Agent.md — Python + uv
2
+
3
+ Purpose
4
+ - This repository uses Python managed by uv for dependency resolution, virtual environments, locking, and execution. Always prefer uv subcommands (add/remove/run/sync/export) over raw pip/venv commands.
5
+
6
+ Core rules
7
+ - Use `uv add` to add or upgrade dependencies so that both `pyproject.toml` and `uv.lock` stay in sync; do not use `pip install` directly.
8
+ - Keep runtime dependencies in `[project.dependencies]` and development-only tools in the `dev` group via `uv add --dev ...`.
9
+ - Use `uv run` to execute Python, test, and tooling commands without manually activating a virtual environment.
10
+
11
+ Project bootstrap
12
+ - New project (scaffold files): `uv init`
13
+ - First install or clean install: `uv sync`
14
+ - Run the app: `uv run python -m <your_module>` or `uv run main.py`
15
+ - REPL: `uv run python`
16
+ - Scripts in pyproject: prefer `uv run <command>` to ensure the correct environment is used
17
+
18
+ Managing dependencies
19
+ - Add runtime dependency: `uv add <name>` (e.g., `uv add httpx`)
20
+ - Add dev dependencies: `uv add --dev pytest ruff`
21
+ - Pin/upgrade by constraint: `uv add "httpx>=0.27"` or adjust `pyproject.toml` and then `uv sync`
22
+ - Remove dependency: `uv remove <name>`
23
+ - Export lock for external tooling: `uv export --format requirements-txt --output-file requirements.txt`
24
+
25
+ Locking and environments
26
+ - `uv run` and `uv sync` will ensure the environment matches `pyproject.toml` and `uv.lock`
27
+ - Avoid manual `pip install` or manual `venv` activation; let uv manage the environment
28
+ - Commit `uv.lock` to version control for reproducible installs
29
+
30
+ pyproject guidance
31
+ - Dependencies live under `[project]` → `dependencies = [...]`
32
+ - Development-only tooling should go under a dev group (e.g., `uv add --dev ruff pytest`) for clean separation
33
+ - Keep `requires-python` current (e.g., `>=3.12`) to match the team’s baseline
34
+
35
+ Usage in this repo
36
+ - When adding libraries or changing versions, propose `uv add ...` changes that update both `pyproject.toml` and `uv.lock`, then run `uv run pytest -q` to validate
37
+ - Prefer minimal diffs, explain the plan, apply changes, and run tests/tooling via `uv run`
38
+ - If build/test fails, inspect error context, adjust constraints or code, and re-run via `uv run`
39
+
40
+ Common commands (copy/paste)
41
+ - Initialize: `uv init` | Install deps: `uv sync`
42
+ - Add runtime: `uv add <pkg>` | Add dev: `uv add --dev <pkg>`
43
+ - Remove: `uv remove <pkg>`
44
+ - Run app: `uv run python -m <your_module>` or `uv run main.py`
45
+ - Tests: `uv run pytest -q`
46
+ - Lint/format: `uv run ruff check .` and/or `uv run ruff format .`
47
+ - Export: `uv export --format requirements-txt --output-file requirements.txt`
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: universal-mcp-agents
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Add your description here
5
5
  Project-URL: Homepage, https://github.com/universal-mcp/applications
6
6
  Project-URL: Repository, https://github.com/universal-mcp/applications
@@ -11,11 +11,12 @@ Requires-Dist: langchain-anthropic>=0.3.19
11
11
  Requires-Dist: langchain-google-genai>=2.1.10
12
12
  Requires-Dist: langchain-openai>=0.3.32
13
13
  Requires-Dist: langgraph>=0.6.6
14
- Requires-Dist: universal-mcp-applications>=0.1.2
14
+ Requires-Dist: universal-mcp-applications>=0.1.4
15
15
  Requires-Dist: universal-mcp>=0.1.24rc17
16
16
  Provides-Extra: dev
17
17
  Requires-Dist: pre-commit; extra == 'dev'
18
18
  Requires-Dist: ruff; extra == 'dev'
19
19
  Provides-Extra: test
20
+ Requires-Dist: pytest-asyncio>=1.1.0; extra == 'test'
20
21
  Requires-Dist: pytest-cov; extra == 'test'
21
22
  Requires-Dist: pytest<9.0.0,>=7.0.0; extra == 'test'
@@ -0,0 +1,27 @@
1
+ # Differentiating Developer and System Prompts
2
+
3
+ This document explains the roles of the two different types of prompts used by agents: the Developer Prompt and the System Prompt.
4
+
5
+ ## Developer Prompt
6
+
7
+ * **Role:** Defines the core identity, capabilities, and constraints of the agent. It's the fundamental instruction set that governs the agent's behavior.
8
+ * **Author:** The agent developer.
9
+ * **Nature:** Static. It is part of the agent's source code and does not change between different runs or users.
10
+ * **Example:** "You are a helpful assistant that can write code. You are an expert in Python. You must not engage in harmful conversations."
11
+
12
+ ## System Prompt
13
+
14
+ * **Role:** Provides init time context to the agent. This includes user-specific information, environment details, or any other dynamic data that can influence the agent's response for a specific interaction.
15
+ * **Author:** The agent platform or the system running the agent.
16
+ * **Nature:** Dynamic. It can change with every request or for every user.
17
+ * **Example:** "The current user is John Doe. The current date is 2025-09-09. The user's timezone is UTC. The user is working on a project located at /path/to/project."
18
+
19
+ ## User Input
20
+
21
+ * Provided everytime by user to trigger the agent
22
+
23
+ ## How they work together
24
+
25
+ The developer prompt and system prompt are combined to form the final set of instructions for the LLM. Typically, the developer prompt comes first, establishing the agent's persona and rules, followed by the system prompt which provides the immediate context for the current task.
26
+
27
+ This separation allows for building robust, general-purpose agents (via developer prompts) that can be adapted to specific situations (via system prompts) without altering their core logic.
@@ -5,11 +5,11 @@ set -x
5
5
 
6
6
  # Ensure dependencies are installed
7
7
  echo "Syncing dependencies..."
8
- uv sync
8
+ uv sync --all-extras
9
9
 
10
10
  # Run tests with pytest
11
11
  echo "Running tests with pytest..."
12
- uv run pytest
12
+ # uv run pytest # --cov=src --cov-report=term-missing
13
13
 
14
14
  echo "Tests passed!"
15
15
 
@@ -55,7 +55,7 @@ sed -i '' "s/^version = ".*"/version = \"$NEW_VERSION\"/" pyproject.toml
55
55
  echo "Version bumped from $CURRENT_VERSION to $NEW_VERSION"
56
56
 
57
57
  # Stage the changed file
58
- git add pyproject.toml
58
+ git add pyproject.toml uv.lock
59
59
 
60
60
  # Commit the change
61
61
  git commit -m "bump: version $CURRENT_VERSION → $NEW_VERSION"
@@ -87,7 +87,7 @@ if [ "$1" = "release" ]; then
87
87
  rm -rf .idea
88
88
  rm -rf .vscode
89
89
 
90
- uv sync && uv build && uv publish
90
+ uv build && uv publish
91
91
  echo "Release complete!"
92
92
  else
93
93
  echo "Skipping release steps"
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
6
6
 
7
7
  [project]
8
8
  name = "universal-mcp-agents"
9
- version = "0.1.3"
9
+ version = "0.1.5"
10
10
  description = "Add your description here"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -19,7 +19,7 @@ dependencies = [
19
19
  "langchain-openai>=0.3.32",
20
20
  "langgraph>=0.6.6",
21
21
  "universal-mcp >= 0.1.24rc17",
22
- "universal-mcp-applications>=0.1.2",
22
+ "universal-mcp-applications>=0.1.4",
23
23
  ]
24
24
 
25
25
  [project.license]
@@ -28,6 +28,7 @@ text = "MIT"
28
28
  [project.optional-dependencies]
29
29
  test = [
30
30
  "pytest>=7.0.0,<9.0.0",
31
+ "pytest-asyncio>=1.1.0",
31
32
  "pytest-cov",
32
33
  ]
33
34
  dev = [
@@ -58,7 +59,7 @@ show_missing = true
58
59
  fail_under = 70
59
60
 
60
61
  [tool.ruff]
61
- line-length = 88
62
+ line-length = 120
62
63
  select = [
63
64
  "E", "W", "F", "I", "UP", "PL", "T20",
64
65
  ]
@@ -72,4 +73,6 @@ quote-style = "double"
72
73
  pythonpath = [
73
74
  "src",
74
75
  ]
76
+ asyncio_mode = "strict"
77
+ asyncio_default_fixture_loop_scope = "module"
75
78
 
@@ -0,0 +1,22 @@
1
+ {"user_input": "Send an email to manoj@agentr.dev from my Gmail account", "difficulty": 1}
2
+ {"user_input": "Show me events from today's Google Calendar.", "difficulty": 1}
3
+ {"user_input": "Create a Google Doc summarizing the last 5 merged pull requests in my GitHub repo- universal-mcp/universal-mcp, including links and commit highlights.", "difficulty": 4}
4
+ {"user_input": "Summarize the key insights from all marketing emails received this week from my Gmail and add a section in a Google Doc with action points.", "difficulty": 4}
5
+ {"user_input": "Create a Google Sheet of the best cafes and restaurants near IIT Bombay", "difficulty": 3}
6
+ {"user_input": "Track the top posts in r/startups over the past 7 days using Reddit and create a trend report on what's being discussed most (e.g., hiring, funding, MVPs) in a Google Doc.", "difficulty": 5}
7
+ {"user_input": "Find the best restaurants in Goa using perplexity web search", "difficulty": 2}
8
+ {"user_input": "List the unread emails from the last 24 hours from my Gmail, sorted by sender.", "difficulty": 2}
9
+ {"user_input": "Tell me how many meetings I have tomorrow and when they start from my Google Calendar.", "difficulty": 1}
10
+ {"user_input": "Create a meeting with aditakarsh@example.com on the topic of the latest trends in AI at 8PM today using Google Calendar.", "difficulty": 2}
11
+ {"user_input": "What are the topics of my meetings today from Google Calendar and who are the attendees? Give a 1-line context for each attendee using LinkedIn or web search.", "difficulty": 4}
12
+ {"user_input": "Fetch my last inbox mail from Microsoft Outlook", "difficulty": 1}
13
+ {"user_input": "Fetch unsubscribe links from my Gmail inbox for promo emails I have received in the last 7 days", "difficulty": 3}
14
+ {"user_input": "Fetch all unread emails from Gmail and new tickets from ClickUp for me from last night", "difficulty": 4}
15
+ {"user_input": "Give me a report on the earnings of Oklo using web search, and projections for the company revenue, stock price", "difficulty": 4}
16
+ {"user_input": "Create a weekly expense report from my credit card transactions and categorize spending by type (food, transport, entertainment, etc.) in a Google Sheet", "difficulty": 3}
17
+ {"user_input": "Generate a comparison table of SaaS tools for project management using web search, including pricing, features, and user ratings in a Google Sheet", "difficulty": 4}
18
+ {"user_input": "Research the top 10 Y Combinator startups from the latest batch using web search and create a report on their industries and funding status in Google Docs", "difficulty": 5}
19
+ {"user_input": "Find and summarize the key takeaways from the latest earnings calls of FAANG companies using web search and create a report in Google Docs", "difficulty": 5}
20
+ {"user_input": "Draft personalized LinkedIn outreach messages for 10 potential collaborators in the fintech space based on their recent posts using LinkedIn data in a Google Sheet", "difficulty": 5}
21
+ {"user_input": "Monitor my Twitter mentions and DMs from the past 48 hours and create a response priority list in Google Sheets", "difficulty": 4}
22
+ {"user_input": "Create a content calendar for next month with trending AI/ML topics using web search and optimal posting times based on my audience analytics in Google Sheets", "difficulty": 5}
@@ -0,0 +1 @@
1
+ {"user_input": "What is 2 + 2?", "expected_output": "4"}
@@ -1,3 +1,8 @@
1
+
2
+ from agentevals.trajectory.llm import (
3
+ TRAJECTORY_ACCURACY_PROMPT,
4
+ create_trajectory_llm_as_judge,
5
+ )
1
6
  from langsmith.evaluation import EvaluationResult, run_evaluator
2
7
  from langsmith.schemas import Example, Run
3
8
  from openevals.llm import create_llm_as_judge
@@ -11,24 +16,26 @@ def exact_match_evaluator(run: Run, example: Example | None = None) -> Evaluatio
11
16
  and the expected output from the dataset.
12
17
  """
13
18
  if example is None or "expected_output" not in example.outputs:
14
- return EvaluationResult(key="exact_match", score=0, comment="No expected output provided.")
19
+ return EvaluationResult(
20
+ key="exact_match", score=0, comment="No expected output provided. Example: " + str(example)
21
+ )
15
22
 
16
23
  # The agent's response might be in a list of messages
17
24
  agent_response_raw = run.outputs.get("output", "")
18
25
  if isinstance(agent_response_raw, list):
19
26
  # Extract text from the last dictionary in the list
20
- agent_response = agent_response_raw[-1].get("text", "").strip() if agent_response_raw else ""
21
- else:
22
- agent_response = str(agent_response_raw).strip()
27
+ agent_response_raw = agent_response_raw[-1]
28
+
23
29
 
24
- expected_output = example.outputs["expected_output"].strip()
30
+ final_answer = agent_response_raw.get("content", "").strip().lower()
25
31
 
26
- if agent_response == expected_output:
32
+ expected_output = example.outputs["expected_output"].strip().lower()
33
+ if final_answer == expected_output:
27
34
  score = 1
28
35
  comment = "Exact match."
29
36
  else:
30
37
  score = 0
31
- comment = f"Mismatch: Expected '{expected_output}', but got '{agent_response}'."
38
+ comment = f"Mismatch: Expected '{expected_output}', but got '{final_answer}'."
32
39
 
33
40
  return EvaluationResult(key="exact_match", score=score, comment=comment)
34
41
 
@@ -38,3 +45,9 @@ correctness_evaluator = create_llm_as_judge(
38
45
  feedback_key="correctness",
39
46
  model="anthropic:claude-4-sonnet-20250514",
40
47
  )
48
+
49
+
50
+ trajectory_evaluator = create_trajectory_llm_as_judge(
51
+ prompt=TRAJECTORY_ACCURACY_PROMPT,
52
+ model="anthropic:claude-4-sonnet-20250514",
53
+ )
@@ -2,45 +2,43 @@ import argparse
2
2
  import asyncio
3
3
  from typing import Any
4
4
 
5
- from dotenv import load_dotenv
6
5
  from langsmith import Client, aevaluate
7
6
  from langsmith.evaluation import RunEvaluator
7
+ from universal_mcp.agentr.client import AgentrClient
8
+ from universal_mcp.agentr.registry import AgentrRegistry
8
9
 
9
10
  from evals.dataset import load_dataset
10
- from evals.evaluators import correctness_evaluator, exact_match_evaluator
11
- from universal_mcp.agentr.registry import AgentrRegistry
12
- from universal_mcp.agents.auto import AutoAgent
11
+ from evals.evaluators import (
12
+ correctness_evaluator,
13
+ exact_match_evaluator,
14
+ trajectory_evaluator,
15
+ )
16
+ from universal_mcp.agents import get_agent
13
17
  from universal_mcp.agents.base import BaseAgent
14
- from universal_mcp.agents.react import ReactAgent
15
- from universal_mcp.agents.simple import SimpleAgent
16
-
17
- load_dotenv()
18
+ from universal_mcp.agents.utils import messages_to_list
18
19
 
19
20
 
20
21
  # 1. Agent Factory
21
- def get_agent(agent_name: str) -> BaseAgent:
22
+ def build_agent(agent_name: str):
22
23
  """
23
24
  Factory function to get an agent instance by name.
24
25
  """
26
+ client = AgentrClient()
25
27
  common_params = {
26
- "instructions": "You are a helpful assistant.",
28
+ "instructions": "You are a helpful assistant. Respond to the final answer in one or two words. Eg, if the answer is 4, you should respond with '4'. Do not provide with any explanation",
27
29
  "model": "anthropic/claude-4-sonnet-20250514",
28
- "registry": AgentrRegistry() if agent_name != "simple" else None,
30
+ "registry": AgentrRegistry(client=client) if agent_name != "simple" else None,
29
31
  }
30
- if agent_name == "simple":
31
- return SimpleAgent(name="simple-agent", **common_params)
32
- elif agent_name == "react":
33
- return ReactAgent(name="react-agent", **common_params)
34
- elif agent_name == "auto":
35
- return AutoAgent(name="auto-agent", **common_params)
36
- else:
37
- raise ValueError(f"Unknown agent: {agent_name}. Available agents: simple, react, auto")
32
+ agent = get_agent(agent_name)(name=agent_name, **common_params)
33
+ return agent
34
+
38
35
 
39
36
 
40
37
  # 2. Evaluator Registry
41
38
  EVALUATORS: dict[str, Any] = {
42
39
  "llm_as_judge": correctness_evaluator,
43
40
  "exact_match": exact_match_evaluator,
41
+ "trajectory": trajectory_evaluator,
44
42
  }
45
43
 
46
44
 
@@ -50,43 +48,28 @@ def get_evaluator(evaluator_name: str) -> RunEvaluator:
50
48
  """
51
49
  evaluator = EVALUATORS.get(evaluator_name)
52
50
  if evaluator is None:
53
- raise ValueError(f"Unknown evaluator: {evaluator_name}. Available evaluators: {', '.join(EVALUATORS.keys())}")
51
+ raise ValueError(
52
+ f"Unknown evaluator: {evaluator_name}. Available evaluators: {', '.join(EVALUATORS.keys())}"
53
+ )
54
54
  return evaluator
55
55
 
56
56
 
57
- # Wrapper to run the agent and format the output consistently
57
+
58
58
  async def agent_runner(agent: BaseAgent, inputs: dict):
59
59
  """
60
60
  Runs the agent and returns a dictionary with the final output.
61
61
  """
62
62
  result = await agent.invoke(user_input=inputs["user_input"])
63
- # Extract the last message content as the final response
64
- if isinstance(result, dict) and "messages" in result and result["messages"]:
65
- content = result["messages"][-1].content
66
- if isinstance(content, str):
67
- final_response = content
68
- elif isinstance(content, list):
69
- # Handle list of content blocks (e.g., from Anthropic)
70
- text_parts = []
71
- for item in content:
72
- if isinstance(item, dict) and item.get("type") == "text":
73
- text_parts.append(item.get("text", ""))
74
- final_response = "\n".join(text_parts)
75
- else:
76
- final_response = str(content)
77
- else:
78
- final_response = str(result)
79
- return {"output": final_response}
80
-
63
+ messages = messages_to_list(result["messages"])
64
+ return {"output": messages}
81
65
 
82
66
  async def main(agent_name: str, dataset_path: str, evaluator_name: str):
83
67
  """
84
68
  The main function for the evaluation CLI.
85
69
  """
86
- print(f"Starting evaluation with agent='{agent_name}', dataset='{dataset_path}', evaluator='{evaluator_name}'")
87
70
 
88
71
  # 1. Get the agent and evaluator
89
- agent = get_agent(agent_name)
72
+ agent = build_agent(agent_name)
90
73
  evaluator = get_evaluator(evaluator_name)
91
74
 
92
75
  # Create a callable for aevaluate
@@ -104,17 +87,19 @@ async def main(agent_name: str, dataset_path: str, evaluator_name: str):
104
87
  # If dataset with same name and examples exists, read it.
105
88
  # Otherwise, a new one is created.
106
89
  dataset = client.create_dataset(
107
- dataset_name, description=f"Dataset for {agent_name} evaluation with {evaluator_name}."
90
+ dataset_name,
91
+ description=f"Dataset for {agent_name} evaluation with {evaluator_name}.",
108
92
  )
109
93
  for example in dataset_examples:
110
94
  client.create_example(
111
95
  inputs={"user_input": example["user_input"]},
112
- outputs={"output": example.get("expected_output")} if "expected_output" in example else None,
96
+ outputs={"expected_output": example.get("expected_output")}
97
+ if "expected_output" in example
98
+ else None,
113
99
  dataset_id=dataset.id,
114
100
  )
115
- print(f"Created and populated dataset '{dataset_name}' for this run.")
116
101
  except Exception:
117
- print(f"Using existing dataset '{dataset_name}'.")
102
+ pass
118
103
 
119
104
  # 4. Run the evaluation
120
105
  await aevaluate(
@@ -130,7 +115,6 @@ if __name__ == "__main__":
130
115
  parser.add_argument(
131
116
  "agent",
132
117
  type=str,
133
- choices=["simple", "react", "auto"],
134
118
  help="The name of the agent to evaluate.",
135
119
  )
136
120
  parser.add_argument(
@@ -146,4 +130,10 @@ if __name__ == "__main__":
146
130
  )
147
131
  args = parser.parse_args()
148
132
 
149
- asyncio.run(main(agent_name=args.agent, dataset_path=args.dataset, evaluator_name=args.evaluator))
133
+ asyncio.run(
134
+ main(
135
+ agent_name=args.agent,
136
+ dataset_path=args.dataset,
137
+ evaluator_name=args.evaluator,
138
+ )
139
+ )
@@ -8,7 +8,9 @@ from evals.dataset import load_dataset
8
8
  load_dotenv()
9
9
 
10
10
 
11
- def upload_runs_to_dataset(project_name: str, dataset_name: str, dataset_description: str = ""):
11
+ def upload_runs_to_dataset(
12
+ project_name: str, dataset_name: str, dataset_description: str = ""
13
+ ):
12
14
  """
13
15
  Uploads runs from a LangSmith project to a dataset.
14
16
 
@@ -20,10 +22,8 @@ def upload_runs_to_dataset(project_name: str, dataset_name: str, dataset_descrip
20
22
  client = Client()
21
23
  try:
22
24
  dataset = client.create_dataset(dataset_name, description=dataset_description)
23
- print(f"Created new dataset: '{dataset_name}'")
24
25
  except Exception:
25
26
  dataset = client.read_dataset(dataset_name=dataset_name)
26
- print(f"Using existing dataset: '{dataset_name}'")
27
27
 
28
28
  runs = client.list_runs(project_name=project_name)
29
29
 
@@ -36,7 +36,6 @@ def upload_runs_to_dataset(project_name: str, dataset_name: str, dataset_descrip
36
36
  )
37
37
  example_count += 1
38
38
 
39
- print(f"✅ Successfully uploaded {example_count} runs from project '{project_name}' to dataset '{dataset_name}'.")
40
39
 
41
40
 
42
41
  def upload_dataset_from_file(
@@ -61,17 +60,14 @@ def upload_dataset_from_file(
61
60
 
62
61
  try:
63
62
  dataset = client.create_dataset(dataset_name, description=dataset_description)
64
- print(f"Created new dataset: '{dataset_name}'")
65
63
  except Exception:
66
64
  dataset = client.read_dataset(dataset_name=dataset_name)
67
- print(f"Using existing dataset: '{dataset_name}'")
68
65
 
69
66
  for example in examples:
70
67
  inputs = {key: example[key] for key in input_keys if key in example}
71
68
  outputs = {key: example[key] for key in output_keys if key in example}
72
69
  client.create_example(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
73
70
 
74
- print(f"✅ Successfully uploaded {len(examples)} examples from '{file_path}' to dataset '{dataset_name}'.")
75
71
 
76
72
 
77
73
  if __name__ == "__main__":
@@ -79,22 +75,48 @@ if __name__ == "__main__":
79
75
  subparsers = parser.add_subparsers(dest="command", required=True)
80
76
 
81
77
  # Sub-parser for uploading runs from a project
82
- parser_runs = subparsers.add_parser("upload-runs", help="Upload runs from a project to a dataset.")
83
- parser_runs.add_argument("--project-name", required=True, help="The LangSmith project name.")
84
- parser_runs.add_argument("--dataset-name", required=True, help="The target dataset name.")
78
+ parser_runs = subparsers.add_parser(
79
+ "upload-runs", help="Upload runs from a project to a dataset."
80
+ )
81
+ parser_runs.add_argument(
82
+ "--project-name", required=True, help="The LangSmith project name."
83
+ )
85
84
  parser_runs.add_argument(
86
- "--dataset-description", default="Dataset from project runs.", help="Description for the dataset."
85
+ "--dataset-name", required=True, help="The target dataset name."
86
+ )
87
+ parser_runs.add_argument(
88
+ "--dataset-description",
89
+ default="Dataset from project runs.",
90
+ help="Description for the dataset.",
87
91
  )
88
92
 
89
93
  # Sub-parser for uploading a dataset from a file
90
- parser_file = subparsers.add_parser("upload-file", help="Upload a dataset from a local file.")
91
- parser_file.add_argument("--file-path", required=True, help="Path to the local dataset file (CSV or JSONL).")
92
- parser_file.add_argument("--dataset-name", required=True, help="The name for the dataset in LangSmith.")
94
+ parser_file = subparsers.add_parser(
95
+ "upload-file", help="Upload a dataset from a local file."
96
+ )
97
+ parser_file.add_argument(
98
+ "--file-path",
99
+ required=True,
100
+ help="Path to the local dataset file (CSV or JSONL).",
101
+ )
102
+ parser_file.add_argument(
103
+ "--dataset-name", required=True, help="The name for the dataset in LangSmith."
104
+ )
105
+ parser_file.add_argument(
106
+ "--dataset-description",
107
+ default="Dataset uploaded from file.",
108
+ help="Description for the dataset.",
109
+ )
110
+ parser_file.add_argument(
111
+ "--input-keys",
112
+ required=True,
113
+ help="Comma-separated list of input column names.",
114
+ )
93
115
  parser_file.add_argument(
94
- "--dataset-description", default="Dataset uploaded from file.", help="Description for the dataset."
116
+ "--output-keys",
117
+ required=True,
118
+ help="Comma-separated list of output column names.",
95
119
  )
96
- parser_file.add_argument("--input-keys", required=True, help="Comma-separated list of input column names.")
97
- parser_file.add_argument("--output-keys", required=True, help="Comma-separated list of output column names.")
98
120
 
99
121
  args = parser.parse_args()
100
122