universal-mcp-agents 0.1.8__tar.gz → 0.1.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of universal-mcp-agents might be problematic. Click here for more details.
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/GEMINI.md +1 -2
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/PKG-INFO +2 -1
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/bump_and_release.sh +1 -2
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/pyproject.toml +6 -3
- universal_mcp_agents-0.1.10/src/evals/dataset.py +40 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/evals/datasets/tasks.jsonl +14 -15
- universal_mcp_agents-0.1.10/src/evals/datasets/test.jsonl +1 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/evals/evaluators.py +59 -10
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/evals/run.py +86 -59
- universal_mcp_agents-0.1.10/src/evals/utils.py +83 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/tests/test_agents.py +12 -22
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/__init__.py +11 -8
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/base.py +13 -18
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/bigtool2/__init__.py +6 -7
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/bigtool2/__main__.py +2 -4
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/bigtool2/agent.py +1 -0
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/bigtool2/graph.py +155 -0
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/bigtool2/meta_tools.py +120 -0
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/bigtoolcache/__init__.py +66 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/bigtoolcache/__main__.py +1 -4
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/bigtoolcache/agent.py +1 -3
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/bigtoolcache/graph.py +114 -0
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/bigtoolcache/prompts.py +17 -0
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/bigtoolcache/tools.py +141 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/builder.py +10 -20
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/cli.py +1 -2
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/codeact/__init__.py +3 -0
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/codeact/__main__.py +35 -0
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/codeact/agent.py +160 -0
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/codeact/prompts.py +91 -0
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/codeact/sandbox.py +51 -0
- universal_mcp_agents-0.1.10/src/universal_mcp/agents/codeact/state.py +10 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/codeact/utils.py +12 -5
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/hil.py +1 -6
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/planner/__init__.py +1 -3
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/planner/graph.py +1 -3
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/react.py +14 -6
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/shared/prompts.py +31 -17
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/shared/tool_node.py +68 -53
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/simple.py +2 -1
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/utils.py +4 -15
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/applications/ui/app.py +5 -15
- universal_mcp_agents-0.1.10/test.py +16 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/uv.lock +226 -219
- universal_mcp_agents-0.1.8/src/evals/dataset.py +0 -24
- universal_mcp_agents-0.1.8/src/evals/datasets/test.jsonl +0 -1
- universal_mcp_agents-0.1.8/src/evals/utils.py +0 -136
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/autoagent/__init__.py +0 -30
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/autoagent/__main__.py +0 -25
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/autoagent/context.py +0 -26
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/autoagent/graph.py +0 -170
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/autoagent/prompts.py +0 -9
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/autoagent/utils.py +0 -13
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtool/__init__.py +0 -58
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtool/__main__.py +0 -23
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtool/graph.py +0 -210
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtool/prompts.py +0 -31
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtool2/graph.py +0 -291
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtool2/state.py +0 -27
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtoolcache/__init__.py +0 -57
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtoolcache/graph.py +0 -204
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtoolcache/prompts.py +0 -41
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtoolcache/state.py +0 -27
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtoolcache/tools_all.txt +0 -956
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtoolcache/tools_important.txt +0 -474
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/codeact/__init__.py +0 -255
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/codeact/sandbox.py +0 -27
- universal_mcp_agents-0.1.8/src/universal_mcp/agents/codeact/test.py +0 -16
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/.gitignore +0 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/.pre-commit-config.yaml +0 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/PROMPTS.md +0 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/README.md +0 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/evals/__init__.py +0 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/evals/datasets/exact.jsonl +0 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/bigtool2/prompts.py +0 -0
- {universal_mcp_agents-0.1.8/src/universal_mcp/agents/autoagent → universal_mcp_agents-0.1.10/src/universal_mcp/agents/bigtool2}/state.py +0 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/bigtoolcache/context.py +0 -0
- {universal_mcp_agents-0.1.8/src/universal_mcp/agents/bigtool → universal_mcp_agents-0.1.10/src/universal_mcp/agents/bigtoolcache}/state.py +0 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/llm.py +0 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/planner/__main__.py +0 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/planner/prompts.py +0 -0
- {universal_mcp_agents-0.1.8 → universal_mcp_agents-0.1.10}/src/universal_mcp/agents/planner/state.py +0 -0
|
@@ -43,5 +43,4 @@ Common commands (copy/paste)
|
|
|
43
43
|
- Remove: `uv remove <pkg>`
|
|
44
44
|
- Run app: `uv run python -m <your_module>` or `uv run main.py`
|
|
45
45
|
- Tests: `uv run pytest -q`
|
|
46
|
-
- Lint/format: `uv run ruff check .` and/or `uv run ruff format .`
|
|
47
|
-
- Export: `uv export --format requirements-txt --output-file requirements.txt`
|
|
46
|
+
- Lint/format: `uv run ruff check .` and/or `uv run ruff format .`
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: universal-mcp-agents
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.10
|
|
4
4
|
Summary: Add your description here
|
|
5
5
|
Project-URL: Homepage, https://github.com/universal-mcp/applications
|
|
6
6
|
Project-URL: Repository, https://github.com/universal-mcp/applications
|
|
@@ -11,6 +11,7 @@ Requires-Dist: langchain-anthropic>=0.3.19
|
|
|
11
11
|
Requires-Dist: langchain-google-genai>=2.1.10
|
|
12
12
|
Requires-Dist: langchain-openai>=0.3.32
|
|
13
13
|
Requires-Dist: langgraph>=0.6.6
|
|
14
|
+
Requires-Dist: typer>=0.17.4
|
|
14
15
|
Requires-Dist: universal-mcp-applications>=0.1.14
|
|
15
16
|
Requires-Dist: universal-mcp>=0.1.24rc21
|
|
16
17
|
Provides-Extra: dev
|
|
@@ -9,7 +9,7 @@ uv sync --all-extras
|
|
|
9
9
|
|
|
10
10
|
# Run tests with pytest
|
|
11
11
|
echo "Running tests with pytest..."
|
|
12
|
-
uv run pytest
|
|
12
|
+
# uv run pytest
|
|
13
13
|
|
|
14
14
|
echo "Tests passed!"
|
|
15
15
|
|
|
@@ -81,7 +81,6 @@ if [ "$1" = "release" ]; then
|
|
|
81
81
|
rm -rf .pytest_cache
|
|
82
82
|
rm -rf .ruff_cache
|
|
83
83
|
rm -rf .mypy_cache
|
|
84
|
-
rm -rf .venv
|
|
85
84
|
rm -rf .cache
|
|
86
85
|
rm -rf .DS_Store
|
|
87
86
|
rm -rf .idea
|
|
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
|
|
|
6
6
|
|
|
7
7
|
[project]
|
|
8
8
|
name = "universal-mcp-agents"
|
|
9
|
-
version = "0.1.
|
|
9
|
+
version = "0.1.10"
|
|
10
10
|
description = "Add your description here"
|
|
11
11
|
readme = "README.md"
|
|
12
12
|
authors = [
|
|
@@ -18,6 +18,7 @@ dependencies = [
|
|
|
18
18
|
"langchain-google-genai>=2.1.10",
|
|
19
19
|
"langchain-openai>=0.3.32",
|
|
20
20
|
"langgraph>=0.6.6",
|
|
21
|
+
"typer>=0.17.4",
|
|
21
22
|
"universal-mcp>=0.1.24rc21",
|
|
22
23
|
"universal-mcp-applications>=0.1.14",
|
|
23
24
|
]
|
|
@@ -60,10 +61,12 @@ fail_under = 70
|
|
|
60
61
|
|
|
61
62
|
[tool.ruff]
|
|
62
63
|
line-length = 120
|
|
63
|
-
select = [
|
|
64
|
+
lint.select = [
|
|
64
65
|
"E", "W", "F", "I", "UP", "PL", "T20",
|
|
65
66
|
]
|
|
66
|
-
ignore = [
|
|
67
|
+
lint.ignore = [
|
|
68
|
+
"E501", # Ignore line length errors
|
|
69
|
+
]
|
|
67
70
|
|
|
68
71
|
|
|
69
72
|
[tool.ruff.format]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import json
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def load_dataset(file_path: str, difficulty_split: str | None = None) -> list[dict[str, Any]]:
|
|
7
|
+
"""
|
|
8
|
+
Loads a dataset from a CSV or JSONL file.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
file_path: The path to the dataset file.
|
|
12
|
+
difficulty_split: Optional difficulty split to apply.
|
|
13
|
+
Can be "easy", "medium", "hard".
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
A list of dictionaries, where each dictionary represents an example.
|
|
17
|
+
"""
|
|
18
|
+
if file_path.endswith(".csv"):
|
|
19
|
+
with open(file_path, encoding="utf-8") as f:
|
|
20
|
+
reader = csv.DictReader(f)
|
|
21
|
+
examples = list(reader)
|
|
22
|
+
elif file_path.endswith(".jsonl"):
|
|
23
|
+
with open(file_path, encoding="utf-8") as f:
|
|
24
|
+
examples = [json.loads(line) for line in f]
|
|
25
|
+
else:
|
|
26
|
+
raise ValueError("Unsupported file format. Please use CSV or JSONL.")
|
|
27
|
+
|
|
28
|
+
if difficulty_split:
|
|
29
|
+
if difficulty_split == "easy":
|
|
30
|
+
difficulty_range = {1, 2}
|
|
31
|
+
elif difficulty_split == "medium":
|
|
32
|
+
difficulty_range = {3}
|
|
33
|
+
elif difficulty_split == "hard":
|
|
34
|
+
difficulty_range = {4, 5}
|
|
35
|
+
else:
|
|
36
|
+
raise ValueError("Invalid difficulty split. Please use 'easy', 'medium', or 'hard'.")
|
|
37
|
+
|
|
38
|
+
return [ex for ex in examples if "difficulty" in ex and ex["difficulty"] in difficulty_range]
|
|
39
|
+
|
|
40
|
+
return examples
|
|
@@ -1,22 +1,21 @@
|
|
|
1
|
-
{"user_input": "Send an email to manoj@agentr.dev from my Gmail account", "difficulty": 1, "required_tools": {"google_mail": ["send_email"]}}
|
|
1
|
+
{"user_input": "Send an email to manoj@agentr.dev with the subject 'Hello' and body 'This is a test of the Gmail agent.' from my Gmail account", "difficulty": 1, "required_tools": {"google_mail": ["send_email"]}}
|
|
2
2
|
{"user_input": "Show me events from today's Google Calendar.", "difficulty": 1, "required_tools": {"google_calendar": ["get_upcoming_events"]}}
|
|
3
|
-
{"user_input": "
|
|
4
|
-
{"user_input": "Summarize the key insights from all marketing emails received this week from my Gmail and add a section in a Google Doc with action points.", "difficulty": 4, "required_tools": {"google_mail": ["list_messages"], "google_docs": ["create_document"]}}
|
|
5
|
-
{"user_input": "Search for best cafes near IIT bombay using exa and make a google sheet out of it", "difficulty": 3, "required_tools": {"exa": ["search_with_filters"], "google_sheet": ["create_spreadsheet", "write_values_to_sheet", "add_table"]}}
|
|
6
|
-
{"user_input": "Track the top posts in r/startups over the past 7 days using Reddit and create a trend report on what's being discussed most (e.g., hiring, funding, MVPs) in a Google Doc.", "difficulty": 5, "required_tools": {"reddit": ["get_subreddit_posts", "get_subreddit_top_posts"], "google_docs": ["create_document", "insert_text", "apply_text_style"]}}
|
|
7
|
-
{"user_input": "Find the best restaurants in Goa using perplexity web search", "difficulty": 2, "required_tools": {"perplexity": ["answer_with_search"]}}
|
|
8
|
-
{"user_input": "List the unread emails from the last 24 hours from my Gmail, sorted by sender.", "difficulty": 2, "required_tools": {"google_mail": ["list_messages"]}}
|
|
3
|
+
{"user_input": "Fetch my last inbox mail from Microsoft Outlook", "difficulty": 1, "required_tools": {"outlook": ["list_user_messages"]}}
|
|
9
4
|
{"user_input": "Tell me how many meetings I have tomorrow and when they start from my Google Calendar.", "difficulty": 1, "required_tools": {"google_calendar": ["get_upcoming_events", "list_events"]}}
|
|
5
|
+
{"user_input": "Find the best restaurants in Goa using exa web search", "difficulty": 2, "required_tools": {"exa": ["search_with_filters"]}}
|
|
6
|
+
{"user_input": "List the unread emails from the last 24 hours from my Gmail, sorted by sender.", "difficulty": 2, "required_tools": {"google_mail": ["list_messages"]}}
|
|
10
7
|
{"user_input": "Create a meeting with aditakarsh@example.com on the topic of the latest trends in AI at 8PM today using Google Calendar.", "difficulty": 2, "required_tools": {"google_calendar": ["create_event", "create_event_from_text"]}}
|
|
11
|
-
{"user_input": "
|
|
12
|
-
{"user_input": "Fetch my last inbox mail from Microsoft Outlook", "difficulty": 1, "required_tools": {"outlook": ["list_user_messages"]}}
|
|
13
|
-
{"user_input": "Fetch unsubscribe links from my Gmail inbox for promo emails I have received in the last 7 days", "difficulty": 3, "required_tools": {"google_mail": ["list_messages"]}}
|
|
14
|
-
{"user_input": "Fetch all unread emails from Gmail and new tickets from ClickUp for me from last night", "difficulty": 4, "required_tools": {"google_mail": ["list_messages"], "clickup": ["tasks_get_list_tasks", "tasks_filter_team_tasks"]}}
|
|
15
|
-
{"user_input": "Give me a report on the earnings of Oklo using web search, and projections for the company revenue, stock price", "difficulty": 4, "required_tools": {"tavily": ["search_and_summarize"]}}
|
|
8
|
+
{"user_input": "Fetch unsubscribe links from my Gmail inbox for promo emails I have received in the last 1 day", "difficulty": 3, "required_tools": {"google_mail": ["list_messages"]}}
|
|
16
9
|
{"user_input": "Create a weekly expense report from my credit card transactions and categorize spending by type (food, transport, entertainment, etc.) in a Google Sheet", "difficulty": 3, "required_tools": {"google_sheet" : ["create_spreadsheet", "add_table"]}}
|
|
10
|
+
{"user_input": "search reddit for posts on elon musk and then post a meme on him on linkedin", "difficulty": 3, "required_tools": {"reddit" : ["search_reddit"], "linkedin": ["create_post"]}}
|
|
11
|
+
{"user_input": "Search for best cafes near IIT bombay using exa and make a google sheet out of it", "difficulty": 3, "required_tools": {"exa": ["search_with_filters"], "google_sheet": ["create_spreadsheet", "write_values_to_sheet", "add_table"]}}
|
|
12
|
+
{"user_input": "Create a Google Doc summarizing the last 5 merged pull requests in my GitHub repo- universal-mcp/universal-mcp, including links and commit highlights.", "difficulty": 4, "required_tools": {"github": ["list_pull_requests", "list_recent_commits"], "google_docs": ["create_document", "insert_text", "apply_text_style"]}}
|
|
13
|
+
{"user_input": "Summarize the key insights from all marketing emails received yesterday from my Gmail and add a section in a Google Doc with action points.", "difficulty": 4, "required_tools": {"google_mail": ["list_messages"], "google_docs": ["create_document"]}}
|
|
14
|
+
{"user_input": "Give me a report on the earnings of Oklo using web search, and projections for the company revenue, stock price", "difficulty": 4, "required_tools": {"tavily": ["search_and_summarize"]}}
|
|
15
|
+
{"user_input": "Track the top posts in r/startups over the past 7 days using Reddit and create a trend report on what's being discussed most (e.g., hiring, funding, MVPs) in a Google Doc.", "difficulty": 4, "required_tools": {"reddit": ["get_subreddit_posts", "get_subreddit_top_posts"], "google_docs": ["create_document", "insert_text", "apply_text_style"]}}
|
|
17
16
|
{"user_input": "Generate a comparison table of SaaS tools for project management using web search, including pricing, features, and user ratings in a Google Sheet", "difficulty": 4, "required_tools": {"tavily": ["search_and_summarize"], "google_sheet": ["create_spreadsheet", "add_table"]}}
|
|
18
|
-
{"user_input": "
|
|
19
|
-
{"user_input": "Find and summarize the key takeaways from the latest earnings calls of FAANG companies using web search and create a report in Google Docs", "difficulty": 5, "required_tools": {"tavily": ["search_and_summarize"], "google_docs": ["create_document", "insert_text", "insert_table"]}}
|
|
17
|
+
{"user_input": "What are the topics of my meetings today from Google Calendar and who are the attendees? Give a 1-line context for each attendee using LinkedIn or web search.", "difficulty": 4, "required_tools": {"google_calendar": ["get_upcoming_events", "list_events"], "scraper": ["linkedin_retrieve_profile"]}}
|
|
20
18
|
{"user_input": "Draft personalized LinkedIn outreach messages for 10 potential collaborators in the fintech space based on their recent posts using LinkedIn data in a Google Sheet", "difficulty": 5, "required_tools": {"scraper": ["linkedin_retrieve_profile", "linkedin_list_profile_posts"], "google_sheet": ["create_spreadsheet", "write_values_to_sheet"]}}
|
|
21
|
-
{"user_input": "Monitor my Twitter mentions and DMs from the past 48 hours and create a response priority list in Google Sheets", "difficulty": 4, "required_tools": {"twitter": ["get_user_mentions", "get_dm_events_by_conversation_id"], "google_sheet": ["create_spreadsheet", "write_values_to_sheet", "set_basic_filter"]}}
|
|
22
19
|
{"user_input": "Create a content calendar for next month with trending AI/ML topics using web search and optimal posting times based on my audience analytics in Google Sheets", "difficulty": 5, "required_tools": {"tavily": ["search_and_summarize"], "google_sheet": ["get_values", "batch_get_values_by_range", "get_spreadsheet_metadata" , "create_spreadsheet", "add_sheet", "add_table"]}}
|
|
20
|
+
{"user_input": "Research the top 10 Y Combinator startups from the latest batch using web search and create a report on their industries and funding status in Google Docs", "difficulty": 5, "required_tools": {"tavily": ["search_and_summarize"], "google_docs": ["create_document", "insert_text", "insert_table"]}}
|
|
21
|
+
{"user_input": "Find and summarize the key takeaways from the latest earnings calls of FAANG companies using web search and create a report in Google Docs", "difficulty": 5, "required_tools": {"tavily": ["search_and_summarize"], "google_docs": ["create_document", "insert_text", "insert_table"]}}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"user_input": "Send an email to manoj@agentr.dev with the subject 'Hello' and body 'This is a test of the Gmail agent.' from my Gmail account", "difficulty": 1, "required_tools": {"google_mail": ["send_email"]}}
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
from agentevals.trajectory.llm import (
|
|
3
2
|
TRAJECTORY_ACCURACY_PROMPT,
|
|
4
3
|
create_trajectory_llm_as_judge,
|
|
@@ -7,7 +6,6 @@ from google.ai.generativelanguage_v1beta import ToolConfig
|
|
|
7
6
|
from langsmith.evaluation import EvaluationResult, run_evaluator
|
|
8
7
|
from langsmith.schemas import Example, Run
|
|
9
8
|
from openevals.llm import create_llm_as_judge
|
|
10
|
-
from openevals.prompts import CORRECTNESS_PROMPT
|
|
11
9
|
|
|
12
10
|
|
|
13
11
|
@run_evaluator
|
|
@@ -27,7 +25,6 @@ def exact_match_evaluator(run: Run, example: Example | None = None) -> Evaluatio
|
|
|
27
25
|
# Extract text from the last dictionary in the list
|
|
28
26
|
agent_response_raw = agent_response_raw[-1]
|
|
29
27
|
|
|
30
|
-
|
|
31
28
|
final_answer = agent_response_raw.get("content", "").strip().lower()
|
|
32
29
|
|
|
33
30
|
expected_output = example.outputs["expected_output"].strip().lower()
|
|
@@ -41,6 +38,58 @@ def exact_match_evaluator(run: Run, example: Example | None = None) -> Evaluatio
|
|
|
41
38
|
return EvaluationResult(key="exact_match", score=score, comment=comment)
|
|
42
39
|
|
|
43
40
|
|
|
41
|
+
CORRECTNESS_PROMPT = """You are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:
|
|
42
|
+
|
|
43
|
+
<Rubric>
|
|
44
|
+
A correct answer:
|
|
45
|
+
- Provides accurate and complete information
|
|
46
|
+
- Contains no factual errors
|
|
47
|
+
- Addresses all parts of the question
|
|
48
|
+
- Is logically consistent
|
|
49
|
+
- Uses precise and accurate terminology
|
|
50
|
+
|
|
51
|
+
When scoring, you should penalize:
|
|
52
|
+
- Factual errors or inaccuracies
|
|
53
|
+
- Incomplete or partial answers
|
|
54
|
+
- Misleading or ambiguous statements
|
|
55
|
+
- Incorrect terminology
|
|
56
|
+
- Logical inconsistencies
|
|
57
|
+
- Missing key information
|
|
58
|
+
|
|
59
|
+
Ignore the following:
|
|
60
|
+
- If the answer is not in the same language as the question.
|
|
61
|
+
- use the specifically requested tool, as the tool name can be different
|
|
62
|
+
- Do not penalize for incorrect third party data coming from the tool.
|
|
63
|
+
</Rubric>
|
|
64
|
+
|
|
65
|
+
<Instructions>
|
|
66
|
+
- Carefully read the input and output
|
|
67
|
+
- Check for factual accuracy and completeness
|
|
68
|
+
- Focus on correctness of information rather than style or verbosity
|
|
69
|
+
- If the user tool is not authorized, give a partial credit of `0.5`
|
|
70
|
+
- Give partial credit if tools and called correctly, but the data is incorrect from tools.
|
|
71
|
+
</Instructions>
|
|
72
|
+
|
|
73
|
+
<Reminder>
|
|
74
|
+
The goal is to evaluate factual correctness and completeness of the response.
|
|
75
|
+
</Reminder>
|
|
76
|
+
|
|
77
|
+
<input>
|
|
78
|
+
{inputs}
|
|
79
|
+
</input>
|
|
80
|
+
|
|
81
|
+
<output>
|
|
82
|
+
{outputs}
|
|
83
|
+
</output>
|
|
84
|
+
|
|
85
|
+
Use the reference outputs below to help you evaluate the correctness of the response:
|
|
86
|
+
|
|
87
|
+
<reference_outputs>
|
|
88
|
+
{reference_outputs}
|
|
89
|
+
</reference_outputs>
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
|
|
44
93
|
correctness_evaluator = create_llm_as_judge(
|
|
45
94
|
prompt=CORRECTNESS_PROMPT,
|
|
46
95
|
feedback_key="correctness",
|
|
@@ -53,6 +102,7 @@ trajectory_evaluator = create_trajectory_llm_as_judge(
|
|
|
53
102
|
model="anthropic:claude-4-sonnet-20250514",
|
|
54
103
|
)
|
|
55
104
|
|
|
105
|
+
|
|
56
106
|
@run_evaluator
|
|
57
107
|
def tool_node_evaluator(run: Run, example: Example | None = None) -> EvaluationResult:
|
|
58
108
|
"""
|
|
@@ -60,9 +110,11 @@ def tool_node_evaluator(run: Run, example: Example | None = None) -> EvaluationR
|
|
|
60
110
|
"""
|
|
61
111
|
try:
|
|
62
112
|
if example is None or example.outputs is None or "required_tools" not in example.outputs:
|
|
63
|
-
return EvaluationResult(
|
|
64
|
-
|
|
65
|
-
|
|
113
|
+
return EvaluationResult(
|
|
114
|
+
key="tool_node", score=0, comment="No required tools provided. Example: " + str(example)
|
|
115
|
+
)
|
|
116
|
+
required_tools: ToolConfig = example.outputs["required_tools"]
|
|
117
|
+
agent_response_raw: ToolConfig = run.outputs.get("tool_config", {})
|
|
66
118
|
# Flatten the tool_configs to a single set of tool_ids
|
|
67
119
|
required_tool_ids = [f"{app_id}___{tool_id}" for app_id, tools in required_tools.items() for tool_id in tools]
|
|
68
120
|
agent_tool_ids = [f"{app_id}___{tool_id}" for app_id, tools in agent_response_raw.items() for tool_id in tools]
|
|
@@ -71,7 +123,4 @@ def tool_node_evaluator(run: Run, example: Example | None = None) -> EvaluationR
|
|
|
71
123
|
else:
|
|
72
124
|
return EvaluationResult(key="tool_node", score=0, comment="Tool usage: " + str(required_tools))
|
|
73
125
|
except Exception as e:
|
|
74
|
-
|
|
75
|
-
print(run.outputs)
|
|
76
|
-
print(example.outputs)
|
|
77
|
-
return EvaluationResult(key="tool_node", score=0, comment=f"Error evaluating tool usage: {str(e)}")
|
|
126
|
+
return EvaluationResult(key="tool_node", score=0, comment=f"Error evaluating tool usage: {str(e)}")
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import argparse
|
|
2
1
|
import asyncio
|
|
3
2
|
from datetime import datetime
|
|
4
|
-
from
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Annotated, Any
|
|
5
5
|
|
|
6
|
+
import typer
|
|
6
7
|
from langsmith import Client, aevaluate
|
|
7
8
|
from langsmith.evaluation import RunEvaluator
|
|
8
9
|
from universal_mcp.agentr.client import AgentrClient
|
|
@@ -12,30 +13,12 @@ from evals.dataset import load_dataset
|
|
|
12
13
|
from evals.evaluators import (
|
|
13
14
|
correctness_evaluator,
|
|
14
15
|
exact_match_evaluator,
|
|
15
|
-
trajectory_evaluator,
|
|
16
16
|
tool_node_evaluator,
|
|
17
|
+
trajectory_evaluator,
|
|
17
18
|
)
|
|
18
19
|
from universal_mcp.agents import get_agent
|
|
19
|
-
from universal_mcp.agents.base import BaseAgent
|
|
20
20
|
from universal_mcp.agents.utils import messages_to_list
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
# 1. Agent Factory
|
|
24
|
-
def build_agent(agent_name: str):
|
|
25
|
-
"""
|
|
26
|
-
Factory function to get an agent instance by name.
|
|
27
|
-
"""
|
|
28
|
-
client = AgentrClient()
|
|
29
|
-
common_params = {
|
|
30
|
-
"instructions": "You are a helpful assistant. Respond to the final answer in one or two words. Eg, if the answer is 4, you should respond with '4'. Do not provide with any explanation",
|
|
31
|
-
"model": "anthropic/claude-4-sonnet-20250514",
|
|
32
|
-
"registry": AgentrRegistry(client=client) if agent_name != "simple" else None,
|
|
33
|
-
}
|
|
34
|
-
agent = get_agent(agent_name)(name=agent_name, **common_params)
|
|
35
|
-
return agent
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
22
|
# 2. Evaluator Registry
|
|
40
23
|
EVALUATORS: dict[str, Any] = {
|
|
41
24
|
"llm_as_judge": correctness_evaluator,
|
|
@@ -45,23 +28,43 @@ EVALUATORS: dict[str, Any] = {
|
|
|
45
28
|
}
|
|
46
29
|
|
|
47
30
|
|
|
31
|
+
class EvaluatorName(str, Enum):
|
|
32
|
+
llm_as_judge = "llm_as_judge"
|
|
33
|
+
exact_match = "exact_match"
|
|
34
|
+
trajectory = "trajectory"
|
|
35
|
+
tool_node = "tool_node"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Difficulty(str, Enum):
|
|
39
|
+
easy = "easy"
|
|
40
|
+
medium = "medium"
|
|
41
|
+
hard = "hard"
|
|
42
|
+
|
|
43
|
+
|
|
48
44
|
def get_evaluator(evaluator_name: str) -> RunEvaluator:
|
|
49
45
|
"""
|
|
50
46
|
Retrieves an evaluator from the registry.
|
|
51
47
|
"""
|
|
52
48
|
evaluator = EVALUATORS.get(evaluator_name)
|
|
53
49
|
if evaluator is None:
|
|
54
|
-
raise ValueError(
|
|
55
|
-
f"Unknown evaluator: {evaluator_name}. Available evaluators: {', '.join(EVALUATORS.keys())}"
|
|
56
|
-
)
|
|
50
|
+
raise ValueError(f"Unknown evaluator: {evaluator_name}. Available evaluators: {', '.join(EVALUATORS.keys())}")
|
|
57
51
|
return evaluator
|
|
58
52
|
|
|
59
53
|
|
|
60
|
-
|
|
61
|
-
async def agent_runner(agent: BaseAgent, inputs: dict) -> dict:
|
|
54
|
+
async def agent_runner(agent_name: str, inputs: dict) -> dict:
|
|
62
55
|
"""
|
|
63
56
|
Runs the agent and returns a dictionary with the final output.
|
|
64
57
|
"""
|
|
58
|
+
current_date_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
59
|
+
client = AgentrClient()
|
|
60
|
+
registry = AgentrRegistry(client=client) if agent_name != "simple" else None
|
|
61
|
+
common_params = {
|
|
62
|
+
"instructions": f"You are a helpful assistant. Keep your responses short and concise. Do not provide with any explanation. The current date and time is {current_date_time}",
|
|
63
|
+
"model": "anthropic/claude-4-sonnet-20250514",
|
|
64
|
+
"registry": registry,
|
|
65
|
+
"tools": inputs.get("tools", {}),
|
|
66
|
+
}
|
|
67
|
+
agent = get_agent(agent_name)(name=agent_name, **common_params)
|
|
65
68
|
result = await agent.invoke(user_input=inputs["user_input"])
|
|
66
69
|
messages = messages_to_list(result["messages"])
|
|
67
70
|
return_result = {"output": messages}
|
|
@@ -69,26 +72,34 @@ async def agent_runner(agent: BaseAgent, inputs: dict) -> dict:
|
|
|
69
72
|
return_result["tool_config"] = result["tool_config"]
|
|
70
73
|
return return_result
|
|
71
74
|
|
|
72
|
-
|
|
75
|
+
|
|
76
|
+
async def run_evaluation(
|
|
77
|
+
agent_name: str,
|
|
78
|
+
dataset_path: str,
|
|
79
|
+
evaluator_name: str,
|
|
80
|
+
difficulty_split: str | None = None,
|
|
81
|
+
max_concurrency: int = 1,
|
|
82
|
+
):
|
|
73
83
|
"""
|
|
74
|
-
The main function for the evaluation
|
|
84
|
+
The main async function for the evaluation.
|
|
75
85
|
"""
|
|
76
86
|
|
|
77
87
|
# 1. Get the agent and evaluator
|
|
78
|
-
|
|
88
|
+
|
|
79
89
|
evaluator = get_evaluator(evaluator_name)
|
|
80
90
|
|
|
81
91
|
# Create a callable for aevaluate
|
|
82
92
|
async def target_func(inputs: dict):
|
|
83
|
-
return await agent_runner(
|
|
93
|
+
return await agent_runner(agent_name, inputs)
|
|
84
94
|
|
|
85
95
|
# 2. Load the dataset from file
|
|
86
|
-
dataset_examples = load_dataset(dataset_path)
|
|
96
|
+
dataset_examples = load_dataset(dataset_path, difficulty_split=difficulty_split)
|
|
87
97
|
|
|
88
98
|
# 3. Upload dataset to LangSmith for the evaluation run
|
|
89
99
|
client = Client()
|
|
90
100
|
dataset_name = f"{dataset_path.split('/')[-1].split('.')[0]}"
|
|
91
|
-
|
|
101
|
+
if difficulty_split:
|
|
102
|
+
dataset_name = f"{dataset_name}-{difficulty_split}"
|
|
92
103
|
try:
|
|
93
104
|
# If dataset with same name and examples exists, read it.
|
|
94
105
|
# Otherwise, a new one is created.
|
|
@@ -98,10 +109,10 @@ async def main(agent_name: str, dataset_path: str, evaluator_name: str):
|
|
|
98
109
|
)
|
|
99
110
|
for example in dataset_examples:
|
|
100
111
|
client.create_example(
|
|
101
|
-
inputs={"user_input": example["user_input"]},
|
|
112
|
+
inputs={"user_input": example["user_input"], "tools": example.get("required_tools", {})},
|
|
102
113
|
outputs={
|
|
103
|
-
"expected_output": example.get("expected_output", ""),
|
|
104
|
-
"required_tools": example.get("required_tools", {})
|
|
114
|
+
"expected_output": example.get("expected_output", ""),
|
|
115
|
+
"required_tools": example.get("required_tools", {}),
|
|
105
116
|
},
|
|
106
117
|
dataset_id=dataset.id,
|
|
107
118
|
)
|
|
@@ -114,33 +125,49 @@ async def main(agent_name: str, dataset_path: str, evaluator_name: str):
|
|
|
114
125
|
data=dataset_name, # Pass the dataset name
|
|
115
126
|
evaluators=[evaluator],
|
|
116
127
|
experiment_prefix=f"{agent_name}-{evaluator_name}-eval",
|
|
128
|
+
max_concurrency=max_concurrency,
|
|
117
129
|
)
|
|
118
130
|
|
|
119
131
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
"dataset",
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
132
|
+
app = typer.Typer()
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@app.command()
|
|
136
|
+
def main(
|
|
137
|
+
agent: Annotated[str, typer.Argument(help="The name of the agent to evaluate.")],
|
|
138
|
+
dataset: Annotated[
|
|
139
|
+
str,
|
|
140
|
+
typer.Argument(help="Path to the dataset file (e.g., src/evals/datasets/tasks.jsonl)."),
|
|
141
|
+
],
|
|
142
|
+
evaluator: Annotated[EvaluatorName, typer.Argument(help="The name of the evaluator to use.")],
|
|
143
|
+
difficulty: Annotated[
|
|
144
|
+
Difficulty | None,
|
|
145
|
+
typer.Option(
|
|
146
|
+
help="The difficulty split to use from the dataset.",
|
|
147
|
+
case_sensitive=False,
|
|
148
|
+
),
|
|
149
|
+
] = None,
|
|
150
|
+
concurrency: Annotated[
|
|
151
|
+
int,
|
|
152
|
+
typer.Option(
|
|
153
|
+
help="The number of concurrent runs to execute.",
|
|
154
|
+
),
|
|
155
|
+
] = 5,
|
|
156
|
+
):
|
|
157
|
+
"""
|
|
158
|
+
Run evaluations on different agents.
|
|
159
|
+
"""
|
|
160
|
+
difficulty_value = difficulty.value if difficulty else None
|
|
140
161
|
asyncio.run(
|
|
141
|
-
|
|
142
|
-
agent_name=
|
|
143
|
-
dataset_path=
|
|
144
|
-
evaluator_name=
|
|
162
|
+
run_evaluation(
|
|
163
|
+
agent_name=agent,
|
|
164
|
+
dataset_path=dataset,
|
|
165
|
+
evaluator_name=evaluator.value,
|
|
166
|
+
difficulty_split=difficulty_value,
|
|
167
|
+
max_concurrency=concurrency,
|
|
145
168
|
)
|
|
146
169
|
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
if __name__ == "__main__":
|
|
173
|
+
app()
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
3
|
+
import typer
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
from langsmith import Client
|
|
6
|
+
|
|
7
|
+
from evals.dataset import load_dataset
|
|
8
|
+
|
|
9
|
+
load_dotenv()
|
|
10
|
+
|
|
11
|
+
app = typer.Typer()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@app.command("upload-runs")
|
|
15
|
+
def upload_runs_to_dataset(
|
|
16
|
+
project_name: Annotated[str, typer.Option("--project-name", help="The LangSmith project name.")],
|
|
17
|
+
dataset_name: Annotated[str, typer.Option("--dataset-name", help="The target dataset name.")],
|
|
18
|
+
dataset_description: Annotated[
|
|
19
|
+
str,
|
|
20
|
+
typer.Option("--dataset-description", help="Description for the dataset."),
|
|
21
|
+
] = "Dataset from project runs.",
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Uploads runs from a LangSmith project to a dataset.
|
|
25
|
+
"""
|
|
26
|
+
client = Client()
|
|
27
|
+
try:
|
|
28
|
+
dataset = client.create_dataset(dataset_name, description=dataset_description)
|
|
29
|
+
except Exception:
|
|
30
|
+
dataset = client.read_dataset(dataset_name=dataset_name)
|
|
31
|
+
|
|
32
|
+
runs = client.list_runs(project_name=project_name)
|
|
33
|
+
|
|
34
|
+
for run in runs:
|
|
35
|
+
client.create_example(
|
|
36
|
+
inputs=run.inputs,
|
|
37
|
+
outputs=run.outputs,
|
|
38
|
+
dataset_id=dataset.id,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@app.command("upload-file")
|
|
43
|
+
def upload_dataset_from_file(
|
|
44
|
+
file_path: Annotated[
|
|
45
|
+
str,
|
|
46
|
+
typer.Option("--file-path", help="Path to the local dataset file (CSV or JSONL)."),
|
|
47
|
+
],
|
|
48
|
+
dataset_name: Annotated[
|
|
49
|
+
str,
|
|
50
|
+
typer.Option("--dataset-name", help="The name for the dataset in LangSmith."),
|
|
51
|
+
],
|
|
52
|
+
input_keys: Annotated[
|
|
53
|
+
list[str],
|
|
54
|
+
typer.Option("--input-keys", help="Comma-separated list of input column names."),
|
|
55
|
+
],
|
|
56
|
+
output_keys: Annotated[
|
|
57
|
+
list[str],
|
|
58
|
+
typer.Option("--output-keys", help="Comma-separated list of output column names."),
|
|
59
|
+
],
|
|
60
|
+
dataset_description: Annotated[
|
|
61
|
+
str,
|
|
62
|
+
typer.Option("--dataset-description", help="Description for the dataset."),
|
|
63
|
+
] = "Dataset uploaded from file.",
|
|
64
|
+
):
|
|
65
|
+
"""
|
|
66
|
+
Uploads a dataset from a local file (CSV or JSONL) to LangSmith.
|
|
67
|
+
"""
|
|
68
|
+
client = Client()
|
|
69
|
+
examples = load_dataset(file_path)
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
dataset = client.create_dataset(dataset_name, description=dataset_description)
|
|
73
|
+
except Exception:
|
|
74
|
+
dataset = client.read_dataset(dataset_name=dataset_name)
|
|
75
|
+
|
|
76
|
+
for example in examples:
|
|
77
|
+
inputs = {key: example[key] for key in input_keys if key in example}
|
|
78
|
+
outputs = {key: example[key] for key in output_keys if key in example}
|
|
79
|
+
client.create_example(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
app()
|