universal-mcp-agents 0.1.12__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of universal-mcp-agents might be problematic. Click here for more details.

Files changed (95) hide show
  1. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/.gitignore +1 -0
  2. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/PKG-INFO +1 -1
  3. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/pyproject.toml +1 -1
  4. universal_mcp_agents-0.1.14/src/evals/datasets/codeact.jsonl +11 -0
  5. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/evals/datasets/tasks.jsonl +1 -1
  6. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/evals/evaluators.py +9 -52
  7. universal_mcp_agents-0.1.14/src/evals/prompts.py +66 -0
  8. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/evals/run.py +5 -2
  9. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/tests/test_agents.py +2 -0
  10. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/base.py +2 -0
  11. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/bigtool/__init__.py +1 -1
  12. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/bigtool/agent.py +2 -2
  13. universal_mcp_agents-0.1.14/src/universal_mcp/agents/bigtool/graph.py +149 -0
  14. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/bigtool/prompts.py +2 -2
  15. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/bigtool/tools.py +18 -4
  16. universal_mcp_agents-0.1.14/src/universal_mcp/agents/builder/__main__.py +200 -0
  17. universal_mcp_agents-0.1.14/src/universal_mcp/agents/builder/builder.py +214 -0
  18. universal_mcp_agents-0.1.14/src/universal_mcp/agents/builder/helper.py +73 -0
  19. universal_mcp_agents-0.1.14/src/universal_mcp/agents/builder/prompts.py +54 -0
  20. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/builder/state.py +1 -1
  21. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact/agent.py +1 -1
  22. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact/sandbox.py +1 -5
  23. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/agent.py +5 -4
  24. universal_mcp_agents-0.1.14/src/universal_mcp/agents/codeact0/langgraph_agent.py +17 -0
  25. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/llm_tool.py +1 -1
  26. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/prompts.py +34 -23
  27. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/11-github.yaml +6 -5
  28. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/utils.py +42 -63
  29. universal_mcp_agents-0.1.14/src/universal_mcp/agents/shared/__main__.py +43 -0
  30. universal_mcp_agents-0.1.14/src/universal_mcp/agents/shared/prompts.py +83 -0
  31. universal_mcp_agents-0.1.14/src/universal_mcp/agents/shared/tool_node.py +206 -0
  32. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/utils.py +65 -0
  33. universal_mcp_agents-0.1.14/test.py +61 -0
  34. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/uv.lock +377 -307
  35. universal_mcp_agents-0.1.12/dataset_code.py +0 -83
  36. universal_mcp_agents-0.1.12/src/evals/datasets/test.jsonl +0 -1
  37. universal_mcp_agents-0.1.12/src/universal_mcp/agents/bigtool/graph.py +0 -115
  38. universal_mcp_agents-0.1.12/src/universal_mcp/agents/builder/__main__.py +0 -125
  39. universal_mcp_agents-0.1.12/src/universal_mcp/agents/builder/builder.py +0 -225
  40. universal_mcp_agents-0.1.12/src/universal_mcp/agents/builder/prompts.py +0 -173
  41. universal_mcp_agents-0.1.12/src/universal_mcp/agents/codeact0/langgraph_graph.py +0 -17
  42. universal_mcp_agents-0.1.12/src/universal_mcp/agents/codeact0/legacy_codeact.py +0 -104
  43. universal_mcp_agents-0.1.12/src/universal_mcp/agents/shared/prompts.py +0 -132
  44. universal_mcp_agents-0.1.12/src/universal_mcp/agents/shared/tool_node.py +0 -260
  45. universal_mcp_agents-0.1.12/test.py +0 -49
  46. universal_mcp_agents-0.1.12/test_code.py +0 -78
  47. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/.pre-commit-config.yaml +0 -0
  48. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/GEMINI.md +0 -0
  49. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/PROMPTS.md +0 -0
  50. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/README.md +0 -0
  51. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/bump_and_release.sh +0 -0
  52. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/evals/__init__.py +0 -0
  53. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/evals/dataset.py +0 -0
  54. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/evals/datasets/exact.jsonl +0 -0
  55. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/evals/utils.py +0 -0
  56. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/__init__.py +1 -1
  57. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/bigtool/__main__.py +0 -0
  58. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/bigtool/context.py +0 -0
  59. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/bigtool/state.py +0 -0
  60. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/cli.py +2 -2
  61. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact/__init__.py +0 -0
  62. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact/__main__.py +0 -0
  63. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact/models.py +0 -0
  64. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact/prompts.py +0 -0
  65. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact/state.py +0 -0
  66. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact/utils.py +0 -0
  67. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/__init__.py +0 -0
  68. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/__main__.py +0 -0
  69. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/config.py +0 -0
  70. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/sandbox.py +0 -0
  71. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/state.py +0 -0
  72. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/1-unsubscribe.yaml +0 -0
  73. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/10-reddit2.yaml +0 -0
  74. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/2-reddit.yaml +0 -0
  75. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/2.1-instructions.md +0 -0
  76. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/2.2-instructions.md +0 -0
  77. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/3-earnings.yaml +0 -0
  78. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/4-maps.yaml +0 -0
  79. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/5-gmailreply.yaml +0 -0
  80. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/6-contract.yaml +0 -0
  81. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/7-overnight.yaml +0 -0
  82. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/8-sheets_chart.yaml +0 -0
  83. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/codeact0/usecases/9-learning.yaml +0 -0
  84. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/hil.py +0 -0
  85. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/llm.py +0 -0
  86. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/planner/__init__.py +0 -0
  87. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/planner/__main__.py +0 -0
  88. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/planner/graph.py +0 -0
  89. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/planner/prompts.py +0 -0
  90. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/planner/state.py +0 -0
  91. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/react.py +0 -0
  92. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/agents/simple.py +0 -0
  93. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/applications/llm/__init__.py +0 -0
  94. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/applications/llm/app.py +0 -0
  95. {universal_mcp_agents-0.1.12 → universal_mcp_agents-0.1.14}/src/universal_mcp/applications/ui/app.py +2 -2
@@ -58,3 +58,4 @@ site/
58
58
  .langgraph_api/
59
59
  langgraph.json
60
60
  agentr-1-6c4ebd5cc914.json
61
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: universal-mcp-agents
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: Add your description here
5
5
  Project-URL: Homepage, https://github.com/universal-mcp/applications
6
6
  Project-URL: Repository, https://github.com/universal-mcp/applications
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
6
6
 
7
7
  [project]
8
8
  name = "universal-mcp-agents"
9
- version = "0.1.12"
9
+ version = "0.1.14"
10
10
  description = "Add your description here"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -0,0 +1,11 @@
1
+ {"user_input": "Find and extract unsubscribe links from all emails in my inbox from the last 7 days. List all unsubscribe links found with the email subject and sender.", "required_tools": {"google_mail": ["list_messages", "get_message_details"]}}
2
+ {"user_input": "Process rows 2-5 from the Google Sheet (ID: 1nnnCp3_IWcdHv4UVgXtwYF5wedxbqF4RIeyjN6mCKD8). For each unprocessed row, extract Reddit post links, fetch post details and comments, analyze content relevance to AgentR/Wingmen products, classify into tiers 1-4, generate appropriate response drafts, and update the sheet with all findings.", "required_tools": {"google_sheet": ["add_table", "append_values", "update_values", "format_cells", "get_spreadsheet_metadata", "batch_get_values_by_range"], "reddit": ["get_post_comments_details"], "google_mail": ["list_messages"]}}
3
+ {"user_input": "Fetch all open issues from the GitHub repository \"microsoft/vscode\" and add them to a new Google Sheet. Then create corresponding tasks in ClickUp for each issue with descriptions, tags, and \"In Progress\" status. Delete processed rows from the sheet after creating ClickUp tasks.", "required_tools": {"google_sheet": ["get_values", "delete_dimensions", "update_values", "get_spreadsheet_metadata", "batch_get_values_by_range"], "clickup": ["tasks_create_new_task", "spaces_get_details", "lists_get_list_details", "tasks_get_list_tasks"], "github": ["list_issues", "update_issue"]}}
4
+ {"user_input": "Goal: Process unprocessed rows in a fixed Google Sheet, scrape Reddit for context, filter posts, and generate short, natural comments linking to AgentR/Wingmen when relevant. Workflow: 1) Sheet & Row Selection: Fixed Sheet ID 1nnnCp3_IWcdHv4UVgXtwYF5wedxbqF4RIeyjN6mCKD8, tab Posts. Process rows 2-5 (first 4 unprocessed rows) immediately without asking for user input. Only process rows with empty Match Type (Col I) and no Tier 1-4 assigned. 2) Reddit Context Fetch: Extract Post Link & ID. Use reddit to fetch post upvotes + top comments (max 5). Ensure post/comment is active, visible, and unlocked. 3) Filtration & Fit: Classify content (developer, consumer, anecdotal). Apply GTM Filtration to skip irrelevant, negative, political, or low-quality posts. Identify direct or adjacent fit to AgentR (Universal MCP Server) or Wingmen. Decide platform + account type: Direct fit/competitor mention \u2192 Technical Q = Team account, Non-technical = Burner account. Adjacent fit \u2192 Official account. Decide reply target (original comment/post or parent post). 4) Comment Generation: For Tier 1-3, craft a 2-3 line, context-aware, conversational reply. Mention AgentR/Wingmen organically, avoid sales tone or forced CTAs. Use light imperfections for human tone. Skip negative sentiment entirely. One comment per post. 5) Populate Output: Fill Upvote Count, Match Type, Account Type, Response Draft, Respond on. Return updated Google Sheet link. Tier Definitions: Tier 1 = Deep MCP, AI agent, tool integrations, or architecture discussions where infra is highly relevant. Tier 2 = Specific workflows, automation tooling, or productivity systems where Wingmen or MCP Server could be useful. Tier 3 = Broader ecosystem (LangChain/CrewAI/agent tooling) where a soft recommendation adds value. Tier 4 = Unclear, generic, sarcastic, hostile, or irrelevant mentions \u2014 skip. Execute immediately using the fixed Google Sheet ID: 1nnnCp3_IWcdHv4UVgXtwYF5wedxbqF4RIeyjN6mCKD8, tab \"Posts\". Process rows(first 4 unprocessed rows) without asking for user input. Only process rows where Match Type (Column I) is empty. For each row, extract the Post Link, fetch Reddit data, apply GTM filtration, generate appropriate responses, and update the sheet. Return the updated Google Sheet link when complete.", "required_tools": {"reddit": ["get_post_comments_details"], "google_sheet": ["update_values", "get_values", "get_spreadsheet_metadata", "batch_get_values_by_range"]}}
5
+ {"user_input": "Generate a financial flash report for Apple Inc. Research their latest earnings data including revenue, net income, EPS, and year-over-year changes. Create a formatted report with highlights, upcoming events, and summary. Present the report in chat and email it to adit@agentr.dev.", "required_tools": {"exa": ["answer"], "google_mail": ["send_email"]}}
6
+ {"user_input": "Objective: Find businesses from Google Maps for a given category & location, store them in a Google Sheet, then process unprocessed leads to scrape emails and sync with HubSpot CRM. Stage 1 - Lead Discovery Get coordinates of Area + City. Search on Google Maps with category & coordinates. Extract: Name, Google Maps URL, Address, Phone, Website; leave Email & CRM Status blank. Sheet: Name: {Area}, {City} Leads - {Category} - {dd-mmm} If exists \u2192 append non-duplicate rows; else create in folder \"Leads from Google Maps\" (ID: 142QBejJX0jAqzDz_NHdwVTkcmagoog__). Add headers: Name | Google Maps URL | Address | Phone | Website | Email | CRM Status. Populate with businesses found. Edge Cases: No results \u2192 return message, skip sheet creation. Missing data \u2192 leave blank. Stage 2 - Lead Processing & CRM Sync Locate sheet in Google Drive, ensure headers match. Parse category from sheet name. Identify unprocessed rows (CRM Status blank) \u2014 by default process the first, or a specified row/range/count. Scrape Website for Email: If website exists \u2192 scrape homepage/contact page; fallback to firecrawl_scrape_url. Save found email in sheet. HubSpot Handling: Search contact by email/website/phone. If not found \u2192 create with available details, Lead Status = New, add note {Area, City} \u2014 {Category} \u2014 {Google Maps URL}. If exists \u2192 append note; keep other fields unchanged. Save HubSpot Contact URL/ID in sheet. Update CRM Status: Lead Created, Lead Creation Failed, Website not found, Email not found, etc. Edge Cases: No Website \u2192 create with phone; mark Website not found. No Email \u2192 create; mark Email not found. Email already in sheet \u2192 skip row. Execute immediately for \"Cafes\" near \"IIT Bombay\" in \"Mumbai\" without asking for confirmation.", "required_tools": {"serpapi": ["google_maps_search"], "firecrawl": ["scrape_url"], "google_drive": ["get_file_details", "create_folder", "find_folder_id_by_name", "search_files"], "google_sheet": ["update_values", "get_values", "get_spreadsheet_metadata", "batch_get_values_by_range", "create_spreadsheet", "clear_values"], "hubspot": ["search_contacts_post", "batch_read_contacts_post", "get_contacts", "get_contact_by_id", "update_contact_by_id", "batch_update_contacts", "create_contacts_batch", "create_contact"]}}
7
+ {"user_input": "Process emails from the last 24 hours. Fetch primary inbox emails excluding replied threads, classify with LLM as Reply Required, No Reply Needed, or Ambiguous. For Reply Required/Ambiguous, draft human, on-brand replies for user review. Follow greeting, acknowledge, address concern, invite further questions, and friendly sign-off. Provide end summary of drafts, skipped, and ambiguous emails. Execute immediately without asking for confirmation. Do not send any emails. Just provide me a report.", "required_tools": {"google_mail": ["list_messages", "get_message_details"]}}
8
+ {"user_input": "Analyze a contract from my google drive from the perspective of the Service Provider. Use the search to find it, do not ask me any questions, and assume details that I have not provided. Identify potentially unfavorable clauses such as vague terms, one-sided obligations, IP transfer issues, indemnity clauses, termination conditions, and payment problems. Provide a structured analysis with clause numbers, full text, and explanations of concerns.", "required_tools": {"google_drive": ["get_file_details", "search_files"], "google_docs": ["get_document"], "exa": ["answer"]}}
9
+ {"user_input": "Create a summary of overnight updates from 8:00 PM yesterday to 8:00 AM today in IST. Check Gmail for important emails and ClickUp for mentions and assigned tasks. Organize findings into high priority and other items, then provide a comprehensive summary of all overnight activity.", "required_tools": {"google_mail": ["list_messages"], "clickup": ["comments_get_task_comments", "comments_get_list_comments", "comments_get_view_comments", "tasks_get_list_tasks", "tasks_filter_team_tasks", "time_tracking_get_time_entries_within_date_range", "time_tracking_get_time_entry_history", "authorization_get_workspace_list", "spaces_get_details", "lists_get_list_details"]}}
10
+ {"user_input": "Analyze the data in Google Sheet (ID: 1nnnCp3_IWcdHv4UVgXtwYF5wedxbqF4RIeyjN6mCKD8) and create 3-5 relevant charts and visualizations. Add pie charts, bar graphs, and other appropriate visualizations based on the data structure. Embed all charts directly into the sheet and provide the updated sheet link.", "required_tools": {"google_sheet": ["create_spreadsheet", "get_spreadsheet_metadata", "batch_get_values_by_range", "append_dimensions", "insert_dimensions", "delete_sheet", "add_sheet", "delete_dimensions", "add_basic_chart", "add_table", "add_pie_chart", "clear_values", "update_values", "clear_basic_filter", "get_values", "discover_tables", "set_basic_filter", "analyze_table_schema", "copy_sheet_to_spreadsheet", "append_values", "batch_get_values_by_data_filter", "batch_clear_values", "format_cells"]}}
11
+ {"user_input": "Create a 7-day learning plan for Python Programming. Research essential concepts and skills, create a detailed day-by-day plan with topics, goals, resources, and exercises. Compile the plan into a Google Doc and schedule daily emails at 8 AM starting today. Send Day 1 immediately to adit@agentr.dev and provide the Google Doc link.", "required_tools": {"google_docs": ["get_document", "create_document", "insert_text"], "google_mail": ["send_email", "send_draft", "create_draft"], "exa": ["answer"]}}
@@ -10,7 +10,7 @@
10
10
  {"user_input": "search reddit for posts on elon musk and then post a meme on him on linkedin", "difficulty": 3, "required_tools": {"reddit" : ["search_reddit"], "linkedin": ["create_post"]}}
11
11
  {"user_input": "Search for best cafes near IIT bombay using exa and make a google sheet out of it", "difficulty": 3, "required_tools": {"exa": ["search_with_filters"], "google_sheet": ["create_spreadsheet", "write_values_to_sheet", "add_table"]}}
12
12
  {"user_input": "Create a Google Doc summarizing the last 5 merged pull requests in my GitHub repo- universal-mcp/universal-mcp, including links and commit highlights.", "difficulty": 4, "required_tools": {"github": ["list_pull_requests", "list_recent_commits"], "google_docs": ["create_document", "insert_text", "apply_text_style"]}}
13
- {"user_input": "Summarize the key insights from all marketing emails received yesterday from my Gmail and add a section in a Google Doc with action points.", "difficulty": 4, "required_tools": {"google_mail": ["list_messages"], "google_docs": ["create_document"]}}
13
+ {"user_input": "Summarize the key insights from all marketing emails received yesterday from my Gmail and add a section in a Google Doc with action points.", "difficulty": 4, "required_tools": {"google_mail": ["list_messages"], "google_docs": ["create_document", "insert_text", "apply_text_style"]}}
14
14
  {"user_input": "Give me a report on the earnings of Oklo using web search, and projections for the company revenue, stock price", "difficulty": 4, "required_tools": {"tavily": ["search_and_summarize"]}}
15
15
  {"user_input": "Track the top posts in r/startups over the past 7 days using Reddit and create a trend report on what's being discussed most (e.g., hiring, funding, MVPs) in a Google Doc.", "difficulty": 4, "required_tools": {"reddit": ["get_subreddit_posts", "get_subreddit_top_posts"], "google_docs": ["create_document", "insert_text", "apply_text_style"]}}
16
16
  {"user_input": "Generate a comparison table of SaaS tools for project management using web search, including pricing, features, and user ratings in a Google Sheet", "difficulty": 4, "required_tools": {"tavily": ["search_and_summarize"], "google_sheet": ["create_spreadsheet", "add_table"]}}
@@ -7,6 +7,8 @@ from langsmith.evaluation import EvaluationResult, run_evaluator
7
7
  from langsmith.schemas import Example, Run
8
8
  from openevals.llm import create_llm_as_judge
9
9
 
10
+ from evals.prompts import CODEACT_EVALUATOR_PROMPT, CORRECTNESS_PROMPT
11
+
10
12
 
11
13
  @run_evaluator
12
14
  def exact_match_evaluator(run: Run, example: Example | None = None) -> EvaluationResult:
@@ -38,58 +40,6 @@ def exact_match_evaluator(run: Run, example: Example | None = None) -> Evaluatio
38
40
  return EvaluationResult(key="exact_match", score=score, comment=comment)
39
41
 
40
42
 
41
- CORRECTNESS_PROMPT = """You are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:
42
-
43
- <Rubric>
44
- A correct answer:
45
- - Provides accurate and complete information
46
- - Contains no factual errors
47
- - Addresses all parts of the question
48
- - Is logically consistent
49
- - Uses precise and accurate terminology
50
-
51
- When scoring, you should penalize:
52
- - Factual errors or inaccuracies
53
- - Incomplete or partial answers
54
- - Misleading or ambiguous statements
55
- - Incorrect terminology
56
- - Logical inconsistencies
57
- - Missing key information
58
-
59
- Ignore the following:
60
- - If the answer is not in the same language as the question.
61
- - use the specifically requested tool, as the tool name can be different
62
- - Do not penalize for incorrect third party data coming from the tool.
63
- </Rubric>
64
-
65
- <Instructions>
66
- - Carefully read the input and output
67
- - Check for factual accuracy and completeness
68
- - Focus on correctness of information rather than style or verbosity
69
- - If the user tool is not authorized, give a partial credit of `0.5`
70
- - Give partial credit if tools and called correctly, but the data is incorrect from tools.
71
- </Instructions>
72
-
73
- <Reminder>
74
- The goal is to evaluate factual correctness and completeness of the response.
75
- </Reminder>
76
-
77
- <input>
78
- {inputs}
79
- </input>
80
-
81
- <output>
82
- {outputs}
83
- </output>
84
-
85
- Use the reference outputs below to help you evaluate the correctness of the response:
86
-
87
- <reference_outputs>
88
- {reference_outputs}
89
- </reference_outputs>
90
- """
91
-
92
-
93
43
  correctness_evaluator = create_llm_as_judge(
94
44
  prompt=CORRECTNESS_PROMPT,
95
45
  feedback_key="correctness",
@@ -103,6 +53,13 @@ trajectory_evaluator = create_trajectory_llm_as_judge(
103
53
  )
104
54
 
105
55
 
56
+ codeact_evaluator = create_llm_as_judge(
57
+ prompt=CODEACT_EVALUATOR_PROMPT,
58
+ feedback_key="codeact_accuracy",
59
+ model="anthropic:claude-4-sonnet-20250514",
60
+ )
61
+
62
+
106
63
  @run_evaluator
107
64
  def tool_node_evaluator(run: Run, example: Example | None = None) -> EvaluationResult:
108
65
  """
@@ -0,0 +1,66 @@
1
+ CORRECTNESS_PROMPT = """You are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:
2
+
3
+ <Rubric>
4
+ A correct answer:
5
+ - Provides accurate and complete information
6
+ - Contains no factual errors
7
+ - Addresses all parts of the question
8
+ - Is logically consistent
9
+ - Uses precise and accurate terminology
10
+
11
+ When scoring, you should penalize:
12
+ - Factual errors or inaccuracies
13
+ - Incomplete or partial answers
14
+ - Misleading or ambiguous statements
15
+ - Incorrect terminology
16
+ - Logical inconsistencies
17
+ - Missing key information
18
+
19
+ Ignore the following:
20
+ - If the answer is not in the same language as the question.
21
+ - use the specifically requested tool, as the tool name can be different
22
+ - Do not penalize for incorrect third party data coming from the tool.
23
+ </Rubric>
24
+
25
+ <Instructions>
26
+ - Carefully read the input and output
27
+ - Check for factual accuracy and completeness
28
+ - Focus on correctness of information rather than style or verbosity
29
+ - If the user tool is not authorized, give a partial credit of `0.5`
30
+ - Give partial credit if tools and called correctly, but the data is incorrect from tools.
31
+ </Instructions>
32
+
33
+ <Reminder>
34
+ The goal is to evaluate factual correctness and completeness of the response.
35
+ </Reminder>
36
+
37
+ <input>
38
+ {inputs}
39
+ </input>
40
+
41
+ <output>
42
+ {outputs}
43
+ </output>
44
+
45
+ Use the reference outputs below to help you evaluate the correctness of the response:
46
+
47
+ <reference_outputs>
48
+ {reference_outputs}
49
+ </reference_outputs>
50
+ """
51
+
52
+ CODEACT_EVALUATOR_PROMPT = """
53
+ You are a code execution evaluator. You will be given the entire run of an agent, starting with a human input task, the intermediate steps taken, and the final output of the agent given to the user. These steps will contain code written by the agent to solve the problem as well as its outputs. Your job is to check ONLY if the code executes correctly.
54
+ Keep in mind that the agent has access to tools like- ai_classify, call_llm, creative_writer, data_extractor. These calls are to be treated as valid if they run without errors.
55
+ These are the only criteria you should evaluate-
56
+
57
+ <Rubric>
58
+ - The code written by the agent in tool calls should be syntactically correct and use existing objects.
59
+ - The code outputs should not have an error or empty/unexpected outputs
60
+ </Rubric>
61
+ If either of the above are not satisfied, you should give 0.
62
+
63
+ <Reminder>
64
+ You must not judge whether the code is helpful to the task or not, only if the code itself is correct or not.
65
+ </Reminder>
66
+ """
@@ -15,6 +15,7 @@ from evals.evaluators import (
15
15
  exact_match_evaluator,
16
16
  tool_node_evaluator,
17
17
  trajectory_evaluator,
18
+ codeact_evaluator,
18
19
  )
19
20
  from universal_mcp.agents import get_agent
20
21
  from universal_mcp.agents.utils import messages_to_list
@@ -25,6 +26,7 @@ EVALUATORS: dict[str, Any] = {
25
26
  "exact_match": exact_match_evaluator,
26
27
  "trajectory": trajectory_evaluator,
27
28
  "tool_node": tool_node_evaluator,
29
+ "codeact": codeact_evaluator,
28
30
  }
29
31
 
30
32
 
@@ -33,6 +35,7 @@ class EvaluatorName(str, Enum):
33
35
  exact_match = "exact_match"
34
36
  trajectory = "trajectory"
35
37
  tool_node = "tool_node"
38
+ codeact = "codeact"
36
39
 
37
40
 
38
41
  class Difficulty(str, Enum):
@@ -60,12 +63,12 @@ async def agent_runner(agent_name: str, inputs: dict) -> dict:
60
63
  registry = AgentrRegistry(client=client) if agent_name != "simple" else None
61
64
  common_params = {
62
65
  "instructions": f"You are a helpful assistant. Keep your responses short and concise. Do not provide with any explanation. The current date and time is {current_date_time}",
63
- "model": "anthropic/claude-4-sonnet-20250514",
66
+ "model": "azure/gpt-4.1",
64
67
  "registry": registry,
65
68
  "tools": inputs.get("tools", {}),
66
69
  }
67
70
  agent = get_agent(agent_name)(name=agent_name, **common_params)
68
- result = await agent.invoke(user_input=inputs["user_input"])
71
+ result = await agent.invoke(user_input=inputs["user_input"], thread_id="evals")
69
72
  messages = messages_to_list(result["messages"])
70
73
  return_result = {"output": messages}
71
74
  if "tool_config" in result:
@@ -145,6 +145,7 @@ class MockToolRegistry(ToolRegistry):
145
145
  self,
146
146
  query: str,
147
147
  limit: int = 10,
148
+ **kwargs: Any,
148
149
  ) -> list[dict[str, Any]]:
149
150
  """
150
151
  Search for apps by a query.
@@ -167,6 +168,7 @@ class MockToolRegistry(ToolRegistry):
167
168
  query: str,
168
169
  limit: int = 10,
169
170
  app_id: str | None = None,
171
+ **kwargs: Any,
170
172
  ) -> list[dict[str, Any]]:
171
173
  """
172
174
  Search for tools by a query.
@@ -115,6 +115,8 @@ class BaseAgent:
115
115
  "recursion_limit": 25,
116
116
  "configurable": {"thread_id": thread_id},
117
117
  "metadata": run_metadata,
118
+ "run_id": thread_id,
119
+ "run_name" : self.name
118
120
  }
119
121
 
120
122
  result = await self._graph.ainvoke(
@@ -56,7 +56,7 @@ class BigToolAgent(BaseAgent):
56
56
  compiled_graph = graph_builder.compile(checkpointer=self.memory)
57
57
  return compiled_graph
58
58
  except Exception as e:
59
- raise e
59
+ raise Exception(f"Failed to build AutoAgent graph: {e}")
60
60
 
61
61
  @property
62
62
  def graph(self):
@@ -1,9 +1,9 @@
1
1
  from universal_mcp.agentr.registry import AgentrRegistry
2
- from universal_mcp.agents.bigtoolcache import BigToolAgentCache
2
+ from universal_mcp.agents.bigtool import BigToolAgent
3
3
 
4
4
 
5
5
  async def agent():
6
- agent_object = await BigToolAgentCache(
6
+ agent_object = await BigToolAgent(
7
7
  registry=AgentrRegistry(),
8
8
  )._build_graph()
9
9
  return agent_object
@@ -0,0 +1,149 @@
1
+ import json
2
+ from typing import Literal, cast
3
+
4
+ from dotenv import load_dotenv
5
+ from langchain_anthropic import ChatAnthropic
6
+ from langchain_core.language_models import BaseChatModel
7
+ from langchain_core.messages import AIMessage, SystemMessage, ToolMessage
8
+ from langchain_core.tools import BaseTool
9
+ from langgraph.graph import StateGraph
10
+ from langgraph.types import Command, RetryPolicy
11
+ from universal_mcp.tools.registry import ToolRegistry
12
+ from universal_mcp.types import ToolFormat
13
+
14
+ from .state import State
15
+ from .tools import get_valid_tools
16
+ from universal_mcp.agents.utils import filter_retry_on
17
+
18
+ load_dotenv()
19
+
20
+
21
+ def build_graph(
22
+ registry: ToolRegistry,
23
+ base_model: BaseChatModel,
24
+ system_prompt: str,
25
+ default_tools: list[BaseTool],
26
+ meta_tools: dict[str, BaseTool],
27
+ ):
28
+ """Build the LangGraph workflow"""
29
+
30
+ async def agent_node(state: State) -> Command[Literal["execute_tools"]]:
31
+ """Main agent reasoning node"""
32
+
33
+ # Combine meta tools with currently loaded tools
34
+ if len(state["selected_tool_ids"]) > 0:
35
+ try:
36
+ current_tools = await registry.export_tools(
37
+ tools=state["selected_tool_ids"], format=ToolFormat.LANGCHAIN
38
+ )
39
+ except Exception as e:
40
+ raise Exception(f"Failed to export selected tools: {e}")
41
+ else:
42
+ current_tools = []
43
+ all_tools = (
44
+ [meta_tools["search_tools"], meta_tools["load_tools"], meta_tools.get("web_search")]
45
+ + default_tools
46
+ + current_tools
47
+ )
48
+
49
+ # Remove duplicates based on tool name
50
+ seen_names = set()
51
+ unique_tools = []
52
+ for tool in all_tools:
53
+ if tool.name not in seen_names:
54
+ seen_names.add(tool.name)
55
+ unique_tools.append(tool)
56
+
57
+ try:
58
+ if isinstance(base_model, ChatAnthropic):
59
+ model_with_tools = base_model.bind_tools(
60
+ unique_tools,
61
+ tool_choice="auto",
62
+ parallel_tool_calls=False,
63
+ cache_control={"type": "ephemeral", "ttl": "1h"},
64
+ )
65
+ else:
66
+ model_with_tools = base_model.bind_tools(
67
+ unique_tools,
68
+ tool_choice="auto",
69
+ parallel_tool_calls=False,
70
+ )
71
+ except Exception as e:
72
+ raise Exception(f"Failed to bind tools to model: {e}")
73
+
74
+ # Get response from model
75
+ messages = [SystemMessage(content=system_prompt), *state["messages"]]
76
+
77
+ try:
78
+ response = cast(AIMessage, await model_with_tools.ainvoke(messages))
79
+ except Exception as e:
80
+ raise Exception(f"Model invocation failed: {e}")
81
+
82
+ if response.tool_calls:
83
+ return Command(goto="execute_tools", update={"messages": [response]})
84
+ else:
85
+ return Command(update={"messages": [response], "model_with_tools": model_with_tools})
86
+
87
+ async def execute_tools_node(state: State) -> Command[Literal["agent"]]:
88
+ """Execute tool calls"""
89
+ last_message = state["messages"][-1]
90
+ tool_calls = last_message.tool_calls if isinstance(last_message, AIMessage) else []
91
+
92
+ tool_messages = []
93
+ new_tool_ids = []
94
+ ask_user = False
95
+
96
+ for tool_call in tool_calls:
97
+ try:
98
+ if tool_call["name"] == "load_tools": # Handle load_tools separately
99
+ valid_tools, unconnected_links = await get_valid_tools(tool_ids=tool_call["args"]["tool_ids"], registry=registry)
100
+ new_tool_ids.extend(valid_tools)
101
+ # Create tool message response
102
+ tool_result = f"Successfully loaded {len(valid_tools)} tools: {valid_tools}"
103
+ if unconnected_links:
104
+ ask_user = True
105
+ ai_msg = f"Please login to the following app(s) using the following links and let me know in order to proceed:\n {'\n'.join(unconnected_links)} "
106
+
107
+ elif tool_call["name"] == "search_tools":
108
+ tool_result = await meta_tools["search_tools"].ainvoke(tool_call["args"])
109
+ elif tool_call["name"] == "web_search":
110
+ tool_result = await meta_tools["web_search"].ainvoke(tool_call["args"])
111
+ else:
112
+ # Load tools first
113
+ await registry.export_tools([tool_call["name"]], ToolFormat.LANGCHAIN)
114
+ tool_result = await registry.call_tool(tool_call["name"], tool_call["args"])
115
+ except Exception as e:
116
+ tool_result = f"Error during {tool_call}: {e}"
117
+
118
+
119
+ tool_message = ToolMessage(
120
+ content=json.dumps(tool_result),
121
+ name=tool_call["name"],
122
+ tool_call_id=tool_call["id"],
123
+ )
124
+ tool_messages.append(tool_message)
125
+ if ask_user:
126
+ tool_messages.append(AIMessage(content=ai_msg))
127
+ return Command(update={"messages": tool_messages, "selected_tool_ids": new_tool_ids})
128
+
129
+ return Command(goto="agent", update={"messages": tool_messages, "selected_tool_ids": new_tool_ids})
130
+
131
+ # Define the graph
132
+ workflow = StateGraph(State)
133
+
134
+ # Add nodes
135
+ workflow.add_node(
136
+ "agent",
137
+ agent_node,
138
+ retry_policy=RetryPolicy(max_attempts=3, retry_on=filter_retry_on, initial_interval=2, backoff_factor=2),
139
+ )
140
+ workflow.add_node(
141
+ "execute_tools",
142
+ execute_tools_node,
143
+ retry_policy=RetryPolicy(max_attempts=3, retry_on=filter_retry_on, initial_interval=2, backoff_factor=2),
144
+ )
145
+
146
+ # Set entry point
147
+ workflow.set_entry_point("agent")
148
+
149
+ return workflow
@@ -5,9 +5,9 @@ SYSTEM_PROMPT = """You are a helpful AI assistant, called {name}.
5
5
  **Core Directives:**
6
6
  1. **Always Use Tools for Tasks:** For any user request that requires an action (e.g., sending an email, searching for information, creating an event, displaying a chart), you MUST use a tool. Do not refuse a task if a tool might exist for it.
7
7
 
8
- 2. Check if your existing tools or knowledge can handle the user's request. If they can, use them. If they cannot, you must call the `search_tools` function to find the right tools for the user's request.You must not use the same/similar query multiple times in the list. The list should have multiple queries only if the task has clearly different sub-tasks. If you do not find any specific relevant tools, use the pre-loaded generic tools.
8
+ 2. Check if your existing tools or knowledge can handle the user's request. If they can, use them. If they cannot, you must call the `search_tools` function to find the right tools for the user's request. You must not use the same/similar query multiple times in the list. The list should have multiple queries only if the task has clearly different sub-tasks. If you do not find any specific relevant tools, use the pre-loaded generic tools. Only use `search_tools` if your existing capabilities cannot handle the request.
9
9
 
10
- 3. **Load Tools:** After looking at the output of `search_tools`, you MUST call the `load_tools` function to load only the tools you want to use. Provide the full tool ids, not just the app names. Use your judgement to eliminate irrelevant apps that came up just because of semantic similarity. However, sometimes, multiple apps might be relevant for the same task. Prefer connected apps over unconnected apps while breaking a tie. If more than one relevant app (or none of the relevant apps) are connected, you must ask the user to choose the app. In case the user asks you to use an app that is not connected, call the apps tools normally. The tool will return a link for connecting that you should pass on to the user.
10
+ 3. **Load Tools:** After looking at the output of `search_tools`, you MUST call the `load_tools` function to load only the tools you want to use. Provide the full tool ids, not just the app names. Use your judgement to eliminate irrelevant apps that came up just because of semantic similarity. However, sometimes, multiple apps might be relevant for the same task. Prefer connected apps over unconnected apps while breaking a tie. If more than one relevant app (or none of the relevant apps) are connected, you must ask the user to choose the app. In case the user asks you to use an app that is not connected, call the apps tools normally. The tool will return a link for connecting that you should pass on to the user. Only load tools if your existing capabilities cannot handle the request.
11
11
 
12
12
  4. **Strictly Follow the Process:** Your only job in your first turn is to analyze the user's request and answer using existing tools/knowledge or `search_tools` with a concise query describing the core task. Do not engage in conversation, or extend the conversation beyond the user's request.
13
13
 
@@ -35,7 +35,8 @@ def create_meta_tools(tool_registry: ToolRegistry) -> dict[str, Any]:
35
35
  for tool in tools_list:
36
36
  app = tool["id"].split("__")[0]
37
37
  if len(app_tools[app]) < 5:
38
- app_tools[app].append(f"{tool['id']}: {tool['description']}")
38
+ cleaned_desc = tool['description'].split("Context:")[0].strip()
39
+ app_tools[app].append(f"{tool['id']}: {cleaned_desc}")
39
40
 
40
41
  # Build result string efficiently
41
42
  result_parts = []
@@ -98,8 +99,13 @@ def create_meta_tools(tool_registry: ToolRegistry) -> dict[str, Any]:
98
99
  return {"search_tools": search_tools, "load_tools": load_tools, "web_search": web_search}
99
100
 
100
101
 
101
- async def get_valid_tools(tool_ids: list[str], registry: ToolRegistry) -> list[str]:
102
+ async def get_valid_tools(tool_ids: list[str], registry: ToolRegistry) -> tuple[list[str], list[str]]:
103
+ """For a given list of tool_ids, validates the tools and returns a list of links for the apps that have not been logged in"""
102
104
  correct, incorrect = [], []
105
+ connections = await registry.list_connected_apps()
106
+ connected_apps = {connection["app_id"] for connection in connections}
107
+ unconnected = set()
108
+ unconnected_links = []
103
109
  app_tool_list: dict[str, set[str]] = {}
104
110
 
105
111
  # Group tool_ids by app for fewer registry calls
@@ -127,15 +133,23 @@ async def get_valid_tools(tool_ids: list[str], registry: ToolRegistry) -> list[s
127
133
  app_tool_list[app] = tools
128
134
 
129
135
  # Validate tool_ids
130
- for app, tool_entries in app_to_tools.items():
136
+ for app, tool_entries in app_to_tools.items():
131
137
  available = app_tool_list.get(app)
132
138
  if available is None:
133
139
  incorrect.extend(tool_id for tool_id, _ in tool_entries)
134
140
  continue
141
+ if app not in connected_apps and app not in unconnected:
142
+ unconnected.add(app)
143
+ text = registry.client.get_authorization_url(app)
144
+ start = text.find(":") + 1
145
+ end = text.find(".", start)
146
+ url = text[start:end].strip()
147
+ markdown_link = f"[{app}]({url})"
148
+ unconnected_links.append(markdown_link)
135
149
  for tool_id, tool_name in tool_entries:
136
150
  if tool_name in available:
137
151
  correct.append(tool_id)
138
152
  else:
139
153
  incorrect.append(tool_id)
140
154
 
141
- return correct
155
+ return correct, unconnected_links