tooluniverse 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tooluniverse might be problematic. Click here for more details.

@@ -0,0 +1,369 @@
1
+ """
2
+ Tool Metadata Generation Pipeline
3
+ Generates comprehensive metadata for a list of tools by extracting details from their configuration files
4
+ """
5
+
6
+ def compose(arguments, tooluniverse, call_tool):
7
+ """
8
+ Main composition function for Tool Metadata Generation
9
+
10
+ Args:
11
+ arguments (dict): Input arguments containing a list of tool config JSONs as well as a tool_type_mappings dict for non-API tools (e.g., {'Databases': ['XMLTool']})
12
+ tooluniverse: ToolUniverse instance
13
+ call_tool: Function to call other tools
14
+
15
+ Returns:
16
+ list: List of tool metadata dictionaries (JSON-compatible)
17
+ """
18
+ import json
19
+ import warnings
20
+ from collections import Counter
21
+
22
+ def _parse_agent_output(output, tool_name="Unknown Tool"):
23
+ """Helper to parse varied agent outputs (JSON string, wrapped dict) into a dict."""
24
+ if isinstance(output, str):
25
+ try:
26
+ return json.loads(output)
27
+ except json.JSONDecodeError:
28
+ print(f"Failed to parse JSON string from {tool_name}; received: {output[:200]}")
29
+ return {} # Return empty dict on failure to prevent crash
30
+
31
+ if isinstance(output, dict) and 'success' in output and 'result' in output:
32
+ # Handle wrapped output like {'success': True, 'result': '{...}'}
33
+ inner_result = output.get('result')
34
+ if isinstance(inner_result, str) and inner_result.strip():
35
+ try:
36
+ return json.loads(inner_result)
37
+ except json.JSONDecodeError:
38
+ print(f"Failed to parse inner result JSON from {tool_name}; using empty metadata.")
39
+ return {}
40
+ elif isinstance(inner_result, dict):
41
+ return inner_result # Result is already a dict
42
+ else:
43
+ return {} # No valid inner result
44
+
45
+ return {}
46
+
47
+ DEFAULT_TOOL_TYPE_MAPPINGS = {
48
+ "Embedding Store": ["EmbeddingDatabase"],
49
+ "Database": ["XMLTool", "DatasetTool"],
50
+ "Scientific Software Package": ["PackageTool"],
51
+ "AI Agent": ["AgenticTool"],
52
+ "ML Model": ["ADMETAITool","AlphaFoldRESTTool","boltz2_docking","compute_depmap24q2_gene_correlations","run_compass_prediction","run_pinnacle_ppi_retrieval","run_transcriptformer_embedding_retrieval","get_abstract_from_patent_app_number","get_claims_from_patent_app_number","get_full_text_from_patent_app_number"],
53
+ "Human Expert Feedback": ["mcp_auto_loader_human_expert","consult_human_expert","get_expert_response","get_expert_status","list_pending_expert_requests","submit_expert_response"],
54
+ "MCP": ["MCPAutoLoaderTool","MCPClientTool","MCPProxyTool"],
55
+ "Compositional Tool": ["ComposeTool"],
56
+ "Tool Finder Tool": ["ToolFinderEmbedding","ToolFinderLLM","ToolFinderKeyword"],
57
+ "Special Tool": ["Finish","CallAgent"]
58
+ }
59
+
60
+ # Step 0: Parse inputs and set up variables
61
+ tool_configs = arguments.get('tool_configs', [])
62
+ tool_type_mappings = arguments.get('tool_type_mappings', {})
63
+ add_existing_tooluniverse_labels = arguments.get('add_existing_tooluniverse_labels', True)
64
+ max_new_tooluniverse_labels = arguments.get('max_new_tooluniverse_labels', 0)
65
+
66
+ # Merge tool type mappings with defaults, prioritizing user-provided mappings
67
+ for key, value in DEFAULT_TOOL_TYPE_MAPPINGS.items():
68
+ if key not in tool_type_mappings:
69
+ tool_type_mappings[key] = value
70
+ warnings.warn("Warning: Augmenting your provided tool_type_mappings with default tool_type_mappings to ensure compatibility with existing ToolUniverse tools. The default tool_type_mappings are:\n" + json.dumps(DEFAULT_TOOL_TYPE_MAPPINGS, indent=4))
71
+
72
+ # Add existing ToolUniverse labels if specified
73
+ tool_labels_set = set()
74
+ if add_existing_tooluniverse_labels:
75
+ # Load existing standardized tool metadata (list of dicts each containing a 'tags' field)
76
+ # Use importlib.resources to avoid absolute paths so this works inside the installed package.
77
+ try:
78
+ try:
79
+ from importlib import resources as importlib_resources # Py3.9+
80
+ except ImportError: # pragma: no cover
81
+ import importlib_resources # type: ignore
82
+
83
+ # Access the JSON file inside the package (tooluniverse/website_data/v3_standardized_tags.json)
84
+ json_path = importlib_resources.files('tooluniverse.website_data').joinpath('v3_standardized_tags.json')
85
+ with json_path.open('r', encoding='utf-8') as f:
86
+ existing_metadata_list = json.load(f)
87
+
88
+ if isinstance(existing_metadata_list, list):
89
+ for item in existing_metadata_list:
90
+ if isinstance(item, dict):
91
+ tags = item.get('tags', [])
92
+ if isinstance(tags, list):
93
+ for tag in tags:
94
+ if isinstance(tag, str) and tag.strip():
95
+ tool_labels_set.add(tag.strip())
96
+ except Exception as e: # Fail gracefully; downstream logic will just proceed without enrichment
97
+ print(f"Failed to load existing ToolUniverse labels: {e}")
98
+
99
+ if not tool_configs:
100
+ return []
101
+
102
+ # Step 1: Generate detailed metadata for each tool
103
+ all_tool_metadata = []
104
+ for tool_config in tool_configs:
105
+ tool_config_str = json.dumps(tool_config)
106
+ try:
107
+ metadata_params = {
108
+ 'tool_config': tool_config_str,
109
+ 'tool_type_mappings': tool_type_mappings
110
+ }
111
+ generated_metadata = {}
112
+ for _ in range(5): # Retry up to 5 times
113
+ raw_output = call_tool('ToolMetadataGenerator', metadata_params)
114
+ generated_metadata = _parse_agent_output(raw_output, 'ToolMetadataGenerator')
115
+ if generated_metadata: # If the result is not empty, break
116
+ break
117
+ # Attempt to enrich tags using LabelGenerator if tags are empty or default
118
+ try:
119
+ # Prepare inputs for LabelGenerator
120
+ tool_name = tool_config.get('name') or generated_metadata.get('name') or ''
121
+ tool_description = tool_config.get('description') or generated_metadata.get('description') or ''
122
+ # The parameter schema may be nested under parameter->properties
123
+ param_properties = tool_config.get('parameter', {}).get('properties', {})
124
+ # Convert parameters to a JSON-like string representation (without importing json to keep dependencies minimal)
125
+ # Safe string construction
126
+ def _stringify_params(props):
127
+ parts = []
128
+ for k, v in props.items():
129
+ if isinstance(v, dict):
130
+ type_val = v.get('type', 'unknown')
131
+ desc_val = v.get('description', '')
132
+ parts.append(f"\"{k}\": {{ 'type': '{type_val}', 'description': '{desc_val}' }}")
133
+ else:
134
+ parts.append(f"\"{k}\": " + repr(v))
135
+ return '{' + ', '.join(parts) + '}'
136
+
137
+ tool_parameters_str = _stringify_params(param_properties)
138
+ category = tool_config.get('category') or tool_config.get('type') or generated_metadata.get('category') or ''
139
+
140
+ label_params = {
141
+ 'tool_name': tool_name,
142
+ 'tool_description': tool_description,
143
+ 'tool_parameters': tool_parameters_str,
144
+ 'category': category,
145
+ 'existing_labels': json.dumps(list(tool_labels_set))
146
+ }
147
+ label_result = call_tool('LabelGenerator', label_params)
148
+ label_result = _parse_agent_output(label_result, 'LabelGenerator')
149
+
150
+ # Parse label_result which may be dict or JSON string
151
+ labels = []
152
+ if isinstance(label_result, dict):
153
+ labels = label_result.get('labels', [])
154
+ # Replace tags
155
+ if labels:
156
+ generated_metadata['tags'] = labels
157
+ except Exception as tag_exc:
158
+ print(f"Label generation failed for tool {tool_config.get('name', 'N/A')}: {tag_exc}")
159
+
160
+ all_tool_metadata.append(generated_metadata)
161
+ except Exception as e:
162
+ print(f"Failed to generate metadata for tool {tool_config.get('name', 'N/A')}: {e}")
163
+ # Optionally, append an error object or skip the tool
164
+ all_tool_metadata.append({
165
+ 'error': f"Metadata generation failed for {tool_config.get('name', 'N/A')}",
166
+ 'details': str(e)
167
+ })
168
+
169
+ # Step 2: Validate schema
170
+ validated_metadata = []
171
+ schema_template = {
172
+ "id": "",
173
+ "name": "",
174
+ "description": "",
175
+ "detailed_description": "",
176
+ "toolType": "api",
177
+ "tags": [],
178
+ "category": "",
179
+ "lab": "",
180
+ "source": "",
181
+ "version": "v1.0.0",
182
+ "reviewed": False,
183
+ "isValidated": False,
184
+ "usageStats": "0 uses",
185
+ "capabilities": [],
186
+ "limitations": [],
187
+ "parameters": {},
188
+ "inputSchema": {},
189
+ "exampleInput": {},
190
+ "apiEndpoints": []
191
+ }
192
+
193
+ for metadata in all_tool_metadata:
194
+ if 'error' in metadata:
195
+ validated_metadata.append(metadata)
196
+ continue
197
+
198
+ validated_item = {}
199
+ for key, default_value in schema_template.items():
200
+ value = metadata.get(key, default_value)
201
+ if not isinstance(value, type(default_value)):
202
+ # Attempt to gracefully handle simple type mismatches or reset
203
+ if isinstance(default_value, list) and not isinstance(value, list):
204
+ value = []
205
+ elif isinstance(default_value, dict) and not isinstance(value, dict):
206
+ value = {}
207
+ elif isinstance(default_value, str) and not isinstance(value, str):
208
+ value = str(value) if value is not None else ""
209
+ elif isinstance(default_value, bool) and not isinstance(value, bool):
210
+ value = bool(value)
211
+ else:
212
+ value = default_value # Fallback to default if type is complex/unexpected
213
+ validated_item[key] = value
214
+ validated_metadata.append(validated_item)
215
+
216
+ all_tool_metadata = validated_metadata
217
+
218
+ # Step 3: Standardize sources and tags using ToolMetadataStandardizer
219
+ try:
220
+ source_list = []
221
+ for tool in all_tool_metadata:
222
+ if 'error' not in tool and tool.get('source'):
223
+ source_list.append(tool.get('source'))
224
+ # Standardize sources
225
+ if source_list:
226
+ standardizer_params = {'metadata_list': list(set(source_list))}
227
+ standardized_sources_map = call_tool('ToolMetadataStandardizer', standardizer_params)
228
+ standardized_sources_map = _parse_agent_output(standardized_sources_map, 'ToolMetadataStandardizer')
229
+ print("Standardized sources mapping:", standardized_sources_map)
230
+
231
+ # Create a reverse map for easy lookup
232
+ source_to_standard_map = {}
233
+ for standard_name, raw_names in standardized_sources_map.items():
234
+ for raw_name in raw_names:
235
+ source_to_standard_map[raw_name] = standard_name
236
+
237
+ # Update the source in each metadata object
238
+ for tool_metadata in all_tool_metadata:
239
+ if 'error' not in tool_metadata:
240
+ original_source = tool_metadata.get('source')
241
+ if original_source in source_to_standard_map:
242
+ tool_metadata['source'] = source_to_standard_map[original_source]
243
+ except Exception as e:
244
+ print(f"An error occurred during source standardization: {e}")
245
+
246
+ try:
247
+ # Step 4: Standardize tags, with an optional second pass to meet label limits
248
+ all_raw_tags = []
249
+ for tool in all_tool_metadata:
250
+ if 'error' not in tool and isinstance(tool.get('tags'), list):
251
+ all_raw_tags.extend(tool.get('tags', []))
252
+
253
+ # Filter out existing labels before standardization
254
+ tags_to_standardize = [tag for tag in set(all_raw_tags) if tag not in tool_labels_set]
255
+ if max_new_tooluniverse_labels <= 0:
256
+ # If no new labels are allowed, skip standardization and just remove new tags
257
+ for tool_metadata in all_tool_metadata:
258
+ if 'error' not in tool_metadata and isinstance(tool_metadata.get('tags'), list):
259
+ original_tags = tool_metadata.get('tags', [])
260
+ filtered_tags = [tag for tag in original_tags if tag in tool_labels_set]
261
+ tool_metadata['tags'] = sorted(list(set(filtered_tags)))
262
+ return all_tool_metadata # Return early since no further processing is needed
263
+
264
+ tag_to_standard_map = {}
265
+ if tags_to_standardize:
266
+ # Iteratively standardize tags for up to 5 passes to meet the label limit.
267
+ current_tags_to_standardize = list(set(tags_to_standardize))
268
+ # This map will store the final standardized version for each original raw tag.
269
+ tag_to_standard_map = {tag: tag for tag in tags_to_standardize}
270
+
271
+ for i in range(5): # Loop for up to 5 standardization passes
272
+ num_tags = len(current_tags_to_standardize)
273
+
274
+ # If the number of tags is within the limit, no more standardization is needed.
275
+ if max_new_tooluniverse_labels > 0 and num_tags <= max_new_tooluniverse_labels:
276
+ print(f"Tag count ({num_tags}) is within the limit ({max_new_tooluniverse_labels}). Stopping standardization.")
277
+ break
278
+
279
+ print(f"Pass {i+1}: Standardizing {num_tags} tags.")
280
+
281
+ # Set the limit for the standardizer tool.
282
+ # Use a default high limit if max_new_tooluniverse_labels is not set, otherwise use the specified limit.
283
+ limit = max_new_tooluniverse_labels if max_new_tooluniverse_labels > 0 else 150
284
+
285
+ standardizer_params = {
286
+ 'metadata_list': current_tags_to_standardize,
287
+ 'limit': limit
288
+ }
289
+
290
+ print(f"Pass {i+1} input tags: ", current_tags_to_standardize)
291
+
292
+ # Call the standardizer tool and parse the output, with retries.
293
+ pass_output_map = {}
294
+ for _ in range(5): # Retry up to 5 times
295
+ raw_output = call_tool('ToolMetadataStandardizer', standardizer_params)
296
+ pass_output_map = _parse_agent_output(raw_output, 'ToolMetadataStandardizer')
297
+ if pass_output_map: # If the result is not empty, break
298
+ break
299
+
300
+ print(f"Pass {i+1} standardized tags mapping:", pass_output_map)
301
+
302
+ # Create a reverse map for the current pass for easy lookup.
303
+ # Maps a tag from the input list to its new standardized version.
304
+ pass_reverse_map = {}
305
+ for standard_tag, raw_tags_in_pass in pass_output_map.items():
306
+ for raw_tag in raw_tags_in_pass:
307
+ pass_reverse_map[raw_tag] = standard_tag
308
+
309
+ # Update the final mapping by chaining the new standardization.
310
+ # For each original tag, find its current mapping and see if it was further standardized in this pass.
311
+ for original_tag, current_standard_tag in tag_to_standard_map.items():
312
+ # If the current standard tag was part of this pass's input and got re-mapped, update it.
313
+ if current_standard_tag in pass_reverse_map:
314
+ tag_to_standard_map[original_tag] = pass_reverse_map[current_standard_tag]
315
+
316
+ # The new set of tags for the next pass are the keys of the current pass's output.
317
+ current_tags_to_standardize = sorted(list(pass_output_map.keys()))
318
+
319
+ # If the standardizer returns an empty map, it means no further consolidation is possible.
320
+ if not current_tags_to_standardize:
321
+ print("No further tag consolidation possible. Stopping.")
322
+ break
323
+
324
+ # Update tags in each metadata object using the final mapping
325
+ for tool_metadata in all_tool_metadata:
326
+ if 'error' not in tool_metadata and isinstance(tool_metadata.get('tags'), list):
327
+ original_tags = tool_metadata.get('tags', [])
328
+ # For each original tag, use its standardized version if available, otherwise keep the original.
329
+ # This correctly handles tags that were already in tool_labels_set and thus not standardized.
330
+ standardized_tags = {tag_to_standard_map.get(tag, tag) for tag in original_tags}
331
+ tool_metadata['tags'] = sorted(list(standardized_tags))
332
+
333
+ except Exception as e:
334
+ print(f"An error occurred during tag standardization: {e}")
335
+
336
+ # Step 5: Remove tags that occur only once across the entire dataset,
337
+ # but only for tags that are new (not pre-existing in tooluniverse)
338
+ try:
339
+ # Flatten the list of all new tags from all tools, ignoring error entries
340
+ all_new_tags_flat = [
341
+ tag
342
+ for tool_metadata in all_tool_metadata
343
+ if 'error' not in tool_metadata and isinstance(tool_metadata.get('tags'), list)
344
+ for tag in tool_metadata.get('tags', [])
345
+ if tag not in tool_labels_set
346
+ ]
347
+
348
+ if all_new_tags_flat:
349
+ # Count the frequency of each new tag
350
+ new_tag_counts = Counter(all_new_tags_flat)
351
+
352
+ # Identify new tags that appear more than once
353
+ new_tags_to_keep = {tag for tag, count in new_tag_counts.items() if count > 1}
354
+
355
+ # Filter the tags in each tool's metadata
356
+ for tool_metadata in all_tool_metadata:
357
+ if 'error' not in tool_metadata and isinstance(tool_metadata.get('tags'), list):
358
+ original_tags = tool_metadata.get('tags', [])
359
+ # Keep all pre-existing tags, and only new tags that appear more than once
360
+ filtered_tags = [
361
+ tag for tag in original_tags
362
+ if tag in tool_labels_set or tag in new_tags_to_keep
363
+ ]
364
+ tool_metadata['tags'] = sorted(list(set(filtered_tags)))
365
+
366
+ except Exception as e:
367
+ print(f"An error occurred during single-occurrence tag removal: {e}")
368
+
369
+ return all_tool_metadata
@@ -1077,12 +1077,13 @@
1077
1077
  "type": "AgenticTool",
1078
1078
  "name": "LabelGenerator",
1079
1079
  "description": "Generates relevant keyword labels for tools based on their name, description, parameters, and category. Creates a comprehensive list of tags for tool discovery and categorization.",
1080
- "prompt": "You are an expert in tool categorization and keyword generation. Your task is to generate relevant, descriptive labels/keywords for a tool that will help with discovery and organization.\n\n## TOOL INFORMATION\n- **Tool Name**: {tool_name}\n- **Description**: {tool_description}\n- **Parameters**: {tool_parameters}\n- **Category**: {category}\n\n## LABEL GENERATION INSTRUCTIONS\nAnalyze the tool information and generate a comprehensive list of relevant labels/keywords that:\n\n1. **Capture Tool Functionality**: What the tool does, its purpose\n2. **Describe Input/Output Types**: Data types, formats, domains\n3. **Indicate Use Cases**: When and where this tool would be useful\n4. **Reference Technical Domains**: Scientific fields, technologies, methodologies\n5. **Include Semantic Variations**: Synonyms, related terms, alternative descriptions\n\n## LABELING GUIDELINES\n- Generate 8-15 relevant labels\n- Use lowercase, hyphenated format (e.g., 'protein-analysis', 'molecular-weight')\n- Include both specific and general terms\n- Avoid overly generic terms like 'tool', 'utility', 'helper'\n- Include domain-specific terminology when applicable\n- Consider both technical and user-friendly terms\n\n## EXAMPLES\nFor a protein sequence analyzer:\n- Technical: ['protein-analysis', 'sequence-processing', 'bioinformatics', 'amino-acid-composition']\n- Functional: ['molecular-biology', 'protein-characterization', 'structural-analysis']\n- Use-case: ['research-tool', 'computational-biology', 'biochemistry']\n\nReturn a JSON object with the following structure:\n```json\n{\n \"labels\": [\"keyword1\", \"keyword2\", \"keyword3\", ...],\n \"rationale\": \"Brief explanation of label selection strategy\"\n}\n```",
1080
+ "prompt": "You are an expert in tool categorization and keyword generation. Your task is to generate relevant, descriptive labels/keywords for a tool that will help with discovery and organization.\n\n## TOOL INFORMATION\n- **Tool Name**: {tool_name}\n- **Description**: {tool_description}\n- **Parameters**: {tool_parameters}\n- **Category**: {category}\n\n## EXISTING LABELS\n{existing_labels}\n\n## LABEL GENERATION INSTRUCTIONS\nAnalyze the tool information and generate a comprehensive list of relevant labels/keywords that:\n\n1. **Capture Tool Functionality**: What the tool does, its purpose\n2. **Describe Input/Output Types**: Data types, formats, domains\n3. **Indicate Use Cases**: When and where this tool would be useful\n4. **Reference Technical Domains**: Scientific fields, technologies, methodologies\n5. **Include Semantic Variations**: Synonyms, related terms, alternative descriptions\n\n## LABELING GUIDELINES\n- Generate 8-15 relevant labels\n- Use lowercase, hyphenated format (e.g., 'protein-analysis', 'molecular-weight')\n- Reuse existing labels (if provided) when the existing labels are a good fit. It is okay to generate new labels if the existing ones are not suitable, or if there is another label you think is fitting that is not in the existing labels set.\n- Include both specific and general terms\n- Avoid overly generic terms like 'tool', 'utility', 'helper'\n- Include domain-specific terminology when applicable\n- Consider both technical and user-friendly terms\n\n## EXAMPLES\nFor a protein sequence analyzer:\n- Technical: ['protein-analysis', 'sequence-processing', 'bioinformatics', 'amino-acid-composition']\n- Functional: ['molecular-biology', 'protein-characterization', 'structural-analysis']\n- Use-case: ['research-tool', 'computational-biology', 'biochemistry']\n\nReturn a JSON object with the following structure:\n```json\n{\n \"labels\": [\"keyword1\", \"keyword2\", \"keyword3\", ...],\n \"rationale\": \"Brief explanation of label selection strategy\"\n}\n```",
1081
1081
  "input_arguments": [
1082
1082
  "tool_name",
1083
1083
  "tool_description",
1084
1084
  "tool_parameters",
1085
- "category"
1085
+ "category",
1086
+ "existing_labels"
1086
1087
  ],
1087
1088
  "parameter": {
1088
1089
  "type": "object",
@@ -1106,6 +1107,11 @@
1106
1107
  "type": "string",
1107
1108
  "description": "The general category or domain the tool belongs to",
1108
1109
  "required": true
1110
+ },
1111
+ "existing_labels": {
1112
+ "type": "string",
1113
+ "description": "JSON array string of existing labels to consider reusing (optional)",
1114
+ "required": false
1109
1115
  }
1110
1116
  },
1111
1117
  "required": [
@@ -1124,33 +1130,142 @@
1124
1130
  }
1125
1131
  },
1126
1132
  {
1127
- "type": "AgenticTool",
1128
- "name": "call_agentic_human",
1129
- "description": "Produces a concise, practical answer that emulates how a well-informed human would respond to the question.",
1130
- "prompt": "You are not an AI. You are a knowledgeable human expert speaking naturally. Read the user's question carefully and respond as a thoughtful person would, giving a clear, direct, and context-aware answer. Do not mention AI, ChatGPT, or models. If the question involves uncertainty or opinion, share a reasoned perspective in a human tone.\n\nQuestion: {question}\n\nAnswer:",
1131
- "input_arguments": [
1132
- "question"
1133
- ],
1134
- "parameter": {
1135
- "type": "object",
1136
- "properties": {
1137
- "question": {
1138
- "type": "string",
1139
- "description": "The user's question to be answered in a human-like manner.",
1140
- "required": true
1133
+ "type": "AgenticTool",
1134
+ "name": "call_agentic_human",
1135
+ "description": "Produces a concise, practical answer that emulates how a well-informed human would respond to the question.",
1136
+ "prompt": "You are not an AI. You are a knowledgeable human expert speaking naturally. Read the user's question carefully and respond as a thoughtful person would, giving a clear, direct, and context-aware answer. Do not mention AI, ChatGPT, or models. If the question involves uncertainty or opinion, share a reasoned perspective in a human tone.\n\nQuestion: {question}\n\nAnswer:",
1137
+ "input_arguments": [
1138
+ "question"
1139
+ ],
1140
+ "parameter": {
1141
+ "type": "object",
1142
+ "properties": {
1143
+ "question": {
1144
+ "type": "string",
1145
+ "description": "The user's question to be answered in a human-like manner.",
1146
+ "required": true
1147
+ }
1148
+ },
1149
+ "required": [
1150
+ "question"
1151
+ ]
1152
+ },
1153
+ "configs": {
1154
+ "api_type": "CHATGPT",
1155
+ "model_id": "o4-mini-0416",
1156
+ "temperature": 0.7,
1157
+ "max_new_tokens": 1024,
1158
+ "return_json": false
1141
1159
  }
1160
+ },
1161
+ {
1162
+ "type": "AgenticTool",
1163
+ "name": "ToolMetadataGenerator",
1164
+ "description": "Generates a JSON structure with the metadata of a tool in ToolUniverse, given the JSON configuration of the tool.",
1165
+ "prompt": "You are an expert in processing ToolUniverse tool configurations. Your task is to extract and generate key metadata from a given tool's JSON configuration and return it as a new, structured JSON object.\n\n**Input Tool Configuration:**\n```json\n{tool_config}\n```\n\n**Tool Type Mappings (for simplifying toolType):**\n```json\n{tool_type_mappings}\n```\n\n**Instructions:**\nFrom the input configuration, generate a new JSON object with the specified structure. All fields enclosed in '<','>' are placeholders for instructions; you should generate a specific value for the tool based on its configuration. Fields not in brackets should use the default values provided.\n\n**Output JSON Structure:**\n```json\n{\n \"id\": \"<generate a new uuid>\",\n \"name\": \"<extract from tool_config.name>\",\n \"description\": \"<extract and tool_config.description and slightly summarize it if it is too long>\",\n \"detailed_description\": \"<extract from tool_config.description>\",\n \"toolType\": \"<if tool_config.type or tool_config.name appears in tool_type_mappings dict in one of the lists (among the dict's values), extract the corresponding key and set it as the simplified toolType. otherwise, set toolType to be 'API' (the default)>\",\n \"tags\": [],\n \"category\": \"<extract from tool_config.type>\",\n \"lab\": \"Zitnik Lab\",\n \"source\": \"<extract the name of the database, package, model, or write 'Agentic'>\",\n \"version\": \"v1.0.0\",\n \"reviewed\": true,\n \"isValidated\": true,\n \"usageStats\": \"100+ uses\",\n \"capabilities\": [\n \"<list capabilities strictly derivable from tool_config>\"\n ],\n \"limitations\": [\n \"May require refinement\"\n ],\n \"parameters\": {<for each parameter key include an object with type and description>},\n \"inputSchema\": <echo tool_config.parameter exactly>,\n \"exampleInput\": <JSON object with example values for each parameter>,\n \"apiEndpoints\": [\n {\n \"method\": \"MCP\",\n \"url\": \"https://tooluniversemcpserver.onrender.com/mcp/\"\n }\n ]\n}\n```\n\nReturn ONLY the final JSON object with no extra commentary.",
1166
+ "input_arguments": [
1167
+ "tool_config",
1168
+ "tool_type_mappings"
1169
+ ],
1170
+ "parameter": {
1171
+ "type": "object",
1172
+ "properties": {
1173
+ "tool_config": {
1174
+ "type": "string",
1175
+ "description": "JSON string of the tool configuration to extract metadata from",
1176
+ "required": true
1177
+ },
1178
+ "tool_type_mappings": {
1179
+ "type": "object",
1180
+ "description": "A mapping from a simplified toolType to a list of tool_config.type that fall under the toolType (e.g., {'Databases': ['XMLTool']})",
1181
+ "required": false
1182
+ }
1183
+ },
1184
+ "required": [
1185
+ "tool_config"
1186
+ ]
1142
1187
  },
1143
- "required": [
1144
- "question"
1145
- ]
1188
+ "configs": {
1189
+ "api_type": "CHATGPT",
1190
+ "model_id": "o4-mini-0416",
1191
+ "temperature": 0.7,
1192
+ "max_new_tokens": 8192,
1193
+ "return_json": true,
1194
+ "return_metadata": false
1195
+ }
1146
1196
  },
1147
- "configs": {
1148
- "api_type": "CHATGPT",
1149
- "model_id": "o4-mini-0416",
1150
- "temperature": 0.7,
1151
- "max_new_tokens": 1024,
1152
- "return_json": false
1153
- }
1197
+ {
1198
+ "type": "AgenticTool",
1199
+ "name": "ToolMetadataStandardizer",
1200
+ "description": "Standardizes and groups semantically equivalent metadata strings (e.g., sources, tags) into canonical forms for consistent downstream usage.",
1201
+ "prompt": "You are an expert in metadata normalization and canonicalization. Given a list of raw metadata strings (sources, tags, categories, etc.), produce a JSON object that maps a SINGLE canonical (standardized) string to the list of ALL raw variants from the input that correspond to that canonical form.\n\nINPUT LIST (raw values):\n{metadata_list}\n\nOPTIONAL LIMIT:\n{limit}\n\nTASK:\nReturn ONLY a JSON object (no markdown, no explanations) of the form:\n{\n \"canonical_value_1\": [\"variant_a\", \"variant_b\"],\n \"canonical_value_2\": [\"variant_c\"],\n ...\n}\n\n**LIMIT CONSTRAINT:**\nIf a `limit` is provided, you MUST group terms more aggressively to ensure the number of canonical keys in the output JSON does not exceed the limit. Every raw string must still be mapped to one of the canonical strings. **However, this aggressive grouping must be balanced. Avoid creating overly broad, uninformative categories (e.g., 'data', 'science', 'metadata'). The canonical labels must still clearly distinguish between different technical capabilities and scientific fields.**\n\n**STANDARDIZATION RULES (apply in order):**\n\n**Part 1: Grammatical & Syntactic Normalization**\n1. Trim whitespace; collapse internal repeated whitespace to a single space.\n2. Case fold (lowercase) for comparison, but canonical output SHOULD use a clean, title or widely-recognized uppercase style for well-known acronyms (retain ALLCAPS for <=5 letter well-known biomedical / data acronyms like NCBI, FDA, NIH, EMA, WHO, API). For general words use lowercase-hyphen style (e.g., \"gene-expression\").\n3. Remove surrounding quotes and trailing punctuation (periods, commas, semicolons).\n4. Replace underscores, spaces, and consecutive separators with a single hyphen (e.g., 'Gene Expression', 'gene_expression' -> 'gene-expression').\n5. Treat hyphen and space variants as equivalent (protein-analysis == protein analysis).\n6. Singular vs plural: treat plural forms as the same (e.g., \"dataset\" and \"datasets\"). Use singular in canonical unless plural is the widely accepted form (e.g., 'omics').\n7. Common stop punctuation (&, /, :) removed unless they encode a standard acronym combination. For constructs like 'R&D' keep as 'R-and-d'.\n8. Strip leading articles (the, a, an) unless part of proper noun (e.g., 'The Cancer Genome Atlas' -> keep).\n9. Collapse obvious expansions to standard acronyms when unambiguous (\"national center for biotechnology information\" -> NCBI, \"food and drug administration\" -> FDA).\n10. If a term is already a concise, recognized proper noun or database name (e.g., 'DrugBank', 'ChEMBL', 'PubChem'), keep its conventional casing as canonical and group all variants to it.\n\n**Part 2: Semantic & Hierarchical Grouping (MOST IMPORTANT)**\n*After applying grammatical normalization, perform the following semantic groupings to create more general, reusable labels.*\n11. **Generalize Specific Terms:** This is the most critical rule. Collapse specific sub-topics into a broader, more general parent category. The goal is to make labels applicable across multiple tools. For example, group 'bioinformatics-ontology' and 'bioinformatics-library-overview' under the single canonical label 'bioinformatics'. **Crucially, do not over-generalize to the point of losing meaning. A category like 'bioinformatics' is good, but a category like 'science' is too broad and uninformative.**\n12. **Hierarchy Collapse:** If terms represent a clear parent-child relationship (e.g., 'genomics' and 'gene-expression-analysis'), group them under the more general parent term ('genomics').\n13. **Synonym & Function Grouping:** Group clear synonyms or terms describing the same function (e.g., 'visualization', 'plotting', 'charting') under a single canonical term (e.g., 'data-visualization').\n14. **Prioritize Broad Concepts:** When choosing a canonical key for a semantic group, always select the most general and widely understood term. For example, prefer 'protein-analysis' over 'protein-folding-simulation'.\n\n**Part 3: Final Output Formatting**\n15. Always include the original raw variants EXACTLY as they appeared in the input list (before any normalization) inside the variant arrays (deduplicate within each list, preserving original order of first appearance).\n16. The canonical key MUST be a clean, user-presentable string (no surrounding whitespace, no trailing punctuation).\n17. Every input value must appear in exactly one array in the output.\n\n**CANONICAL KEY SELECTION HEURISTICS (when multiple variants map to same group):\n- Prefer the most general, high-level concept (e.g., 'bioinformatics' over 'sequence-alignment').\n- Prefer widely recognized product / database / organization name with correct branding (DrugBank, PubChem, UniProt, Ensembl).\n- Else prefer the shortest unambiguous normalized form.\n- Else use hyphenated lowercase normalized form.\n\nOUTPUT REQUIREMENTS:\n- Pure JSON object.\n- Keys: canonical strings.\n- Values: arrays of raw variants (length >=1).\n- Do NOT return commentary, explanations, or fields other than the mapping.\n\nIf input list is empty, return {}.",
1202
+ "input_arguments": [
1203
+ "metadata_list",
1204
+ "limit"
1205
+ ],
1206
+ "parameter": {
1207
+ "type": "object",
1208
+ "properties": {
1209
+ "metadata_list": {
1210
+ "type": "array",
1211
+ "items": {
1212
+ "type": "string"
1213
+ },
1214
+ "description": "List of raw metadata strings (e.g., sources, tags) to standardize and group.",
1215
+ "required": true
1216
+ },
1217
+ "limit": {
1218
+ "type": "integer",
1219
+ "description": "If provided, the maximum number of canonical strings to return. The LLM will group terms more aggressively to meet this limit, ensuring all raw strings are mapped.",
1220
+ "required": false
1221
+ }
1222
+ },
1223
+ "required": [
1224
+ "metadata_list"
1225
+ ]
1226
+ },
1227
+ "configs": {
1228
+ "api_type": "CHATGPT",
1229
+ "model_id": "o4-mini-0416",
1230
+ "temperature": 0.7,
1231
+ "max_new_tokens": 13192,
1232
+ "return_json": true
1233
+ }
1234
+ },
1235
+ {
1236
+ "type": "AgenticTool",
1237
+ "name": "ToolRelationshipDetector",
1238
+ "description": "Analyzes a primary tool against a list of other tools to identify meaningful, directional data flow compatibilities for scientific workflows. Returns a list of compatible pairs with direction and rationale.",
1239
+ "prompt": "You are an expert in tool composition and scientific workflow design. Your task is to determine the directional data flow compatibility between a primary tool (Tool A) and each tool in a provided list of other tools. Directional compatibility means that the output of one tool can be meaningfully and frequently used as an input to another tool as part of a logical scientific discovery process.\n\n**Your Task:**\n1. For each tool in the list of 'Other Tools', independently evaluate its pairwise relationship with Tool A.\n2. Identify only the pairs where the output of one tool is **frequently and logically** used as input to the other and could feasibly be part of a scientific workflow.\n3. For each such meaningful pair found, determine the data flow direction (`A->B`, `B->A`, or `both`) and provide a 10-15 word rationale for your choice referencing the inputs and outputs of the tools. \n\n**Primary Tool (Tool A):**\n{tool_a}\n\n**List of Other Tools:**\n{other_tools}\n\n**Output Format:**\nReturn ONLY a valid JSON object containing a single key, `relationships`, which holds a list of JSON objects. Each object in the list represents a meaningful relationship. If no meaningful relationships are found, return an empty list.\n\n```json\n{\n \"relationships\": [\n {\n \"tool_b_name\": \"<Name of the tool from the list>\",\n \"direction\": \"<'A->B'|'B->A'|'both'>\"\n \"rationale\": \"<10-15 word rationale>\"\n ]\n}\n```",
1240
+ "input_arguments": [
1241
+ "tool_a",
1242
+ "other_tools"
1243
+ ],
1244
+ "parameter": {
1245
+ "type": "object",
1246
+ "properties": {
1247
+ "tool_a": {
1248
+ "type": "string",
1249
+ "description": "JSON string for the primary tool configuration (Tool A).",
1250
+ "required": true
1251
+ },
1252
+ "other_tools": {
1253
+ "type": "string",
1254
+ "description": "JSON string of a list of other tool configurations to compare against Tool A.",
1255
+ "required": true
1256
+ }
1257
+ },
1258
+ "required": [
1259
+ "tool_a",
1260
+ "other_tools"
1261
+ ]
1262
+ },
1263
+ "configs": {
1264
+ "api_type": "CHATGPT",
1265
+ "model_id": "o4-mini-0416",
1266
+ "temperature": 0.2,
1267
+ "max_new_tokens": 8192,
1268
+ "return_json": true
1269
+ }
1154
1270
  }
1155
-
1156
- ]
1271
+ ]