tooluniverse 0.2.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tooluniverse might be problematic. Click here for more details.
- tooluniverse/__init__.py +340 -4
- tooluniverse/admetai_tool.py +84 -0
- tooluniverse/agentic_tool.py +563 -0
- tooluniverse/alphafold_tool.py +96 -0
- tooluniverse/base_tool.py +129 -6
- tooluniverse/boltz_tool.py +207 -0
- tooluniverse/chem_tool.py +192 -0
- tooluniverse/compose_scripts/__init__.py +1 -0
- tooluniverse/compose_scripts/biomarker_discovery.py +293 -0
- tooluniverse/compose_scripts/comprehensive_drug_discovery.py +186 -0
- tooluniverse/compose_scripts/drug_safety_analyzer.py +89 -0
- tooluniverse/compose_scripts/literature_tool.py +34 -0
- tooluniverse/compose_scripts/output_summarizer.py +279 -0
- tooluniverse/compose_scripts/tool_description_optimizer.py +681 -0
- tooluniverse/compose_scripts/tool_discover.py +705 -0
- tooluniverse/compose_scripts/tool_graph_composer.py +448 -0
- tooluniverse/compose_tool.py +371 -0
- tooluniverse/ctg_tool.py +1002 -0
- tooluniverse/custom_tool.py +81 -0
- tooluniverse/dailymed_tool.py +108 -0
- tooluniverse/data/admetai_tools.json +155 -0
- tooluniverse/data/agentic_tools.json +1156 -0
- tooluniverse/data/alphafold_tools.json +87 -0
- tooluniverse/data/boltz_tools.json +9 -0
- tooluniverse/data/chembl_tools.json +16 -0
- tooluniverse/data/clait_tools.json +108 -0
- tooluniverse/data/clinicaltrials_gov_tools.json +326 -0
- tooluniverse/data/compose_tools.json +202 -0
- tooluniverse/data/dailymed_tools.json +70 -0
- tooluniverse/data/dataset_tools.json +646 -0
- tooluniverse/data/disease_target_score_tools.json +712 -0
- tooluniverse/data/efo_tools.json +17 -0
- tooluniverse/data/embedding_tools.json +319 -0
- tooluniverse/data/enrichr_tools.json +31 -0
- tooluniverse/data/europe_pmc_tools.json +22 -0
- tooluniverse/data/expert_feedback_tools.json +10 -0
- tooluniverse/data/fda_drug_adverse_event_tools.json +491 -0
- tooluniverse/data/fda_drug_labeling_tools.json +1 -1
- tooluniverse/data/fda_drugs_with_brand_generic_names_for_tool.py +76929 -148860
- tooluniverse/data/finder_tools.json +209 -0
- tooluniverse/data/gene_ontology_tools.json +113 -0
- tooluniverse/data/gwas_tools.json +1082 -0
- tooluniverse/data/hpa_tools.json +333 -0
- tooluniverse/data/humanbase_tools.json +47 -0
- tooluniverse/data/idmap_tools.json +74 -0
- tooluniverse/data/mcp_client_tools_example.json +113 -0
- tooluniverse/data/mcpautoloadertool_defaults.json +28 -0
- tooluniverse/data/medlineplus_tools.json +141 -0
- tooluniverse/data/monarch_tools.json +1 -1
- tooluniverse/data/openalex_tools.json +36 -0
- tooluniverse/data/opentarget_tools.json +1 -1
- tooluniverse/data/output_summarization_tools.json +101 -0
- tooluniverse/data/packages/bioinformatics_core_tools.json +1756 -0
- tooluniverse/data/packages/categorized_tools.txt +206 -0
- tooluniverse/data/packages/cheminformatics_tools.json +347 -0
- tooluniverse/data/packages/earth_sciences_tools.json +74 -0
- tooluniverse/data/packages/genomics_tools.json +776 -0
- tooluniverse/data/packages/image_processing_tools.json +38 -0
- tooluniverse/data/packages/machine_learning_tools.json +789 -0
- tooluniverse/data/packages/neuroscience_tools.json +62 -0
- tooluniverse/data/packages/original_tools.txt +0 -0
- tooluniverse/data/packages/physics_astronomy_tools.json +62 -0
- tooluniverse/data/packages/scientific_computing_tools.json +560 -0
- tooluniverse/data/packages/single_cell_tools.json +453 -0
- tooluniverse/data/packages/software_tools.json +4954 -0
- tooluniverse/data/packages/structural_biology_tools.json +396 -0
- tooluniverse/data/packages/visualization_tools.json +399 -0
- tooluniverse/data/pubchem_tools.json +215 -0
- tooluniverse/data/pubtator_tools.json +68 -0
- tooluniverse/data/rcsb_pdb_tools.json +1332 -0
- tooluniverse/data/reactome_tools.json +19 -0
- tooluniverse/data/semantic_scholar_tools.json +26 -0
- tooluniverse/data/special_tools.json +2 -25
- tooluniverse/data/tool_composition_tools.json +88 -0
- tooluniverse/data/toolfinderkeyword_defaults.json +34 -0
- tooluniverse/data/txagent_client_tools.json +9 -0
- tooluniverse/data/uniprot_tools.json +211 -0
- tooluniverse/data/url_fetch_tools.json +94 -0
- tooluniverse/data/uspto_downloader_tools.json +9 -0
- tooluniverse/data/uspto_tools.json +811 -0
- tooluniverse/data/xml_tools.json +3275 -0
- tooluniverse/dataset_tool.py +296 -0
- tooluniverse/default_config.py +165 -0
- tooluniverse/efo_tool.py +42 -0
- tooluniverse/embedding_database.py +630 -0
- tooluniverse/embedding_sync.py +396 -0
- tooluniverse/enrichr_tool.py +266 -0
- tooluniverse/europe_pmc_tool.py +52 -0
- tooluniverse/execute_function.py +1775 -95
- tooluniverse/extended_hooks.py +444 -0
- tooluniverse/gene_ontology_tool.py +194 -0
- tooluniverse/graphql_tool.py +158 -36
- tooluniverse/gwas_tool.py +358 -0
- tooluniverse/hpa_tool.py +1645 -0
- tooluniverse/humanbase_tool.py +389 -0
- tooluniverse/logging_config.py +254 -0
- tooluniverse/mcp_client_tool.py +764 -0
- tooluniverse/mcp_integration.py +413 -0
- tooluniverse/mcp_tool_registry.py +925 -0
- tooluniverse/medlineplus_tool.py +337 -0
- tooluniverse/openalex_tool.py +228 -0
- tooluniverse/openfda_adv_tool.py +283 -0
- tooluniverse/openfda_tool.py +393 -160
- tooluniverse/output_hook.py +1122 -0
- tooluniverse/package_tool.py +195 -0
- tooluniverse/pubchem_tool.py +158 -0
- tooluniverse/pubtator_tool.py +168 -0
- tooluniverse/rcsb_pdb_tool.py +38 -0
- tooluniverse/reactome_tool.py +108 -0
- tooluniverse/remote/boltz/boltz_mcp_server.py +50 -0
- tooluniverse/remote/depmap_24q2/depmap_24q2_mcp_tool.py +442 -0
- tooluniverse/remote/expert_feedback/human_expert_mcp_tools.py +2013 -0
- tooluniverse/remote/expert_feedback/simple_test.py +23 -0
- tooluniverse/remote/expert_feedback/start_web_interface.py +188 -0
- tooluniverse/remote/expert_feedback/web_only_interface.py +0 -0
- tooluniverse/remote/expert_feedback_mcp/human_expert_mcp_server.py +1611 -0
- tooluniverse/remote/expert_feedback_mcp/simple_test.py +34 -0
- tooluniverse/remote/expert_feedback_mcp/start_web_interface.py +91 -0
- tooluniverse/remote/immune_compass/compass_tool.py +327 -0
- tooluniverse/remote/pinnacle/pinnacle_tool.py +328 -0
- tooluniverse/remote/transcriptformer/transcriptformer_tool.py +586 -0
- tooluniverse/remote/uspto_downloader/uspto_downloader_mcp_server.py +61 -0
- tooluniverse/remote/uspto_downloader/uspto_downloader_tool.py +120 -0
- tooluniverse/remote_tool.py +99 -0
- tooluniverse/restful_tool.py +53 -30
- tooluniverse/scripts/generate_tool_graph.py +408 -0
- tooluniverse/scripts/visualize_tool_graph.py +829 -0
- tooluniverse/semantic_scholar_tool.py +62 -0
- tooluniverse/smcp.py +2452 -0
- tooluniverse/smcp_server.py +975 -0
- tooluniverse/test/mcp_server_test.py +0 -0
- tooluniverse/test/test_admetai_tool.py +370 -0
- tooluniverse/test/test_agentic_tool.py +129 -0
- tooluniverse/test/test_alphafold_tool.py +71 -0
- tooluniverse/test/test_chem_tool.py +37 -0
- tooluniverse/test/test_compose_lieraturereview.py +63 -0
- tooluniverse/test/test_compose_tool.py +448 -0
- tooluniverse/test/test_dailymed.py +69 -0
- tooluniverse/test/test_dataset_tool.py +200 -0
- tooluniverse/test/test_disease_target_score.py +56 -0
- tooluniverse/test/test_drugbank_filter_examples.py +179 -0
- tooluniverse/test/test_efo.py +31 -0
- tooluniverse/test/test_enrichr_tool.py +21 -0
- tooluniverse/test/test_europe_pmc_tool.py +20 -0
- tooluniverse/test/test_fda_adv.py +95 -0
- tooluniverse/test/test_fda_drug_labeling.py +91 -0
- tooluniverse/test/test_gene_ontology_tools.py +66 -0
- tooluniverse/test/test_gwas_tool.py +139 -0
- tooluniverse/test/test_hpa.py +625 -0
- tooluniverse/test/test_humanbase_tool.py +20 -0
- tooluniverse/test/test_idmap_tools.py +61 -0
- tooluniverse/test/test_mcp_server.py +211 -0
- tooluniverse/test/test_mcp_tool.py +247 -0
- tooluniverse/test/test_medlineplus.py +220 -0
- tooluniverse/test/test_openalex_tool.py +32 -0
- tooluniverse/test/test_opentargets.py +28 -0
- tooluniverse/test/test_pubchem_tool.py +116 -0
- tooluniverse/test/test_pubtator_tool.py +37 -0
- tooluniverse/test/test_rcsb_pdb_tool.py +86 -0
- tooluniverse/test/test_reactome.py +54 -0
- tooluniverse/test/test_semantic_scholar_tool.py +24 -0
- tooluniverse/test/test_software_tools.py +147 -0
- tooluniverse/test/test_tool_description_optimizer.py +49 -0
- tooluniverse/test/test_tool_finder.py +26 -0
- tooluniverse/test/test_tool_finder_llm.py +252 -0
- tooluniverse/test/test_tools_find.py +195 -0
- tooluniverse/test/test_uniprot_tools.py +74 -0
- tooluniverse/test/test_uspto_tool.py +72 -0
- tooluniverse/test/test_xml_tool.py +113 -0
- tooluniverse/tool_finder_embedding.py +267 -0
- tooluniverse/tool_finder_keyword.py +693 -0
- tooluniverse/tool_finder_llm.py +699 -0
- tooluniverse/tool_graph_web_ui.py +955 -0
- tooluniverse/tool_registry.py +416 -0
- tooluniverse/uniprot_tool.py +155 -0
- tooluniverse/url_tool.py +253 -0
- tooluniverse/uspto_tool.py +240 -0
- tooluniverse/utils.py +369 -41
- tooluniverse/xml_tool.py +369 -0
- tooluniverse-1.0.0.dist-info/METADATA +377 -0
- tooluniverse-1.0.0.dist-info/RECORD +186 -0
- tooluniverse-1.0.0.dist-info/entry_points.txt +9 -0
- tooluniverse/generate_mcp_tools.py +0 -113
- tooluniverse/mcp_server.py +0 -3340
- tooluniverse-0.2.0.dist-info/METADATA +0 -139
- tooluniverse-0.2.0.dist-info/RECORD +0 -21
- tooluniverse-0.2.0.dist-info/entry_points.txt +0 -4
- {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.0.dist-info}/WHEEL +0 -0
- {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding Sync Tool for ToolUniverse
|
|
3
|
+
|
|
4
|
+
Synchronize embedding databases with HuggingFace Hub for sharing and collaboration.
|
|
5
|
+
Supports uploading local databases to HuggingFace and downloading databases from HuggingFace.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import json
|
|
10
|
+
import shutil
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Dict
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from huggingface_hub import HfApi, upload_folder, snapshot_download
|
|
17
|
+
from huggingface_hub.utils import HfHubHTTPError
|
|
18
|
+
except ImportError:
|
|
19
|
+
raise ImportError(
|
|
20
|
+
"huggingface_hub is required. Install with: pip install huggingface_hub"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from .base_tool import BaseTool
|
|
24
|
+
from .tool_registry import register_tool
|
|
25
|
+
from .logging_config import get_logger
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@register_tool("EmbeddingSync")
|
|
29
|
+
class EmbeddingSync(BaseTool):
|
|
30
|
+
"""
|
|
31
|
+
Sync embedding databases with HuggingFace Hub.
|
|
32
|
+
Supports uploading local databases and downloading shared databases.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, tool_config):
|
|
36
|
+
super().__init__(tool_config)
|
|
37
|
+
self.logger = get_logger("EmbeddingSync")
|
|
38
|
+
|
|
39
|
+
# HuggingFace configuration
|
|
40
|
+
hf_config = tool_config.get("configs", {}).get("huggingface_config", {})
|
|
41
|
+
self.hf_token = hf_config.get("token") or os.getenv("HF_TOKEN")
|
|
42
|
+
self.hf_endpoint = hf_config.get("endpoint", "https://huggingface.co")
|
|
43
|
+
|
|
44
|
+
if not self.hf_token:
|
|
45
|
+
self.logger.warning(
|
|
46
|
+
"HuggingFace token not found. Some operations may fail."
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Initialize HF API
|
|
50
|
+
self.hf_api = HfApi(endpoint=self.hf_endpoint, token=self.hf_token)
|
|
51
|
+
|
|
52
|
+
# Storage configuration
|
|
53
|
+
storage_config = tool_config.get("configs", {}).get("storage_config", {})
|
|
54
|
+
self.data_dir = Path(storage_config.get("data_dir", "./data/embeddings"))
|
|
55
|
+
self.export_dir = Path(storage_config.get("export_dir", "./exports"))
|
|
56
|
+
|
|
57
|
+
# Ensure directories exist
|
|
58
|
+
self.data_dir.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
self.export_dir.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
|
|
61
|
+
def run(self, arguments):
|
|
62
|
+
"""Main entry point for the tool"""
|
|
63
|
+
action = arguments.get("action")
|
|
64
|
+
|
|
65
|
+
if action == "upload":
|
|
66
|
+
return self._upload_to_huggingface(arguments)
|
|
67
|
+
elif action == "download":
|
|
68
|
+
return self._download_from_huggingface(arguments)
|
|
69
|
+
else:
|
|
70
|
+
return {"error": f"Unknown action: {action}"}
|
|
71
|
+
|
|
72
|
+
def _upload_to_huggingface(self, arguments):
|
|
73
|
+
"""Upload local database to HuggingFace Hub"""
|
|
74
|
+
database_name = arguments.get("database_name")
|
|
75
|
+
repository = arguments.get("repository")
|
|
76
|
+
description = arguments.get("description", "")
|
|
77
|
+
private = arguments.get("private", False)
|
|
78
|
+
commit_message = arguments.get(
|
|
79
|
+
"commit_message", f"Upload {database_name} database"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
if not database_name:
|
|
83
|
+
return {"error": "database_name is required"}
|
|
84
|
+
if not repository:
|
|
85
|
+
return {"error": "repository is required (format: username/repo-name)"}
|
|
86
|
+
if not self.hf_token:
|
|
87
|
+
return {"error": "HuggingFace token required for upload operations"}
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
# Check if local database exists
|
|
91
|
+
db_path = self.data_dir / "embeddings.db"
|
|
92
|
+
index_path = self.data_dir / f"{database_name}.faiss"
|
|
93
|
+
|
|
94
|
+
if not db_path.exists():
|
|
95
|
+
return {"error": "Local embeddings database not found"}
|
|
96
|
+
if not index_path.exists():
|
|
97
|
+
return {
|
|
98
|
+
"error": f"FAISS index for database '{database_name}' not found"
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
# Create export directory for this upload
|
|
102
|
+
export_path = (
|
|
103
|
+
self.export_dir
|
|
104
|
+
/ f"{database_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
105
|
+
)
|
|
106
|
+
export_path.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
|
|
108
|
+
# Copy database files to export directory
|
|
109
|
+
shutil.copy2(db_path, export_path / "embeddings.db")
|
|
110
|
+
shutil.copy2(index_path, export_path / f"{database_name}.faiss")
|
|
111
|
+
|
|
112
|
+
# Create database info file
|
|
113
|
+
db_info = self._get_database_info(database_name)
|
|
114
|
+
if not db_info:
|
|
115
|
+
return {
|
|
116
|
+
"error": f"Database '{database_name}' not found in local storage"
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
info_file = {
|
|
120
|
+
"database_name": database_name,
|
|
121
|
+
"description": description,
|
|
122
|
+
"embedding_model": db_info.get("embedding_model"),
|
|
123
|
+
"embedding_dimensions": db_info.get("embedding_dimensions"),
|
|
124
|
+
"document_count": db_info.get("document_count"),
|
|
125
|
+
"created_at": db_info.get("created_at"),
|
|
126
|
+
"uploaded_at": datetime.now().isoformat(),
|
|
127
|
+
"version": "1.0.0",
|
|
128
|
+
"format": "tooluniverse_embedding_db",
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
with open(export_path / "database_info.json", "w") as f:
|
|
132
|
+
json.dump(info_file, f, indent=2)
|
|
133
|
+
|
|
134
|
+
# Create README file
|
|
135
|
+
readme_content = self._generate_readme(database_name, description, db_info)
|
|
136
|
+
with open(export_path / "README.md", "w") as f:
|
|
137
|
+
f.write(readme_content)
|
|
138
|
+
|
|
139
|
+
# Create repository if it doesn't exist
|
|
140
|
+
try:
|
|
141
|
+
self.hf_api.repo_info(repository, repo_type="dataset")
|
|
142
|
+
self.logger.info(f"Repository {repository} already exists")
|
|
143
|
+
except HfHubHTTPError:
|
|
144
|
+
self.logger.info(f"Creating new repository: {repository}")
|
|
145
|
+
self.hf_api.create_repo(
|
|
146
|
+
repo_id=repository, repo_type="dataset", private=private
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Upload files to HuggingFace
|
|
150
|
+
self.logger.info(f"Uploading database to {repository}")
|
|
151
|
+
upload_folder(
|
|
152
|
+
folder_path=str(export_path),
|
|
153
|
+
repo_id=repository,
|
|
154
|
+
repo_type="dataset",
|
|
155
|
+
token=self.hf_token,
|
|
156
|
+
commit_message=commit_message,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Clean up export directory
|
|
160
|
+
shutil.rmtree(export_path)
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
"status": "success",
|
|
164
|
+
"database_name": database_name,
|
|
165
|
+
"repository": repository,
|
|
166
|
+
"document_count": db_info.get("document_count"),
|
|
167
|
+
"upload_url": f"{self.hf_endpoint}/datasets/{repository}",
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
except Exception as e:
|
|
171
|
+
self.logger.error(f"Error uploading to HuggingFace: {str(e)}")
|
|
172
|
+
return {"error": f"Failed to upload: {str(e)}"}
|
|
173
|
+
|
|
174
|
+
def _download_from_huggingface(self, arguments):
|
|
175
|
+
"""Download database from HuggingFace Hub"""
|
|
176
|
+
repository = arguments.get("repository")
|
|
177
|
+
local_name = arguments.get("local_name")
|
|
178
|
+
overwrite = arguments.get("overwrite", False)
|
|
179
|
+
|
|
180
|
+
if not repository:
|
|
181
|
+
return {"error": "repository is required (format: username/repo-name)"}
|
|
182
|
+
if not local_name:
|
|
183
|
+
local_name = repository.split("/")[-1] # Use repo name as default
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
# Check if local database already exists
|
|
187
|
+
if self._local_database_exists(local_name) and not overwrite:
|
|
188
|
+
return {
|
|
189
|
+
"error": f"Local database '{local_name}' already exists. Use overwrite=true to replace."
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
# Download repository to temporary directory
|
|
193
|
+
temp_dir = (
|
|
194
|
+
self.export_dir / f"download_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
self.logger.info(f"Downloading database from {repository}")
|
|
198
|
+
snapshot_download(
|
|
199
|
+
repo_id=repository,
|
|
200
|
+
repo_type="dataset",
|
|
201
|
+
local_dir=str(temp_dir),
|
|
202
|
+
token=self.hf_token,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Verify required files exist
|
|
206
|
+
db_file = temp_dir / "embeddings.db"
|
|
207
|
+
info_file = temp_dir / "database_info.json"
|
|
208
|
+
|
|
209
|
+
if not db_file.exists():
|
|
210
|
+
shutil.rmtree(temp_dir)
|
|
211
|
+
return {"error": "Downloaded repository does not contain embeddings.db"}
|
|
212
|
+
|
|
213
|
+
if not info_file.exists():
|
|
214
|
+
shutil.rmtree(temp_dir)
|
|
215
|
+
return {
|
|
216
|
+
"error": "Downloaded repository does not contain database_info.json"
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
# Load database info
|
|
220
|
+
with open(info_file) as f:
|
|
221
|
+
db_info = json.load(f)
|
|
222
|
+
|
|
223
|
+
original_name = db_info.get("database_name")
|
|
224
|
+
faiss_file = temp_dir / f"{original_name}.faiss"
|
|
225
|
+
|
|
226
|
+
if not faiss_file.exists():
|
|
227
|
+
shutil.rmtree(temp_dir)
|
|
228
|
+
return {
|
|
229
|
+
"error": f"FAISS index file {original_name}.faiss not found in download"
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
# Copy files to local storage with new name
|
|
233
|
+
local_db_path = self.data_dir / "embeddings.db"
|
|
234
|
+
local_index_path = self.data_dir / f"{local_name}.faiss"
|
|
235
|
+
|
|
236
|
+
# Handle database file (merge or replace)
|
|
237
|
+
if local_db_path.exists() and not overwrite:
|
|
238
|
+
# Merge databases (simplified approach - copy tables)
|
|
239
|
+
self._merge_databases(
|
|
240
|
+
str(db_file), str(local_db_path), original_name, local_name
|
|
241
|
+
)
|
|
242
|
+
else:
|
|
243
|
+
shutil.copy2(db_file, local_db_path)
|
|
244
|
+
self._rename_database_in_db(
|
|
245
|
+
str(local_db_path), original_name, local_name
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Copy FAISS index
|
|
249
|
+
shutil.copy2(faiss_file, local_index_path)
|
|
250
|
+
|
|
251
|
+
# Clean up
|
|
252
|
+
shutil.rmtree(temp_dir)
|
|
253
|
+
|
|
254
|
+
return {
|
|
255
|
+
"status": "success",
|
|
256
|
+
"repository": repository,
|
|
257
|
+
"local_name": local_name,
|
|
258
|
+
"document_count": db_info.get("document_count"),
|
|
259
|
+
"embedding_model": db_info.get("embedding_model"),
|
|
260
|
+
"downloaded_at": datetime.now().isoformat(),
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
except Exception as e:
|
|
264
|
+
self.logger.error(f"Error downloading from HuggingFace: {str(e)}")
|
|
265
|
+
# Clean up on error
|
|
266
|
+
if "temp_dir" in locals() and temp_dir.exists():
|
|
267
|
+
shutil.rmtree(temp_dir)
|
|
268
|
+
return {"error": f"Failed to download: {str(e)}"}
|
|
269
|
+
|
|
270
|
+
def _get_database_info(self, database_name: str) -> Dict:
|
|
271
|
+
"""Get database information from local SQLite"""
|
|
272
|
+
import sqlite3
|
|
273
|
+
|
|
274
|
+
db_path = self.data_dir / "embeddings.db"
|
|
275
|
+
if not db_path.exists():
|
|
276
|
+
return {}
|
|
277
|
+
|
|
278
|
+
try:
|
|
279
|
+
with sqlite3.connect(db_path) as conn:
|
|
280
|
+
cursor = conn.execute(
|
|
281
|
+
"""
|
|
282
|
+
SELECT name, description, embedding_model, embedding_dimensions, document_count, created_at
|
|
283
|
+
FROM databases WHERE name = ?
|
|
284
|
+
""",
|
|
285
|
+
(database_name,),
|
|
286
|
+
)
|
|
287
|
+
row = cursor.fetchone()
|
|
288
|
+
if row:
|
|
289
|
+
return {
|
|
290
|
+
"name": row[0],
|
|
291
|
+
"description": row[1],
|
|
292
|
+
"embedding_model": row[2],
|
|
293
|
+
"embedding_dimensions": row[3],
|
|
294
|
+
"document_count": row[4],
|
|
295
|
+
"created_at": row[5],
|
|
296
|
+
}
|
|
297
|
+
except Exception as e:
|
|
298
|
+
self.logger.error(f"Error getting database info: {str(e)}")
|
|
299
|
+
|
|
300
|
+
return {}
|
|
301
|
+
|
|
302
|
+
def _local_database_exists(self, database_name: str) -> bool:
|
|
303
|
+
"""Check if database exists locally"""
|
|
304
|
+
return bool(self._get_database_info(database_name))
|
|
305
|
+
|
|
306
|
+
def _generate_readme(
|
|
307
|
+
self, database_name: str, description: str, db_info: Dict
|
|
308
|
+
) -> str:
|
|
309
|
+
"""Generate README content for HuggingFace repository"""
|
|
310
|
+
return f"""# {database_name} - Embedding Database
|
|
311
|
+
|
|
312
|
+
## Description
|
|
313
|
+
{description or 'Embedding database created with ToolUniverse'}
|
|
314
|
+
|
|
315
|
+
## Database Information
|
|
316
|
+
- **Documents**: {db_info.get('document_count', 'Unknown')}
|
|
317
|
+
- **Embedding Model**: {db_info.get('embedding_model', 'Unknown')}
|
|
318
|
+
- **Dimensions**: {db_info.get('embedding_dimensions', 'Unknown')}
|
|
319
|
+
- **Created**: {db_info.get('created_at', 'Unknown')}
|
|
320
|
+
|
|
321
|
+
## Usage
|
|
322
|
+
|
|
323
|
+
To use this database in ToolUniverse:
|
|
324
|
+
|
|
325
|
+
```python
|
|
326
|
+
from src.tooluniverse.execute_function import ToolUniverse
|
|
327
|
+
|
|
328
|
+
# Download and load the database
|
|
329
|
+
tu = ToolUniverse()
|
|
330
|
+
sync = tu.init_tool("EmbeddingSync")
|
|
331
|
+
|
|
332
|
+
# Download from HuggingFace
|
|
333
|
+
sync.run({{
|
|
334
|
+
"action": "download",
|
|
335
|
+
"repository": "username/repo-name",
|
|
336
|
+
"local_name": "{database_name}"
|
|
337
|
+
}})
|
|
338
|
+
|
|
339
|
+
# Search the database
|
|
340
|
+
db = tu.init_tool("EmbeddingDatabaseSearch")
|
|
341
|
+
results = db.run({{
|
|
342
|
+
"database_name": "{database_name}",
|
|
343
|
+
"query": "your search query",
|
|
344
|
+
"top_k": 5
|
|
345
|
+
}})
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
## Format
|
|
349
|
+
This database uses the ToolUniverse embedding database format with FAISS vector index and SQLite metadata storage.
|
|
350
|
+
"""
|
|
351
|
+
|
|
352
|
+
def _merge_databases(
|
|
353
|
+
self, source_db: str, target_db: str, source_name: str, target_name: str
|
|
354
|
+
):
|
|
355
|
+
"""Merge source database into target database (simplified implementation)"""
|
|
356
|
+
import sqlite3
|
|
357
|
+
|
|
358
|
+
# This is a simplified merge - in practice, you'd want more sophisticated handling
|
|
359
|
+
with sqlite3.connect(source_db) as source_conn:
|
|
360
|
+
with sqlite3.connect(target_db) as target_conn:
|
|
361
|
+
# Copy database record
|
|
362
|
+
source_conn.execute(
|
|
363
|
+
"UPDATE databases SET name = ? WHERE name = ?",
|
|
364
|
+
(target_name, source_name),
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# Copy all records (simplified)
|
|
368
|
+
target_conn.execute("ATTACH DATABASE ? AS source_db", (source_db,))
|
|
369
|
+
target_conn.execute(
|
|
370
|
+
"""
|
|
371
|
+
INSERT OR REPLACE INTO databases
|
|
372
|
+
SELECT * FROM source_db.databases WHERE name = ?
|
|
373
|
+
""",
|
|
374
|
+
(target_name,),
|
|
375
|
+
)
|
|
376
|
+
target_conn.execute(
|
|
377
|
+
"""
|
|
378
|
+
INSERT INTO documents
|
|
379
|
+
SELECT * FROM source_db.documents WHERE database_name = ?
|
|
380
|
+
""",
|
|
381
|
+
(target_name,),
|
|
382
|
+
)
|
|
383
|
+
target_conn.execute("DETACH DATABASE source_db")
|
|
384
|
+
|
|
385
|
+
def _rename_database_in_db(self, db_path: str, old_name: str, new_name: str):
|
|
386
|
+
"""Rename database in SQLite file"""
|
|
387
|
+
import sqlite3
|
|
388
|
+
|
|
389
|
+
with sqlite3.connect(db_path) as conn:
|
|
390
|
+
conn.execute(
|
|
391
|
+
"UPDATE databases SET name = ? WHERE name = ?", (new_name, old_name)
|
|
392
|
+
)
|
|
393
|
+
conn.execute(
|
|
394
|
+
"UPDATE documents SET database_name = ? WHERE database_name = ?",
|
|
395
|
+
(new_name, old_name),
|
|
396
|
+
)
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import requests
|
|
3
|
+
import urllib.parse
|
|
4
|
+
import networkx as nx
|
|
5
|
+
from .base_tool import BaseTool
|
|
6
|
+
from .tool_registry import register_tool
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_tool("EnrichrTool")
|
|
10
|
+
class EnrichrTool(BaseTool):
|
|
11
|
+
"""
|
|
12
|
+
Tool to perform gene enrichment analysis using Enrichr.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, tool_config):
|
|
16
|
+
super().__init__(tool_config)
|
|
17
|
+
# Constants
|
|
18
|
+
self.enrichr_url = "https://maayanlab.cloud/Enrichr/addList"
|
|
19
|
+
self.enrichment_url = "https://maayanlab.cloud/Enrichr/enrich"
|
|
20
|
+
|
|
21
|
+
def run(self, arguments):
|
|
22
|
+
"""Main entry point for the tool."""
|
|
23
|
+
genes = arguments.get("gene_list")
|
|
24
|
+
libs = arguments.get(
|
|
25
|
+
"libs",
|
|
26
|
+
[
|
|
27
|
+
"WikiPathways_2024_Human",
|
|
28
|
+
"Reactome_Pathways_2024",
|
|
29
|
+
"MSigDB_Hallmark_2020",
|
|
30
|
+
"GO_Molecular_Function_2023",
|
|
31
|
+
"GO_Biological_Process_2023",
|
|
32
|
+
],
|
|
33
|
+
)
|
|
34
|
+
return self.enrichr_api(genes, libs)
|
|
35
|
+
|
|
36
|
+
def get_official_gene_name(self, gene_name):
|
|
37
|
+
"""
|
|
38
|
+
Retrieve the official gene symbol for a given gene name or synonym using the MyGene.info API.
|
|
39
|
+
|
|
40
|
+
Parameters:
|
|
41
|
+
gene_name (str): The gene name or synonym to query.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
str: The official gene symbol if found; otherwise, raises an Exception.
|
|
45
|
+
"""
|
|
46
|
+
# URL-encode the gene_name to handle special characters
|
|
47
|
+
encoded_gene_name = urllib.parse.quote(gene_name)
|
|
48
|
+
url = f"https://mygene.info/v3/query?q={encoded_gene_name}&fields=symbol,alias&species=human"
|
|
49
|
+
|
|
50
|
+
response = requests.get(url)
|
|
51
|
+
if response.status_code != 200:
|
|
52
|
+
return f"Error querying MyGene.info API: {response.status_code}"
|
|
53
|
+
|
|
54
|
+
data = response.json()
|
|
55
|
+
hits = data.get("hits", [])
|
|
56
|
+
if not hits:
|
|
57
|
+
return f"No data found for: {gene_name}. Please check the gene name and try again."
|
|
58
|
+
|
|
59
|
+
# Attempt to find an exact match in the official symbol or among aliases.
|
|
60
|
+
for hit in hits:
|
|
61
|
+
symbol = hit.get("symbol", "")
|
|
62
|
+
if symbol.upper() == gene_name.upper():
|
|
63
|
+
print(
|
|
64
|
+
f"[enrichr_api] Using the official gene name: '{symbol}' instead of {gene_name}",
|
|
65
|
+
flush=True,
|
|
66
|
+
)
|
|
67
|
+
return symbol
|
|
68
|
+
aliases = hit.get("alias", [])
|
|
69
|
+
if any(gene_name.upper() == alias.upper() for alias in aliases):
|
|
70
|
+
print(
|
|
71
|
+
f"[enrichr_api] Using the official gene name: '{symbol}' instead of {gene_name}",
|
|
72
|
+
flush=True,
|
|
73
|
+
)
|
|
74
|
+
return symbol
|
|
75
|
+
|
|
76
|
+
# If no exact match is found, return the symbol of the top hit.
|
|
77
|
+
top_hit = hits[0]
|
|
78
|
+
symbol = top_hit.get("symbol", None)
|
|
79
|
+
if symbol:
|
|
80
|
+
print(
|
|
81
|
+
f"[enrichr_api] Using the official gene name: '{symbol}' instead of {gene_name}",
|
|
82
|
+
flush=True,
|
|
83
|
+
)
|
|
84
|
+
return symbol
|
|
85
|
+
else:
|
|
86
|
+
return f"No official gene symbol found for: {gene_name}. Please ensure it is correct."
|
|
87
|
+
|
|
88
|
+
def submit_gene_list(self, gene_list):
|
|
89
|
+
"""
|
|
90
|
+
Submit the gene list to Enrichr and return the user list ID.
|
|
91
|
+
|
|
92
|
+
Parameters:
|
|
93
|
+
gene_list (str): Newline-separated string of gene names.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
str: The user list ID from Enrichr.
|
|
97
|
+
"""
|
|
98
|
+
payload = {
|
|
99
|
+
"list": (None, gene_list),
|
|
100
|
+
"description": (None, f"Gene list for {gene_list}"),
|
|
101
|
+
}
|
|
102
|
+
response = requests.post(self.enrichr_url, files=payload)
|
|
103
|
+
|
|
104
|
+
if not response.ok:
|
|
105
|
+
return "Error submitting gene list to Enrichr"
|
|
106
|
+
|
|
107
|
+
return json.loads(response.text)["userListId"]
|
|
108
|
+
|
|
109
|
+
def get_enrichment_results(self, user_list_id, library):
|
|
110
|
+
"""
|
|
111
|
+
Fetch enrichment results for a specific library.
|
|
112
|
+
|
|
113
|
+
Parameters:
|
|
114
|
+
user_list_id (str): The user list ID from Enrichr.
|
|
115
|
+
library (str): The name of the enrichment library.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
dict: The enrichment results.
|
|
119
|
+
"""
|
|
120
|
+
query_string = f"?userListId={user_list_id}&backgroundType={library}"
|
|
121
|
+
response = requests.get(self.enrichment_url + query_string)
|
|
122
|
+
|
|
123
|
+
if not response.ok:
|
|
124
|
+
return f"Error fetching enrichment results for {library}"
|
|
125
|
+
|
|
126
|
+
return json.loads(response.text)
|
|
127
|
+
|
|
128
|
+
def build_graph(self, genes, enrichment_results):
|
|
129
|
+
"""
|
|
130
|
+
Initialize and build the graph with gene nodes and enriched terms.
|
|
131
|
+
|
|
132
|
+
Parameters:
|
|
133
|
+
genes (list): List of gene names.
|
|
134
|
+
enrichment_results (dict): Dictionary of enrichment results by library.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
networkx.Graph: The constructed graph.
|
|
138
|
+
"""
|
|
139
|
+
G = nx.Graph()
|
|
140
|
+
|
|
141
|
+
# Add gene nodes
|
|
142
|
+
for gene in genes:
|
|
143
|
+
G.add_node(gene, type="gene")
|
|
144
|
+
|
|
145
|
+
# Add enriched terms and edges
|
|
146
|
+
for library, results in enrichment_results.items():
|
|
147
|
+
for term in results:
|
|
148
|
+
term_name = term[1]
|
|
149
|
+
associated_genes = term[5]
|
|
150
|
+
G.add_node(term_name, type="term", library=library)
|
|
151
|
+
|
|
152
|
+
for gene in associated_genes:
|
|
153
|
+
if gene in genes:
|
|
154
|
+
G.add_edge(gene, term_name, weight=round(term[4], 2))
|
|
155
|
+
|
|
156
|
+
return G
|
|
157
|
+
|
|
158
|
+
def rank_paths_by_weight(self, G, source, target):
|
|
159
|
+
"""
|
|
160
|
+
Find and rank paths between source and target based on total edge weight.
|
|
161
|
+
|
|
162
|
+
Parameters:
|
|
163
|
+
G (networkx.Graph): The graph to search.
|
|
164
|
+
source (str): The source node.
|
|
165
|
+
target (str): The target node.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
list: List of tuples (path, weight) sorted by weight descending.
|
|
169
|
+
"""
|
|
170
|
+
all_paths = list(nx.all_simple_paths(G, source=source, target=target))
|
|
171
|
+
path_weights = []
|
|
172
|
+
|
|
173
|
+
for path in all_paths:
|
|
174
|
+
total_weight = sum(
|
|
175
|
+
G[path[i]][path[i + 1]].get("weight", 1) for i in range(len(path) - 1)
|
|
176
|
+
)
|
|
177
|
+
path_weights.append((path, total_weight))
|
|
178
|
+
|
|
179
|
+
return sorted(path_weights, key=lambda x: x[1], reverse=True)
|
|
180
|
+
|
|
181
|
+
def rank_paths_to_term(self, G, gene, term):
|
|
182
|
+
"""
|
|
183
|
+
Find and rank paths from each gene to a specified term based on total edge weight.
|
|
184
|
+
|
|
185
|
+
Parameters:
|
|
186
|
+
G (networkx.Graph): The graph to search.
|
|
187
|
+
gene (str): The source gene.
|
|
188
|
+
term (str): The target term.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
list or None: List of tuples (path, weight) sorted by weight descending, or None if no paths.
|
|
192
|
+
"""
|
|
193
|
+
all_paths = list(nx.all_simple_paths(G, source=gene, target=term))
|
|
194
|
+
path_weights = []
|
|
195
|
+
|
|
196
|
+
for path in all_paths:
|
|
197
|
+
total_weight = sum(
|
|
198
|
+
G[path[i]][path[i + 1]].get("weight", 1) for i in range(len(path) - 1)
|
|
199
|
+
)
|
|
200
|
+
path_weights.append((path, total_weight))
|
|
201
|
+
|
|
202
|
+
if len(path_weights) != 0:
|
|
203
|
+
return sorted(path_weights, key=lambda x: x[1], reverse=True)
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
def enrichr_api(self, genes, libs):
|
|
207
|
+
"""
|
|
208
|
+
Main API function to perform gene enrichment analysis.
|
|
209
|
+
|
|
210
|
+
Parameters:
|
|
211
|
+
genes (list): List of gene names.
|
|
212
|
+
libs (list): List of enrichment libraries to use.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
tuple: (connected_path, connections) dictionaries.
|
|
216
|
+
"""
|
|
217
|
+
# Convert each gene to its official name and log the result
|
|
218
|
+
genes = [self.get_official_gene_name(gene) for gene in genes]
|
|
219
|
+
print("Official gene names:", genes)
|
|
220
|
+
|
|
221
|
+
# Ensure at least two genes are provided for path ranking
|
|
222
|
+
if len(genes) < 2:
|
|
223
|
+
raise ValueError(
|
|
224
|
+
"At least two genes are required to rank paths between genes."
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Prepare the gene list for Enrichr submission
|
|
228
|
+
gene_list_str = "\n".join(genes)
|
|
229
|
+
user_list_id = self.submit_gene_list(gene_list_str)
|
|
230
|
+
|
|
231
|
+
# Retrieve enrichment results for each specified library
|
|
232
|
+
enrichment_results = {}
|
|
233
|
+
for library in libs:
|
|
234
|
+
results = self.get_enrichment_results(user_list_id, library)
|
|
235
|
+
# Safely get the top 5 results; if the library key isn't found, default to an empty list
|
|
236
|
+
enrichment_results[library] = results.get(library, [])[:5]
|
|
237
|
+
|
|
238
|
+
# Build the graph from the gene list and enrichment results
|
|
239
|
+
G = self.build_graph(genes, enrichment_results)
|
|
240
|
+
|
|
241
|
+
# Rank paths from the first gene to the second
|
|
242
|
+
ranked_paths = self.rank_paths_by_weight(G, genes[0], genes[1])
|
|
243
|
+
connected_path = {}
|
|
244
|
+
for path, weight in ranked_paths:
|
|
245
|
+
connected_path[f"Path: {path}"] = f"Total Weight: {weight}"
|
|
246
|
+
|
|
247
|
+
# Compute connectivity data for each gene and graph node
|
|
248
|
+
connections = {}
|
|
249
|
+
for gene in genes:
|
|
250
|
+
for term in G.nodes:
|
|
251
|
+
paths_to_term = self.rank_paths_to_term(G, gene, term)
|
|
252
|
+
if paths_to_term is not None:
|
|
253
|
+
connections[f"Connectivity: {gene} - {term}"] = paths_to_term
|
|
254
|
+
|
|
255
|
+
# Check for empty outputs and print helper messages
|
|
256
|
+
if not connected_path:
|
|
257
|
+
print(
|
|
258
|
+
f"[Enrichr] No ranked paths were found between the gene pair {genes}."
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
if not connections:
|
|
262
|
+
print(
|
|
263
|
+
f"[Enrichr] No connection between genes and terms in the enriched graph of {genes}."
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return connected_path, connections
|