tooluniverse 0.2.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tooluniverse might be problematic. Click here for more details.
- tooluniverse/__init__.py +340 -4
- tooluniverse/admetai_tool.py +84 -0
- tooluniverse/agentic_tool.py +563 -0
- tooluniverse/alphafold_tool.py +96 -0
- tooluniverse/base_tool.py +129 -6
- tooluniverse/boltz_tool.py +207 -0
- tooluniverse/chem_tool.py +192 -0
- tooluniverse/compose_scripts/__init__.py +1 -0
- tooluniverse/compose_scripts/biomarker_discovery.py +293 -0
- tooluniverse/compose_scripts/comprehensive_drug_discovery.py +186 -0
- tooluniverse/compose_scripts/drug_safety_analyzer.py +89 -0
- tooluniverse/compose_scripts/literature_tool.py +34 -0
- tooluniverse/compose_scripts/output_summarizer.py +279 -0
- tooluniverse/compose_scripts/tool_description_optimizer.py +681 -0
- tooluniverse/compose_scripts/tool_discover.py +705 -0
- tooluniverse/compose_scripts/tool_graph_composer.py +448 -0
- tooluniverse/compose_tool.py +371 -0
- tooluniverse/ctg_tool.py +1002 -0
- tooluniverse/custom_tool.py +81 -0
- tooluniverse/dailymed_tool.py +108 -0
- tooluniverse/data/admetai_tools.json +155 -0
- tooluniverse/data/adverse_event_tools.json +108 -0
- tooluniverse/data/agentic_tools.json +1156 -0
- tooluniverse/data/alphafold_tools.json +87 -0
- tooluniverse/data/boltz_tools.json +9 -0
- tooluniverse/data/chembl_tools.json +16 -0
- tooluniverse/data/clinicaltrials_gov_tools.json +326 -0
- tooluniverse/data/compose_tools.json +202 -0
- tooluniverse/data/dailymed_tools.json +70 -0
- tooluniverse/data/dataset_tools.json +646 -0
- tooluniverse/data/disease_target_score_tools.json +712 -0
- tooluniverse/data/efo_tools.json +17 -0
- tooluniverse/data/embedding_tools.json +319 -0
- tooluniverse/data/enrichr_tools.json +31 -0
- tooluniverse/data/europe_pmc_tools.json +22 -0
- tooluniverse/data/expert_feedback_tools.json +10 -0
- tooluniverse/data/fda_drug_adverse_event_tools.json +491 -0
- tooluniverse/data/fda_drug_labeling_tools.json +1 -1
- tooluniverse/data/fda_drugs_with_brand_generic_names_for_tool.py +76929 -148860
- tooluniverse/data/finder_tools.json +209 -0
- tooluniverse/data/gene_ontology_tools.json +113 -0
- tooluniverse/data/gwas_tools.json +1082 -0
- tooluniverse/data/hpa_tools.json +333 -0
- tooluniverse/data/humanbase_tools.json +47 -0
- tooluniverse/data/idmap_tools.json +74 -0
- tooluniverse/data/mcp_client_tools_example.json +113 -0
- tooluniverse/data/mcpautoloadertool_defaults.json +28 -0
- tooluniverse/data/medlineplus_tools.json +141 -0
- tooluniverse/data/monarch_tools.json +1 -1
- tooluniverse/data/openalex_tools.json +36 -0
- tooluniverse/data/opentarget_tools.json +1 -1
- tooluniverse/data/output_summarization_tools.json +101 -0
- tooluniverse/data/packages/bioinformatics_core_tools.json +1756 -0
- tooluniverse/data/packages/categorized_tools.txt +206 -0
- tooluniverse/data/packages/cheminformatics_tools.json +347 -0
- tooluniverse/data/packages/earth_sciences_tools.json +74 -0
- tooluniverse/data/packages/genomics_tools.json +776 -0
- tooluniverse/data/packages/image_processing_tools.json +38 -0
- tooluniverse/data/packages/machine_learning_tools.json +789 -0
- tooluniverse/data/packages/neuroscience_tools.json +62 -0
- tooluniverse/data/packages/original_tools.txt +0 -0
- tooluniverse/data/packages/physics_astronomy_tools.json +62 -0
- tooluniverse/data/packages/scientific_computing_tools.json +560 -0
- tooluniverse/data/packages/single_cell_tools.json +453 -0
- tooluniverse/data/packages/structural_biology_tools.json +396 -0
- tooluniverse/data/packages/visualization_tools.json +399 -0
- tooluniverse/data/pubchem_tools.json +215 -0
- tooluniverse/data/pubtator_tools.json +68 -0
- tooluniverse/data/rcsb_pdb_tools.json +1332 -0
- tooluniverse/data/reactome_tools.json +19 -0
- tooluniverse/data/semantic_scholar_tools.json +26 -0
- tooluniverse/data/special_tools.json +2 -25
- tooluniverse/data/tool_composition_tools.json +88 -0
- tooluniverse/data/toolfinderkeyword_defaults.json +34 -0
- tooluniverse/data/txagent_client_tools.json +9 -0
- tooluniverse/data/uniprot_tools.json +211 -0
- tooluniverse/data/url_fetch_tools.json +94 -0
- tooluniverse/data/uspto_downloader_tools.json +9 -0
- tooluniverse/data/uspto_tools.json +811 -0
- tooluniverse/data/xml_tools.json +3275 -0
- tooluniverse/dataset_tool.py +296 -0
- tooluniverse/default_config.py +165 -0
- tooluniverse/efo_tool.py +42 -0
- tooluniverse/embedding_database.py +630 -0
- tooluniverse/embedding_sync.py +396 -0
- tooluniverse/enrichr_tool.py +266 -0
- tooluniverse/europe_pmc_tool.py +52 -0
- tooluniverse/execute_function.py +1775 -95
- tooluniverse/extended_hooks.py +444 -0
- tooluniverse/gene_ontology_tool.py +194 -0
- tooluniverse/graphql_tool.py +158 -36
- tooluniverse/gwas_tool.py +358 -0
- tooluniverse/hpa_tool.py +1645 -0
- tooluniverse/humanbase_tool.py +389 -0
- tooluniverse/logging_config.py +254 -0
- tooluniverse/mcp_client_tool.py +764 -0
- tooluniverse/mcp_integration.py +413 -0
- tooluniverse/mcp_tool_registry.py +925 -0
- tooluniverse/medlineplus_tool.py +337 -0
- tooluniverse/openalex_tool.py +228 -0
- tooluniverse/openfda_adv_tool.py +283 -0
- tooluniverse/openfda_tool.py +393 -160
- tooluniverse/output_hook.py +1122 -0
- tooluniverse/package_tool.py +195 -0
- tooluniverse/pubchem_tool.py +158 -0
- tooluniverse/pubtator_tool.py +168 -0
- tooluniverse/rcsb_pdb_tool.py +38 -0
- tooluniverse/reactome_tool.py +108 -0
- tooluniverse/remote/boltz/boltz_mcp_server.py +50 -0
- tooluniverse/remote/depmap_24q2/depmap_24q2_mcp_tool.py +442 -0
- tooluniverse/remote/expert_feedback/human_expert_mcp_tools.py +2013 -0
- tooluniverse/remote/expert_feedback/simple_test.py +23 -0
- tooluniverse/remote/expert_feedback/start_web_interface.py +188 -0
- tooluniverse/remote/expert_feedback/web_only_interface.py +0 -0
- tooluniverse/remote/immune_compass/compass_tool.py +327 -0
- tooluniverse/remote/pinnacle/pinnacle_tool.py +328 -0
- tooluniverse/remote/transcriptformer/transcriptformer_tool.py +586 -0
- tooluniverse/remote/uspto_downloader/uspto_downloader_mcp_server.py +61 -0
- tooluniverse/remote/uspto_downloader/uspto_downloader_tool.py +120 -0
- tooluniverse/remote_tool.py +99 -0
- tooluniverse/restful_tool.py +53 -30
- tooluniverse/scripts/generate_tool_graph.py +408 -0
- tooluniverse/scripts/visualize_tool_graph.py +829 -0
- tooluniverse/semantic_scholar_tool.py +62 -0
- tooluniverse/smcp.py +2452 -0
- tooluniverse/smcp_server.py +975 -0
- tooluniverse/test/mcp_server_test.py +0 -0
- tooluniverse/test/test_admetai_tool.py +370 -0
- tooluniverse/test/test_agentic_tool.py +129 -0
- tooluniverse/test/test_alphafold_tool.py +71 -0
- tooluniverse/test/test_chem_tool.py +37 -0
- tooluniverse/test/test_compose_lieraturereview.py +63 -0
- tooluniverse/test/test_compose_tool.py +448 -0
- tooluniverse/test/test_dailymed.py +69 -0
- tooluniverse/test/test_dataset_tool.py +200 -0
- tooluniverse/test/test_disease_target_score.py +56 -0
- tooluniverse/test/test_drugbank_filter_examples.py +179 -0
- tooluniverse/test/test_efo.py +31 -0
- tooluniverse/test/test_enrichr_tool.py +21 -0
- tooluniverse/test/test_europe_pmc_tool.py +20 -0
- tooluniverse/test/test_fda_adv.py +95 -0
- tooluniverse/test/test_fda_drug_labeling.py +91 -0
- tooluniverse/test/test_gene_ontology_tools.py +66 -0
- tooluniverse/test/test_gwas_tool.py +139 -0
- tooluniverse/test/test_hpa.py +625 -0
- tooluniverse/test/test_humanbase_tool.py +20 -0
- tooluniverse/test/test_idmap_tools.py +61 -0
- tooluniverse/test/test_mcp_server.py +211 -0
- tooluniverse/test/test_mcp_tool.py +247 -0
- tooluniverse/test/test_medlineplus.py +220 -0
- tooluniverse/test/test_openalex_tool.py +32 -0
- tooluniverse/test/test_opentargets.py +28 -0
- tooluniverse/test/test_pubchem_tool.py +116 -0
- tooluniverse/test/test_pubtator_tool.py +37 -0
- tooluniverse/test/test_rcsb_pdb_tool.py +86 -0
- tooluniverse/test/test_reactome.py +54 -0
- tooluniverse/test/test_semantic_scholar_tool.py +24 -0
- tooluniverse/test/test_software_tools.py +147 -0
- tooluniverse/test/test_tool_description_optimizer.py +49 -0
- tooluniverse/test/test_tool_finder.py +26 -0
- tooluniverse/test/test_tool_finder_llm.py +252 -0
- tooluniverse/test/test_tools_find.py +195 -0
- tooluniverse/test/test_uniprot_tools.py +74 -0
- tooluniverse/test/test_uspto_tool.py +72 -0
- tooluniverse/test/test_xml_tool.py +113 -0
- tooluniverse/tool_finder_embedding.py +267 -0
- tooluniverse/tool_finder_keyword.py +693 -0
- tooluniverse/tool_finder_llm.py +699 -0
- tooluniverse/tool_graph_web_ui.py +955 -0
- tooluniverse/tool_registry.py +416 -0
- tooluniverse/uniprot_tool.py +155 -0
- tooluniverse/url_tool.py +253 -0
- tooluniverse/uspto_tool.py +240 -0
- tooluniverse/utils.py +369 -41
- tooluniverse/xml_tool.py +369 -0
- tooluniverse-1.0.1.dist-info/METADATA +387 -0
- tooluniverse-1.0.1.dist-info/RECORD +182 -0
- tooluniverse-1.0.1.dist-info/entry_points.txt +9 -0
- tooluniverse/generate_mcp_tools.py +0 -113
- tooluniverse/mcp_server.py +0 -3340
- tooluniverse-0.2.0.dist-info/METADATA +0 -139
- tooluniverse-0.2.0.dist-info/RECORD +0 -21
- tooluniverse-0.2.0.dist-info/entry_points.txt +0 -4
- {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.1.dist-info}/WHEEL +0 -0
- {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,586 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transcriptformer Gene Embedding Tool - MCP Server
|
|
3
|
+
|
|
4
|
+
This module provides an MCP (Model Context Protocol) server for retrieving
|
|
5
|
+
pre-computed gene embeddings from the Transcriptformer model. Transcriptformer
|
|
6
|
+
is a transformer-based architecture trained on single-cell RNA sequencing data
|
|
7
|
+
to learn contextualized gene representations that capture cell-type-specific
|
|
8
|
+
and disease-state-specific expression patterns.
|
|
9
|
+
|
|
10
|
+
The tool provides access to disease-specific embedding stores that enable:
|
|
11
|
+
- Gene similarity analysis in specific cellular contexts
|
|
12
|
+
- Biomarker discovery and validation
|
|
13
|
+
- Pathway analysis and functional annotation
|
|
14
|
+
- Drug target identification and prioritization
|
|
15
|
+
- Precision medicine applications
|
|
16
|
+
- Systems biology research
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from fastmcp import FastMCP
|
|
20
|
+
import os
|
|
21
|
+
import asyncio
|
|
22
|
+
import uuid
|
|
23
|
+
import gzip
|
|
24
|
+
import json
|
|
25
|
+
import numpy as np
|
|
26
|
+
from typing import Union, List, Dict, Tuple, Optional, Any
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Initialize MCP Server for Transcriptformer gene embedding retrieval
|
|
30
|
+
server = FastMCP("Transcriptformer SMCP Server")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TranscriptformerEmbeddingTool:
|
|
34
|
+
"""
|
|
35
|
+
Comprehensive tool for retrieving contextualized gene embeddings from Transcriptformer models.
|
|
36
|
+
|
|
37
|
+
This class provides functionality to:
|
|
38
|
+
- Load and manage disease-specific embedding stores
|
|
39
|
+
- Retrieve gene embeddings for specific cellular contexts (cell type + disease state)
|
|
40
|
+
- Handle both gene symbols and Ensembl IDs with intelligent mapping
|
|
41
|
+
- Cache metadata for efficient repeated queries
|
|
42
|
+
- Support bulk embedding retrieval for pathway analysis
|
|
43
|
+
|
|
44
|
+
Transcriptformer embeddings encode gene expression patterns learned from
|
|
45
|
+
single-cell RNA sequencing data, capturing:
|
|
46
|
+
- Cell-type-specific expression signatures
|
|
47
|
+
- Disease-state-dependent gene regulation
|
|
48
|
+
- Co-expression relationships and functional modules
|
|
49
|
+
- Temporal dynamics and developmental trajectories
|
|
50
|
+
|
|
51
|
+
The tool supports various disease contexts and cell types, enabling
|
|
52
|
+
precision medicine applications and systems biology research.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, data_dir: Optional[str] = None):
|
|
56
|
+
"""
|
|
57
|
+
Initialize the Transcriptformer embedding tool by discovering available disease stores.
|
|
58
|
+
|
|
59
|
+
The tool automatically scans the embedding store directory to identify
|
|
60
|
+
available disease-specific embedding collections and prepares metadata
|
|
61
|
+
caching infrastructure for efficient access.
|
|
62
|
+
|
|
63
|
+
Raises:
|
|
64
|
+
FileNotFoundError: If the embedding store base directory cannot be found.
|
|
65
|
+
"""
|
|
66
|
+
# Construct path to embedding stores
|
|
67
|
+
if data_dir is None:
|
|
68
|
+
transcriptformer_data_path = os.getenv(
|
|
69
|
+
"TRANSCRIPTFORMER_DATA_PATH", "/root/PrismDB"
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
transcriptformer_data_path = data_dir
|
|
73
|
+
self.base_dir = os.path.join(
|
|
74
|
+
transcriptformer_data_path, "transcriptformer_embedding", "embedding_store"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Validate base directory exists
|
|
78
|
+
if not os.path.exists(self.base_dir):
|
|
79
|
+
raise FileNotFoundError(
|
|
80
|
+
f"Transcriptformer embedding store directory not found at {self.base_dir}. Please check your TRANSCRIPTFORMER_DATA_PATH."
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Discover available disease-specific embedding stores
|
|
84
|
+
self.available_diseases: List[str] = [
|
|
85
|
+
d.lower().replace(" ", "_")
|
|
86
|
+
for d in os.listdir(self.base_dir)
|
|
87
|
+
if os.path.isdir(os.path.join(self.base_dir, d))
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
# Initialize metadata cache for performance optimization
|
|
91
|
+
self.metadata_cache: Dict[str, Dict[str, Any]] = {}
|
|
92
|
+
|
|
93
|
+
print(
|
|
94
|
+
f"Transcriptformer tool initialized with {len(self.available_diseases)} disease contexts: {self.available_diseases}"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def _load_metadata(self, disease: str) -> Dict:
|
|
98
|
+
"""
|
|
99
|
+
Load and cache metadata for a specific disease embedding store.
|
|
100
|
+
|
|
101
|
+
This method loads comprehensive metadata including gene mappings, available
|
|
102
|
+
cell types, disease states, and embedding matrix organization. Metadata
|
|
103
|
+
is cached to avoid repeated file I/O operations for the same disease.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
disease (str): Disease identifier (normalized to lowercase with underscores).
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Dict: Cached metadata dictionary containing:
|
|
110
|
+
- store_path: Path to disease-specific embedding store
|
|
111
|
+
- ensembl_ids_ordered: Ordered list of Ensembl gene IDs
|
|
112
|
+
- gene_to_idx: Mapping from Ensembl IDs to matrix indices
|
|
113
|
+
- symbol_to_ensembl: Mapping from gene symbols to Ensembl IDs
|
|
114
|
+
- available_symbols: Sorted list of available gene symbols
|
|
115
|
+
- groups_meta: Metadata for available cell type + disease state combinations
|
|
116
|
+
- available_cell_types: Sorted list of available cell types
|
|
117
|
+
- available_states: Sorted list of available disease states
|
|
118
|
+
|
|
119
|
+
Raises:
|
|
120
|
+
FileNotFoundError: If disease is not available or metadata file is missing.
|
|
121
|
+
"""
|
|
122
|
+
# Return cached metadata if already loaded
|
|
123
|
+
if disease in self.metadata_cache:
|
|
124
|
+
return self.metadata_cache[disease]
|
|
125
|
+
|
|
126
|
+
# Validate disease availability
|
|
127
|
+
if disease not in self.available_diseases:
|
|
128
|
+
raise FileNotFoundError(
|
|
129
|
+
f"Disease '{disease}' is not available. Please choose from available diseases: {self.available_diseases}"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Construct paths to disease-specific store and metadata
|
|
133
|
+
store_path = os.path.join(self.base_dir, disease.replace(" ", "_"))
|
|
134
|
+
metadata_path = os.path.join(store_path, "metadata.json.gz")
|
|
135
|
+
|
|
136
|
+
# Validate metadata file exists
|
|
137
|
+
if not os.path.exists(metadata_path):
|
|
138
|
+
raise FileNotFoundError(
|
|
139
|
+
f"Metadata file not found at: {metadata_path}. Please ensure embedding store is properly prepared."
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Load compressed metadata file
|
|
143
|
+
print(
|
|
144
|
+
f"Loading Transcriptformer metadata from embedding store: {os.path.basename(store_path)}..."
|
|
145
|
+
)
|
|
146
|
+
with gzip.open(metadata_path, "rt", encoding="utf-8") as f:
|
|
147
|
+
metadata = json.load(f)
|
|
148
|
+
|
|
149
|
+
# Process and cache metadata with normalized keys
|
|
150
|
+
self.metadata_cache[disease] = {
|
|
151
|
+
"store_path": store_path,
|
|
152
|
+
"ensembl_ids_ordered": metadata["ensembl_ids_ordered"],
|
|
153
|
+
"gene_to_idx": {
|
|
154
|
+
gene: i for i, gene in enumerate(metadata["ensembl_ids_ordered"])
|
|
155
|
+
},
|
|
156
|
+
"symbol_to_ensembl": metadata["gene_map_symbol_to_ensembl"],
|
|
157
|
+
"available_symbols": sorted(
|
|
158
|
+
list(metadata["gene_map_symbol_to_ensembl"].keys())
|
|
159
|
+
),
|
|
160
|
+
"groups_meta": {
|
|
161
|
+
k.lower().replace(" ", "_"): v for k, v in metadata["groups"].items()
|
|
162
|
+
},
|
|
163
|
+
"available_cell_types": sorted(
|
|
164
|
+
list(
|
|
165
|
+
set(
|
|
166
|
+
details["cell_type"].lower().replace(" ", "_")
|
|
167
|
+
for details in metadata["groups"].values()
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
),
|
|
171
|
+
"available_states": sorted(
|
|
172
|
+
list(
|
|
173
|
+
set(
|
|
174
|
+
details["disease_state"].lower().replace(" ", "_")
|
|
175
|
+
for details in metadata["groups"].values()
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
),
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
cached_data = self.metadata_cache[disease]
|
|
182
|
+
print(
|
|
183
|
+
f"Metadata loaded successfully: {len(cached_data['available_symbols'])} genes, "
|
|
184
|
+
f"{len(cached_data['available_cell_types'])} cell types, "
|
|
185
|
+
f"{len(cached_data['available_states'])} disease states."
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return self.metadata_cache[disease]
|
|
189
|
+
|
|
190
|
+
def get_embedding_for_context(
|
|
191
|
+
self,
|
|
192
|
+
state: str,
|
|
193
|
+
cell_type: str,
|
|
194
|
+
gene_names: Union[List[str], None],
|
|
195
|
+
disease: str,
|
|
196
|
+
) -> Tuple[Optional[Dict[str, np.ndarray]], List[str]]:
|
|
197
|
+
"""
|
|
198
|
+
Retrieve contextualized gene embeddings for specific cellular and disease contexts.
|
|
199
|
+
|
|
200
|
+
This method loads pre-computed Transcriptformer embeddings that capture gene
|
|
201
|
+
expression patterns in specific combinations of cell type and disease state.
|
|
202
|
+
The embeddings are loaded on-demand from compressed numpy matrices for
|
|
203
|
+
memory efficiency and fast access.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
state (str): Disease state context (e.g., 'control', 'disease', 'treated').
|
|
207
|
+
Must be normalized (lowercase, underscores for spaces).
|
|
208
|
+
cell_type (str): Cell type context (e.g., 'b_cell', 'macrophage', 'epithelial_cell').
|
|
209
|
+
Must be normalized (lowercase, underscores for spaces).
|
|
210
|
+
gene_names (Union[List[str], None]): Gene identifiers to retrieve embeddings for.
|
|
211
|
+
Can be gene symbols (e.g., 'TP53') or Ensembl IDs (e.g., 'ENSG00000141510').
|
|
212
|
+
If None, returns embeddings for all available genes.
|
|
213
|
+
disease (str): Disease context identifier (e.g., 'breast_cancer', 'diabetes').
|
|
214
|
+
Must match available disease stores.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Tuple[Optional[Dict[str, np.ndarray]], List[str]]: A tuple containing:
|
|
218
|
+
- Dictionary mapping gene names to embedding vectors (None if failed)
|
|
219
|
+
- List of context information and error messages
|
|
220
|
+
|
|
221
|
+
The embedding vectors are float32 numpy arrays representing learned gene
|
|
222
|
+
representations in the specified cellular context.
|
|
223
|
+
"""
|
|
224
|
+
# Normalize input parameters for consistent matching
|
|
225
|
+
disease = disease.lower().replace(" ", "_")
|
|
226
|
+
state = state.lower().replace(" ", "_")
|
|
227
|
+
cell_type = cell_type.lower().replace(" ", "_")
|
|
228
|
+
|
|
229
|
+
# Load metadata for the specified disease
|
|
230
|
+
metadata = self._load_metadata(disease)
|
|
231
|
+
|
|
232
|
+
context_info = []
|
|
233
|
+
embeddings = {}
|
|
234
|
+
invalid_genes = []
|
|
235
|
+
|
|
236
|
+
# Validate disease state parameter
|
|
237
|
+
if state not in metadata["available_states"]:
|
|
238
|
+
context_info.append(
|
|
239
|
+
f"Invalid disease state '{state}'. Available states for {disease}: {metadata['available_states']}"
|
|
240
|
+
)
|
|
241
|
+
return None, context_info
|
|
242
|
+
|
|
243
|
+
# Validate cell type parameter
|
|
244
|
+
if cell_type not in metadata["available_cell_types"]:
|
|
245
|
+
context_info.append(
|
|
246
|
+
f"Invalid cell type '{cell_type}'. Available cell types for {disease}: {metadata['available_cell_types']}"
|
|
247
|
+
)
|
|
248
|
+
return None, context_info
|
|
249
|
+
|
|
250
|
+
# Process gene names parameter (None means retrieve all genes)
|
|
251
|
+
if gene_names is None:
|
|
252
|
+
# Retrieve embeddings for all genes in this context
|
|
253
|
+
print(
|
|
254
|
+
f"Loading complete gene embedding set for context: {disease} - {state} - {cell_type}"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Create gene mapping using symbols as primary keys when available
|
|
258
|
+
for ensembl_id in metadata["ensembl_ids_ordered"]:
|
|
259
|
+
# Find corresponding gene symbol for this Ensembl ID
|
|
260
|
+
gene_symbol = None
|
|
261
|
+
for symbol, ens_id in metadata["symbol_to_ensembl"].items():
|
|
262
|
+
if ens_id == ensembl_id:
|
|
263
|
+
gene_symbol = symbol
|
|
264
|
+
break
|
|
265
|
+
|
|
266
|
+
# Use gene symbol as key if available, otherwise use Ensembl ID
|
|
267
|
+
if gene_symbol:
|
|
268
|
+
embeddings[gene_symbol] = ensembl_id
|
|
269
|
+
else:
|
|
270
|
+
embeddings[ensembl_id] = ensembl_id
|
|
271
|
+
else:
|
|
272
|
+
# Validate and process specific gene identifiers
|
|
273
|
+
for gene_name in gene_names:
|
|
274
|
+
ensembl_id = None
|
|
275
|
+
|
|
276
|
+
# Check if input is an Ensembl ID (starts with 'ENSG')
|
|
277
|
+
if gene_name.upper().startswith("ENSG"):
|
|
278
|
+
ensembl_id = gene_name.upper()
|
|
279
|
+
if ensembl_id not in metadata["gene_to_idx"]:
|
|
280
|
+
invalid_genes.append(gene_name)
|
|
281
|
+
else:
|
|
282
|
+
# Treat as gene symbol and lookup corresponding Ensembl ID
|
|
283
|
+
ensembl_id = metadata["symbol_to_ensembl"].get(gene_name.upper())
|
|
284
|
+
if not ensembl_id:
|
|
285
|
+
invalid_genes.append(gene_name)
|
|
286
|
+
|
|
287
|
+
# Add valid gene to embedding request
|
|
288
|
+
if ensembl_id:
|
|
289
|
+
embeddings[gene_name] = ensembl_id
|
|
290
|
+
|
|
291
|
+
# Report invalid gene identifiers
|
|
292
|
+
if invalid_genes:
|
|
293
|
+
context_info.append(
|
|
294
|
+
f"Invalid or unavailable gene identifiers: {invalid_genes}"
|
|
295
|
+
)
|
|
296
|
+
context_info.append(
|
|
297
|
+
f"Please use valid gene symbols or Ensembl IDs from the {disease} dataset."
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Check if any valid genes were found
|
|
301
|
+
if not embeddings:
|
|
302
|
+
return None, context_info
|
|
303
|
+
|
|
304
|
+
# Construct group key for embedding matrix lookup
|
|
305
|
+
# Format: celltype_diseasestate (normalized, no special characters)
|
|
306
|
+
group_key = (
|
|
307
|
+
f"{cell_type}_{state}".replace(" ", "_").replace("(", "").replace(")", "")
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
# Validate that the requested context combination exists
|
|
311
|
+
if group_key not in metadata["groups_meta"]:
|
|
312
|
+
available_keys = list(metadata["groups_meta"].keys())
|
|
313
|
+
context_info.append(
|
|
314
|
+
f"Context combination not available: state='{state}', cell_type='{cell_type}'"
|
|
315
|
+
)
|
|
316
|
+
context_info.append(
|
|
317
|
+
f"Available context combinations for {disease}: {available_keys}"
|
|
318
|
+
)
|
|
319
|
+
return None, context_info
|
|
320
|
+
|
|
321
|
+
# Load embedding matrix on-demand from compressed numpy file
|
|
322
|
+
npy_path = os.path.join(metadata["store_path"], f"{group_key}.npy")
|
|
323
|
+
if not os.path.exists(npy_path):
|
|
324
|
+
context_info.append(
|
|
325
|
+
f"Embedding matrix file not found for context '{group_key}' at {npy_path}"
|
|
326
|
+
)
|
|
327
|
+
return None, context_info
|
|
328
|
+
|
|
329
|
+
print(f"Loading embedding matrix for context: {group_key}")
|
|
330
|
+
embedding_matrix = np.load(npy_path)
|
|
331
|
+
|
|
332
|
+
# Extract embeddings for requested genes
|
|
333
|
+
final_embeddings = {}
|
|
334
|
+
for gene_name, ensembl_id in embeddings.items():
|
|
335
|
+
gene_idx = metadata["gene_to_idx"].get(ensembl_id)
|
|
336
|
+
if gene_idx is not None:
|
|
337
|
+
# Extract and dequantize embedding vector to float32
|
|
338
|
+
embedding_vector = embedding_matrix[gene_idx].astype(np.float32)
|
|
339
|
+
final_embeddings[gene_name] = embedding_vector
|
|
340
|
+
|
|
341
|
+
# Add success information to context
|
|
342
|
+
context_info.append(
|
|
343
|
+
f"Successfully retrieved {len(final_embeddings)} gene embeddings for context: {disease} - {state} - {cell_type}"
|
|
344
|
+
)
|
|
345
|
+
if len(final_embeddings) > 0:
|
|
346
|
+
embedding_dim = final_embeddings[next(iter(final_embeddings))].shape[0]
|
|
347
|
+
context_info.append(
|
|
348
|
+
f"Embedding dimensionality: {embedding_dim} features per gene"
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
return final_embeddings, context_info
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
@server.tool()
|
|
355
|
+
async def run_transcriptformer_embedding_retrieval(
|
|
356
|
+
state: str,
|
|
357
|
+
cell_type: str,
|
|
358
|
+
gene_names: List[str],
|
|
359
|
+
disease: str,
|
|
360
|
+
data_dir: Optional[str] = None,
|
|
361
|
+
):
|
|
362
|
+
"""
|
|
363
|
+
MCP Tool: Retrieves contextualized gene embeddings from Transcriptformer models.
|
|
364
|
+
|
|
365
|
+
This tool provides access to pre-computed Transcriptformer embeddings that capture
|
|
366
|
+
gene expression patterns learned from single-cell RNA sequencing data. The embeddings
|
|
367
|
+
are contextualized for specific combinations of disease states and cell types,
|
|
368
|
+
enabling precise analysis of gene behavior in relevant biological contexts.
|
|
369
|
+
|
|
370
|
+
Scientific Background:
|
|
371
|
+
- Transcriptformer uses transformer architecture to learn gene representations
|
|
372
|
+
- Embeddings capture cell-type-specific and disease-state-specific expression patterns
|
|
373
|
+
- Model trained on large-scale single-cell RNA-seq datasets
|
|
374
|
+
- Dense vector representations enable similarity analysis and downstream ML applications
|
|
375
|
+
|
|
376
|
+
Applications:
|
|
377
|
+
- Gene similarity analysis and functional annotation
|
|
378
|
+
- Biomarker discovery and validation in disease contexts
|
|
379
|
+
- Pathway analysis and systems biology research
|
|
380
|
+
- Drug target identification and prioritization
|
|
381
|
+
- Precision medicine and personalized therapeutics
|
|
382
|
+
- Co-expression network analysis
|
|
383
|
+
|
|
384
|
+
Technical Details:
|
|
385
|
+
- Embeddings stored as compressed numpy matrices for efficient access
|
|
386
|
+
- On-demand loading minimizes memory usage
|
|
387
|
+
- Supports both gene symbols and Ensembl ID inputs
|
|
388
|
+
- Float32 precision for optimal balance of accuracy and efficiency
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
state (str): Disease state context for embedding retrieval. Examples:
|
|
392
|
+
- 'control': Healthy/normal condition
|
|
393
|
+
- 'disease': Disease-affected state
|
|
394
|
+
- 'treated': Post-treatment condition
|
|
395
|
+
- 'untreated': Pre-treatment condition
|
|
396
|
+
Must match available states in the disease-specific store.
|
|
397
|
+
|
|
398
|
+
cell_type (str): Cell type context for embeddings. Examples:
|
|
399
|
+
- 'b_cell': B lymphocytes
|
|
400
|
+
- 't_cell': T lymphocytes
|
|
401
|
+
- 'macrophage': Tissue macrophages
|
|
402
|
+
- 'epithelial_cell': Epithelial cells
|
|
403
|
+
- 'fibroblast': Connective tissue fibroblasts
|
|
404
|
+
Must match available cell types in the disease store.
|
|
405
|
+
|
|
406
|
+
gene_names (List[str]): Gene identifiers for embedding retrieval:
|
|
407
|
+
- Gene symbols: ['TP53', 'BRCA1', 'EGFR', 'MYC']
|
|
408
|
+
- Ensembl IDs: ['ENSG00000141510', 'ENSG00000139618']
|
|
409
|
+
- Mixed formats supported
|
|
410
|
+
- Empty list retrieves all available genes
|
|
411
|
+
|
|
412
|
+
disease (str): Disease/dataset identifier. Examples:
|
|
413
|
+
- 'breast_cancer': Breast cancer scRNA-seq data
|
|
414
|
+
- 'lung_cancer': Lung cancer contexts
|
|
415
|
+
- 'diabetes': Diabetes-related datasets
|
|
416
|
+
- 'alzheimer': Alzheimer's disease contexts
|
|
417
|
+
Must match available disease stores.
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
dict: Comprehensive embedding retrieval results containing:
|
|
421
|
+
- 'embeddings' (dict, optional): Gene-to-embedding mapping where:
|
|
422
|
+
* Keys: Gene identifiers (symbols or Ensembl IDs as provided)
|
|
423
|
+
* Values: Embedding vectors as lists of float32 values
|
|
424
|
+
Only present when embeddings are successfully retrieved.
|
|
425
|
+
- 'context_info' (list): Detailed retrieval information including:
|
|
426
|
+
* Validation results and parameter processing
|
|
427
|
+
* Number of genes processed and embedding dimensions
|
|
428
|
+
* Warnings about invalid gene identifiers
|
|
429
|
+
* Context combination availability
|
|
430
|
+
- 'error' (str, optional): Error description if retrieval failed
|
|
431
|
+
|
|
432
|
+
Example Usage:
|
|
433
|
+
# Retrieve specific cancer-related genes in breast cancer B cells
|
|
434
|
+
result = await run_transcriptformer_embedding_retrieval(
|
|
435
|
+
state="disease",
|
|
436
|
+
cell_type="b_cell",
|
|
437
|
+
gene_names=["TP53", "BRCA1", "EGFR", "MYC"],
|
|
438
|
+
disease="breast_cancer"
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
# Get all gene embeddings for control hepatocytes
|
|
442
|
+
result = await run_transcriptformer_embedding_retrieval(
|
|
443
|
+
state="control",
|
|
444
|
+
cell_type="hepatocyte",
|
|
445
|
+
gene_names=[],
|
|
446
|
+
disease="liver_disease"
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Mixed gene identifier formats
|
|
450
|
+
result = await run_transcriptformer_embedding_retrieval(
|
|
451
|
+
state="treated",
|
|
452
|
+
cell_type="t_cell",
|
|
453
|
+
gene_names=["CD8A", "ENSG00000153563", "IFNG"],
|
|
454
|
+
disease="immunotherapy_response"
|
|
455
|
+
)
|
|
456
|
+
"""
|
|
457
|
+
|
|
458
|
+
# Generate unique request ID for tracking and logging
|
|
459
|
+
request_id = str(uuid.uuid4())[:8]
|
|
460
|
+
print(
|
|
461
|
+
f"[{request_id}] Received Transcriptformer embedding retrieval request for {disease} - {state} - {cell_type}"
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
# Set default data directory if not provided
|
|
465
|
+
if data_dir is None:
|
|
466
|
+
data_dir = os.getenv("TRANSCRIPTFORMER_DATA_PATH", "/root/PrismDB")
|
|
467
|
+
|
|
468
|
+
# Initialize global Transcriptformer tool instance for MCP server
|
|
469
|
+
# This instance will be used by the MCP tool function to serve embedding requests
|
|
470
|
+
try:
|
|
471
|
+
transcriptformer_tool = TranscriptformerEmbeddingTool(data_dir=data_dir)
|
|
472
|
+
print("Transcriptformer tool instance created and ready for MCP server")
|
|
473
|
+
except Exception as e:
|
|
474
|
+
print(f"Error creating Transcriptformer tool: {str(e)}")
|
|
475
|
+
print(
|
|
476
|
+
"Please ensure TRANSCRIPTFORMER_DATA_PATH is correctly set and embedding stores exist."
|
|
477
|
+
)
|
|
478
|
+
raise e
|
|
479
|
+
|
|
480
|
+
try:
|
|
481
|
+
# Brief async pause to allow for proper request handling
|
|
482
|
+
await asyncio.sleep(0.1)
|
|
483
|
+
|
|
484
|
+
# Validate input parameters
|
|
485
|
+
if not disease or not disease.strip():
|
|
486
|
+
raise ValueError(
|
|
487
|
+
"Disease parameter cannot be empty. Please specify a valid disease identifier."
|
|
488
|
+
)
|
|
489
|
+
if not state or not state.strip():
|
|
490
|
+
raise ValueError(
|
|
491
|
+
"State parameter cannot be empty. Please specify a valid disease state."
|
|
492
|
+
)
|
|
493
|
+
if not cell_type or not cell_type.strip():
|
|
494
|
+
raise ValueError(
|
|
495
|
+
"Cell type parameter cannot be empty. Please specify a valid cell type."
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
print(
|
|
499
|
+
f"[{request_id}] Processing embedding retrieval for {len(gene_names) if gene_names else 'all'} genes"
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
# Execute Transcriptformer embedding retrieval
|
|
503
|
+
embeddings, context_info = transcriptformer_tool.get_embedding_for_context(
|
|
504
|
+
state=state.strip(),
|
|
505
|
+
cell_type=cell_type.strip(),
|
|
506
|
+
gene_names=gene_names if gene_names else None,
|
|
507
|
+
disease=disease.strip(),
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Handle retrieval failure
|
|
511
|
+
if embeddings is None:
|
|
512
|
+
print(
|
|
513
|
+
f"[{request_id}] Embedding retrieval failed for context: {disease} - {state} - {cell_type}"
|
|
514
|
+
)
|
|
515
|
+
return {
|
|
516
|
+
"error": "Failed to retrieve Transcriptformer embeddings for specified context",
|
|
517
|
+
"context_info": context_info
|
|
518
|
+
+ [
|
|
519
|
+
"Please verify disease, state, and cell type parameters.",
|
|
520
|
+
"Check available contexts using the tool's metadata.",
|
|
521
|
+
],
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
# Convert numpy arrays to JSON-serializable lists
|
|
525
|
+
# This enables downstream processing and API compatibility
|
|
526
|
+
serializable_embeddings = {}
|
|
527
|
+
for gene_name, embedding_vector in embeddings.items():
|
|
528
|
+
serializable_embeddings[gene_name] = embedding_vector.tolist()
|
|
529
|
+
|
|
530
|
+
# Log successful completion with key metrics
|
|
531
|
+
num_genes = len(serializable_embeddings)
|
|
532
|
+
embedding_dim = (
|
|
533
|
+
len(next(iter(serializable_embeddings.values())))
|
|
534
|
+
if serializable_embeddings
|
|
535
|
+
else 0
|
|
536
|
+
)
|
|
537
|
+
print(
|
|
538
|
+
f"[{request_id}] Transcriptformer embedding retrieval completed: {num_genes} genes, {embedding_dim}D embeddings"
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
return {
|
|
542
|
+
"embeddings": serializable_embeddings,
|
|
543
|
+
"context_info": context_info
|
|
544
|
+
+ [
|
|
545
|
+
f"Embedding retrieval completed for {num_genes} genes.",
|
|
546
|
+
f"Context: {disease} - {state} - {cell_type}",
|
|
547
|
+
f"Embedding dimensionality: {embedding_dim} features per gene.",
|
|
548
|
+
],
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
except ValueError as e:
|
|
552
|
+
error_message = (
|
|
553
|
+
f"Transcriptformer embedding retrieval validation error: {str(e)}"
|
|
554
|
+
)
|
|
555
|
+
print(f"[{request_id}] {error_message}")
|
|
556
|
+
return {
|
|
557
|
+
"error": error_message,
|
|
558
|
+
"context_info": ["Please verify input parameters and available contexts."],
|
|
559
|
+
}
|
|
560
|
+
except Exception as e:
|
|
561
|
+
error_message = (
|
|
562
|
+
f"Unexpected error during Transcriptformer embedding retrieval: {str(e)}"
|
|
563
|
+
)
|
|
564
|
+
print(f"[{request_id}] {error_message}")
|
|
565
|
+
return {
|
|
566
|
+
"error": error_message,
|
|
567
|
+
"context_info": [
|
|
568
|
+
"Internal server error occurred during embedding retrieval."
|
|
569
|
+
],
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
if __name__ == "__main__":
|
|
574
|
+
print("Starting MCP server for Transcriptformer Gene Embedding Tool...")
|
|
575
|
+
print("Model: Transcriptformer (Transformer-based gene representation learning)")
|
|
576
|
+
print("Application: Contextualized gene embedding retrieval from single-cell data")
|
|
577
|
+
print("Features: Disease-specific and cell-type-specific gene representations")
|
|
578
|
+
print("Server: FastMCP with streamable HTTP transport")
|
|
579
|
+
print("Port: 7000 (configured for biomedical embedding services)")
|
|
580
|
+
print("Timeout: Extended for large embedding matrix operations")
|
|
581
|
+
|
|
582
|
+
# Launch the MCP server with Transcriptformer embedding capabilities
|
|
583
|
+
# Extended timeout for handling large embedding matrices
|
|
584
|
+
server.run(
|
|
585
|
+
transport="streamable-http", host="0.0.0.0", port=7000, stateless_http=True
|
|
586
|
+
)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from fastmcp import FastMCP
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
from .uspto_downloader_tool import USPTOPatentDocumentDownloader
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
# Read the tool config dicts from the JSON file
|
|
8
|
+
try:
|
|
9
|
+
with open(
|
|
10
|
+
os.path.join(os.path.dirname(__file__), "uspto_downloader_client_tools.json"),
|
|
11
|
+
"r",
|
|
12
|
+
) as f:
|
|
13
|
+
uspto_downloader_tools = json.load(f)
|
|
14
|
+
except FileNotFoundError as e:
|
|
15
|
+
print(f"\033[91mError: {e}\033[0m")
|
|
16
|
+
print(
|
|
17
|
+
f"\033[91mIs uspto_downloader_client_tools.json in the parent directory of {__file__}?\033[0m"
|
|
18
|
+
)
|
|
19
|
+
sys.exit(1)
|
|
20
|
+
|
|
21
|
+
server = FastMCP("Your MCP Server", stateless_http=True)
|
|
22
|
+
agents = {}
|
|
23
|
+
for tool_config in uspto_downloader_tools:
|
|
24
|
+
agents[tool_config["name"]] = USPTOPatentDocumentDownloader(tool_config=tool_config)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@server.tool()
|
|
28
|
+
def download_abst(query: dict):
|
|
29
|
+
"""Retrieve the abstract of a patent application by its application number.
|
|
30
|
+
Args:
|
|
31
|
+
"query" dict: A dictionary containing the application number under the key "applicationNumberText".
|
|
32
|
+
Returns:
|
|
33
|
+
dict: A dictionary containing the abstract text under the 'result' key or an error message under the 'error' key if the document could not be retrieved.
|
|
34
|
+
"""
|
|
35
|
+
return agents["get_abstract_from_patent_app_number"].run(query)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@server.tool()
|
|
39
|
+
def download_claims(query: dict):
|
|
40
|
+
"""Retrieve the claims of a patent application by its application number.
|
|
41
|
+
Args:
|
|
42
|
+
"query" dict: A dictionary containing the application number under the key "applicationNumberText".
|
|
43
|
+
Returns:
|
|
44
|
+
dict: A dictionary containing the claims text under the 'result' key or an error message under the 'error' key if the document could not be retrieved.
|
|
45
|
+
"""
|
|
46
|
+
return agents["get_claims_from_patent_app_number"].run(query)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@server.tool()
|
|
50
|
+
def download_full_text(query: dict):
|
|
51
|
+
"""Retrieve the full text of a patent application by its application number.
|
|
52
|
+
Args:
|
|
53
|
+
"query" dict: A dictionary containing the application number under the key "applicationNumberText".
|
|
54
|
+
Returns:
|
|
55
|
+
dict: A dictionary containing the full text under the 'result' key or an error message under the 'error' key if the document could not be retrieved.
|
|
56
|
+
"""
|
|
57
|
+
return agents["get_full_text_from_patent_app_number"].run(query)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
if __name__ == "__main__":
|
|
61
|
+
server.run(transport="streamable-http", host="0.0.0.0", port=8081)
|