tooluniverse 0.2.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tooluniverse might be problematic. Click here for more details.

Files changed (186) hide show
  1. tooluniverse/__init__.py +340 -4
  2. tooluniverse/admetai_tool.py +84 -0
  3. tooluniverse/agentic_tool.py +563 -0
  4. tooluniverse/alphafold_tool.py +96 -0
  5. tooluniverse/base_tool.py +129 -6
  6. tooluniverse/boltz_tool.py +207 -0
  7. tooluniverse/chem_tool.py +192 -0
  8. tooluniverse/compose_scripts/__init__.py +1 -0
  9. tooluniverse/compose_scripts/biomarker_discovery.py +293 -0
  10. tooluniverse/compose_scripts/comprehensive_drug_discovery.py +186 -0
  11. tooluniverse/compose_scripts/drug_safety_analyzer.py +89 -0
  12. tooluniverse/compose_scripts/literature_tool.py +34 -0
  13. tooluniverse/compose_scripts/output_summarizer.py +279 -0
  14. tooluniverse/compose_scripts/tool_description_optimizer.py +681 -0
  15. tooluniverse/compose_scripts/tool_discover.py +705 -0
  16. tooluniverse/compose_scripts/tool_graph_composer.py +448 -0
  17. tooluniverse/compose_tool.py +371 -0
  18. tooluniverse/ctg_tool.py +1002 -0
  19. tooluniverse/custom_tool.py +81 -0
  20. tooluniverse/dailymed_tool.py +108 -0
  21. tooluniverse/data/admetai_tools.json +155 -0
  22. tooluniverse/data/adverse_event_tools.json +108 -0
  23. tooluniverse/data/agentic_tools.json +1156 -0
  24. tooluniverse/data/alphafold_tools.json +87 -0
  25. tooluniverse/data/boltz_tools.json +9 -0
  26. tooluniverse/data/chembl_tools.json +16 -0
  27. tooluniverse/data/clinicaltrials_gov_tools.json +326 -0
  28. tooluniverse/data/compose_tools.json +202 -0
  29. tooluniverse/data/dailymed_tools.json +70 -0
  30. tooluniverse/data/dataset_tools.json +646 -0
  31. tooluniverse/data/disease_target_score_tools.json +712 -0
  32. tooluniverse/data/efo_tools.json +17 -0
  33. tooluniverse/data/embedding_tools.json +319 -0
  34. tooluniverse/data/enrichr_tools.json +31 -0
  35. tooluniverse/data/europe_pmc_tools.json +22 -0
  36. tooluniverse/data/expert_feedback_tools.json +10 -0
  37. tooluniverse/data/fda_drug_adverse_event_tools.json +491 -0
  38. tooluniverse/data/fda_drug_labeling_tools.json +1 -1
  39. tooluniverse/data/fda_drugs_with_brand_generic_names_for_tool.py +76929 -148860
  40. tooluniverse/data/finder_tools.json +209 -0
  41. tooluniverse/data/gene_ontology_tools.json +113 -0
  42. tooluniverse/data/gwas_tools.json +1082 -0
  43. tooluniverse/data/hpa_tools.json +333 -0
  44. tooluniverse/data/humanbase_tools.json +47 -0
  45. tooluniverse/data/idmap_tools.json +74 -0
  46. tooluniverse/data/mcp_client_tools_example.json +113 -0
  47. tooluniverse/data/mcpautoloadertool_defaults.json +28 -0
  48. tooluniverse/data/medlineplus_tools.json +141 -0
  49. tooluniverse/data/monarch_tools.json +1 -1
  50. tooluniverse/data/openalex_tools.json +36 -0
  51. tooluniverse/data/opentarget_tools.json +1 -1
  52. tooluniverse/data/output_summarization_tools.json +101 -0
  53. tooluniverse/data/packages/bioinformatics_core_tools.json +1756 -0
  54. tooluniverse/data/packages/categorized_tools.txt +206 -0
  55. tooluniverse/data/packages/cheminformatics_tools.json +347 -0
  56. tooluniverse/data/packages/earth_sciences_tools.json +74 -0
  57. tooluniverse/data/packages/genomics_tools.json +776 -0
  58. tooluniverse/data/packages/image_processing_tools.json +38 -0
  59. tooluniverse/data/packages/machine_learning_tools.json +789 -0
  60. tooluniverse/data/packages/neuroscience_tools.json +62 -0
  61. tooluniverse/data/packages/original_tools.txt +0 -0
  62. tooluniverse/data/packages/physics_astronomy_tools.json +62 -0
  63. tooluniverse/data/packages/scientific_computing_tools.json +560 -0
  64. tooluniverse/data/packages/single_cell_tools.json +453 -0
  65. tooluniverse/data/packages/structural_biology_tools.json +396 -0
  66. tooluniverse/data/packages/visualization_tools.json +399 -0
  67. tooluniverse/data/pubchem_tools.json +215 -0
  68. tooluniverse/data/pubtator_tools.json +68 -0
  69. tooluniverse/data/rcsb_pdb_tools.json +1332 -0
  70. tooluniverse/data/reactome_tools.json +19 -0
  71. tooluniverse/data/semantic_scholar_tools.json +26 -0
  72. tooluniverse/data/special_tools.json +2 -25
  73. tooluniverse/data/tool_composition_tools.json +88 -0
  74. tooluniverse/data/toolfinderkeyword_defaults.json +34 -0
  75. tooluniverse/data/txagent_client_tools.json +9 -0
  76. tooluniverse/data/uniprot_tools.json +211 -0
  77. tooluniverse/data/url_fetch_tools.json +94 -0
  78. tooluniverse/data/uspto_downloader_tools.json +9 -0
  79. tooluniverse/data/uspto_tools.json +811 -0
  80. tooluniverse/data/xml_tools.json +3275 -0
  81. tooluniverse/dataset_tool.py +296 -0
  82. tooluniverse/default_config.py +165 -0
  83. tooluniverse/efo_tool.py +42 -0
  84. tooluniverse/embedding_database.py +630 -0
  85. tooluniverse/embedding_sync.py +396 -0
  86. tooluniverse/enrichr_tool.py +266 -0
  87. tooluniverse/europe_pmc_tool.py +52 -0
  88. tooluniverse/execute_function.py +1775 -95
  89. tooluniverse/extended_hooks.py +444 -0
  90. tooluniverse/gene_ontology_tool.py +194 -0
  91. tooluniverse/graphql_tool.py +158 -36
  92. tooluniverse/gwas_tool.py +358 -0
  93. tooluniverse/hpa_tool.py +1645 -0
  94. tooluniverse/humanbase_tool.py +389 -0
  95. tooluniverse/logging_config.py +254 -0
  96. tooluniverse/mcp_client_tool.py +764 -0
  97. tooluniverse/mcp_integration.py +413 -0
  98. tooluniverse/mcp_tool_registry.py +925 -0
  99. tooluniverse/medlineplus_tool.py +337 -0
  100. tooluniverse/openalex_tool.py +228 -0
  101. tooluniverse/openfda_adv_tool.py +283 -0
  102. tooluniverse/openfda_tool.py +393 -160
  103. tooluniverse/output_hook.py +1122 -0
  104. tooluniverse/package_tool.py +195 -0
  105. tooluniverse/pubchem_tool.py +158 -0
  106. tooluniverse/pubtator_tool.py +168 -0
  107. tooluniverse/rcsb_pdb_tool.py +38 -0
  108. tooluniverse/reactome_tool.py +108 -0
  109. tooluniverse/remote/boltz/boltz_mcp_server.py +50 -0
  110. tooluniverse/remote/depmap_24q2/depmap_24q2_mcp_tool.py +442 -0
  111. tooluniverse/remote/expert_feedback/human_expert_mcp_tools.py +2013 -0
  112. tooluniverse/remote/expert_feedback/simple_test.py +23 -0
  113. tooluniverse/remote/expert_feedback/start_web_interface.py +188 -0
  114. tooluniverse/remote/expert_feedback/web_only_interface.py +0 -0
  115. tooluniverse/remote/immune_compass/compass_tool.py +327 -0
  116. tooluniverse/remote/pinnacle/pinnacle_tool.py +328 -0
  117. tooluniverse/remote/transcriptformer/transcriptformer_tool.py +586 -0
  118. tooluniverse/remote/uspto_downloader/uspto_downloader_mcp_server.py +61 -0
  119. tooluniverse/remote/uspto_downloader/uspto_downloader_tool.py +120 -0
  120. tooluniverse/remote_tool.py +99 -0
  121. tooluniverse/restful_tool.py +53 -30
  122. tooluniverse/scripts/generate_tool_graph.py +408 -0
  123. tooluniverse/scripts/visualize_tool_graph.py +829 -0
  124. tooluniverse/semantic_scholar_tool.py +62 -0
  125. tooluniverse/smcp.py +2452 -0
  126. tooluniverse/smcp_server.py +975 -0
  127. tooluniverse/test/mcp_server_test.py +0 -0
  128. tooluniverse/test/test_admetai_tool.py +370 -0
  129. tooluniverse/test/test_agentic_tool.py +129 -0
  130. tooluniverse/test/test_alphafold_tool.py +71 -0
  131. tooluniverse/test/test_chem_tool.py +37 -0
  132. tooluniverse/test/test_compose_lieraturereview.py +63 -0
  133. tooluniverse/test/test_compose_tool.py +448 -0
  134. tooluniverse/test/test_dailymed.py +69 -0
  135. tooluniverse/test/test_dataset_tool.py +200 -0
  136. tooluniverse/test/test_disease_target_score.py +56 -0
  137. tooluniverse/test/test_drugbank_filter_examples.py +179 -0
  138. tooluniverse/test/test_efo.py +31 -0
  139. tooluniverse/test/test_enrichr_tool.py +21 -0
  140. tooluniverse/test/test_europe_pmc_tool.py +20 -0
  141. tooluniverse/test/test_fda_adv.py +95 -0
  142. tooluniverse/test/test_fda_drug_labeling.py +91 -0
  143. tooluniverse/test/test_gene_ontology_tools.py +66 -0
  144. tooluniverse/test/test_gwas_tool.py +139 -0
  145. tooluniverse/test/test_hpa.py +625 -0
  146. tooluniverse/test/test_humanbase_tool.py +20 -0
  147. tooluniverse/test/test_idmap_tools.py +61 -0
  148. tooluniverse/test/test_mcp_server.py +211 -0
  149. tooluniverse/test/test_mcp_tool.py +247 -0
  150. tooluniverse/test/test_medlineplus.py +220 -0
  151. tooluniverse/test/test_openalex_tool.py +32 -0
  152. tooluniverse/test/test_opentargets.py +28 -0
  153. tooluniverse/test/test_pubchem_tool.py +116 -0
  154. tooluniverse/test/test_pubtator_tool.py +37 -0
  155. tooluniverse/test/test_rcsb_pdb_tool.py +86 -0
  156. tooluniverse/test/test_reactome.py +54 -0
  157. tooluniverse/test/test_semantic_scholar_tool.py +24 -0
  158. tooluniverse/test/test_software_tools.py +147 -0
  159. tooluniverse/test/test_tool_description_optimizer.py +49 -0
  160. tooluniverse/test/test_tool_finder.py +26 -0
  161. tooluniverse/test/test_tool_finder_llm.py +252 -0
  162. tooluniverse/test/test_tools_find.py +195 -0
  163. tooluniverse/test/test_uniprot_tools.py +74 -0
  164. tooluniverse/test/test_uspto_tool.py +72 -0
  165. tooluniverse/test/test_xml_tool.py +113 -0
  166. tooluniverse/tool_finder_embedding.py +267 -0
  167. tooluniverse/tool_finder_keyword.py +693 -0
  168. tooluniverse/tool_finder_llm.py +699 -0
  169. tooluniverse/tool_graph_web_ui.py +955 -0
  170. tooluniverse/tool_registry.py +416 -0
  171. tooluniverse/uniprot_tool.py +155 -0
  172. tooluniverse/url_tool.py +253 -0
  173. tooluniverse/uspto_tool.py +240 -0
  174. tooluniverse/utils.py +369 -41
  175. tooluniverse/xml_tool.py +369 -0
  176. tooluniverse-1.0.1.dist-info/METADATA +387 -0
  177. tooluniverse-1.0.1.dist-info/RECORD +182 -0
  178. tooluniverse-1.0.1.dist-info/entry_points.txt +9 -0
  179. tooluniverse/generate_mcp_tools.py +0 -113
  180. tooluniverse/mcp_server.py +0 -3340
  181. tooluniverse-0.2.0.dist-info/METADATA +0 -139
  182. tooluniverse-0.2.0.dist-info/RECORD +0 -21
  183. tooluniverse-0.2.0.dist-info/entry_points.txt +0 -4
  184. {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.1.dist-info}/WHEEL +0 -0
  185. {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.1.dist-info}/licenses/LICENSE +0 -0
  186. {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,586 @@
1
+ """
2
+ Transcriptformer Gene Embedding Tool - MCP Server
3
+
4
+ This module provides an MCP (Model Context Protocol) server for retrieving
5
+ pre-computed gene embeddings from the Transcriptformer model. Transcriptformer
6
+ is a transformer-based architecture trained on single-cell RNA sequencing data
7
+ to learn contextualized gene representations that capture cell-type-specific
8
+ and disease-state-specific expression patterns.
9
+
10
+ The tool provides access to disease-specific embedding stores that enable:
11
+ - Gene similarity analysis in specific cellular contexts
12
+ - Biomarker discovery and validation
13
+ - Pathway analysis and functional annotation
14
+ - Drug target identification and prioritization
15
+ - Precision medicine applications
16
+ - Systems biology research
17
+ """
18
+
19
+ from fastmcp import FastMCP
20
+ import os
21
+ import asyncio
22
+ import uuid
23
+ import gzip
24
+ import json
25
+ import numpy as np
26
+ from typing import Union, List, Dict, Tuple, Optional, Any
27
+
28
+
29
+ # Initialize MCP Server for Transcriptformer gene embedding retrieval
30
+ server = FastMCP("Transcriptformer SMCP Server")
31
+
32
+
33
+ class TranscriptformerEmbeddingTool:
34
+ """
35
+ Comprehensive tool for retrieving contextualized gene embeddings from Transcriptformer models.
36
+
37
+ This class provides functionality to:
38
+ - Load and manage disease-specific embedding stores
39
+ - Retrieve gene embeddings for specific cellular contexts (cell type + disease state)
40
+ - Handle both gene symbols and Ensembl IDs with intelligent mapping
41
+ - Cache metadata for efficient repeated queries
42
+ - Support bulk embedding retrieval for pathway analysis
43
+
44
+ Transcriptformer embeddings encode gene expression patterns learned from
45
+ single-cell RNA sequencing data, capturing:
46
+ - Cell-type-specific expression signatures
47
+ - Disease-state-dependent gene regulation
48
+ - Co-expression relationships and functional modules
49
+ - Temporal dynamics and developmental trajectories
50
+
51
+ The tool supports various disease contexts and cell types, enabling
52
+ precision medicine applications and systems biology research.
53
+ """
54
+
55
+ def __init__(self, data_dir: Optional[str] = None):
56
+ """
57
+ Initialize the Transcriptformer embedding tool by discovering available disease stores.
58
+
59
+ The tool automatically scans the embedding store directory to identify
60
+ available disease-specific embedding collections and prepares metadata
61
+ caching infrastructure for efficient access.
62
+
63
+ Raises:
64
+ FileNotFoundError: If the embedding store base directory cannot be found.
65
+ """
66
+ # Construct path to embedding stores
67
+ if data_dir is None:
68
+ transcriptformer_data_path = os.getenv(
69
+ "TRANSCRIPTFORMER_DATA_PATH", "/root/PrismDB"
70
+ )
71
+ else:
72
+ transcriptformer_data_path = data_dir
73
+ self.base_dir = os.path.join(
74
+ transcriptformer_data_path, "transcriptformer_embedding", "embedding_store"
75
+ )
76
+
77
+ # Validate base directory exists
78
+ if not os.path.exists(self.base_dir):
79
+ raise FileNotFoundError(
80
+ f"Transcriptformer embedding store directory not found at {self.base_dir}. Please check your TRANSCRIPTFORMER_DATA_PATH."
81
+ )
82
+
83
+ # Discover available disease-specific embedding stores
84
+ self.available_diseases: List[str] = [
85
+ d.lower().replace(" ", "_")
86
+ for d in os.listdir(self.base_dir)
87
+ if os.path.isdir(os.path.join(self.base_dir, d))
88
+ ]
89
+
90
+ # Initialize metadata cache for performance optimization
91
+ self.metadata_cache: Dict[str, Dict[str, Any]] = {}
92
+
93
+ print(
94
+ f"Transcriptformer tool initialized with {len(self.available_diseases)} disease contexts: {self.available_diseases}"
95
+ )
96
+
97
+ def _load_metadata(self, disease: str) -> Dict:
98
+ """
99
+ Load and cache metadata for a specific disease embedding store.
100
+
101
+ This method loads comprehensive metadata including gene mappings, available
102
+ cell types, disease states, and embedding matrix organization. Metadata
103
+ is cached to avoid repeated file I/O operations for the same disease.
104
+
105
+ Args:
106
+ disease (str): Disease identifier (normalized to lowercase with underscores).
107
+
108
+ Returns:
109
+ Dict: Cached metadata dictionary containing:
110
+ - store_path: Path to disease-specific embedding store
111
+ - ensembl_ids_ordered: Ordered list of Ensembl gene IDs
112
+ - gene_to_idx: Mapping from Ensembl IDs to matrix indices
113
+ - symbol_to_ensembl: Mapping from gene symbols to Ensembl IDs
114
+ - available_symbols: Sorted list of available gene symbols
115
+ - groups_meta: Metadata for available cell type + disease state combinations
116
+ - available_cell_types: Sorted list of available cell types
117
+ - available_states: Sorted list of available disease states
118
+
119
+ Raises:
120
+ FileNotFoundError: If disease is not available or metadata file is missing.
121
+ """
122
+ # Return cached metadata if already loaded
123
+ if disease in self.metadata_cache:
124
+ return self.metadata_cache[disease]
125
+
126
+ # Validate disease availability
127
+ if disease not in self.available_diseases:
128
+ raise FileNotFoundError(
129
+ f"Disease '{disease}' is not available. Please choose from available diseases: {self.available_diseases}"
130
+ )
131
+
132
+ # Construct paths to disease-specific store and metadata
133
+ store_path = os.path.join(self.base_dir, disease.replace(" ", "_"))
134
+ metadata_path = os.path.join(store_path, "metadata.json.gz")
135
+
136
+ # Validate metadata file exists
137
+ if not os.path.exists(metadata_path):
138
+ raise FileNotFoundError(
139
+ f"Metadata file not found at: {metadata_path}. Please ensure embedding store is properly prepared."
140
+ )
141
+
142
+ # Load compressed metadata file
143
+ print(
144
+ f"Loading Transcriptformer metadata from embedding store: {os.path.basename(store_path)}..."
145
+ )
146
+ with gzip.open(metadata_path, "rt", encoding="utf-8") as f:
147
+ metadata = json.load(f)
148
+
149
+ # Process and cache metadata with normalized keys
150
+ self.metadata_cache[disease] = {
151
+ "store_path": store_path,
152
+ "ensembl_ids_ordered": metadata["ensembl_ids_ordered"],
153
+ "gene_to_idx": {
154
+ gene: i for i, gene in enumerate(metadata["ensembl_ids_ordered"])
155
+ },
156
+ "symbol_to_ensembl": metadata["gene_map_symbol_to_ensembl"],
157
+ "available_symbols": sorted(
158
+ list(metadata["gene_map_symbol_to_ensembl"].keys())
159
+ ),
160
+ "groups_meta": {
161
+ k.lower().replace(" ", "_"): v for k, v in metadata["groups"].items()
162
+ },
163
+ "available_cell_types": sorted(
164
+ list(
165
+ set(
166
+ details["cell_type"].lower().replace(" ", "_")
167
+ for details in metadata["groups"].values()
168
+ )
169
+ )
170
+ ),
171
+ "available_states": sorted(
172
+ list(
173
+ set(
174
+ details["disease_state"].lower().replace(" ", "_")
175
+ for details in metadata["groups"].values()
176
+ )
177
+ )
178
+ ),
179
+ }
180
+
181
+ cached_data = self.metadata_cache[disease]
182
+ print(
183
+ f"Metadata loaded successfully: {len(cached_data['available_symbols'])} genes, "
184
+ f"{len(cached_data['available_cell_types'])} cell types, "
185
+ f"{len(cached_data['available_states'])} disease states."
186
+ )
187
+
188
+ return self.metadata_cache[disease]
189
+
190
+ def get_embedding_for_context(
191
+ self,
192
+ state: str,
193
+ cell_type: str,
194
+ gene_names: Union[List[str], None],
195
+ disease: str,
196
+ ) -> Tuple[Optional[Dict[str, np.ndarray]], List[str]]:
197
+ """
198
+ Retrieve contextualized gene embeddings for specific cellular and disease contexts.
199
+
200
+ This method loads pre-computed Transcriptformer embeddings that capture gene
201
+ expression patterns in specific combinations of cell type and disease state.
202
+ The embeddings are loaded on-demand from compressed numpy matrices for
203
+ memory efficiency and fast access.
204
+
205
+ Args:
206
+ state (str): Disease state context (e.g., 'control', 'disease', 'treated').
207
+ Must be normalized (lowercase, underscores for spaces).
208
+ cell_type (str): Cell type context (e.g., 'b_cell', 'macrophage', 'epithelial_cell').
209
+ Must be normalized (lowercase, underscores for spaces).
210
+ gene_names (Union[List[str], None]): Gene identifiers to retrieve embeddings for.
211
+ Can be gene symbols (e.g., 'TP53') or Ensembl IDs (e.g., 'ENSG00000141510').
212
+ If None, returns embeddings for all available genes.
213
+ disease (str): Disease context identifier (e.g., 'breast_cancer', 'diabetes').
214
+ Must match available disease stores.
215
+
216
+ Returns:
217
+ Tuple[Optional[Dict[str, np.ndarray]], List[str]]: A tuple containing:
218
+ - Dictionary mapping gene names to embedding vectors (None if failed)
219
+ - List of context information and error messages
220
+
221
+ The embedding vectors are float32 numpy arrays representing learned gene
222
+ representations in the specified cellular context.
223
+ """
224
+ # Normalize input parameters for consistent matching
225
+ disease = disease.lower().replace(" ", "_")
226
+ state = state.lower().replace(" ", "_")
227
+ cell_type = cell_type.lower().replace(" ", "_")
228
+
229
+ # Load metadata for the specified disease
230
+ metadata = self._load_metadata(disease)
231
+
232
+ context_info = []
233
+ embeddings = {}
234
+ invalid_genes = []
235
+
236
+ # Validate disease state parameter
237
+ if state not in metadata["available_states"]:
238
+ context_info.append(
239
+ f"Invalid disease state '{state}'. Available states for {disease}: {metadata['available_states']}"
240
+ )
241
+ return None, context_info
242
+
243
+ # Validate cell type parameter
244
+ if cell_type not in metadata["available_cell_types"]:
245
+ context_info.append(
246
+ f"Invalid cell type '{cell_type}'. Available cell types for {disease}: {metadata['available_cell_types']}"
247
+ )
248
+ return None, context_info
249
+
250
+ # Process gene names parameter (None means retrieve all genes)
251
+ if gene_names is None:
252
+ # Retrieve embeddings for all genes in this context
253
+ print(
254
+ f"Loading complete gene embedding set for context: {disease} - {state} - {cell_type}"
255
+ )
256
+
257
+ # Create gene mapping using symbols as primary keys when available
258
+ for ensembl_id in metadata["ensembl_ids_ordered"]:
259
+ # Find corresponding gene symbol for this Ensembl ID
260
+ gene_symbol = None
261
+ for symbol, ens_id in metadata["symbol_to_ensembl"].items():
262
+ if ens_id == ensembl_id:
263
+ gene_symbol = symbol
264
+ break
265
+
266
+ # Use gene symbol as key if available, otherwise use Ensembl ID
267
+ if gene_symbol:
268
+ embeddings[gene_symbol] = ensembl_id
269
+ else:
270
+ embeddings[ensembl_id] = ensembl_id
271
+ else:
272
+ # Validate and process specific gene identifiers
273
+ for gene_name in gene_names:
274
+ ensembl_id = None
275
+
276
+ # Check if input is an Ensembl ID (starts with 'ENSG')
277
+ if gene_name.upper().startswith("ENSG"):
278
+ ensembl_id = gene_name.upper()
279
+ if ensembl_id not in metadata["gene_to_idx"]:
280
+ invalid_genes.append(gene_name)
281
+ else:
282
+ # Treat as gene symbol and lookup corresponding Ensembl ID
283
+ ensembl_id = metadata["symbol_to_ensembl"].get(gene_name.upper())
284
+ if not ensembl_id:
285
+ invalid_genes.append(gene_name)
286
+
287
+ # Add valid gene to embedding request
288
+ if ensembl_id:
289
+ embeddings[gene_name] = ensembl_id
290
+
291
+ # Report invalid gene identifiers
292
+ if invalid_genes:
293
+ context_info.append(
294
+ f"Invalid or unavailable gene identifiers: {invalid_genes}"
295
+ )
296
+ context_info.append(
297
+ f"Please use valid gene symbols or Ensembl IDs from the {disease} dataset."
298
+ )
299
+
300
+ # Check if any valid genes were found
301
+ if not embeddings:
302
+ return None, context_info
303
+
304
+ # Construct group key for embedding matrix lookup
305
+ # Format: celltype_diseasestate (normalized, no special characters)
306
+ group_key = (
307
+ f"{cell_type}_{state}".replace(" ", "_").replace("(", "").replace(")", "")
308
+ )
309
+
310
+ # Validate that the requested context combination exists
311
+ if group_key not in metadata["groups_meta"]:
312
+ available_keys = list(metadata["groups_meta"].keys())
313
+ context_info.append(
314
+ f"Context combination not available: state='{state}', cell_type='{cell_type}'"
315
+ )
316
+ context_info.append(
317
+ f"Available context combinations for {disease}: {available_keys}"
318
+ )
319
+ return None, context_info
320
+
321
+ # Load embedding matrix on-demand from compressed numpy file
322
+ npy_path = os.path.join(metadata["store_path"], f"{group_key}.npy")
323
+ if not os.path.exists(npy_path):
324
+ context_info.append(
325
+ f"Embedding matrix file not found for context '{group_key}' at {npy_path}"
326
+ )
327
+ return None, context_info
328
+
329
+ print(f"Loading embedding matrix for context: {group_key}")
330
+ embedding_matrix = np.load(npy_path)
331
+
332
+ # Extract embeddings for requested genes
333
+ final_embeddings = {}
334
+ for gene_name, ensembl_id in embeddings.items():
335
+ gene_idx = metadata["gene_to_idx"].get(ensembl_id)
336
+ if gene_idx is not None:
337
+ # Extract and dequantize embedding vector to float32
338
+ embedding_vector = embedding_matrix[gene_idx].astype(np.float32)
339
+ final_embeddings[gene_name] = embedding_vector
340
+
341
+ # Add success information to context
342
+ context_info.append(
343
+ f"Successfully retrieved {len(final_embeddings)} gene embeddings for context: {disease} - {state} - {cell_type}"
344
+ )
345
+ if len(final_embeddings) > 0:
346
+ embedding_dim = final_embeddings[next(iter(final_embeddings))].shape[0]
347
+ context_info.append(
348
+ f"Embedding dimensionality: {embedding_dim} features per gene"
349
+ )
350
+
351
+ return final_embeddings, context_info
352
+
353
+
354
+ @server.tool()
355
+ async def run_transcriptformer_embedding_retrieval(
356
+ state: str,
357
+ cell_type: str,
358
+ gene_names: List[str],
359
+ disease: str,
360
+ data_dir: Optional[str] = None,
361
+ ):
362
+ """
363
+ MCP Tool: Retrieves contextualized gene embeddings from Transcriptformer models.
364
+
365
+ This tool provides access to pre-computed Transcriptformer embeddings that capture
366
+ gene expression patterns learned from single-cell RNA sequencing data. The embeddings
367
+ are contextualized for specific combinations of disease states and cell types,
368
+ enabling precise analysis of gene behavior in relevant biological contexts.
369
+
370
+ Scientific Background:
371
+ - Transcriptformer uses transformer architecture to learn gene representations
372
+ - Embeddings capture cell-type-specific and disease-state-specific expression patterns
373
+ - Model trained on large-scale single-cell RNA-seq datasets
374
+ - Dense vector representations enable similarity analysis and downstream ML applications
375
+
376
+ Applications:
377
+ - Gene similarity analysis and functional annotation
378
+ - Biomarker discovery and validation in disease contexts
379
+ - Pathway analysis and systems biology research
380
+ - Drug target identification and prioritization
381
+ - Precision medicine and personalized therapeutics
382
+ - Co-expression network analysis
383
+
384
+ Technical Details:
385
+ - Embeddings stored as compressed numpy matrices for efficient access
386
+ - On-demand loading minimizes memory usage
387
+ - Supports both gene symbols and Ensembl ID inputs
388
+ - Float32 precision for optimal balance of accuracy and efficiency
389
+
390
+ Args:
391
+ state (str): Disease state context for embedding retrieval. Examples:
392
+ - 'control': Healthy/normal condition
393
+ - 'disease': Disease-affected state
394
+ - 'treated': Post-treatment condition
395
+ - 'untreated': Pre-treatment condition
396
+ Must match available states in the disease-specific store.
397
+
398
+ cell_type (str): Cell type context for embeddings. Examples:
399
+ - 'b_cell': B lymphocytes
400
+ - 't_cell': T lymphocytes
401
+ - 'macrophage': Tissue macrophages
402
+ - 'epithelial_cell': Epithelial cells
403
+ - 'fibroblast': Connective tissue fibroblasts
404
+ Must match available cell types in the disease store.
405
+
406
+ gene_names (List[str]): Gene identifiers for embedding retrieval:
407
+ - Gene symbols: ['TP53', 'BRCA1', 'EGFR', 'MYC']
408
+ - Ensembl IDs: ['ENSG00000141510', 'ENSG00000139618']
409
+ - Mixed formats supported
410
+ - Empty list retrieves all available genes
411
+
412
+ disease (str): Disease/dataset identifier. Examples:
413
+ - 'breast_cancer': Breast cancer scRNA-seq data
414
+ - 'lung_cancer': Lung cancer contexts
415
+ - 'diabetes': Diabetes-related datasets
416
+ - 'alzheimer': Alzheimer's disease contexts
417
+ Must match available disease stores.
418
+
419
+ Returns:
420
+ dict: Comprehensive embedding retrieval results containing:
421
+ - 'embeddings' (dict, optional): Gene-to-embedding mapping where:
422
+ * Keys: Gene identifiers (symbols or Ensembl IDs as provided)
423
+ * Values: Embedding vectors as lists of float32 values
424
+ Only present when embeddings are successfully retrieved.
425
+ - 'context_info' (list): Detailed retrieval information including:
426
+ * Validation results and parameter processing
427
+ * Number of genes processed and embedding dimensions
428
+ * Warnings about invalid gene identifiers
429
+ * Context combination availability
430
+ - 'error' (str, optional): Error description if retrieval failed
431
+
432
+ Example Usage:
433
+ # Retrieve specific cancer-related genes in breast cancer B cells
434
+ result = await run_transcriptformer_embedding_retrieval(
435
+ state="disease",
436
+ cell_type="b_cell",
437
+ gene_names=["TP53", "BRCA1", "EGFR", "MYC"],
438
+ disease="breast_cancer"
439
+ )
440
+
441
+ # Get all gene embeddings for control hepatocytes
442
+ result = await run_transcriptformer_embedding_retrieval(
443
+ state="control",
444
+ cell_type="hepatocyte",
445
+ gene_names=[],
446
+ disease="liver_disease"
447
+ )
448
+
449
+ # Mixed gene identifier formats
450
+ result = await run_transcriptformer_embedding_retrieval(
451
+ state="treated",
452
+ cell_type="t_cell",
453
+ gene_names=["CD8A", "ENSG00000153563", "IFNG"],
454
+ disease="immunotherapy_response"
455
+ )
456
+ """
457
+
458
+ # Generate unique request ID for tracking and logging
459
+ request_id = str(uuid.uuid4())[:8]
460
+ print(
461
+ f"[{request_id}] Received Transcriptformer embedding retrieval request for {disease} - {state} - {cell_type}"
462
+ )
463
+
464
+ # Set default data directory if not provided
465
+ if data_dir is None:
466
+ data_dir = os.getenv("TRANSCRIPTFORMER_DATA_PATH", "/root/PrismDB")
467
+
468
+ # Initialize global Transcriptformer tool instance for MCP server
469
+ # This instance will be used by the MCP tool function to serve embedding requests
470
+ try:
471
+ transcriptformer_tool = TranscriptformerEmbeddingTool(data_dir=data_dir)
472
+ print("Transcriptformer tool instance created and ready for MCP server")
473
+ except Exception as e:
474
+ print(f"Error creating Transcriptformer tool: {str(e)}")
475
+ print(
476
+ "Please ensure TRANSCRIPTFORMER_DATA_PATH is correctly set and embedding stores exist."
477
+ )
478
+ raise e
479
+
480
+ try:
481
+ # Brief async pause to allow for proper request handling
482
+ await asyncio.sleep(0.1)
483
+
484
+ # Validate input parameters
485
+ if not disease or not disease.strip():
486
+ raise ValueError(
487
+ "Disease parameter cannot be empty. Please specify a valid disease identifier."
488
+ )
489
+ if not state or not state.strip():
490
+ raise ValueError(
491
+ "State parameter cannot be empty. Please specify a valid disease state."
492
+ )
493
+ if not cell_type or not cell_type.strip():
494
+ raise ValueError(
495
+ "Cell type parameter cannot be empty. Please specify a valid cell type."
496
+ )
497
+
498
+ print(
499
+ f"[{request_id}] Processing embedding retrieval for {len(gene_names) if gene_names else 'all'} genes"
500
+ )
501
+
502
+ # Execute Transcriptformer embedding retrieval
503
+ embeddings, context_info = transcriptformer_tool.get_embedding_for_context(
504
+ state=state.strip(),
505
+ cell_type=cell_type.strip(),
506
+ gene_names=gene_names if gene_names else None,
507
+ disease=disease.strip(),
508
+ )
509
+
510
+ # Handle retrieval failure
511
+ if embeddings is None:
512
+ print(
513
+ f"[{request_id}] Embedding retrieval failed for context: {disease} - {state} - {cell_type}"
514
+ )
515
+ return {
516
+ "error": "Failed to retrieve Transcriptformer embeddings for specified context",
517
+ "context_info": context_info
518
+ + [
519
+ "Please verify disease, state, and cell type parameters.",
520
+ "Check available contexts using the tool's metadata.",
521
+ ],
522
+ }
523
+
524
+ # Convert numpy arrays to JSON-serializable lists
525
+ # This enables downstream processing and API compatibility
526
+ serializable_embeddings = {}
527
+ for gene_name, embedding_vector in embeddings.items():
528
+ serializable_embeddings[gene_name] = embedding_vector.tolist()
529
+
530
+ # Log successful completion with key metrics
531
+ num_genes = len(serializable_embeddings)
532
+ embedding_dim = (
533
+ len(next(iter(serializable_embeddings.values())))
534
+ if serializable_embeddings
535
+ else 0
536
+ )
537
+ print(
538
+ f"[{request_id}] Transcriptformer embedding retrieval completed: {num_genes} genes, {embedding_dim}D embeddings"
539
+ )
540
+
541
+ return {
542
+ "embeddings": serializable_embeddings,
543
+ "context_info": context_info
544
+ + [
545
+ f"Embedding retrieval completed for {num_genes} genes.",
546
+ f"Context: {disease} - {state} - {cell_type}",
547
+ f"Embedding dimensionality: {embedding_dim} features per gene.",
548
+ ],
549
+ }
550
+
551
+ except ValueError as e:
552
+ error_message = (
553
+ f"Transcriptformer embedding retrieval validation error: {str(e)}"
554
+ )
555
+ print(f"[{request_id}] {error_message}")
556
+ return {
557
+ "error": error_message,
558
+ "context_info": ["Please verify input parameters and available contexts."],
559
+ }
560
+ except Exception as e:
561
+ error_message = (
562
+ f"Unexpected error during Transcriptformer embedding retrieval: {str(e)}"
563
+ )
564
+ print(f"[{request_id}] {error_message}")
565
+ return {
566
+ "error": error_message,
567
+ "context_info": [
568
+ "Internal server error occurred during embedding retrieval."
569
+ ],
570
+ }
571
+
572
+
573
+ if __name__ == "__main__":
574
+ print("Starting MCP server for Transcriptformer Gene Embedding Tool...")
575
+ print("Model: Transcriptformer (Transformer-based gene representation learning)")
576
+ print("Application: Contextualized gene embedding retrieval from single-cell data")
577
+ print("Features: Disease-specific and cell-type-specific gene representations")
578
+ print("Server: FastMCP with streamable HTTP transport")
579
+ print("Port: 7000 (configured for biomedical embedding services)")
580
+ print("Timeout: Extended for large embedding matrix operations")
581
+
582
+ # Launch the MCP server with Transcriptformer embedding capabilities
583
+ # Extended timeout for handling large embedding matrices
584
+ server.run(
585
+ transport="streamable-http", host="0.0.0.0", port=7000, stateless_http=True
586
+ )
@@ -0,0 +1,61 @@
1
+ from fastmcp import FastMCP
2
+ import sys
3
+ import os
4
+ from .uspto_downloader_tool import USPTOPatentDocumentDownloader
5
+ import json
6
+
7
+ # Read the tool config dicts from the JSON file
8
+ try:
9
+ with open(
10
+ os.path.join(os.path.dirname(__file__), "uspto_downloader_client_tools.json"),
11
+ "r",
12
+ ) as f:
13
+ uspto_downloader_tools = json.load(f)
14
+ except FileNotFoundError as e:
15
+ print(f"\033[91mError: {e}\033[0m")
16
+ print(
17
+ f"\033[91mIs uspto_downloader_client_tools.json in the parent directory of {__file__}?\033[0m"
18
+ )
19
+ sys.exit(1)
20
+
21
+ server = FastMCP("Your MCP Server", stateless_http=True)
22
+ agents = {}
23
+ for tool_config in uspto_downloader_tools:
24
+ agents[tool_config["name"]] = USPTOPatentDocumentDownloader(tool_config=tool_config)
25
+
26
+
27
+ @server.tool()
28
+ def download_abst(query: dict):
29
+ """Retrieve the abstract of a patent application by its application number.
30
+ Args:
31
+ "query" dict: A dictionary containing the application number under the key "applicationNumberText".
32
+ Returns:
33
+ dict: A dictionary containing the abstract text under the 'result' key or an error message under the 'error' key if the document could not be retrieved.
34
+ """
35
+ return agents["get_abstract_from_patent_app_number"].run(query)
36
+
37
+
38
+ @server.tool()
39
+ def download_claims(query: dict):
40
+ """Retrieve the claims of a patent application by its application number.
41
+ Args:
42
+ "query" dict: A dictionary containing the application number under the key "applicationNumberText".
43
+ Returns:
44
+ dict: A dictionary containing the claims text under the 'result' key or an error message under the 'error' key if the document could not be retrieved.
45
+ """
46
+ return agents["get_claims_from_patent_app_number"].run(query)
47
+
48
+
49
+ @server.tool()
50
+ def download_full_text(query: dict):
51
+ """Retrieve the full text of a patent application by its application number.
52
+ Args:
53
+ "query" dict: A dictionary containing the application number under the key "applicationNumberText".
54
+ Returns:
55
+ dict: A dictionary containing the full text under the 'result' key or an error message under the 'error' key if the document could not be retrieved.
56
+ """
57
+ return agents["get_full_text_from_patent_app_number"].run(query)
58
+
59
+
60
+ if __name__ == "__main__":
61
+ server.run(transport="streamable-http", host="0.0.0.0", port=8081)