tooluniverse 0.2.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tooluniverse might be problematic. Click here for more details.

Files changed (190) hide show
  1. tooluniverse/__init__.py +340 -4
  2. tooluniverse/admetai_tool.py +84 -0
  3. tooluniverse/agentic_tool.py +563 -0
  4. tooluniverse/alphafold_tool.py +96 -0
  5. tooluniverse/base_tool.py +129 -6
  6. tooluniverse/boltz_tool.py +207 -0
  7. tooluniverse/chem_tool.py +192 -0
  8. tooluniverse/compose_scripts/__init__.py +1 -0
  9. tooluniverse/compose_scripts/biomarker_discovery.py +293 -0
  10. tooluniverse/compose_scripts/comprehensive_drug_discovery.py +186 -0
  11. tooluniverse/compose_scripts/drug_safety_analyzer.py +89 -0
  12. tooluniverse/compose_scripts/literature_tool.py +34 -0
  13. tooluniverse/compose_scripts/output_summarizer.py +279 -0
  14. tooluniverse/compose_scripts/tool_description_optimizer.py +681 -0
  15. tooluniverse/compose_scripts/tool_discover.py +705 -0
  16. tooluniverse/compose_scripts/tool_graph_composer.py +448 -0
  17. tooluniverse/compose_tool.py +371 -0
  18. tooluniverse/ctg_tool.py +1002 -0
  19. tooluniverse/custom_tool.py +81 -0
  20. tooluniverse/dailymed_tool.py +108 -0
  21. tooluniverse/data/admetai_tools.json +155 -0
  22. tooluniverse/data/agentic_tools.json +1156 -0
  23. tooluniverse/data/alphafold_tools.json +87 -0
  24. tooluniverse/data/boltz_tools.json +9 -0
  25. tooluniverse/data/chembl_tools.json +16 -0
  26. tooluniverse/data/clait_tools.json +108 -0
  27. tooluniverse/data/clinicaltrials_gov_tools.json +326 -0
  28. tooluniverse/data/compose_tools.json +202 -0
  29. tooluniverse/data/dailymed_tools.json +70 -0
  30. tooluniverse/data/dataset_tools.json +646 -0
  31. tooluniverse/data/disease_target_score_tools.json +712 -0
  32. tooluniverse/data/efo_tools.json +17 -0
  33. tooluniverse/data/embedding_tools.json +319 -0
  34. tooluniverse/data/enrichr_tools.json +31 -0
  35. tooluniverse/data/europe_pmc_tools.json +22 -0
  36. tooluniverse/data/expert_feedback_tools.json +10 -0
  37. tooluniverse/data/fda_drug_adverse_event_tools.json +491 -0
  38. tooluniverse/data/fda_drug_labeling_tools.json +1 -1
  39. tooluniverse/data/fda_drugs_with_brand_generic_names_for_tool.py +76929 -148860
  40. tooluniverse/data/finder_tools.json +209 -0
  41. tooluniverse/data/gene_ontology_tools.json +113 -0
  42. tooluniverse/data/gwas_tools.json +1082 -0
  43. tooluniverse/data/hpa_tools.json +333 -0
  44. tooluniverse/data/humanbase_tools.json +47 -0
  45. tooluniverse/data/idmap_tools.json +74 -0
  46. tooluniverse/data/mcp_client_tools_example.json +113 -0
  47. tooluniverse/data/mcpautoloadertool_defaults.json +28 -0
  48. tooluniverse/data/medlineplus_tools.json +141 -0
  49. tooluniverse/data/monarch_tools.json +1 -1
  50. tooluniverse/data/openalex_tools.json +36 -0
  51. tooluniverse/data/opentarget_tools.json +1 -1
  52. tooluniverse/data/output_summarization_tools.json +101 -0
  53. tooluniverse/data/packages/bioinformatics_core_tools.json +1756 -0
  54. tooluniverse/data/packages/categorized_tools.txt +206 -0
  55. tooluniverse/data/packages/cheminformatics_tools.json +347 -0
  56. tooluniverse/data/packages/earth_sciences_tools.json +74 -0
  57. tooluniverse/data/packages/genomics_tools.json +776 -0
  58. tooluniverse/data/packages/image_processing_tools.json +38 -0
  59. tooluniverse/data/packages/machine_learning_tools.json +789 -0
  60. tooluniverse/data/packages/neuroscience_tools.json +62 -0
  61. tooluniverse/data/packages/original_tools.txt +0 -0
  62. tooluniverse/data/packages/physics_astronomy_tools.json +62 -0
  63. tooluniverse/data/packages/scientific_computing_tools.json +560 -0
  64. tooluniverse/data/packages/single_cell_tools.json +453 -0
  65. tooluniverse/data/packages/software_tools.json +4954 -0
  66. tooluniverse/data/packages/structural_biology_tools.json +396 -0
  67. tooluniverse/data/packages/visualization_tools.json +399 -0
  68. tooluniverse/data/pubchem_tools.json +215 -0
  69. tooluniverse/data/pubtator_tools.json +68 -0
  70. tooluniverse/data/rcsb_pdb_tools.json +1332 -0
  71. tooluniverse/data/reactome_tools.json +19 -0
  72. tooluniverse/data/semantic_scholar_tools.json +26 -0
  73. tooluniverse/data/special_tools.json +2 -25
  74. tooluniverse/data/tool_composition_tools.json +88 -0
  75. tooluniverse/data/toolfinderkeyword_defaults.json +34 -0
  76. tooluniverse/data/txagent_client_tools.json +9 -0
  77. tooluniverse/data/uniprot_tools.json +211 -0
  78. tooluniverse/data/url_fetch_tools.json +94 -0
  79. tooluniverse/data/uspto_downloader_tools.json +9 -0
  80. tooluniverse/data/uspto_tools.json +811 -0
  81. tooluniverse/data/xml_tools.json +3275 -0
  82. tooluniverse/dataset_tool.py +296 -0
  83. tooluniverse/default_config.py +165 -0
  84. tooluniverse/efo_tool.py +42 -0
  85. tooluniverse/embedding_database.py +630 -0
  86. tooluniverse/embedding_sync.py +396 -0
  87. tooluniverse/enrichr_tool.py +266 -0
  88. tooluniverse/europe_pmc_tool.py +52 -0
  89. tooluniverse/execute_function.py +1775 -95
  90. tooluniverse/extended_hooks.py +444 -0
  91. tooluniverse/gene_ontology_tool.py +194 -0
  92. tooluniverse/graphql_tool.py +158 -36
  93. tooluniverse/gwas_tool.py +358 -0
  94. tooluniverse/hpa_tool.py +1645 -0
  95. tooluniverse/humanbase_tool.py +389 -0
  96. tooluniverse/logging_config.py +254 -0
  97. tooluniverse/mcp_client_tool.py +764 -0
  98. tooluniverse/mcp_integration.py +413 -0
  99. tooluniverse/mcp_tool_registry.py +925 -0
  100. tooluniverse/medlineplus_tool.py +337 -0
  101. tooluniverse/openalex_tool.py +228 -0
  102. tooluniverse/openfda_adv_tool.py +283 -0
  103. tooluniverse/openfda_tool.py +393 -160
  104. tooluniverse/output_hook.py +1122 -0
  105. tooluniverse/package_tool.py +195 -0
  106. tooluniverse/pubchem_tool.py +158 -0
  107. tooluniverse/pubtator_tool.py +168 -0
  108. tooluniverse/rcsb_pdb_tool.py +38 -0
  109. tooluniverse/reactome_tool.py +108 -0
  110. tooluniverse/remote/boltz/boltz_mcp_server.py +50 -0
  111. tooluniverse/remote/depmap_24q2/depmap_24q2_mcp_tool.py +442 -0
  112. tooluniverse/remote/expert_feedback/human_expert_mcp_tools.py +2013 -0
  113. tooluniverse/remote/expert_feedback/simple_test.py +23 -0
  114. tooluniverse/remote/expert_feedback/start_web_interface.py +188 -0
  115. tooluniverse/remote/expert_feedback/web_only_interface.py +0 -0
  116. tooluniverse/remote/expert_feedback_mcp/human_expert_mcp_server.py +1611 -0
  117. tooluniverse/remote/expert_feedback_mcp/simple_test.py +34 -0
  118. tooluniverse/remote/expert_feedback_mcp/start_web_interface.py +91 -0
  119. tooluniverse/remote/immune_compass/compass_tool.py +327 -0
  120. tooluniverse/remote/pinnacle/pinnacle_tool.py +328 -0
  121. tooluniverse/remote/transcriptformer/transcriptformer_tool.py +586 -0
  122. tooluniverse/remote/uspto_downloader/uspto_downloader_mcp_server.py +61 -0
  123. tooluniverse/remote/uspto_downloader/uspto_downloader_tool.py +120 -0
  124. tooluniverse/remote_tool.py +99 -0
  125. tooluniverse/restful_tool.py +53 -30
  126. tooluniverse/scripts/generate_tool_graph.py +408 -0
  127. tooluniverse/scripts/visualize_tool_graph.py +829 -0
  128. tooluniverse/semantic_scholar_tool.py +62 -0
  129. tooluniverse/smcp.py +2452 -0
  130. tooluniverse/smcp_server.py +975 -0
  131. tooluniverse/test/mcp_server_test.py +0 -0
  132. tooluniverse/test/test_admetai_tool.py +370 -0
  133. tooluniverse/test/test_agentic_tool.py +129 -0
  134. tooluniverse/test/test_alphafold_tool.py +71 -0
  135. tooluniverse/test/test_chem_tool.py +37 -0
  136. tooluniverse/test/test_compose_lieraturereview.py +63 -0
  137. tooluniverse/test/test_compose_tool.py +448 -0
  138. tooluniverse/test/test_dailymed.py +69 -0
  139. tooluniverse/test/test_dataset_tool.py +200 -0
  140. tooluniverse/test/test_disease_target_score.py +56 -0
  141. tooluniverse/test/test_drugbank_filter_examples.py +179 -0
  142. tooluniverse/test/test_efo.py +31 -0
  143. tooluniverse/test/test_enrichr_tool.py +21 -0
  144. tooluniverse/test/test_europe_pmc_tool.py +20 -0
  145. tooluniverse/test/test_fda_adv.py +95 -0
  146. tooluniverse/test/test_fda_drug_labeling.py +91 -0
  147. tooluniverse/test/test_gene_ontology_tools.py +66 -0
  148. tooluniverse/test/test_gwas_tool.py +139 -0
  149. tooluniverse/test/test_hpa.py +625 -0
  150. tooluniverse/test/test_humanbase_tool.py +20 -0
  151. tooluniverse/test/test_idmap_tools.py +61 -0
  152. tooluniverse/test/test_mcp_server.py +211 -0
  153. tooluniverse/test/test_mcp_tool.py +247 -0
  154. tooluniverse/test/test_medlineplus.py +220 -0
  155. tooluniverse/test/test_openalex_tool.py +32 -0
  156. tooluniverse/test/test_opentargets.py +28 -0
  157. tooluniverse/test/test_pubchem_tool.py +116 -0
  158. tooluniverse/test/test_pubtator_tool.py +37 -0
  159. tooluniverse/test/test_rcsb_pdb_tool.py +86 -0
  160. tooluniverse/test/test_reactome.py +54 -0
  161. tooluniverse/test/test_semantic_scholar_tool.py +24 -0
  162. tooluniverse/test/test_software_tools.py +147 -0
  163. tooluniverse/test/test_tool_description_optimizer.py +49 -0
  164. tooluniverse/test/test_tool_finder.py +26 -0
  165. tooluniverse/test/test_tool_finder_llm.py +252 -0
  166. tooluniverse/test/test_tools_find.py +195 -0
  167. tooluniverse/test/test_uniprot_tools.py +74 -0
  168. tooluniverse/test/test_uspto_tool.py +72 -0
  169. tooluniverse/test/test_xml_tool.py +113 -0
  170. tooluniverse/tool_finder_embedding.py +267 -0
  171. tooluniverse/tool_finder_keyword.py +693 -0
  172. tooluniverse/tool_finder_llm.py +699 -0
  173. tooluniverse/tool_graph_web_ui.py +955 -0
  174. tooluniverse/tool_registry.py +416 -0
  175. tooluniverse/uniprot_tool.py +155 -0
  176. tooluniverse/url_tool.py +253 -0
  177. tooluniverse/uspto_tool.py +240 -0
  178. tooluniverse/utils.py +369 -41
  179. tooluniverse/xml_tool.py +369 -0
  180. tooluniverse-1.0.0.dist-info/METADATA +377 -0
  181. tooluniverse-1.0.0.dist-info/RECORD +186 -0
  182. tooluniverse-1.0.0.dist-info/entry_points.txt +9 -0
  183. tooluniverse/generate_mcp_tools.py +0 -113
  184. tooluniverse/mcp_server.py +0 -3340
  185. tooluniverse-0.2.0.dist-info/METADATA +0 -139
  186. tooluniverse-0.2.0.dist-info/RECORD +0 -21
  187. tooluniverse-0.2.0.dist-info/entry_points.txt +0 -4
  188. {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.0.dist-info}/WHEEL +0 -0
  189. {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.0.dist-info}/licenses/LICENSE +0 -0
  190. {tooluniverse-0.2.0.dist-info → tooluniverse-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1645 @@
1
+ # hpa_tool.py
2
+
3
+ import requests
4
+ import xml.etree.ElementTree as ET
5
+ from typing import Dict, Any, List
6
+ from .base_tool import BaseTool
7
+ from .tool_registry import register_tool
8
+
9
+ HPA_SEARCH_API = "https://www.proteinatlas.org/api/search_download.php"
10
+ HPA_BASE = "https://www.proteinatlas.org"
11
+ HPA_JSON_API_TEMPLATE = "https://www.proteinatlas.org/{ensembl_id}.json"
12
+ HPA_XML_API_TEMPLATE = "https://www.proteinatlas.org/{ensembl_id}.xml"
13
+
14
+ # --- Base Tool Classes ---
15
+
16
+
17
+ @register_tool("HPASearchApiTool")
18
+ class HPASearchApiTool(BaseTool):
19
+ """
20
+ Base class for interacting with HPA's search_download.php API.
21
+ Uses HPA's search and download API to get protein expression data.
22
+ """
23
+
24
+ def __init__(self, tool_config):
25
+ super().__init__(tool_config)
26
+ self.timeout = 30
27
+ self.base_url = HPA_SEARCH_API
28
+
29
+ def _make_api_request(
30
+ self, search_term: str, columns: str, format_type: str = "json"
31
+ ) -> Dict[str, Any]:
32
+ """Make HPA API request with improved error handling"""
33
+ params = {
34
+ "search": search_term,
35
+ "format": format_type,
36
+ "columns": columns,
37
+ "compress": "no",
38
+ }
39
+
40
+ try:
41
+ resp = requests.get(self.base_url, params=params, timeout=self.timeout)
42
+ if resp.status_code == 404:
43
+ return {"error": f"No data found for gene '{search_term}'"}
44
+ if resp.status_code != 200:
45
+ return {
46
+ "error": f"HPA API request failed, HTTP {resp.status_code}",
47
+ "detail": resp.text,
48
+ }
49
+
50
+ if format_type == "json":
51
+ data = resp.json()
52
+ # Ensure we always return a list for consistency
53
+ if not isinstance(data, list):
54
+ return {"error": "API did not return expected list format"}
55
+ return data
56
+ else:
57
+ return {"tsv_data": resp.text}
58
+
59
+ except requests.RequestException as e:
60
+ return {"error": f"HPA API request failed: {str(e)}"}
61
+ except ValueError as e:
62
+ return {
63
+ "error": f"Failed to parse HPA response data: {str(e)}",
64
+ "content": resp.text,
65
+ }
66
+
67
+
68
+ @register_tool("HPAJsonApiTool")
69
+ class HPAJsonApiTool(BaseTool):
70
+ """
71
+ Base class for interacting with HPA's /{ensembl_id}.json API.
72
+ More efficient for getting comprehensive gene data.
73
+ """
74
+
75
+ def __init__(self, tool_config):
76
+ super().__init__(tool_config)
77
+ self.timeout = 30
78
+ self.base_url_template = HPA_JSON_API_TEMPLATE
79
+
80
+ def _make_api_request(self, ensembl_id: str) -> Dict[str, Any]:
81
+ """Make HPA JSON API request for a specific gene"""
82
+ url = self.base_url_template.format(ensembl_id=ensembl_id)
83
+ try:
84
+ resp = requests.get(url, timeout=self.timeout)
85
+ if resp.status_code == 404:
86
+ return {"error": f"No data found for Ensembl ID '{ensembl_id}'"}
87
+ if resp.status_code != 200:
88
+ return {
89
+ "error": f"HPA JSON API request failed, HTTP {resp.status_code}",
90
+ "detail": resp.text,
91
+ }
92
+
93
+ return resp.json()
94
+
95
+ except requests.RequestException as e:
96
+ return {"error": f"HPA JSON API request failed: {str(e)}"}
97
+ except ValueError as e:
98
+ return {
99
+ "error": f"Failed to parse HPA JSON response: {str(e)}",
100
+ "content": resp.text,
101
+ }
102
+
103
+
104
+ @register_tool("HPAXmlApiTool")
105
+ class HPAXmlApiTool(BaseTool):
106
+ """
107
+ Base class for interacting with HPA's /{ensembl_id}.xml API.
108
+ Optimized for comprehensive XML data extraction.
109
+ """
110
+
111
+ def __init__(self, tool_config):
112
+ super().__init__(tool_config)
113
+ self.timeout = 45
114
+ self.base_url_template = HPA_XML_API_TEMPLATE
115
+
116
+ def _make_api_request(self, ensembl_id: str) -> ET.Element:
117
+ """Make HPA XML API request for a specific gene"""
118
+ url = self.base_url_template.format(ensembl_id=ensembl_id)
119
+ try:
120
+ resp = requests.get(url, timeout=self.timeout)
121
+ if resp.status_code == 404:
122
+ raise Exception(f"No XML data found for Ensembl ID '{ensembl_id}'")
123
+ if resp.status_code != 200:
124
+ raise Exception(f"HPA XML API request failed, HTTP {resp.status_code}")
125
+
126
+ return ET.fromstring(resp.content)
127
+ except requests.RequestException as e:
128
+ raise Exception(f"HPA XML API request failed: {str(e)}")
129
+ except ET.ParseError as e:
130
+ raise Exception(f"Failed to parse HPA XML response: {str(e)}")
131
+
132
+
133
+ # --- New Enhanced Tools Based on Your Optimization Plan ---
134
+
135
+
136
+ @register_tool("HPAGetRnaExpressionBySourceTool")
137
+ class HPAGetRnaExpressionBySourceTool(HPASearchApiTool):
138
+ """
139
+ Get RNA expression for a gene from specific biological sources using optimized columns parameter.
140
+ This tool directly leverages the comprehensive columns table for efficient queries.
141
+ """
142
+
143
+ def __init__(self, tool_config):
144
+ super().__init__(tool_config)
145
+ # Use correct HPA API column identifiers
146
+ self.source_column_mappings = {
147
+ "tissue": "rnatsm", # RNA tissue specific nTPM
148
+ "blood": "rnablm", # RNA blood lineage specific nTPM
149
+ "brain": "rnabrm", # RNA brain region specific nTPM
150
+ "single_cell": "rnascm", # RNA single cell type specific nTPM
151
+ }
152
+
153
+ # Map expected API response field names for each source type
154
+ self.api_response_fields = {
155
+ "tissue": "RNA tissue specific nTPM",
156
+ "blood": "RNA blood lineage specific nTPM",
157
+ "brain": "RNA brain region specific nTPM",
158
+ "single_cell": "RNA single cell type specific nTPM",
159
+ }
160
+
161
+ # Map source names to expected keys in API response
162
+ self.source_name_mappings = {
163
+ "tissue": {
164
+ "adipose_tissue": ["adipose tissue", "fat"],
165
+ "adrenal_gland": ["adrenal gland", "adrenal"],
166
+ "appendix": ["appendix"],
167
+ "bone_marrow": ["bone marrow"],
168
+ "brain": ["brain", "cerebral cortex"],
169
+ "breast": ["breast"],
170
+ "bronchus": ["bronchus"],
171
+ "cerebellum": ["cerebellum"],
172
+ "cerebral_cortex": ["cerebral cortex", "brain"],
173
+ "cervix": ["cervix"],
174
+ "choroid_plexus": ["choroid plexus"],
175
+ "colon": ["colon"],
176
+ "duodenum": ["duodenum"],
177
+ "endometrium": ["endometrium"],
178
+ "epididymis": ["epididymis"],
179
+ "esophagus": ["esophagus"],
180
+ "fallopian_tube": ["fallopian tube"],
181
+ "gallbladder": ["gallbladder"],
182
+ "heart_muscle": ["heart muscle", "heart"],
183
+ "hippocampal_formation": ["hippocampus", "hippocampal formation"],
184
+ "hypothalamus": ["hypothalamus"],
185
+ "kidney": ["kidney"],
186
+ "liver": ["liver"],
187
+ "lung": ["lung"],
188
+ "lymph_node": ["lymph node"],
189
+ "nasopharynx": ["nasopharynx"],
190
+ "oral_mucosa": ["oral mucosa"],
191
+ "ovary": ["ovary"],
192
+ "pancreas": ["pancreas"],
193
+ "parathyroid_gland": ["parathyroid gland"],
194
+ "pituitary_gland": ["pituitary gland"],
195
+ "placenta": ["placenta"],
196
+ "prostate": ["prostate"],
197
+ "rectum": ["rectum"],
198
+ "retina": ["retina"],
199
+ "salivary_gland": ["salivary gland"],
200
+ "seminal_vesicle": ["seminal vesicle"],
201
+ "skeletal_muscle": ["skeletal muscle"],
202
+ "skin": ["skin"],
203
+ "small_intestine": ["small intestine"],
204
+ "smooth_muscle": ["smooth muscle"],
205
+ "soft_tissue": ["soft tissue"],
206
+ "spleen": ["spleen"],
207
+ "stomach": ["stomach"],
208
+ "testis": ["testis"],
209
+ "thymus": ["thymus"],
210
+ "thyroid_gland": ["thyroid gland"],
211
+ "tongue": ["tongue"],
212
+ "tonsil": ["tonsil"],
213
+ "urinary_bladder": ["urinary bladder"],
214
+ "vagina": ["vagina"],
215
+ },
216
+ "blood": {
217
+ "t_cell": ["t-cell", "t cell"],
218
+ "b_cell": ["b-cell", "b cell"],
219
+ "nk_cell": ["nk-cell", "nk cell", "natural killer"],
220
+ "monocyte": ["monocyte"],
221
+ "neutrophil": ["neutrophil"],
222
+ "eosinophil": ["eosinophil"],
223
+ "basophil": ["basophil"],
224
+ "dendritic_cell": ["dendritic cell"],
225
+ },
226
+ "brain": {
227
+ "cerebellum": ["cerebellum"],
228
+ "cerebral_cortex": ["cerebral cortex", "cortex"],
229
+ "hippocampus": ["hippocampus", "hippocampal formation"],
230
+ "hypothalamus": ["hypothalamus"],
231
+ "amygdala": ["amygdala"],
232
+ "brainstem": ["brainstem", "brain stem"],
233
+ "thalamus": ["thalamus"],
234
+ },
235
+ "single_cell": {
236
+ "t_cell": ["t-cell", "t cell"],
237
+ "b_cell": ["b-cell", "b cell"],
238
+ "hepatocyte": ["hepatocyte"],
239
+ "neuron": ["neuron"],
240
+ "astrocyte": ["astrocyte"],
241
+ "fibroblast": ["fibroblast"],
242
+ },
243
+ }
244
+
245
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
246
+ gene_name = arguments.get("gene_name")
247
+ source_type = arguments.get("source_type", "").lower()
248
+ source_name = (
249
+ arguments.get("source_name", "").lower().replace(" ", "_").replace("-", "_")
250
+ )
251
+
252
+ if not gene_name:
253
+ return {"error": "Parameter 'gene_name' is required"}
254
+ if not source_type:
255
+ return {"error": "Parameter 'source_type' is required"}
256
+ if not source_name:
257
+ return {"error": "Parameter 'source_name' is required"}
258
+
259
+ # Validate source type
260
+ if source_type not in self.source_column_mappings:
261
+ available_types = ", ".join(self.source_column_mappings.keys())
262
+ return {
263
+ "error": f"Invalid source_type '{source_type}'. Available types: {available_types}"
264
+ }
265
+
266
+ # Enhanced validation with intelligent recommendations
267
+ if source_name not in self.source_name_mappings[source_type]:
268
+ available_sources = list(self.source_name_mappings[source_type].keys())
269
+
270
+ # Find similar source names (fuzzy matching)
271
+ similar_sources = []
272
+ source_keywords = source_name.replace("_", " ").split()
273
+
274
+ for valid_source in available_sources:
275
+ # Direct substring matching
276
+ if (
277
+ source_name.lower() in valid_source.lower()
278
+ or valid_source.lower() in source_name.lower()
279
+ ):
280
+ similar_sources.append(valid_source)
281
+ continue
282
+
283
+ # Check with underscores removed/normalized
284
+ normalized_input = source_name.lower().replace("_", "").replace(" ", "")
285
+ normalized_valid = (
286
+ valid_source.lower().replace("_", "").replace(" ", "")
287
+ )
288
+ if (
289
+ normalized_input in normalized_valid
290
+ or normalized_valid in normalized_input
291
+ ):
292
+ similar_sources.append(valid_source)
293
+ continue
294
+
295
+ # Check individual keywords
296
+ for keyword in source_keywords:
297
+ if (
298
+ keyword.lower() in valid_source.lower()
299
+ or valid_source.lower() in keyword.lower()
300
+ ):
301
+ similar_sources.append(valid_source)
302
+ break
303
+
304
+ error_msg = (
305
+ f"Invalid source_name '{source_name}' for source_type '{source_type}'. "
306
+ )
307
+ if similar_sources:
308
+ error_msg += f"Similar options: {similar_sources[:3]}. "
309
+ error_msg += (
310
+ f"All available sources for '{source_type}': {available_sources}"
311
+ )
312
+ return {"error": error_msg}
313
+
314
+ try:
315
+ # Get the correct API column
316
+ api_column = self.source_column_mappings[source_type]
317
+ columns = f"g,gs,{api_column}"
318
+
319
+ # Call the search API
320
+ response_data = self._make_api_request(gene_name, columns)
321
+
322
+ if "error" in response_data:
323
+ return response_data
324
+
325
+ if not response_data or len(response_data) == 0:
326
+ return {
327
+ "gene_name": gene_name,
328
+ "source_type": source_type,
329
+ "source_name": source_name,
330
+ "expression_value": "N/A",
331
+ "status": "Gene not found",
332
+ }
333
+
334
+ # Get the first result
335
+ gene_data = response_data[0]
336
+
337
+ # Extract expression data from the API response
338
+ expression_value = "N/A"
339
+ available_sources = []
340
+
341
+ # Get the expression data dictionary for this source type
342
+ api_field_name = self.api_response_fields[source_type]
343
+ expression_data = gene_data.get(api_field_name)
344
+
345
+ if expression_data and isinstance(expression_data, dict):
346
+ available_sources = list(expression_data.keys())
347
+
348
+ # Get possible names for this source
349
+ possible_names = self.source_name_mappings[source_type][source_name]
350
+
351
+ # Try to find a matching source name in the response
352
+ for source_key in expression_data.keys():
353
+ source_key_lower = source_key.lower()
354
+ for possible_name in possible_names:
355
+ if (
356
+ possible_name.lower() in source_key_lower
357
+ or source_key_lower in possible_name.lower()
358
+ ):
359
+ expression_value = expression_data[source_key]
360
+ break
361
+ if expression_value != "N/A":
362
+ break
363
+
364
+ # If exact match not found, look for partial matches
365
+ if expression_value == "N/A":
366
+ source_keywords = source_name.replace("_", " ").split()
367
+ for source_key in expression_data.keys():
368
+ source_key_lower = source_key.lower()
369
+ for keyword in source_keywords:
370
+ if keyword in source_key_lower:
371
+ expression_value = expression_data[source_key]
372
+ break
373
+ if expression_value != "N/A":
374
+ break
375
+
376
+ # Categorize expression level
377
+ expression_level = "unknown"
378
+ if expression_value != "N/A":
379
+ try:
380
+ val = float(expression_value)
381
+ if val > 50:
382
+ expression_level = "very high"
383
+ elif val > 10:
384
+ expression_level = "high"
385
+ elif val > 1:
386
+ expression_level = "medium"
387
+ elif val > 0.1:
388
+ expression_level = "low"
389
+ else:
390
+ expression_level = "very low"
391
+ except (ValueError, TypeError):
392
+ expression_level = "unknown"
393
+
394
+ return {
395
+ "gene_name": gene_data.get("Gene", gene_name),
396
+ "gene_synonym": gene_data.get("Gene synonym", ""),
397
+ "source_type": source_type,
398
+ "source_name": source_name,
399
+ "expression_value": expression_value,
400
+ "expression_level": expression_level,
401
+ "expression_unit": "nTPM",
402
+ "column_queried": api_column,
403
+ "available_sources": (
404
+ available_sources[:10]
405
+ if len(available_sources) > 10
406
+ else available_sources
407
+ ),
408
+ "total_available_sources": len(available_sources),
409
+ "status": (
410
+ "success"
411
+ if expression_value != "N/A"
412
+ else "no_expression_data_for_source"
413
+ ),
414
+ }
415
+
416
+ except Exception as e:
417
+ return {
418
+ "error": f"Failed to retrieve RNA expression data: {str(e)}",
419
+ "gene_name": gene_name,
420
+ "source_type": source_type,
421
+ "source_name": source_name,
422
+ }
423
+
424
+
425
+ @register_tool("HPAGetSubcellularLocationTool")
426
+ class HPAGetSubcellularLocationTool(HPASearchApiTool):
427
+ """
428
+ Get annotated subcellular locations for a protein using optimized columns parameter.
429
+ Uses scml (main location) and scal (additional location) columns for efficient queries.
430
+ """
431
+
432
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
433
+ gene_name = arguments.get("gene_name")
434
+ if not gene_name:
435
+ return {"error": "Parameter 'gene_name' is required"}
436
+
437
+ # Use specific columns for subcellular location data
438
+ result = self._make_api_request(gene_name, "g,gs,scml,scal")
439
+
440
+ if "error" in result:
441
+ return result
442
+
443
+ if not result:
444
+ return {"error": "No subcellular location data found"}
445
+
446
+ gene_data = result[0]
447
+
448
+ # Parse main and additional locations
449
+ main_location = gene_data.get("Subcellular main location", "")
450
+ additional_location = gene_data.get("Subcellular additional location", "")
451
+
452
+ # Handle different data types (string or list)
453
+ if isinstance(main_location, list):
454
+ main_locations = main_location
455
+ elif isinstance(main_location, str):
456
+ main_locations = (
457
+ [loc.strip() for loc in main_location.split(";") if loc.strip()]
458
+ if main_location
459
+ else []
460
+ )
461
+ else:
462
+ main_locations = []
463
+
464
+ if isinstance(additional_location, list):
465
+ additional_locations = additional_location
466
+ elif isinstance(additional_location, str):
467
+ additional_locations = (
468
+ [loc.strip() for loc in additional_location.split(";") if loc.strip()]
469
+ if additional_location
470
+ else []
471
+ )
472
+ else:
473
+ additional_locations = []
474
+
475
+ return {
476
+ "gene_name": gene_data.get("Gene", gene_name),
477
+ "gene_synonym": gene_data.get("Gene synonym", ""),
478
+ "main_locations": main_locations,
479
+ "additional_locations": additional_locations,
480
+ "total_locations": len(main_locations) + len(additional_locations),
481
+ "location_summary": self._generate_location_summary(
482
+ main_locations, additional_locations
483
+ ),
484
+ }
485
+
486
+ def _generate_location_summary(
487
+ self, main_locs: List[str], add_locs: List[str]
488
+ ) -> str:
489
+ """Generate a summary of subcellular locations"""
490
+ if not main_locs and not add_locs:
491
+ return "No subcellular location data available"
492
+
493
+ summary_parts = []
494
+ if main_locs:
495
+ summary_parts.append(f"Primary: {', '.join(main_locs)}")
496
+ if add_locs:
497
+ summary_parts.append(f"Additional: {', '.join(add_locs)}")
498
+
499
+ return "; ".join(summary_parts)
500
+
501
+
502
+ # --- Existing Tools (Updated with improvements) ---
503
+
504
+
505
+ @register_tool("HPASearchGenesTool")
506
+ class HPASearchGenesTool(HPASearchApiTool):
507
+ """
508
+ Search for matching genes by gene name, keywords, or cell line names and return Ensembl ID list.
509
+ This is the entry tool for many query workflows.
510
+ """
511
+
512
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
513
+ search_query = arguments.get("search_query")
514
+ if not search_query:
515
+ return {"error": "Parameter 'search_query' is required"}
516
+
517
+ # 'g' for Gene name, 'gs' for Gene synonym, 'eg' for Ensembl ID
518
+ columns = "g,gs,eg"
519
+ result = self._make_api_request(search_query, columns)
520
+
521
+ if "error" in result:
522
+ return result
523
+
524
+ if not result or not isinstance(result, list):
525
+ return {"error": f"No matching genes found for query '{search_query}'"}
526
+
527
+ formatted_results = []
528
+ for gene in result:
529
+ gene_synonym = gene.get("Gene synonym", "")
530
+ if isinstance(gene_synonym, str):
531
+ synonyms = gene_synonym.split(", ") if gene_synonym else []
532
+ elif isinstance(gene_synonym, list):
533
+ synonyms = gene_synonym
534
+ else:
535
+ synonyms = []
536
+
537
+ formatted_results.append(
538
+ {
539
+ "gene_name": gene.get("Gene"),
540
+ "ensembl_id": gene.get("Ensembl"),
541
+ "gene_synonyms": synonyms,
542
+ }
543
+ )
544
+
545
+ return {
546
+ "search_query": search_query,
547
+ "match_count": len(formatted_results),
548
+ "genes": formatted_results,
549
+ }
550
+
551
+
552
+ @register_tool("HPAGetComparativeExpressionTool")
553
+ class HPAGetComparativeExpressionTool(HPASearchApiTool):
554
+ """
555
+ Compare gene expression levels in specific cell lines and healthy tissues.
556
+ Get expression data for comparison by gene name and cell line name.
557
+ """
558
+
559
+ def __init__(self, tool_config):
560
+ super().__init__(tool_config)
561
+ # Mapping of common cell lines to their column identifiers
562
+ self.cell_line_columns = {
563
+ "ishikawa": "cell_RNA_ishikawa_heraklio",
564
+ "hela": "cell_RNA_hela",
565
+ "mcf7": "cell_RNA_mcf7",
566
+ "a549": "cell_RNA_a549",
567
+ "hepg2": "cell_RNA_hepg2",
568
+ "jurkat": "cell_RNA_jurkat",
569
+ "pc3": "cell_RNA_pc3",
570
+ "rh30": "cell_RNA_rh30",
571
+ "siha": "cell_RNA_siha",
572
+ "u251": "cell_RNA_u251",
573
+ }
574
+
575
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
576
+ gene_name = arguments.get("gene_name")
577
+ cell_line = arguments.get("cell_line", "").lower()
578
+
579
+ if not gene_name:
580
+ return {"error": "Parameter 'gene_name' is required"}
581
+ if not cell_line:
582
+ return {"error": "Parameter 'cell_line' is required"}
583
+
584
+ # Enhanced validation with intelligent recommendations
585
+ cell_column = self.cell_line_columns.get(cell_line)
586
+ if not cell_column:
587
+ available_lines = list(self.cell_line_columns.keys())
588
+
589
+ # Find similar cell line names
590
+ similar_lines = []
591
+ for valid_line in available_lines:
592
+ if cell_line in valid_line or valid_line in cell_line:
593
+ similar_lines.append(valid_line)
594
+
595
+ error_msg = f"Unsupported cell_line '{cell_line}'. "
596
+ if similar_lines:
597
+ error_msg += f"Similar options: {similar_lines}. "
598
+ error_msg += f"All supported cell lines: {available_lines}"
599
+ return {"error": error_msg}
600
+
601
+ # Request expression data for the cell line
602
+ cell_columns = f"g,gs,{cell_column}"
603
+ cell_result = self._make_api_request(gene_name, cell_columns)
604
+ if "error" in cell_result:
605
+ return cell_result
606
+
607
+ # Request expression data for healthy tissues
608
+ tissue_columns = "g,gs,rnatsm"
609
+ tissue_result = self._make_api_request(gene_name, tissue_columns)
610
+ if "error" in tissue_result:
611
+ return tissue_result
612
+
613
+ # Format the result
614
+ if not cell_result or not tissue_result:
615
+ return {"error": "No expression data found"}
616
+
617
+ # Extract the first matching gene data
618
+ cell_data = (
619
+ cell_result[0] if isinstance(cell_result, list) and cell_result else {}
620
+ )
621
+ tissue_data = (
622
+ tissue_result[0]
623
+ if isinstance(tissue_result, list) and tissue_result
624
+ else {}
625
+ )
626
+
627
+ return {
628
+ "gene_name": gene_name,
629
+ "gene_symbol": cell_data.get("Gene", gene_name),
630
+ "gene_synonym": cell_data.get("Gene synonym", ""),
631
+ "cell_line": cell_line,
632
+ "cell_line_expression": cell_data.get(cell_column, "N/A"),
633
+ "healthy_tissue_expression": tissue_data.get(
634
+ "RNA tissue specific nTPM", "N/A"
635
+ ),
636
+ "expression_unit": "nTPM (normalized Transcripts Per Million)",
637
+ "comparison_summary": self._generate_comparison_summary(
638
+ cell_data.get(cell_column), tissue_data.get("RNA tissue specific nTPM")
639
+ ),
640
+ }
641
+
642
+ def _generate_comparison_summary(self, cell_expr, tissue_expr) -> str:
643
+ """Generate expression level comparison summary"""
644
+ try:
645
+ cell_val = float(cell_expr) if cell_expr and cell_expr != "N/A" else None
646
+ tissue_val = (
647
+ float(tissue_expr) if tissue_expr and tissue_expr != "N/A" else None
648
+ )
649
+
650
+ if cell_val is None or tissue_val is None:
651
+ return "Insufficient data for comparison"
652
+
653
+ if cell_val > tissue_val * 2:
654
+ return f"Expression significantly higher in cell line ({cell_val:.2f} vs {tissue_val:.2f})"
655
+ elif tissue_val > cell_val * 2:
656
+ return f"Expression significantly higher in healthy tissues ({tissue_val:.2f} vs {cell_val:.2f})"
657
+ else:
658
+ return f"Expression levels similar (cell line: {cell_val:.2f}, healthy tissues: {tissue_val:.2f})"
659
+ except Exception:
660
+ return "Failed to calculate expression level comparison"
661
+
662
+
663
+ @register_tool("HPAGetDiseaseExpressionTool")
664
+ class HPAGetDiseaseExpressionTool(HPASearchApiTool):
665
+ """
666
+ Get expression data for a gene in specific diseases and tissues.
667
+ Get related expression information by gene name, tissue type, and disease name.
668
+ """
669
+
670
+ def __init__(self, tool_config):
671
+ super().__init__(tool_config)
672
+ # Mapping of common cancer types to their column identifiers
673
+ self.cancer_columns = {
674
+ "brain_cancer": "cancer_RNA_brain_cancer",
675
+ "breast_cancer": "cancer_RNA_breast_cancer",
676
+ "colon_cancer": "cancer_RNA_colon_cancer",
677
+ "lung_cancer": "cancer_RNA_lung_cancer",
678
+ "liver_cancer": "cancer_RNA_liver_cancer",
679
+ "prostate_cancer": "cancer_RNA_prostate_cancer",
680
+ "kidney_cancer": "cancer_RNA_kidney_cancer",
681
+ "pancreatic_cancer": "cancer_RNA_pancreatic_cancer",
682
+ "stomach_cancer": "cancer_RNA_stomach_cancer",
683
+ "ovarian_cancer": "cancer_RNA_ovarian_cancer",
684
+ }
685
+
686
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
687
+ gene_name = arguments.get("gene_name")
688
+ tissue_type = arguments.get("tissue_type", "").lower()
689
+ disease_name = arguments.get("disease_name", "").lower()
690
+
691
+ if not gene_name:
692
+ return {"error": "Parameter 'gene_name' is required"}
693
+ if not disease_name:
694
+ return {"error": "Parameter 'disease_name' is required"}
695
+
696
+ # Enhanced validation with intelligent recommendations
697
+ disease_key = f"{tissue_type}_{disease_name}" if tissue_type else disease_name
698
+ cancer_column = None
699
+
700
+ # Match cancer type
701
+ for key, column in self.cancer_columns.items():
702
+ if disease_key in key or disease_name in key:
703
+ cancer_column = column
704
+ break
705
+
706
+ if not cancer_column:
707
+ available_diseases = [
708
+ k.replace("_", " ") for k in self.cancer_columns.keys()
709
+ ]
710
+
711
+ # Find similar disease names
712
+ similar_diseases = []
713
+ disease_keywords = disease_name.replace("_", " ").split()
714
+
715
+ for valid_disease in available_diseases:
716
+ for keyword in disease_keywords:
717
+ if (
718
+ keyword in valid_disease.lower()
719
+ or valid_disease.lower() in keyword
720
+ ):
721
+ similar_diseases.append(valid_disease)
722
+ break
723
+
724
+ error_msg = f"Unsupported disease_name '{disease_name}'. "
725
+ if similar_diseases:
726
+ error_msg += f"Similar options: {similar_diseases[:3]}. "
727
+ error_msg += f"All supported diseases: {available_diseases}"
728
+ return {"error": error_msg}
729
+
730
+ # Build request columns
731
+ columns = f"g,gs,{cancer_column},rnatsm"
732
+ result = self._make_api_request(gene_name, columns)
733
+
734
+ if "error" in result:
735
+ return result
736
+
737
+ if not result:
738
+ return {"error": "No expression data found"}
739
+
740
+ # Extract the first matching gene data
741
+ gene_data = result[0] if isinstance(result, list) and result else {}
742
+
743
+ return {
744
+ "gene_name": gene_name,
745
+ "gene_symbol": gene_data.get("Gene", gene_name),
746
+ "gene_synonym": gene_data.get("Gene synonym", ""),
747
+ "tissue_type": tissue_type or "Not specified",
748
+ "disease_name": disease_name,
749
+ "disease_expression": gene_data.get(cancer_column, "N/A"),
750
+ "healthy_expression": gene_data.get("RNA tissue specific nTPM", "N/A"),
751
+ "expression_unit": "nTPM (normalized Transcripts Per Million)",
752
+ "disease_vs_healthy": self._compare_disease_healthy(
753
+ gene_data.get(cancer_column), gene_data.get("RNA tissue specific nTPM")
754
+ ),
755
+ }
756
+
757
+ def _compare_disease_healthy(self, disease_expr, healthy_expr) -> str:
758
+ """Compare expression difference between disease and healthy state"""
759
+ try:
760
+ disease_val = (
761
+ float(disease_expr) if disease_expr and disease_expr != "N/A" else None
762
+ )
763
+ healthy_val = (
764
+ float(healthy_expr) if healthy_expr and healthy_expr != "N/A" else None
765
+ )
766
+
767
+ if disease_val is None or healthy_val is None:
768
+ return "Insufficient data for comparison"
769
+
770
+ fold_change = disease_val / healthy_val if healthy_val > 0 else float("inf")
771
+
772
+ if fold_change > 2:
773
+ return f"Disease state expression upregulated {fold_change:.2f} fold"
774
+ elif fold_change < 0.5:
775
+ return (
776
+ f"Disease state expression downregulated {1/fold_change:.2f} fold"
777
+ )
778
+ else:
779
+ return f"Expression level relatively stable (fold change: {fold_change:.2f})"
780
+ except Exception:
781
+ return "Failed to calculate expression difference"
782
+
783
+
784
+ @register_tool("HPAGetBiologicalProcessTool")
785
+ class HPAGetBiologicalProcessTool(HPASearchApiTool):
786
+ """
787
+ Get biological process information related to a gene.
788
+ Get specific biological processes a gene is involved in by gene name.
789
+ """
790
+
791
+ def __init__(self, tool_config):
792
+ super().__init__(tool_config)
793
+ # Predefined biological process list
794
+ self.target_processes = [
795
+ "Apoptosis",
796
+ "Biological rhythms",
797
+ "Cell cycle",
798
+ "Host-virus interaction",
799
+ "Necrosis",
800
+ "Transcription",
801
+ "Transcription regulation",
802
+ ]
803
+
804
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
805
+ gene_name = arguments.get("gene_name")
806
+ filter_processes = arguments.get("filter_processes", True)
807
+
808
+ if not gene_name:
809
+ return {"error": "Parameter 'gene_name' is required"}
810
+
811
+ # Request biological process data for the gene
812
+ columns = "g,gs,upbp"
813
+ result = self._make_api_request(gene_name, columns)
814
+
815
+ if "error" in result:
816
+ return result
817
+
818
+ if not result:
819
+ return {"error": "No gene data found"}
820
+
821
+ # Extract the first matching gene data
822
+ gene_data = result[0] if isinstance(result, list) and result else {}
823
+
824
+ # Parse biological processes
825
+ biological_processes = gene_data.get("Biological process", "")
826
+ if not biological_processes or biological_processes == "N/A":
827
+ return {
828
+ "gene_name": gene_name,
829
+ "gene_symbol": gene_data.get("Gene", gene_name),
830
+ "gene_synonym": gene_data.get("Gene synonym", ""),
831
+ "biological_processes": [],
832
+ "target_processes_found": [],
833
+ "target_process_names": [],
834
+ "total_processes": 0,
835
+ "target_processes_count": 0,
836
+ }
837
+
838
+ # Split and clean process list - handle both string and list formats
839
+ processes_list = []
840
+ if isinstance(biological_processes, list):
841
+ processes_list = biological_processes
842
+ elif isinstance(biological_processes, str):
843
+ # Usually separated by semicolon or comma
844
+ processes_list = [
845
+ p.strip()
846
+ for p in biological_processes.replace(";", ",").split(",")
847
+ if p.strip()
848
+ ]
849
+
850
+ # Filter target processes
851
+ target_found = []
852
+ if filter_processes:
853
+ for process in processes_list:
854
+ for target in self.target_processes:
855
+ if target.lower() in process.lower():
856
+ target_found.append(
857
+ {"target_process": target, "full_description": process}
858
+ )
859
+
860
+ return {
861
+ "gene_name": gene_name,
862
+ "gene_symbol": gene_data.get("Gene", gene_name),
863
+ "gene_synonym": gene_data.get("Gene synonym", ""),
864
+ "biological_processes": processes_list,
865
+ "target_processes_found": target_found,
866
+ "target_process_names": [tp["target_process"] for tp in target_found],
867
+ "total_processes": len(processes_list),
868
+ "target_processes_count": len(target_found),
869
+ }
870
+
871
+
872
+ @register_tool("HPAGetCancerPrognosticsTool")
873
+ class HPAGetCancerPrognosticsTool(HPAJsonApiTool):
874
+ """
875
+ Get prognostic value of a gene across various cancers.
876
+ Uses the efficient JSON API to retrieve cancer prognostic data.
877
+ """
878
+
879
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
880
+ ensembl_id = arguments.get("ensembl_id")
881
+ if not ensembl_id:
882
+ return {"error": "Parameter 'ensembl_id' is required"}
883
+
884
+ data = self._make_api_request(ensembl_id)
885
+ if "error" in data:
886
+ return data
887
+
888
+ prognostics = []
889
+ for key, value in data.items():
890
+ if key.startswith("Cancer prognostics") and isinstance(value, dict):
891
+ cancer_type = key.replace("Cancer prognostics - ", "").strip()
892
+ if value and value.get("is_prognostic"):
893
+ prognostics.append(
894
+ {
895
+ "cancer_type": cancer_type,
896
+ "prognostic_type": value.get("prognostic type", "Unknown"),
897
+ "p_value": value.get("p_val", "N/A"),
898
+ "is_prognostic": value.get("is_prognostic", False),
899
+ }
900
+ )
901
+
902
+ return {
903
+ "ensembl_id": ensembl_id,
904
+ "gene": data.get("Gene", "Unknown"),
905
+ "gene_synonym": data.get("Gene synonym", ""),
906
+ "prognostic_cancers_count": len(prognostics),
907
+ "prognostic_summary": (
908
+ prognostics
909
+ if prognostics
910
+ else "No significant prognostic value found in the analyzed cancers."
911
+ ),
912
+ "note": "Prognostic value indicates whether high/low expression of this gene correlates with patient survival in specific cancer types.",
913
+ }
914
+
915
+
916
+ @register_tool("HPAGetProteinInteractionsTool")
917
+ class HPAGetProteinInteractionsTool(HPASearchApiTool):
918
+ """
919
+ Get protein-protein interaction partners for a gene.
920
+ Uses search API to retrieve interaction data.
921
+ """
922
+
923
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
924
+ gene_name = arguments.get("gene_name")
925
+ if not gene_name:
926
+ return {"error": "Parameter 'gene_name' is required"}
927
+
928
+ # Use 'ppi' column to retrieve protein-protein interactions
929
+ columns = "g,gs,ppi"
930
+ result = self._make_api_request(gene_name, columns)
931
+
932
+ if "error" in result:
933
+ return result
934
+
935
+ if not result or not isinstance(result, list):
936
+ return {"error": f"No interaction data found for gene '{gene_name}'"}
937
+
938
+ gene_data = result[0]
939
+ interactions_str = gene_data.get("Protein-protein interaction", "")
940
+
941
+ if not interactions_str or interactions_str == "N/A":
942
+ return {
943
+ "gene": gene_data.get("Gene", gene_name),
944
+ "gene_synonym": gene_data.get("Gene synonym", ""),
945
+ "interactions": "No interaction data found.",
946
+ "interactor_count": 0,
947
+ "interactors": [],
948
+ }
949
+
950
+ # Parse interaction string (usually semicolon or comma separated)
951
+ interactors = [
952
+ i.strip()
953
+ for i in interactions_str.replace(";", ",").split(",")
954
+ if i.strip()
955
+ ]
956
+
957
+ return {
958
+ "gene": gene_data.get("Gene", gene_name),
959
+ "gene_synonym": gene_data.get("Gene synonym", ""),
960
+ "interactor_count": len(interactors),
961
+ "interactors": interactors,
962
+ "interaction_summary": f"Found {len(interactors)} protein interaction partners",
963
+ }
964
+
965
+
966
+ @register_tool("HPAGetRnaExpressionByTissueTool")
967
+ class HPAGetRnaExpressionByTissueTool(HPAJsonApiTool):
968
+ """
969
+ Query RNA expression levels for a gene in specific tissues.
970
+ More precise than general tissue expression queries.
971
+ """
972
+
973
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
974
+ ensembl_id = arguments.get("ensembl_id")
975
+ tissue_names = arguments.get("tissue_names", [])
976
+
977
+ if not ensembl_id:
978
+ return {"error": "Parameter 'ensembl_id' is required"}
979
+ if not tissue_names or not isinstance(tissue_names, list):
980
+ # Provide helpful tissue name examples
981
+ example_tissues = [
982
+ "brain",
983
+ "liver",
984
+ "heart",
985
+ "kidney",
986
+ "lung",
987
+ "pancreas",
988
+ "skin",
989
+ "muscle",
990
+ ]
991
+ return {
992
+ "error": f"Parameter 'tissue_names' is required and must be a list. Example: {example_tissues}"
993
+ }
994
+
995
+ data = self._make_api_request(ensembl_id)
996
+ if "error" in data:
997
+ return data
998
+
999
+ # Get RNA tissue expression data
1000
+ rna_data = data.get("RNA tissue specific nTPM", {})
1001
+ if not isinstance(rna_data, dict):
1002
+ return {"error": "No RNA tissue expression data available for this gene"}
1003
+
1004
+ expression_results = {}
1005
+ available_tissues = list(rna_data.keys())
1006
+
1007
+ for tissue in tissue_names:
1008
+ # Case-insensitive matching
1009
+ found_tissue = None
1010
+ for available_tissue in available_tissues:
1011
+ if (
1012
+ tissue.lower() in available_tissue.lower()
1013
+ or available_tissue.lower() in tissue.lower()
1014
+ ):
1015
+ found_tissue = available_tissue
1016
+ break
1017
+
1018
+ if found_tissue:
1019
+ expression_results[tissue] = {
1020
+ "matched_tissue": found_tissue,
1021
+ "expression_value": rna_data[found_tissue],
1022
+ "expression_level": self._categorize_expression(
1023
+ rna_data[found_tissue]
1024
+ ),
1025
+ }
1026
+ else:
1027
+ expression_results[tissue] = {
1028
+ "matched_tissue": "Not found",
1029
+ "expression_value": "N/A",
1030
+ "expression_level": "No data",
1031
+ }
1032
+
1033
+ return {
1034
+ "ensembl_id": ensembl_id,
1035
+ "gene": data.get("Gene", "Unknown"),
1036
+ "gene_synonym": data.get("Gene synonym", ""),
1037
+ "expression_unit": "nTPM (normalized Transcripts Per Million)",
1038
+ "queried_tissues": tissue_names,
1039
+ "tissue_expression": expression_results,
1040
+ "available_tissues_sample": (
1041
+ available_tissues[:10]
1042
+ if len(available_tissues) > 10
1043
+ else available_tissues
1044
+ ),
1045
+ "total_available_tissues": len(available_tissues),
1046
+ }
1047
+
1048
+ def _categorize_expression(self, expr_value) -> str:
1049
+ """Categorize expression level"""
1050
+ try:
1051
+ val = float(expr_value)
1052
+ if val > 50:
1053
+ return "Very high"
1054
+ elif val > 10:
1055
+ return "High"
1056
+ elif val > 1:
1057
+ return "Medium"
1058
+ elif val > 0.1:
1059
+ return "Low"
1060
+ else:
1061
+ return "Very low"
1062
+ except (ValueError, TypeError):
1063
+ return "Unknown"
1064
+
1065
+
1066
+ @register_tool("HPAGetContextualBiologicalProcessTool")
1067
+ class HPAGetContextualBiologicalProcessTool(BaseTool):
1068
+ """
1069
+ Analyze a gene's biological processes in the context of specific tissue or cell line.
1070
+ Enhanced with intelligent context validation and recommendation.
1071
+ """
1072
+
1073
+ def __init__(self, tool_config):
1074
+ super().__init__(tool_config)
1075
+ # Define all valid context options
1076
+ self.valid_contexts = {
1077
+ "tissues": [
1078
+ "adipose_tissue",
1079
+ "adrenal_gland",
1080
+ "appendix",
1081
+ "bone_marrow",
1082
+ "brain",
1083
+ "breast",
1084
+ "bronchus",
1085
+ "cerebellum",
1086
+ "cerebral_cortex",
1087
+ "cervix",
1088
+ "colon",
1089
+ "duodenum",
1090
+ "endometrium",
1091
+ "esophagus",
1092
+ "gallbladder",
1093
+ "heart_muscle",
1094
+ "kidney",
1095
+ "liver",
1096
+ "lung",
1097
+ "lymph_node",
1098
+ "ovary",
1099
+ "pancreas",
1100
+ "placenta",
1101
+ "prostate",
1102
+ "rectum",
1103
+ "salivary_gland",
1104
+ "skeletal_muscle",
1105
+ "skin",
1106
+ "small_intestine",
1107
+ "spleen",
1108
+ "stomach",
1109
+ "testis",
1110
+ "thymus",
1111
+ "thyroid_gland",
1112
+ "urinary_bladder",
1113
+ "vagina",
1114
+ ],
1115
+ "cell_lines": [
1116
+ "hela",
1117
+ "mcf7",
1118
+ "a549",
1119
+ "hepg2",
1120
+ "jurkat",
1121
+ "pc3",
1122
+ "rh30",
1123
+ "siha",
1124
+ "u251",
1125
+ ],
1126
+ "blood_cells": [
1127
+ "t_cell",
1128
+ "b_cell",
1129
+ "nk_cell",
1130
+ "monocyte",
1131
+ "neutrophil",
1132
+ "eosinophil",
1133
+ ],
1134
+ "brain_regions": [
1135
+ "cerebellum",
1136
+ "cerebral_cortex",
1137
+ "hippocampus",
1138
+ "hypothalamus",
1139
+ "amygdala",
1140
+ ],
1141
+ }
1142
+
1143
+ def _validate_context(self, context_name: str) -> Dict[str, Any]:
1144
+ """Validate context_name and provide intelligent recommendations"""
1145
+ context_lower = context_name.lower().replace(" ", "_").replace("-", "_")
1146
+
1147
+ # Check all valid contexts
1148
+ all_valid = []
1149
+ for category, contexts in self.valid_contexts.items():
1150
+ all_valid.extend(contexts)
1151
+ if context_lower in contexts:
1152
+ return {"valid": True, "category": category}
1153
+
1154
+ # Find similar contexts (fuzzy matching)
1155
+ similar_contexts = []
1156
+ context_keywords = context_lower.split("_")
1157
+
1158
+ for valid_context in all_valid:
1159
+ for keyword in context_keywords:
1160
+ if keyword in valid_context.lower() or valid_context.lower() in keyword:
1161
+ similar_contexts.append(valid_context)
1162
+ break
1163
+
1164
+ return {
1165
+ "valid": False,
1166
+ "input": context_name,
1167
+ "similar_suggestions": similar_contexts[:5], # Top 5 suggestions
1168
+ "all_tissues": self.valid_contexts["tissues"][:10], # First 10 tissues
1169
+ "all_cell_lines": self.valid_contexts["cell_lines"],
1170
+ "total_available": len(all_valid),
1171
+ }
1172
+
1173
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
1174
+ gene_name = arguments.get("gene_name")
1175
+ context_name = arguments.get("context_name")
1176
+
1177
+ if not gene_name:
1178
+ return {"error": "Parameter 'gene_name' is required"}
1179
+ if not context_name:
1180
+ return {"error": "Parameter 'context_name' is required"}
1181
+
1182
+ # Validate context_name and provide recommendations if invalid
1183
+ validation = self._validate_context(context_name)
1184
+ if not validation["valid"]:
1185
+ error_msg = f"Invalid context_name '{validation['input']}'. "
1186
+ if validation["similar_suggestions"]:
1187
+ error_msg += f"Similar options: {validation['similar_suggestions']}. "
1188
+ error_msg += f"Available tissues: {validation['all_tissues']}... "
1189
+ error_msg += f"Available cell lines: {validation['all_cell_lines']}. "
1190
+ error_msg += f"Total {validation['total_available']} contexts available."
1191
+ return {"error": error_msg}
1192
+
1193
+ try:
1194
+ # Step 1: Get gene basic info and Ensembl ID
1195
+ search_api = HPASearchApiTool({})
1196
+ search_result = search_api._make_api_request(gene_name, "g,gs,eg,upbp")
1197
+
1198
+ if "error" in search_result or not search_result:
1199
+ return {"error": f"Could not find gene information for '{gene_name}'"}
1200
+
1201
+ gene_data = (
1202
+ search_result[0] if isinstance(search_result, list) else search_result
1203
+ )
1204
+ ensembl_id = gene_data.get("Ensembl", "")
1205
+
1206
+ if not ensembl_id:
1207
+ return {"error": f"Could not find Ensembl ID for gene '{gene_name}'"}
1208
+
1209
+ # Step 2: Get biological processes
1210
+ biological_processes = gene_data.get("Biological process", "")
1211
+ processes_list = []
1212
+ if biological_processes and biological_processes != "N/A":
1213
+ if isinstance(biological_processes, list):
1214
+ processes_list = biological_processes
1215
+ elif isinstance(biological_processes, str):
1216
+ processes_list = [
1217
+ p.strip()
1218
+ for p in biological_processes.replace(";", ",").split(",")
1219
+ if p.strip()
1220
+ ]
1221
+
1222
+ # Step 3: Get expression in context with improved error handling
1223
+ json_api = HPAJsonApiTool({})
1224
+ json_data = json_api._make_api_request(ensembl_id)
1225
+
1226
+ expression_value = "N/A"
1227
+ expression_level = "not expressed"
1228
+ context_type = (
1229
+ validation["category"].replace("_", " ").rstrip("s")
1230
+ ) # "tissues" -> "tissue"
1231
+
1232
+ if "error" not in json_data and json_data:
1233
+ # FIXED: Check if rna_data is not None before calling .keys()
1234
+ rna_data = json_data.get("RNA tissue specific nTPM")
1235
+ if rna_data and isinstance(rna_data, dict):
1236
+ # Try to find matching tissue
1237
+ for tissue_key in rna_data.keys():
1238
+ if (
1239
+ context_name.lower() in tissue_key.lower()
1240
+ or tissue_key.lower() in context_name.lower()
1241
+ ):
1242
+ expression_value = rna_data[tissue_key]
1243
+ break
1244
+
1245
+ # If not found in tissues and it's a cell line, try cell line data
1246
+ if expression_value == "N/A" and validation["category"] == "cell_lines":
1247
+ context_type = "cell line"
1248
+ cell_line_columns = {
1249
+ "hela": "cell_RNA_hela",
1250
+ "mcf7": "cell_RNA_mcf7",
1251
+ "a549": "cell_RNA_a549",
1252
+ "hepg2": "cell_RNA_hepg2",
1253
+ }
1254
+
1255
+ cell_column = cell_line_columns.get(context_name.lower())
1256
+ if cell_column:
1257
+ cell_result = search_api._make_api_request(
1258
+ gene_name, f"g,{cell_column}"
1259
+ )
1260
+ if "error" not in cell_result and cell_result:
1261
+ expression_value = cell_result[0].get(cell_column, "N/A")
1262
+
1263
+ # Categorize expression level
1264
+ try:
1265
+ expr_val = float(expression_value) if expression_value != "N/A" else 0
1266
+ if expr_val > 10:
1267
+ expression_level = "highly expressed"
1268
+ elif expr_val > 1:
1269
+ expression_level = "moderately expressed"
1270
+ elif expr_val > 0.1:
1271
+ expression_level = "expressed at low level"
1272
+ else:
1273
+ expression_level = "not expressed or very low"
1274
+ except (ValueError, TypeError):
1275
+ expression_level = "expression level unclear"
1276
+
1277
+ # Generate contextual conclusion
1278
+ relevance = (
1279
+ "may be functionally relevant"
1280
+ if "expressed" in expression_level and "not" not in expression_level
1281
+ else "is likely not functionally relevant"
1282
+ )
1283
+
1284
+ conclusion = f"Gene {gene_name} is involved in {len(processes_list)} biological processes. It is {expression_level} in {context_name} ({expression_value} nTPM), suggesting its functional roles {relevance} in this {context_type} context."
1285
+
1286
+ return {
1287
+ "gene": gene_data.get("Gene", gene_name),
1288
+ "gene_synonym": gene_data.get("Gene synonym", ""),
1289
+ "ensembl_id": ensembl_id,
1290
+ "context": context_name,
1291
+ "context_type": context_type,
1292
+ "context_category": validation["category"],
1293
+ "expression_in_context": f"{expression_value} nTPM",
1294
+ "expression_level": expression_level,
1295
+ "total_biological_processes": len(processes_list),
1296
+ "biological_processes": (
1297
+ processes_list[:10] if len(processes_list) > 10 else processes_list
1298
+ ),
1299
+ "contextual_conclusion": conclusion,
1300
+ "functional_relevance": relevance,
1301
+ }
1302
+
1303
+ except Exception as e:
1304
+ return {"error": f"Failed to perform contextual analysis: {str(e)}"}
1305
+
1306
+
1307
+ # --- Keep existing comprehensive gene details tool for images ---
1308
+
1309
+
1310
+ @register_tool("HPAGetGenePageDetailsTool")
1311
+ class HPAGetGenePageDetailsTool(HPAXmlApiTool):
1312
+ """
1313
+ Get detailed information about a gene page, including images, protein expression, antibody data, etc.
1314
+ Get the most comprehensive data by parsing HPA's single gene XML endpoint.
1315
+ Enhanced version with improved image extraction and comprehensive data parsing based on optimization plan.
1316
+ """
1317
+
1318
+ def __init__(self, tool_config):
1319
+ super().__init__(tool_config)
1320
+
1321
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
1322
+ ensembl_id = arguments.get("ensembl_id")
1323
+ include_images = arguments.get("include_images", True)
1324
+ include_antibodies = arguments.get("include_antibodies", True)
1325
+ include_expression = arguments.get("include_expression", True)
1326
+
1327
+ if not ensembl_id:
1328
+ return {"error": "Parameter 'ensembl_id' is required"}
1329
+
1330
+ try:
1331
+ root = self._make_api_request(ensembl_id)
1332
+ return self._parse_gene_xml(
1333
+ root, ensembl_id, include_images, include_antibodies, include_expression
1334
+ )
1335
+
1336
+ except Exception as e:
1337
+ return {"error": str(e)}
1338
+
1339
+ def _parse_gene_xml(
1340
+ self,
1341
+ root: ET.Element,
1342
+ ensembl_id: str,
1343
+ include_images: bool,
1344
+ include_antibodies: bool,
1345
+ include_expression: bool,
1346
+ ) -> Dict[str, Any]:
1347
+ """Parse gene XML data comprehensively based on actual HPA XML schema"""
1348
+ result = {
1349
+ "ensembl_id": ensembl_id,
1350
+ "gene_name": "",
1351
+ "gene_description": "",
1352
+ "chromosome_location": "",
1353
+ "uniprot_ids": [],
1354
+ "summary": {},
1355
+ }
1356
+
1357
+ # Extract basic gene information from entry element
1358
+ entry_elem = root.find(".//entry")
1359
+ if entry_elem is not None:
1360
+ # Gene name
1361
+ name_elem = entry_elem.find("name")
1362
+ if name_elem is not None:
1363
+ result["gene_name"] = name_elem.text or ""
1364
+
1365
+ # Gene synonyms
1366
+ synonyms = []
1367
+ for synonym_elem in entry_elem.findall("synonym"):
1368
+ if synonym_elem.text:
1369
+ synonyms.append(synonym_elem.text)
1370
+ result["gene_synonyms"] = synonyms
1371
+
1372
+ # Extract Uniprot IDs from identifier/xref elements
1373
+ identifier_elem = entry_elem.find("identifier")
1374
+ if identifier_elem is not None:
1375
+ for xref in identifier_elem.findall("xref"):
1376
+ if xref.get("db") == "Uniprot/SWISSPROT":
1377
+ result["uniprot_ids"].append(xref.get("id", ""))
1378
+
1379
+ # Extract protein classes
1380
+ protein_classes = []
1381
+ protein_classes_elem = entry_elem.find("proteinClasses")
1382
+ if protein_classes_elem is not None:
1383
+ for pc in protein_classes_elem.findall("proteinClass"):
1384
+ class_name = pc.get("name", "")
1385
+ if class_name:
1386
+ protein_classes.append(class_name)
1387
+ result["protein_classes"] = protein_classes
1388
+
1389
+ # Extract image information with enhanced parsing
1390
+ if include_images:
1391
+ result["ihc_images"] = self._extract_ihc_images(root)
1392
+ result["if_images"] = self._extract_if_images(root)
1393
+
1394
+ # Extract antibody information
1395
+ if include_antibodies:
1396
+ result["antibodies"] = self._extract_antibodies(root)
1397
+
1398
+ # Extract expression information
1399
+ if include_expression:
1400
+ result["expression_summary"] = self._extract_expression_summary(root)
1401
+ result["tissue_expression"] = self._extract_tissue_expression(root)
1402
+ result["cell_line_expression"] = self._extract_cell_line_expression(root)
1403
+
1404
+ # Extract summary statistics
1405
+ result["summary"] = {
1406
+ "total_antibodies": len(result.get("antibodies", [])),
1407
+ "total_ihc_images": len(result.get("ihc_images", [])),
1408
+ "total_if_images": len(result.get("if_images", [])),
1409
+ "tissues_with_expression": len(result.get("tissue_expression", [])),
1410
+ "cell_lines_with_expression": len(result.get("cell_line_expression", [])),
1411
+ }
1412
+
1413
+ return result
1414
+
1415
+ def _extract_ihc_images(self, root: ET.Element) -> List[Dict[str, Any]]:
1416
+ """Extract tissue immunohistochemistry (IHC) images based on actual HPA XML structure"""
1417
+ images = []
1418
+
1419
+ # Find tissueExpression elements which contain IHC images
1420
+ for tissue_expr in root.findall(".//tissueExpression"):
1421
+ # Extract selected images from tissueExpression
1422
+ for image_elem in tissue_expr.findall(".//image"):
1423
+ image_type = image_elem.get("imageType", "")
1424
+ if image_type == "selected":
1425
+ tissue_elem = image_elem.find("tissue")
1426
+ image_url_elem = image_elem.find("imageUrl")
1427
+
1428
+ if tissue_elem is not None and image_url_elem is not None:
1429
+ tissue_name = tissue_elem.text or ""
1430
+ organ = tissue_elem.get("organ", "")
1431
+ ontology_terms = tissue_elem.get("ontologyTerms", "")
1432
+ image_url = image_url_elem.text or ""
1433
+
1434
+ images.append(
1435
+ {
1436
+ "image_type": "Immunohistochemistry",
1437
+ "tissue_name": tissue_name,
1438
+ "organ": organ,
1439
+ "ontology_terms": ontology_terms,
1440
+ "image_url": image_url,
1441
+ "selected": True,
1442
+ }
1443
+ )
1444
+
1445
+ return images
1446
+
1447
+ def _extract_if_images(self, root: ET.Element) -> List[Dict[str, Any]]:
1448
+ """Extract subcellular immunofluorescence (IF) images based on actual HPA XML structure"""
1449
+ images = []
1450
+
1451
+ # Look for subcellular expression data (IF images are typically in subcellular sections)
1452
+ for subcell_expr in root.findall(".//subcellularExpression"):
1453
+ # Extract subcellular location images
1454
+ for image_elem in subcell_expr.findall(".//image"):
1455
+ image_type = image_elem.get("imageType", "")
1456
+ if image_type == "selected":
1457
+ location_elem = image_elem.find("location")
1458
+ image_url_elem = image_elem.find("imageUrl")
1459
+
1460
+ if location_elem is not None and image_url_elem is not None:
1461
+ location_name = location_elem.text or ""
1462
+ image_url = image_url_elem.text or ""
1463
+
1464
+ images.append(
1465
+ {
1466
+ "image_type": "Immunofluorescence",
1467
+ "subcellular_location": location_name,
1468
+ "image_url": image_url,
1469
+ "selected": True,
1470
+ }
1471
+ )
1472
+
1473
+ return images
1474
+
1475
+ def _extract_antibodies(self, root: ET.Element) -> List[Dict[str, Any]]:
1476
+ """Extract antibody information from actual HPA XML structure"""
1477
+ antibodies_data = []
1478
+
1479
+ # Look for antibody references in various expression sections
1480
+ antibody_ids = set()
1481
+
1482
+ # Look for antibody references in tissue expression
1483
+ for tissue_expr in root.findall(".//tissueExpression"):
1484
+ for elem in tissue_expr.iter():
1485
+ if "antibody" in elem.tag.lower() or elem.get("antibody"):
1486
+ antibody_id = elem.get("antibody") or elem.text
1487
+ if antibody_id:
1488
+ antibody_ids.add(antibody_id)
1489
+
1490
+ # Create basic antibody info for found IDs
1491
+ for antibody_id in antibody_ids:
1492
+ antibodies_data.append(
1493
+ {
1494
+ "antibody_id": antibody_id,
1495
+ "source": "HPA",
1496
+ "applications": ["IHC", "IF"],
1497
+ "validation_status": "Available",
1498
+ }
1499
+ )
1500
+
1501
+ # If no specific antibody IDs found, create a placeholder
1502
+ if not antibodies_data:
1503
+ antibodies_data.append(
1504
+ {
1505
+ "antibody_id": "HPA_antibody",
1506
+ "source": "HPA",
1507
+ "applications": ["IHC", "IF"],
1508
+ "validation_status": "Available",
1509
+ }
1510
+ )
1511
+
1512
+ return antibodies_data
1513
+
1514
+ def _extract_expression_summary(self, root: ET.Element) -> Dict[str, Any]:
1515
+ """Extract expression summary information from actual HPA XML structure"""
1516
+ summary = {
1517
+ "tissue_specificity": "",
1518
+ "subcellular_location": [],
1519
+ "protein_class": [],
1520
+ "predicted_location": "",
1521
+ "tissue_expression_summary": "",
1522
+ "subcellular_expression_summary": "",
1523
+ }
1524
+
1525
+ # Extract predicted location
1526
+ predicted_location_elem = root.find(".//predictedLocation")
1527
+ if predicted_location_elem is not None:
1528
+ summary["predicted_location"] = predicted_location_elem.text or ""
1529
+
1530
+ # Extract tissue expression summary
1531
+ tissue_expr_elem = root.find(".//tissueExpression")
1532
+ if tissue_expr_elem is not None:
1533
+ tissue_summary_elem = tissue_expr_elem.find("summary")
1534
+ if tissue_summary_elem is not None:
1535
+ summary["tissue_expression_summary"] = tissue_summary_elem.text or ""
1536
+
1537
+ # Extract subcellular expression summary
1538
+ subcell_expr_elem = root.find(".//subcellularExpression")
1539
+ if subcell_expr_elem is not None:
1540
+ subcell_summary_elem = subcell_expr_elem.find("summary")
1541
+ if subcell_summary_elem is not None:
1542
+ summary["subcellular_expression_summary"] = (
1543
+ subcell_summary_elem.text or ""
1544
+ )
1545
+
1546
+ return summary
1547
+
1548
+ def _extract_tissue_expression(self, root: ET.Element) -> List[Dict[str, Any]]:
1549
+ """Extract detailed tissue expression data from actual HPA XML structure"""
1550
+ tissue_data = []
1551
+
1552
+ # Extract from tissueExpression data elements
1553
+ for tissue_expr in root.findall(".//tissueExpression"):
1554
+ for data_elem in tissue_expr.findall(".//data"):
1555
+ tissue_elem = data_elem.find("tissue")
1556
+ level_elem = data_elem.find("level")
1557
+
1558
+ if tissue_elem is not None:
1559
+ tissue_info = {
1560
+ "tissue_name": tissue_elem.text or "",
1561
+ "organ": tissue_elem.get("organ", ""),
1562
+ "expression_level": "",
1563
+ }
1564
+
1565
+ if level_elem is not None:
1566
+ tissue_info["expression_level"] = (
1567
+ level_elem.get("type", "") + ": " + (level_elem.text or "")
1568
+ )
1569
+
1570
+ tissue_data.append(tissue_info)
1571
+
1572
+ return tissue_data
1573
+
1574
+ def _extract_cell_line_expression(self, root: ET.Element) -> List[Dict[str, Any]]:
1575
+ """Extract cell line expression data from actual HPA XML structure"""
1576
+ cell_line_data = []
1577
+
1578
+ # Look for cell line expression in subcellular expression
1579
+ for subcell_expr in root.findall(".//subcellularExpression"):
1580
+ for data_elem in subcell_expr.findall(".//data"):
1581
+ cell_line_elem = data_elem.find("cellLine")
1582
+ if cell_line_elem is not None:
1583
+ cell_info = {
1584
+ "cell_line_name": cell_line_elem.get("name", "")
1585
+ or (cell_line_elem.text or ""),
1586
+ "expression_data": [],
1587
+ }
1588
+
1589
+ if cell_info["expression_data"]:
1590
+ cell_line_data.append(cell_info)
1591
+
1592
+ return cell_line_data
1593
+
1594
+
1595
+ # --- Legacy/Compatibility Tools ---
1596
+
1597
+
1598
+ @register_tool("HPAGetGeneJSONTool")
1599
+ class HPAGetGeneJSONTool(HPAJsonApiTool):
1600
+ """
1601
+ Enhanced legacy tool - Get basic gene information using Ensembl Gene ID.
1602
+ Now uses the efficient JSON API instead of search API.
1603
+ """
1604
+
1605
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
1606
+ ensembl_id = arguments.get("ensembl_id")
1607
+ if not ensembl_id:
1608
+ return {"error": "Parameter 'ensembl_id' is required"}
1609
+
1610
+ # Use JSON API to get comprehensive information
1611
+ data = self._make_api_request(ensembl_id)
1612
+
1613
+ if "error" in data:
1614
+ return data
1615
+
1616
+ # Convert to response similar to original JSON format for compatibility
1617
+ return {
1618
+ "Ensembl": ensembl_id,
1619
+ "Gene": data.get("Gene", ""),
1620
+ "Gene synonym": data.get("Gene synonym", ""),
1621
+ "Uniprot": data.get("Uniprot", ""),
1622
+ "Biological process": data.get("Biological process", ""),
1623
+ "RNA tissue specific nTPM": data.get("RNA tissue specific nTPM", ""),
1624
+ }
1625
+
1626
+
1627
+ @register_tool("HPAGetGeneXMLTool")
1628
+ class HPAGetGeneXMLTool(HPASearchApiTool):
1629
+ """
1630
+ Legacy tool - Get gene TSV format data (alternative to XML).
1631
+ """
1632
+
1633
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
1634
+ ensembl_id = arguments.get("ensembl_id")
1635
+ if not ensembl_id:
1636
+ return {"error": "Parameter 'ensembl_id' is required"}
1637
+
1638
+ # Use TSV format to get detailed data
1639
+ columns = "g,gs,up,upbp,rnatsm,cell_RNA_a549,cell_RNA_hela"
1640
+ result = self._make_api_request(ensembl_id, columns, format_type="tsv")
1641
+
1642
+ if "error" in result:
1643
+ return result
1644
+
1645
+ return {"tsv_data": result.get("tsv_data", "")}