spacr 0.2.1__py3-none-any.whl → 0.2.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. spacr/gui.py +2 -1
  2. spacr/gui_elements.py +2 -7
  3. spacr/resources/icons/abort.png +0 -0
  4. spacr/resources/icons/classify.png +0 -0
  5. spacr/resources/icons/make_masks.png +0 -0
  6. spacr/resources/icons/mask.png +0 -0
  7. spacr/resources/icons/measure.png +0 -0
  8. spacr/resources/icons/recruitment.png +0 -0
  9. spacr/resources/icons/regression.png +0 -0
  10. spacr/resources/icons/run.png +0 -0
  11. spacr/resources/icons/umap.png +0 -0
  12. {spacr-0.2.1.dist-info → spacr-0.2.21.dist-info}/METADATA +1 -1
  13. spacr-0.2.21.dist-info/RECORD +56 -0
  14. spacr/alpha.py +0 -807
  15. spacr/annotate_app.py +0 -670
  16. spacr/annotate_app_v2.py +0 -670
  17. spacr/app_make_masks_v2.py +0 -686
  18. spacr/classify_app.py +0 -201
  19. spacr/cli.py +0 -41
  20. spacr/foldseek.py +0 -779
  21. spacr/get_alfafold_structures.py +0 -72
  22. spacr/gui_2.py +0 -157
  23. spacr/gui_annotate.py +0 -145
  24. spacr/gui_classify_app.py +0 -201
  25. spacr/gui_make_masks_app.py +0 -927
  26. spacr/gui_make_masks_app_v2.py +0 -688
  27. spacr/gui_mask_app.py +0 -249
  28. spacr/gui_measure_app.py +0 -246
  29. spacr/gui_run.py +0 -58
  30. spacr/gui_sim_app.py +0 -0
  31. spacr/gui_wrappers.py +0 -149
  32. spacr/icons/abort.png +0 -0
  33. spacr/icons/abort.svg +0 -1
  34. spacr/icons/download.png +0 -0
  35. spacr/icons/download.svg +0 -1
  36. spacr/icons/download_for_offline_100dp_E8EAED_FILL0_wght100_GRAD-25_opsz48.png +0 -0
  37. spacr/icons/download_for_offline_100dp_E8EAED_FILL0_wght100_GRAD-25_opsz48.svg +0 -1
  38. spacr/icons/logo_spacr.png +0 -0
  39. spacr/icons/make_masks.png +0 -0
  40. spacr/icons/make_masks.svg +0 -1
  41. spacr/icons/map_barcodes.png +0 -0
  42. spacr/icons/map_barcodes.svg +0 -1
  43. spacr/icons/mask.png +0 -0
  44. spacr/icons/mask.svg +0 -1
  45. spacr/icons/measure.png +0 -0
  46. spacr/icons/measure.svg +0 -1
  47. spacr/icons/play_circle_100dp_E8EAED_FILL0_wght100_GRAD-25_opsz48.png +0 -0
  48. spacr/icons/play_circle_100dp_E8EAED_FILL0_wght100_GRAD-25_opsz48.svg +0 -1
  49. spacr/icons/run.png +0 -0
  50. spacr/icons/run.svg +0 -1
  51. spacr/icons/sequencing.png +0 -0
  52. spacr/icons/sequencing.svg +0 -1
  53. spacr/icons/settings.png +0 -0
  54. spacr/icons/settings.svg +0 -1
  55. spacr/icons/settings_100dp_E8EAED_FILL0_wght100_GRAD-25_opsz48.png +0 -0
  56. spacr/icons/settings_100dp_E8EAED_FILL0_wght100_GRAD-25_opsz48.svg +0 -1
  57. spacr/icons/stop_circle_100dp_E8EAED_FILL0_wght100_GRAD-25_opsz48.png +0 -0
  58. spacr/icons/stop_circle_100dp_E8EAED_FILL0_wght100_GRAD-25_opsz48.svg +0 -1
  59. spacr/icons/theater_comedy_100dp_E8EAED_FILL0_wght100_GRAD200_opsz48.png +0 -0
  60. spacr/icons/theater_comedy_100dp_E8EAED_FILL0_wght100_GRAD200_opsz48.svg +0 -1
  61. spacr/make_masks_app.py +0 -929
  62. spacr/make_masks_app_v2.py +0 -688
  63. spacr/mask_app.py +0 -249
  64. spacr/measure_app.py +0 -246
  65. spacr/models/cp/toxo_plaque_cyto_e25000_X1120_Y1120.CP_model +0 -0
  66. spacr/models/cp/toxo_plaque_cyto_e25000_X1120_Y1120.CP_model_settings.csv +0 -23
  67. spacr/models/cp/toxo_pv_lumen.CP_model +0 -0
  68. spacr/old_code.py +0 -358
  69. spacr/resources/icons/abort.svg +0 -1
  70. spacr/resources/icons/annotate.svg +0 -1
  71. spacr/resources/icons/classify.svg +0 -1
  72. spacr/resources/icons/download.svg +0 -1
  73. spacr/resources/icons/icon.psd +0 -0
  74. spacr/resources/icons/make_masks.svg +0 -1
  75. spacr/resources/icons/map_barcodes.svg +0 -1
  76. spacr/resources/icons/mask.svg +0 -1
  77. spacr/resources/icons/measure.svg +0 -1
  78. spacr/resources/icons/run.svg +0 -1
  79. spacr/resources/icons/run_2.png +0 -0
  80. spacr/resources/icons/run_2.svg +0 -1
  81. spacr/resources/icons/sequencing.svg +0 -1
  82. spacr/resources/icons/settings.svg +0 -1
  83. spacr/resources/icons/train_cellpose.svg +0 -1
  84. spacr/test_gui.py +0 -0
  85. spacr-0.2.1.dist-info/RECORD +0 -126
  86. /spacr/resources/icons/{cellpose.png → cellpose_all.png} +0 -0
  87. {spacr-0.2.1.dist-info → spacr-0.2.21.dist-info}/LICENSE +0 -0
  88. {spacr-0.2.1.dist-info → spacr-0.2.21.dist-info}/WHEEL +0 -0
  89. {spacr-0.2.1.dist-info → spacr-0.2.21.dist-info}/entry_points.txt +0 -0
  90. {spacr-0.2.1.dist-info → spacr-0.2.21.dist-info}/top_level.txt +0 -0
spacr/foldseek.py DELETED
@@ -1,779 +0,0 @@
1
- import os, shutil, subprocess, tarfile, requests
2
- import numpy as np
3
- import pandas as pd
4
- from scipy.stats import fisher_exact
5
- from statsmodels.stats.multitest import multipletests
6
- from concurrent.futures import ProcessPoolExecutor, as_completed
7
- import seaborn as sns
8
- import matplotlib.pyplot as plt
9
- from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
10
-
11
- def run_command(command):
12
- print(f"Executing: {command}")
13
- result = subprocess.run(command, shell=True, capture_output=True, text=True)
14
- if result.returncode != 0:
15
- print(f"Error running command: {command}")
16
- print(result.stdout)
17
- print(result.stderr)
18
- return False
19
- return True
20
-
21
- def add_headers_and_save_csv(input_tsv_path, output_csv_path, results_dir):
22
-
23
- headers = [
24
- 'query', 'target', 'fident', 'alnlen', 'mismatch', 'gapopen',
25
- 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits'
26
- ]
27
-
28
- # Rename the aln_tmscore file to have a .tsv extension if it doesn't already
29
- input_tsv_path = f"{results_dir}/aln_tmscore"
30
- if not input_tsv_path.endswith('.tsv'):
31
- os.rename(input_tsv_path, input_tsv_path + '.tsv')
32
- input_tsv_path += '.tsv'
33
-
34
- # Read the TSV file into a DataFrame
35
- df = pd.read_csv(input_tsv_path, sep='\t', header=None)
36
-
37
- # Assign headers to the DataFrame
38
- df.columns = headers
39
-
40
- # Save the DataFrame as a CSV file
41
- df.to_csv(output_csv_path, index=False)
42
- print(f"File saved as {output_csv_path}")
43
-
44
- def generate_database(path, base_dir, mode='file'):
45
- structures_dir = f'{base_dir}/structures'
46
- os.makedirs(structures_dir, exist_ok=True)
47
-
48
- if mode == 'tar':
49
- if os.path.exists(structures_dir) and not os.listdir(structures_dir):
50
- if not os.path.exists(path):
51
- print(f"Structure tar file {path} not found.")
52
- else:
53
- tar = tarfile.open(path)
54
- tar.extractall(path=structures_dir)
55
- tar.close()
56
- if not run_command(f"foldseek createdb {structures_dir} {structures_dir}/structures_db"):
57
- raise Exception("Failed to create structures database.")
58
-
59
- if mode == 'file':
60
- if os.path.exists(structures_dir) and not os.listdir(structures_dir):
61
- if not os.path.exists(path):
62
- print(f"Structure folder {path} not found.")
63
- else:
64
- for file in os.listdir(path):
65
- file_path = os.path.join(path, file)
66
- new_path = os.path.join(structures_dir, file)
67
- #print(path)
68
- #print(structures_dir)
69
- shutil.copy(file_path, new_path)
70
-
71
- if not run_command(f"foldseek createdb {structures_dir} {structures_dir}/structures_db"):
72
- raise Exception("Failed to create structures database.")
73
- return structures_dir
74
-
75
- def align_to_database(structure_fldr_path, base_dir='/home/carruthers/foldseek', cores=25):
76
-
77
- databases_dir = f'{base_dir}/foldseek_databases'
78
- results_dir = f'{base_dir}/results'
79
- tmp_dir = f'{base_dir}/tmp'
80
-
81
- os.makedirs(databases_dir, exist_ok=True)
82
- os.makedirs(results_dir, exist_ok=True)
83
- os.makedirs(tmp_dir, exist_ok=True)
84
-
85
- # Check and download PDB database if not exists
86
- pdb_db_path = os.path.join(databases_dir, "pdb")
87
- if not os.path.exists(pdb_db_path):
88
- print("Downloading PDB database...")
89
- if not run_command(f"foldseek databases PDB {pdb_db_path} {tmp_dir}"):
90
- raise Exception("Failed to download PDB database.")
91
-
92
- # Check and download AlphaFold database if not exists
93
- afdb_db_path = os.path.join(databases_dir, "afdb")
94
- if not os.path.exists(afdb_db_path):
95
- print("Downloading AlphaFold database...")
96
- if not run_command(f"foldseek databases Alphafold/Proteome {afdb_db_path} {tmp_dir}"):
97
- raise Exception("Failed to download AlphaFold database.")
98
-
99
- structures_dir = generate_database(structure_fldr_path, base_dir, mode='file')
100
-
101
- for i, targetDB in enumerate([pdb_db_path, afdb_db_path]):
102
-
103
- if i == 0:
104
- results_dir = os.path.join(base_dir, 'results', "pdb")
105
- os.makedirs(results_dir, exist_ok=True)
106
- print("Running Foldseek on PDB...")
107
- if i == 1:
108
- results_dir = os.path.join(base_dir, 'results', "afdb")
109
- os.makedirs(results_dir, exist_ok=True)
110
- print("Running Foldseek on AFdb...")
111
-
112
- aln_tmscore = f"{results_dir}/aln_tmscore"
113
- aln_tmscore_tsv = f"{results_dir}/aln_tmscore.tsv"
114
-
115
- queryDB = f"{structures_dir}/structures_db"
116
- targetDB = pdb_db_path
117
- aln = f"{results_dir}/results"
118
-
119
- if not run_command(f"foldseek search {queryDB} {targetDB} {aln} {tmp_dir} -a --threads {cores}"):
120
- raise Exception("Foldseek search against PDB failed.")
121
-
122
- if not run_command(f"foldseek aln2tmscore {queryDB} {targetDB} {aln} {aln_tmscore} --threads {cores}"):
123
- raise Exception("Foldseek aln2tmscore against PDB failed.")
124
-
125
-
126
- output_format = "query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits"
127
-
128
- if not run_command(f"foldseek createtsv {queryDB} {targetDB} {aln} {aln_tmscore} {aln_tmscore_tsv} --format-output {output_format}"):
129
- raise Exception("Foldseek createtsv against PDB failed.")
130
-
131
- input_tsv_path = f"{results_dir}/aln_tmscore"
132
- output_csv_path = f"{results_dir}/aln_tmscore.csv"
133
-
134
- # Call the function with the path to your TSV file and the output CSV file path
135
- add_headers_and_save_csv(input_tsv_path, output_csv_path, results_dir)
136
-
137
- def check_uniprot_structure(uniprot_id):
138
- import requests
139
- base_url = "https://www.ebi.ac.uk/proteins/api/proteins"
140
- headers = {"Accept": "application/json"}
141
- response = requests.get(f"{base_url}/{uniprot_id}", headers=headers)
142
- if response.status_code == 200:
143
- data = response.json()
144
- print(data) # Print the whole JSON to examine its structure
145
- else:
146
- print(f"Failed to retrieve data for {uniprot_id}: {response.status_code}")
147
-
148
- def get_ec_numbers(data):
149
- try:
150
- # Navigate through the nested structure with checks at each step
151
- protein_info = data.get('protein', {})
152
- recommended_name = protein_info.get('recommendedName', {})
153
- ec_numbers = recommended_name.get('ecNumber', [])
154
-
155
- # Extract the 'value' field from each EC number entry
156
- return ", ".join(ec['value'] for ec in ec_numbers if 'value' in ec)
157
- except Exception as e:
158
- print(f"Failed to extract EC numbers: {str(e)}")
159
- return ""
160
-
161
- def process_protein_data(data, verbose=False):
162
- if data is None:
163
- return None
164
-
165
- uniprot_id = data.get('accession')
166
- protein_data = {}
167
- protein_data[uniprot_id] = {
168
- 'UniProt ID': uniprot_id,
169
- 'Entry Name': data.get('id'),
170
- 'Organism': next((name['value'] for name in data.get('organism', {}).get('names', []) if name['type'] == 'scientific'), None),
171
- #'Taxonomic Lineage': ", ".join(data.get('organism', {}).get('lineage', [])),
172
- 'Taxonomy ID': data.get('organism', {}).get('taxonomy'),
173
- 'Sequence Length': data.get('sequence', {}).get('length'),
174
- #'EC Number': ", ".join([ec['value'] for ec in data.get('protein', {}).get('recommendedName', {}).get('ecNumber', [])]),
175
- 'EC Number': get_ec_numbers(data),
176
- 'Function': "; ".join([func['text'][0]['value'] for func in data.get('comments', []) if func['type'] == 'FUNCTION']),
177
- 'Recommended Name': data.get('protein', {}).get('recommendedName', {}).get('fullName', {}).get('value', ''),
178
- 'Alternative Names': "; ".join([alt['fullName']['value'] for alt in data.get('protein', {}).get('alternativeName', [])]),
179
- 'GO Biological Process': [],
180
- 'GO Cellular Component': [],
181
- 'GO Molecular Function': [],
182
- 'GO IDs': [],
183
- 'KEGG': [],
184
- 'OrthoDB': [],
185
- 'Sequence': data.get('sequence', {}).get('sequence', ''),
186
- 'Family and Domains': {},
187
- 'Catalytic Activity': "; ".join([cat['reaction']['name'] for cat in data.get('comments', []) if cat['type'] == 'CATALYTIC_ACTIVITY']),
188
- 'Cofactor': "; ".join([cof['cofactors'][0]['name'] for cof in data.get('comments', []) if cof['type'] == 'COFACTOR']),
189
- 'Enzyme Regulation': "; ".join([reg['text'][0]['value'] for reg in data.get('comments', []) if reg['type'] == 'ENZYME_REGULATION']),
190
- 'Disease Association': "; ".join([dis['text'][0]['value'] for dis in data.get('comments', []) if dis['type'] == 'DISEASE']),
191
- 'Interaction Partners': "; ".join([inter['id'] for inter in data.get('dbReferences', []) if inter['type'] == 'InterPro'])
192
- }
193
-
194
- # Subcellular Location processing
195
- protein_data[uniprot_id].update({
196
- 'sub_loc_Intermembrane': "",
197
- 'sub_loc_Topological_Domain': "",
198
- 'sub_loc_Subcellular_Location': "",
199
- 'sub_loc_Transmembrane': ""
200
- })
201
-
202
- for loc in data.get('comments', []):
203
- if loc['type'] == 'SUBCELLULAR_LOCATION':
204
- for component in loc.get('locations', []):
205
- if 'topology' in component:
206
- protein_data[uniprot_id]['sub_loc_Topological_Domain'] += component['topology']['value'] + "; "
207
- if 'orientation' in component:
208
- protein_data[uniprot_id]['sub_loc_Intermembrane'] += component['orientation']['value'] + "; "
209
- if 'location' in component:
210
- protein_data[uniprot_id]['sub_loc_Subcellular_Location'] += component['location']['value'] + "; "
211
- if 'subcellularLocation' in component:
212
- protein_data[uniprot_id]['sub_loc_Transmembrane'] += component['subcellularLocation']['value'] + "; "
213
-
214
- # Initialize PTM/Processing details
215
- ptms = set(ptm['type'] for ptm in data.get('features', []) if ptm['category'] == 'PTM')
216
- for ptm in ptms:
217
- protein_data[uniprot_id][ptm] = []
218
-
219
- # Process each PTM type
220
- for ptm in data.get('features', []):
221
- if ptm['category'] == 'PTM' and ptm['type'] in protein_data[uniprot_id]:
222
- ptm_description = ptm.get('description', '')
223
- ptm_details = f"{ptm_description} (positions {ptm.get('begin')} to {ptm.get('end')})"
224
- protein_data[uniprot_id][ptm['type']].append(ptm_details)
225
-
226
- # Gene Ontology Annotations
227
- for go in data.get('dbReferences', []):
228
- if go['type'] == 'GO' and 'properties' in go:
229
- go_term = go['properties']['term']
230
- if go_term.startswith('P:'):
231
- protein_data[uniprot_id]['GO Biological Process'].append(go_term[2:])
232
- elif go_term.startswith('C:'):
233
- protein_data[uniprot_id]['GO Cellular Component'].append(go_term[2:])
234
- elif go_term.startswith('F:'):
235
- protein_data[uniprot_id]['GO Molecular Function'].append(go_term[2:])
236
- protein_data[uniprot_id]['GO IDs'].append(go['id'])
237
-
238
- # External sources
239
- for xref in data.get('dbReferences', []):
240
- if xref['type'] == 'KEGG':
241
- protein_data[uniprot_id]['KEGG'].append(xref['id'])
242
- elif xref['type'] == 'OrthoDB':
243
- protein_data[uniprot_id]['OrthoDB'].append(xref['id'])
244
-
245
- # Initialize Family and Domains from 'features'
246
- for feature in data.get('features', []):
247
- if feature['type'] in ['DOMAIN', 'MOTIF', 'REGION']:
248
- domain_key = f"{feature['type']} {feature.get('description', 'N/A')}"
249
- if domain_key not in protein_data[uniprot_id]:
250
- protein_data[uniprot_id][domain_key] = f"Positions {feature.get('begin')} to {feature.get('end')}"
251
- if verbose:
252
- print(protein_data)
253
- return protein_data
254
-
255
- def fetch_data_for_uniprot_id(uniprot_id):
256
- """ Fetch data for a single UniProt ID from the UniProt API. """
257
- base_url = "https://www.ebi.ac.uk/proteins/api/proteins"
258
- headers = {"Accept": "application/json"}
259
- request_url = f"{base_url}/{uniprot_id}"
260
- response = requests.get(request_url, headers=headers)
261
- if response.status_code == 200:
262
- return response.json()
263
- else:
264
- print(f"Failed to retrieve data for {uniprot_id}: {response.status_code}")
265
- return None
266
-
267
- def fetch_and_aggregate_functional_data(uniprot_ids, num_workers=4):
268
- """
269
- Fetch and process functional data for a list of UniProt IDs using multiple processes.
270
- """
271
- # Create a process pool to fetch data asynchronously
272
- with ProcessPoolExecutor(max_workers=num_workers) as executor:
273
- # Map each UniProt ID to a future object responsible for fetching and processing its data
274
- future_to_uniprot = {executor.submit(fetch_data_for_uniprot_id, uid): uid for uid in uniprot_ids}
275
-
276
- # Dictionary to hold processed protein data
277
- protein_data = {}
278
-
279
- # Collect results as they are completed
280
- for future in as_completed(future_to_uniprot):
281
- data = future.result()
282
- if data:
283
- processed_data = process_protein_data(data)
284
- if processed_data:
285
- # Each key in processed_data should be a UniProt ID and the value a dictionary of attributes
286
- protein_data.update(processed_data) # Merge the processed data into the main dictionary
287
-
288
- # Convert the accumulated dictionary into a pandas DataFrame
289
- df = pd.DataFrame.from_dict(protein_data, orient='index')
290
-
291
- return df
292
-
293
- def get_unique_uniprot_ids(mapping):
294
- # Extract all UniProt IDs from the mapping
295
- all_uniprot_ids = set(mapping.values()) # This gets all the unique values (UniProt IDs)
296
- return list(all_uniprot_ids)
297
-
298
- def pdb_to_uniprot(pdb_chain_map = {}):
299
-
300
- import re, time, json, zlib, requests
301
- from xml.etree import ElementTree
302
- from urllib.parse import urlparse, parse_qs, urlencode
303
- from requests.adapters import HTTPAdapter, Retry
304
-
305
- POLLING_INTERVAL = 3
306
- API_URL = "https://rest.uniprot.org"
307
- retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
308
- session = requests.Session()
309
- session.mount("https://", HTTPAdapter(max_retries=retries))
310
-
311
- # The maximum number of IDs we can submit in one request
312
- MAX_IDS_PER_REQUEST = 90000
313
-
314
- def check_response(response):
315
- try:
316
- response.raise_for_status()
317
- except requests.HTTPError:
318
- print(response.json())
319
- raise
320
-
321
- def submit_id_mapping(from_db, to_db, ids):
322
- request = requests.post(
323
- f"{API_URL}/idmapping/run",
324
- data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
325
- )
326
- check_response(request)
327
- return request.json()["jobId"]
328
-
329
- def get_next_link(headers):
330
- re_next_link = re.compile(r'<(.+)>; rel="next"')
331
- if "Link" in headers:
332
- match = re_next_link.match(headers["Link"])
333
- if match:
334
- return match.group(1)
335
-
336
- def check_id_mapping_results_ready(job_id):
337
- while True:
338
- request = session.get(f"{API_URL}/idmapping/status/{job_id}")
339
- check_response(request)
340
- j = request.json()
341
- if "jobStatus" in j:
342
- if j["jobStatus"] == "RUNNING":
343
- print(f"Retrying in {POLLING_INTERVAL}s")
344
- time.sleep(POLLING_INTERVAL)
345
- else:
346
- raise Exception(j["jobStatus"])
347
- else:
348
- return bool(j["results"] or j["failedIds"])
349
-
350
- def get_batch(batch_response, file_format, compressed):
351
- batch_url = get_next_link(batch_response.headers)
352
- while batch_url:
353
- batch_response = session.get(batch_url)
354
- batch_response.raise_for_status()
355
- yield decode_results(batch_response, file_format, compressed)
356
- batch_url = get_next_link(batch_response.headers)
357
-
358
- def combine_batches(all_results, batch_results, file_format):
359
- if file_format == "json":
360
- for key in ("results", "failedIds"):
361
- if key in batch_results and batch_results[key]:
362
- all_results[key] += batch_results[key]
363
- elif file_format == "tsv":
364
- return all_results + batch_results[1:]
365
- else:
366
- return all_results + batch_results
367
- return all_results
368
-
369
- def get_id_mapping_results_link(job_id):
370
- url = f"{API_URL}/idmapping/details/{job_id}"
371
- request = session.get(url)
372
- check_response(request)
373
- return request.json()["redirectURL"]
374
-
375
- def decode_results(response, file_format, compressed):
376
- if compressed:
377
- decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
378
- if file_format == "json":
379
- j = json.loads(decompressed.decode("utf-8"))
380
- return j
381
- elif file_format == "tsv":
382
- return [line for line in decompressed.decode("utf-8").split("\n") if line]
383
- elif file_format == "xlsx":
384
- return [decompressed]
385
- elif file_format == "xml":
386
- return [decompressed.decode("utf-8")]
387
- else:
388
- return decompressed.decode("utf-8")
389
- elif file_format == "json":
390
- return response.json()
391
- elif file_format == "tsv":
392
- return [line for line in response.text.split("\n") if line]
393
- elif file_format == "xlsx":
394
- return [response.content]
395
- elif file_format == "xml":
396
- return [response.text]
397
- return response.text
398
-
399
- def get_xml_namespace(element):
400
- m = re.match(r"\{(.*)\}", element.tag)
401
- return m.groups()[0] if m else ""
402
-
403
- def merge_xml_results(xml_results):
404
- merged_root = ElementTree.fromstring(xml_results[0])
405
- for result in xml_results[1:]:
406
- root = ElementTree.fromstring(result)
407
- for child in root.findall("{http://uniprot.org/uniprot}entry"):
408
- merged_root.insert(-1, child)
409
- ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
410
- return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
411
-
412
- def print_progress_batches(batch_index, size, total):
413
- n_fetched = min((batch_index + 1) * size, total)
414
- print(f"Fetched: {n_fetched} / {total}")
415
-
416
- def get_id_mapping_results_search(url):
417
- parsed = urlparse(url)
418
- query = parse_qs(parsed.query)
419
- file_format = query["format"][0] if "format" in query else "json"
420
- if "size" in query:
421
- size = int(query["size"][0])
422
- else:
423
- size = 500
424
- query["size"] = size
425
- compressed = (
426
- query["compressed"][0].lower() == "true" if "compressed" in query else False
427
- )
428
- parsed = parsed._replace(query=urlencode(query, doseq=True))
429
- url = parsed.geturl()
430
- request = session.get(url)
431
- check_response(request)
432
- results = decode_results(request, file_format, compressed)
433
- total = int(request.headers["x-total-results"])
434
- print_progress_batches(0, size, total)
435
- for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
436
- results = combine_batches(results, batch, file_format)
437
- print_progress_batches(i, size, total)
438
- if file_format == "xml":
439
- return merge_xml_results(results)
440
- return results
441
-
442
- def get_id_mapping_results_stream(url):
443
- if "/stream/" not in url:
444
- url = url.replace("/results/", "/results/stream/")
445
- request = session.get(url)
446
- check_response(request)
447
- parsed = urlparse(url)
448
- query = parse_qs(parsed.query)
449
- file_format = query["format"][0] if "format" in query else "json"
450
- compressed = (
451
- query["compressed"][0].lower() == "true" if "compressed" in query else False
452
- )
453
- return decode_results(request, file_format, compressed)
454
-
455
- def extract_uniprot_names(results):
456
- uniprot_mapping = {}
457
- for result in results.get('results', []):
458
- pdb_name = result['from']
459
- #print(result['to'])
460
- #time.sleep(1)
461
- uniprot_name = result['to'].get('primaryAccession', '') #uniProtkbId
462
- if uniprot_name:
463
- uniprot_mapping[pdb_name] = uniprot_name
464
- return uniprot_mapping
465
-
466
- def chunks(lst, n):
467
- """Yield successive n-sized chunks from lst."""
468
- for i in range(0, len(lst), n):
469
- yield lst[i:i + n]
470
-
471
- uniprot_names = {}
472
- formatted_ids = [f"{pdb_id}:{chain}" for pdb_id, chain in pdb_chain_map.items()]
473
-
474
- # Iterate over each chunk of formatted IDs and submit separate jobs
475
- for formatted_ids_chunk in chunks(formatted_ids, MAX_IDS_PER_REQUEST):
476
- #print('chunk',formatted_ids_chunk)
477
- job_id = submit_id_mapping("PDB", "UniProtKB", formatted_ids_chunk)
478
- #accession, UniProtKB
479
- if check_id_mapping_results_ready(job_id):
480
- link = get_id_mapping_results_link(job_id)
481
- results = get_id_mapping_results_search(link)
482
- uniprot_names.update(extract_uniprot_names(results))
483
- return uniprot_names
484
-
485
- def functionally_annotate_foldseek_hits(csv_file_path, num_workers=25, limit=None, threshold=None):
486
-
487
- foldseek_df = pd.read_csv(csv_file_path)
488
-
489
- if not threshold is None:
490
- foldseek_df = foldseek_df[foldseek_df['evalue'] < threshold]
491
-
492
- if not limit is None:
493
- foldseek_df = foldseek_df.sample(n=limit)
494
-
495
- # Extract PDB IDs and chain and convert them to uppercase
496
- foldseek_df['target_pdbID'] = foldseek_df['target'].str.split('-').str[0].str.upper()
497
- foldseek_df['chain'] = foldseek_df['target'].str.split('_').str[-1]
498
- unique_pdb_ids = dict(zip(foldseek_df['target_pdbID'], foldseek_df['chain']))
499
-
500
- print(f'Found {len(unique_pdb_ids)} unique target proteins')
501
-
502
- # Fetch UniProt mapping for the unique PDB IDs
503
- unique_pdb_mapping = pdb_to_uniprot(unique_pdb_ids)
504
- #print(unique_pdb_mapping)
505
-
506
- # Map the target PDB IDs and chains to UniProt IDs using the unique_pdb_mapping
507
- foldseek_df['target_uniprotID'] = foldseek_df.apply(
508
- lambda row: unique_pdb_mapping.get(f"{row['target_pdbID']}:{row['chain']}", pd.NA),
509
- axis=1
510
- )
511
-
512
- #display(foldseek_df)
513
- #display(unique_pdb_mapping)
514
- unique_pdb_ids = get_unique_uniprot_ids(unique_pdb_mapping)
515
- #print(unique_pdb_ids)
516
- target_metadata_df = fetch_and_aggregate_functional_data(unique_pdb_ids, num_workers=20)
517
- #display(target_metadata_df)
518
- merged_df = pd.merge(foldseek_df, target_metadata_df, left_on='target_uniprotID', right_on='UniProt ID')
519
- return merged_df
520
-
521
- def _analyze_group(args):
522
- group, total, feature_columns, query = args
523
- results = []
524
- group_total = group.shape[0]
525
- for feature in feature_columns:
526
- try:
527
- all_features = set(group[feature].explode().dropna().unique())
528
- except TypeError:
529
- all_features = set(group[feature].dropna().apply(lambda x: x if isinstance(x, list) else [x]).explode().unique())
530
-
531
- for specific_feature in all_features:
532
- observed_present = group[feature].apply(lambda x: specific_feature in x if isinstance(x, list) else specific_feature == x).sum()
533
- observed_absent = group_total - observed_present
534
- expected_present = group[feature].apply(lambda x: specific_feature in x if isinstance(x, list) else specific_feature == x).sum()
535
- expected_absent = total - expected_present
536
-
537
- contingency_table = [[observed_present, observed_absent], [expected_present, expected_absent]]
538
- odds_ratio, p_value = fisher_exact(contingency_table, 'greater')
539
-
540
- results.append({
541
- 'query': query,
542
- 'feature': specific_feature,
543
- 'p_value': p_value,
544
- 'category': feature
545
- })
546
- return results
547
-
548
- def perform_enrichment_analysis(df, num_workers=4):
549
-
550
- exclude_columns = [
551
- 'query', 'target', 'fident', 'alnlen', 'mismatch', 'gapopen', 'qstart', 'qend',
552
- 'tstart', 'tend', 'evalue', 'bits', 'target_pdbID', 'target_uniprotID', 'UniProt ID',
553
- 'Entry Name', 'Organism', 'Taxonomy ID', 'Sequence Length', 'Sequence', 'EC Number', 'Function',
554
- 'Recommended Name', 'Alternative Names'
555
- ]
556
- feature_columns = df.columns.difference(exclude_columns)
557
- total = df.shape[0]
558
-
559
- with ProcessPoolExecutor(max_workers=num_workers) as executor:
560
- future_to_group = {executor.submit(_analyze_group, (group, total, feature_columns, query)): query for query, group in df.groupby('query')}
561
- results = []
562
- for future in as_completed(future_to_group):
563
- results.extend(future.result())
564
-
565
- results_df = pd.DataFrame(results)
566
- correction_method = 'fdr_bh'
567
- p_adjust = multipletests(results_df['p_value'], method=correction_method)
568
- results_df['adjusted_p_value'] = p_adjust[1]
569
-
570
- return results_df
571
-
572
- def compare_features(enrichment_results, verbose=False):
573
-
574
- # Check feature matches
575
- def check_feature_match(row):
576
- category = row['category']
577
- feature = row['feature']
578
- # Check if the category column exists in the DataFrame
579
- if category in protein_data_df.columns:
580
- # Flatten the list if it's not already a scalar
581
- values = row[category]
582
- if verbose:
583
- print(f'category:{category}, feature:{feature}, values:{values}')
584
- if isinstance(values, list) or isinstance(values, np.ndarray):
585
- if any(pd.isna(values)):
586
- return np.nan
587
- else:
588
- # Check if the feature is within the list of values
589
- return 1 if feature in values else 0
590
- else:
591
- # Direct comparison if it's scalar
592
- if pd.isna(values):
593
- return np.nan
594
- return 1 if feature == values else 0
595
- else:
596
- print(f'Could not find {category} in columns')
597
- return np.nan
598
-
599
- # Assuming the format 'something-UniProtID' in the 'query' column
600
- enrichment_results['UniProt ID'] = enrichment_results['query'].str.split('-').str[1]
601
-
602
- # Get unique UniProt IDs
603
- uniprot_ids = enrichment_results['UniProt ID'].unique().tolist()
604
-
605
- # Fetch data for these UniProt IDs
606
- protein_data_df = fetch_and_aggregate_functional_data(uniprot_ids)
607
-
608
- # Assuming the fetched protein_data_df is indexed by 'UniProt ID', merge it
609
- comparison_df = pd.merge(enrichment_results, protein_data_df, on='UniProt ID', how='left')
610
-
611
- # Filter significant features
612
- significant_features = comparison_df[comparison_df['adjusted_p_value'] < 0.05]
613
-
614
- # Apply the checking function to each row
615
- significant_features['comparison'] = significant_features.apply(check_feature_match, axis=1)
616
-
617
- return significant_features
618
-
619
- def calculate_feature_metrics(comparison_df):
620
- # Drop rows where comparison is NaN
621
- filtered_df = comparison_df.dropna(subset=['comparison'])
622
-
623
- # Convert 'comparison' to integer for metrics calculation
624
- filtered_df['comparison'] = filtered_df['comparison'].astype(int)
625
-
626
- # Initialize dictionary to store metrics by category and feature
627
- metrics = []
628
-
629
- # Group by category and feature for detailed metrics
630
- grouped = filtered_df.groupby(['category', 'feature'])
631
- for (category, feature), group in grouped:
632
- # True labels are 'comparison', predictions assume 1 if 'comparison' > 0 (already true for 1 and 0)
633
- true_labels = group['comparison']
634
- pred_labels = (group['comparison'] > 0).astype(int) # Prediction: 1 if comparison > 0, else 0
635
-
636
- # Calculating precision, recall, F1-score, and accuracy
637
- precision = precision_score(true_labels, pred_labels, zero_division=0)
638
- recall = recall_score(true_labels, pred_labels, zero_division=0)
639
- f1 = f1_score(true_labels, pred_labels, zero_division=0)
640
- accuracy = accuracy_score(true_labels, pred_labels)
641
-
642
- # Append results to metrics list
643
- metrics.append({
644
- 'category': category,
645
- 'feature': feature,
646
- 'precision': precision,
647
- 'recall': recall,
648
- 'f1_score': f1,
649
- 'accuracy': accuracy
650
- })
651
-
652
- # Convert list of metrics to DataFrame
653
- metrics_df = pd.DataFrame(metrics)
654
-
655
- return metrics_df
656
-
657
- def visualize_heatmap(data, pivot_index, pivot_columns, values):
658
- # Pivoting the data for heatmap
659
- heatmap_data = data.pivot_table(index=pivot_index, columns=pivot_columns, values=values, aggfunc='first')
660
-
661
- # Create a figure and axes object
662
- fig, ax = plt.subplots(figsize=(10, 8))
663
-
664
- # Create the heatmap on the specified axes
665
- sns.heatmap(heatmap_data, annot=True, cmap='viridis', fmt=".2g", linewidths=.5, ax=ax)
666
-
667
- ax.set_title('Heatmap of Enriched Features Across Queries')
668
- ax.set_ylabel('Query')
669
- ax.set_xlabel('Feature')
670
-
671
- # Return the figure object
672
- return fig
673
-
674
- def visualize_bar_chart(data):
675
- # Counting occurrences of significant features
676
- feature_counts = data['feature'].value_counts().reset_index()
677
- feature_counts.columns = ['feature', 'counts']
678
-
679
- # Create a figure and axes object
680
- fig, ax = plt.subplots(figsize=(12, 8))
681
-
682
- # Create the bar plot on the specified axes
683
- bar_plot = sns.barplot(x='counts', y='feature', data=feature_counts.head(20), ax=ax)
684
-
685
- # Optional: set color palette manually if needed
686
- #bar_plot.set_palette(sns.color_palette("viridis", n_colors=20))
687
-
688
- ax.set_title('Top Enriched Features Across All Queries')
689
- ax.set_xlabel('Counts of Significant Enrichment')
690
- ax.set_ylabel('Features')
691
-
692
- # Properly setting the x-ticks and rotating them
693
- ax.set_xticks(ax.get_xticks()) # This ensures the ticks are explicitly set
694
- ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
695
-
696
- # Return the figure object
697
- return fig
698
-
699
- def visualize_dot_plot(data):
700
- # Adjusting data for visualization
701
- data['-log10(p_value)'] = -np.log10(data['adjusted_p_value'])
702
-
703
- # Create a figure object
704
- fig, ax = plt.subplots(figsize=(10, 8))
705
-
706
- # Create the plot on the specified axes
707
- sns.scatterplot(data=data, x='feature', y='query', size='-log10(p_value)',
708
- legend=None, sizes=(20, 200), hue='category', ax=ax)
709
-
710
- ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
711
- ax.set_title('Dot Plot of Feature Enrichment Across Queries')
712
- ax.set_xlabel('Feature')
713
- ax.set_ylabel('Query')
714
- ax.grid(True)
715
-
716
- # Return the figure object
717
- return fig
718
-
719
- def analyze_results(foldseek_csv_path, base_dir):
720
-
721
- results = functionally_annotate_foldseek_hits(foldseek_csv_path, limit=None, threshold=None)
722
- #display(results)
723
-
724
- enrichment_results = perform_enrichment_analysis(results, num_workers=25)
725
- filtered_results = enrichment_results[enrichment_results['adjusted_p_value'] < 0.05]
726
- filtered_results = filtered_results[filtered_results['feature'].str.strip().astype(bool)]
727
- #display(filtered_results)
728
-
729
- fldr = os.path.dirname(foldseek_csv_path)
730
-
731
- heatmap_path = os.path.join(fldr, 'heatmap.pdf')
732
- bar_path = os.path.join(fldr, 'bar.pdf')
733
- dot_path = os.path.join(fldr, 'dot.pdf')
734
-
735
- heatmap_fig = visualize_heatmap(filtered_results, 'query', 'feature', 'adjusted_p_value')
736
- bar_fig = visualize_bar_chart(filtered_results)
737
- dot_fig = visualize_dot_plot(filtered_results)
738
-
739
- heatmap_fig.savefig(heatmap_path, bbox_inches='tight')
740
- bar_fig.savefig(bar_path, bbox_inches='tight')
741
- dot_fig.savefig(dot_path, bbox_inches='tight')
742
-
743
- comparison_results = compare_features(filtered_results)
744
- #display(comparison_results)
745
- feature_metrics_results = calculate_feature_metrics(comparison_results)
746
- #display(feature_metrics_results)
747
-
748
- fldr = os.path.dirname(foldseek_csv_path)
749
-
750
- merged_path = os.path.join(fldr, 'merged.csv')
751
- enrichment_path = os.path.join(fldr, 'enrichment.csv')
752
- comparison_path = os.path.join(fldr, 'comparison.csv')
753
-
754
- results.to_csv(merged_path, index=False)
755
- filtered_results.to_csv(enrichment_path, index=False)
756
- comparison_results.to_csv(comparison_path, index=False)
757
-
758
- print(f'saved to results to {merged_path}')
759
- print(f'saved to enrichment results to {enrichment_path}')
760
- print(f'saved to comparison results to {comparison_path}')
761
-
762
- #display(functional_data_df)
763
-
764
- # Set up directories
765
- #structure_fldr_path = "/home/carruthers/Downloads/ME49_proteome/cif"
766
- #base_dir='/home/carruthers/foldseek/me49'
767
-
768
- #align_to_database(structure_fldr_path, base_dir, cores=25)
769
- #foldseek_csv_path = f'{base_dir}/results/pdb/aln_tmscore.csv'
770
- #analyze_results(foldseek_csv_path, base_dir)
771
-
772
- # Set up directories
773
- #structure_fldr_path = "/home/carruthers/Downloads/GT1_proteome/cif"
774
- #base_dir='/home/carruthers/foldseek/gt1'
775
-
776
- #align_to_database(structure_fldr_path, base_dir, cores=25)
777
- #foldseek_csv_path = f'{base_dir}/results/pdb/aln_tmscore.csv'
778
- #analyze_results(foldseek_csv_path, base_dir)
779
-