pgatk 0.0.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. pgatk/__init__.py +1 -0
  2. pgatk/cgenomes/__init__.py +0 -0
  3. pgatk/cgenomes/cbioportal_downloader.py +424 -0
  4. pgatk/cgenomes/cgenomes_proteindb.py +1214 -0
  5. pgatk/cgenomes/cosmic_downloader.py +228 -0
  6. pgatk/cgenomes/models.py +12 -0
  7. pgatk/cli.py +73 -0
  8. pgatk/clinvar/__init__.py +0 -0
  9. pgatk/clinvar/chromosome_mapper.py +100 -0
  10. pgatk/clinvar/clinvar_service.py +910 -0
  11. pgatk/clinvar/data_downloader.py +201 -0
  12. pgatk/commands/__init__.py +0 -0
  13. pgatk/commands/blast_get_position.py +30 -0
  14. pgatk/commands/cbioportal_downloader.py +42 -0
  15. pgatk/commands/cbioportal_to_proteindb.py +103 -0
  16. pgatk/commands/clinvar_to_proteindb.py +47 -0
  17. pgatk/commands/cosmic_downloader.py +48 -0
  18. pgatk/commands/cosmic_to_proteindb.py +60 -0
  19. pgatk/commands/digest_mutant_protein.py +36 -0
  20. pgatk/commands/dnaseq_to_proteindb.py +80 -0
  21. pgatk/commands/ensembl_database.py +28 -0
  22. pgatk/commands/ensembl_downloader.py +150 -0
  23. pgatk/commands/gencode_downloader.py +71 -0
  24. pgatk/commands/gnomad_vcf_downloader.py +79 -0
  25. pgatk/commands/map_peptide2genome.py +30 -0
  26. pgatk/commands/ncbi_downloader.py +103 -0
  27. pgatk/commands/proteindb_decoy.py +106 -0
  28. pgatk/commands/threeframe_translation.py +31 -0
  29. pgatk/commands/utils.py +27 -0
  30. pgatk/commands/validate_peptides.py +49 -0
  31. pgatk/commands/vcf_to_proteindb.py +117 -0
  32. pgatk/config/__init__.py +0 -0
  33. pgatk/config/assemblies_conf.json +49114 -0
  34. pgatk/config/cbioportal_config.yaml +26 -0
  35. pgatk/config/clinvar_config.yaml +41 -0
  36. pgatk/config/cosmic_config.yaml +41 -0
  37. pgatk/config/ensembl_config.yaml +30 -0
  38. pgatk/config/ensembl_downloader_config.yaml +35 -0
  39. pgatk/config/protein_decoy.yaml +22 -0
  40. pgatk/config/registry.py +38 -0
  41. pgatk/db/__init__.py +0 -0
  42. pgatk/db/digest_mutant_protein.py +129 -0
  43. pgatk/db/map_peptide2genome.py +275 -0
  44. pgatk/ensembl/__init__.py +0 -0
  45. pgatk/ensembl/data_downloader.py +550 -0
  46. pgatk/ensembl/ensembl.py +1063 -0
  47. pgatk/ensembl/exceptions.py +13 -0
  48. pgatk/ensembl/models.py +8 -0
  49. pgatk/gnomad/__init__.py +0 -0
  50. pgatk/gnomad/data_downloader.py +363 -0
  51. pgatk/proteogenomics/__init__.py +0 -0
  52. pgatk/proteogenomics/blast_get_position.py +197 -0
  53. pgatk/proteogenomics/spectrumai.py +360 -0
  54. pgatk/proteomics/__init__.py +0 -0
  55. pgatk/proteomics/db/__init__.py +0 -0
  56. pgatk/proteomics/db/protein_database_decoy.py +462 -0
  57. pgatk/proteomics/models.py +48 -0
  58. pgatk/toolbox/__init__.py +0 -0
  59. pgatk/toolbox/exceptions.py +34 -0
  60. pgatk/toolbox/general.py +426 -0
  61. pgatk/toolbox/rest.py +39 -0
  62. pgatk/toolbox/vcf_utils.py +225 -0
  63. pgatk-0.0.27.dist-info/METADATA +423 -0
  64. pgatk-0.0.27.dist-info/RECORD +68 -0
  65. pgatk-0.0.27.dist-info/WHEEL +5 -0
  66. pgatk-0.0.27.dist-info/entry_points.txt +2 -0
  67. pgatk-0.0.27.dist-info/licenses/LICENSE.txt +201 -0
  68. pgatk-0.0.27.dist-info/top_level.txt +1 -0
pgatk/__init__.py ADDED
@@ -0,0 +1 @@
1
+ name = "pgatk"
File without changes
@@ -0,0 +1,424 @@
1
+ import csv
2
+ import os
3
+ from concurrent.futures import as_completed
4
+ from concurrent.futures.thread import ThreadPoolExecutor
5
+ from typing import Optional
6
+
7
+ import requests
8
+
9
+ from pgatk.toolbox.exceptions import AppException
10
+ from pgatk.toolbox.general import ParameterConfiguration, check_create_folders, clear_cache
11
+ from pgatk.toolbox.rest import call_api_raw
12
+
13
+ _CBIO_PAGE_SIZE = 10_000
14
+
15
+ MAF_HEADER = [
16
+ "Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
17
+ "Start_Position", "End_Position", "Strand", "Consequence",
18
+ "Variant_Classification", "Variant_Type", "Reference_Allele",
19
+ "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", "dbSNP_RS", "dbSNP_Val_Status",
20
+ "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode",
21
+ "Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2",
22
+ "Tumor_Validation_Allele1", "Tumor_Validation_Allele2",
23
+ "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2",
24
+ "Verification_Status", "Validation_Status", "Mutation_Status",
25
+ "Sequencing_Phase", "Sequence_Source", "Validation_Method",
26
+ "Score", "BAM_File", "Sequencer", "t_ref_count", "t_alt_count",
27
+ "n_ref_count", "n_alt_count", "HGVSc", "HGVSp", "HGVSp_Short",
28
+ "Transcript_ID", "RefSeq", "Protein_position", "Codons", "Hotspot",
29
+ ]
30
+
31
+
32
+ def _json_or_raise(resp, label: str, log):
33
+ """Parse JSON from a response, raising with a clear message if the body is empty or invalid."""
34
+ if not resp.text:
35
+ raise ValueError(
36
+ f"{label}: server returned HTTP {resp.status_code} with an empty body"
37
+ )
38
+ try:
39
+ return resp.json()
40
+ except Exception:
41
+ snippet = resp.text[:300]
42
+ raise ValueError(f"{label}: could not parse JSON (HTTP {resp.status_code}). Body: {snippet!r}")
43
+
44
+
45
+ def _get_sample_ids(base_url: str, study_id: str, log) -> list:
46
+ """Return all sequenced sample IDs for a study via the sample-list API."""
47
+ # Prefer the sequenced list; fall back to the _all list.
48
+ for list_id in (f"{study_id}_sequenced", f"{study_id}_all"):
49
+ url = f"{base_url}/sample-lists/{list_id}/sample-ids"
50
+ log.debug("GET %s", url)
51
+ resp = requests.get(url, headers={"Accept": "application/json"}, timeout=30)
52
+ if resp.status_code == 200 and resp.text:
53
+ return _json_or_raise(resp, f"sample-ids ({list_id})", log)
54
+ log.warning("Could not retrieve sample list for study '%s'", study_id)
55
+ return []
56
+
57
+
58
+ def _fetch_study_mutations(base_url: str, study_id: str, log) -> list:
59
+ """Fetch all mutations for a study from the cBioPortal REST API."""
60
+ profiles_url = f"{base_url}/studies/{study_id}/molecular-profiles"
61
+ log.debug("GET %s", profiles_url)
62
+ resp = requests.get(profiles_url, headers={"Accept": "application/json"}, timeout=30)
63
+ resp.raise_for_status()
64
+ profiles = _json_or_raise(resp, "molecular-profiles", log)
65
+
66
+ profile_id = None
67
+ for p in profiles:
68
+ if p.get("molecularAlterationType") == "MUTATION_EXTENDED":
69
+ profile_id = p["molecularProfileId"]
70
+ break
71
+ if profile_id is None:
72
+ log.warning("No MUTATION_EXTENDED profile found for study '%s'", study_id)
73
+ return []
74
+ log.info("Using molecular profile '%s'", profile_id)
75
+
76
+ sample_ids = _get_sample_ids(base_url, study_id, log)
77
+ if not sample_ids:
78
+ log.warning("No samples found for study '%s'", study_id)
79
+ return []
80
+ log.info("Fetching mutations for %d samples ...", len(sample_ids))
81
+
82
+ # sampleListId is silently broken in the current API; use explicit identifiers.
83
+ sample_mol_ids = [
84
+ {"molecularProfileId": profile_id, "sampleId": sid} for sid in sample_ids
85
+ ]
86
+
87
+ mutations = []
88
+ page = 0
89
+ while True:
90
+ url = (
91
+ f"{base_url}/mutations/fetch"
92
+ f"?molecularProfileId={profile_id}&projection=DETAILED"
93
+ f"&pageSize={_CBIO_PAGE_SIZE}&pageNumber={page}"
94
+ )
95
+ log.debug("POST %s (page %d)", url, page)
96
+ resp = requests.post(
97
+ url,
98
+ json={"sampleMolecularIdentifiers": sample_mol_ids},
99
+ headers={"Content-Type": "application/json", "Accept": "application/json"},
100
+ timeout=120,
101
+ )
102
+ resp.raise_for_status()
103
+ batch = _json_or_raise(resp, f"mutations page {page}", log)
104
+ if not batch:
105
+ break
106
+ mutations.extend(batch)
107
+ log.info(" Page %d: %d mutations (total: %d)", page, len(batch), len(mutations))
108
+ if len(batch) < _CBIO_PAGE_SIZE:
109
+ break
110
+ page += 1
111
+ return mutations
112
+
113
+
114
+ def _mutation_to_row(m: dict) -> list:
115
+ """Map a cBioPortal API mutation object to a MAF-compatible row list."""
116
+ gene = m.get("gene") or {}
117
+ hugo = gene.get("hugoGeneSymbol", "")
118
+ entrez = str(gene.get("entrezGeneId", ""))
119
+ chrom = str(m.get("chr", ""))
120
+ start = str(m.get("startPosition", ""))
121
+ end = str(m.get("endPosition", ""))
122
+ ref = m.get("referenceAllele", "")
123
+ alt = m.get("variantAllele", "")
124
+ varclass = m.get("mutationType", "")
125
+ vartype = m.get("variantType", "")
126
+ sample_id = m.get("sampleId", "")
127
+ ncbi_build = m.get("ncbiBuild", "GRCh37")
128
+ center = m.get("center", "")
129
+ mut_status = m.get("mutationStatus", "Somatic")
130
+ val_status = m.get("validationStatus", "")
131
+ t_ref = str(m.get("tumorRefCount", ""))
132
+ t_alt = str(m.get("tumorAltCount", ""))
133
+ n_ref = str(m.get("normalRefCount", ""))
134
+ n_alt = str(m.get("normalAltCount", ""))
135
+ refseq = m.get("refseqMrnaId", "")
136
+ hgvsc = m.get("hgvsc", "") or ""
137
+ pc = m.get("proteinChange", "")
138
+ hgvsp_short = pc if pc.startswith("p.") else (f"p.{pc}" if pc else "")
139
+ # Use RefSeq transcript ID; Ensembl IDs are not returned by the public API.
140
+ transcript_id = refseq.split(".")[0] if refseq else ""
141
+ protein_pos = str(m.get("proteinPosStart", ""))
142
+
143
+ return [
144
+ hugo, entrez, center, ncbi_build, chrom, start, end, "+",
145
+ "", varclass, vartype, ref, ref, alt, "", "", sample_id,
146
+ "", "", "", "", "", "", "", "", val_status, mut_status, "", "", "", "", "", "",
147
+ t_ref, t_alt, n_ref, n_alt, hgvsc, "", hgvsp_short, transcript_id, refseq,
148
+ protein_pos, "", "",
149
+ ]
150
+
151
+
152
+ def _fetch_clinical_data(base_url: str, study_id: str, output_dir: str, log) -> Optional[str]:
153
+ """Download all sample-level clinical attributes for a study into a TSV file.
154
+
155
+ Calls ``GET /studies/{studyId}/clinical-data?clinicalDataType=SAMPLE`` with
156
+ pagination, then pivots the attribute-per-row API response into a standard
157
+ SAMPLE_ID × attribute-column TSV that ``CancerGenomesService.get_value_per_sample``
158
+ can read.
159
+ """
160
+ records: list[dict] = []
161
+ page = 0
162
+ while True:
163
+ url = (
164
+ f"{base_url}/studies/{study_id}/clinical-data"
165
+ f"?clinicalDataType=SAMPLE&pageSize={_CBIO_PAGE_SIZE}&pageNumber={page}"
166
+ )
167
+ log.debug("GET %s", url)
168
+ resp = requests.get(url, headers={"Accept": "application/json"}, timeout=60)
169
+ if resp.status_code != 200:
170
+ log.warning("Clinical data request failed (HTTP %d) for study '%s'", resp.status_code, study_id)
171
+ return None
172
+ if not resp.text:
173
+ break
174
+ batch = _json_or_raise(resp, f"clinical-data page {page}", log)
175
+ if not batch:
176
+ break
177
+ records.extend(batch)
178
+ log.debug(" Clinical page %d: %d records (total %d)", page, len(batch), len(records))
179
+ if len(batch) < _CBIO_PAGE_SIZE:
180
+ break
181
+ page += 1
182
+
183
+ if not records:
184
+ log.warning("No clinical data returned for study '%s'", study_id)
185
+ return None
186
+
187
+ # Pivot: collect all attributes in insertion order, then build one row per sample.
188
+ sample_attrs: dict[str, dict[str, str]] = {}
189
+ attr_order: list[str] = []
190
+ for rec in records:
191
+ sid = rec.get("sampleId", "")
192
+ attr = rec.get("clinicalAttributeId", "")
193
+ val = rec.get("value", "")
194
+ if attr and attr not in attr_order:
195
+ attr_order.append(attr)
196
+ if sid not in sample_attrs:
197
+ sample_attrs[sid] = {}
198
+ sample_attrs[sid][attr] = val
199
+
200
+ out_path = os.path.join(output_dir, "data_clinical_sample.txt")
201
+ with open(out_path, "w", encoding="utf-8", newline="") as fh:
202
+ header = ["SAMPLE_ID"] + attr_order
203
+ fh.write("\t".join(header) + "\n")
204
+ for sid, attrs in sample_attrs.items():
205
+ row = [sid] + [attrs.get(a, "") for a in attr_order]
206
+ fh.write("\t".join(row) + "\n")
207
+
208
+ log.info("Wrote clinical data for %d samples to %s", len(sample_attrs), out_path)
209
+ return out_path
210
+
211
+
212
+ class CbioPortalDownloadService(ParameterConfiguration):
213
+ CONFIG_KEY_DATA_DOWNLOADER = 'cbioportal_data_downloader'
214
+ CONFIG_KEY_CBIOPORTAL_DOWNLOAD_URL = 'cbioportal_download_url'
215
+ CONFIG_OUTPUT_DIRECTORY = 'output_directory'
216
+ CONFIG_CBIOPORTAL_API = 'cbioportal_api'
217
+ CONFIG_CBIOPORTAL_API_SERVER = 'base_url'
218
+ CONFIG_CBIOPORTAL_API_CANCER_STUDIES = "cancer_studies"
219
+ CONFIG_LIST_STUDIES = "list_studies"
220
+ CONFIG_MULTITHREADING = "multithreading"
221
+ PROTEINDB = 'proteindb'
222
+ FILTER_INFO = 'filter_info'
223
+ FILTER_COLUMN = 'filter_column'
224
+
225
+ def __init__(self, config_data, pipeline_arguments):
226
+ """
227
+ Init the class with the specific parameters.
228
+ :param config_data configuration file
229
+ :param pipeline_arguments pipelines arguments
230
+ """
231
+
232
+ super(CbioPortalDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_data,
233
+ pipeline_arguments)
234
+
235
+ self._local_path_cbioportal = 'output_directory'
236
+ self._list_studies = []
237
+ self._multithreading = True
238
+
239
+ self._cbioportal_base_url = 'https://www.cbioportal.org/api'
240
+ self._cancer_studies_command = 'studies'
241
+
242
+ if self.CONFIG_OUTPUT_DIRECTORY in self.get_pipeline_parameters():
243
+ self._local_path_cbioportal = self.get_pipeline_parameters()[self.CONFIG_OUTPUT_DIRECTORY]
244
+ elif self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() and \
245
+ self.CONFIG_OUTPUT_DIRECTORY in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER]:
246
+ self._local_path_cbioportal = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
247
+ self.CONFIG_OUTPUT_DIRECTORY]
248
+
249
+ if self.CONFIG_LIST_STUDIES in self.get_pipeline_parameters():
250
+ self._list_studies = self.get_pipeline_parameters()[self.CONFIG_LIST_STUDIES]
251
+ elif self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() and \
252
+ self.CONFIG_LIST_STUDIES in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER]:
253
+ self._list_studies = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
254
+ self.CONFIG_LIST_STUDIES]
255
+
256
+ if self.CONFIG_MULTITHREADING in self.get_pipeline_parameters():
257
+ self._multithreading = self.get_pipeline_parameters()[self.CONFIG_MULTITHREADING]
258
+ elif self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() and \
259
+ self.CONFIG_MULTITHREADING in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER]:
260
+ self._multithreading = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
261
+ self.CONFIG_MULTITHREADING]
262
+
263
+ if self.CONFIG_CBIOPORTAL_API_SERVER in self.get_pipeline_parameters():
264
+ self._cbioportal_base_url = self.get_pipeline_parameters()[self.CONFIG_CBIOPORTAL_API_SERVER]
265
+ elif (self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() and
266
+ self.CONFIG_CBIOPORTAL_API in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER]
267
+ and self.CONFIG_CBIOPORTAL_API_SERVER in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
268
+ self.CONFIG_CBIOPORTAL_API]):
269
+ self._cbioportal_base_url = \
270
+ self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_CBIOPORTAL_API][
271
+ self.CONFIG_CBIOPORTAL_API_SERVER]
272
+
273
+ if self.CONFIG_CBIOPORTAL_API_CANCER_STUDIES in self.get_pipeline_parameters():
274
+ self._cancer_studies_command = self.get_pipeline_parameters()[self.CONFIG_CBIOPORTAL_API_CANCER_STUDIES]
275
+ elif (self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() and
276
+ self.CONFIG_CBIOPORTAL_API in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER]
277
+ and self.CONFIG_CBIOPORTAL_API_CANCER_STUDIES in
278
+ self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
279
+ self.CONFIG_CBIOPORTAL_API]):
280
+ self._cancer_studies_command = \
281
+ self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_CBIOPORTAL_API][
282
+ self.CONFIG_CBIOPORTAL_API_CANCER_STUDIES]
283
+
284
+ self.prepare_local_cbioportal_repository()
285
+ self.get_cancer_studies()
286
+
287
+ def prepare_local_cbioportal_repository(self):
288
+ self.get_logger().debug("Preparing local cbioportal repository, root folder - '{}'".format(
289
+ self.get_local_path_root_cbioportal_repo()))
290
+ check_create_folders([self.get_local_path_root_cbioportal_repo()])
291
+ self.get_logger().debug(
292
+ "Local path for cbioportal Release - '{}'".format(self.get_local_path_root_cbioportal_repo()))
293
+
294
+ def get_local_path_root_cbioportal_repo(self):
295
+ return self._local_path_cbioportal
296
+
297
+ def get_filter_options(self, variable, default_value):
298
+ return_value = default_value
299
+ if variable in self.get_default_parameters():
300
+ return_value = self.get_default_parameters()[variable]
301
+ elif self.PROTEINDB in self.get_default_parameters() and \
302
+ self.FILTER_INFO in self.get_default_parameters()[self.PROTEINDB] and \
303
+ variable in self.get_default_parameters()[self.PROTEINDB][self.FILTER_INFO]:
304
+ return_value = self.get_default_parameters()[self.PROTEINDB][self.FILTER_INFO][variable]
305
+ return return_value
306
+
307
+ def get_cancer_studies(self):
308
+ """
309
+ This method will print the list of all cancer studies for the user.
310
+ :return:
311
+ """
312
+ server = self._cbioportal_base_url
313
+ endpoint = self._cancer_studies_command
314
+ self._cbioportal_studies = call_api_raw(server + "/" + endpoint).text
315
+ return self._cbioportal_studies
316
+
317
+ def download_study(self, download_study, url_file_name=None):
318
+ """
319
+ This function will download a study from cBioPortal using the study ID
320
+ :param download_study: Study to be downloaded, if the study is empty or None, all the studies will be
321
+ downloaded.
322
+ :param url_file_name: file tsv containing the urls to be downloaded.
323
+ :return: None
324
+ """
325
+
326
+ clear_cache()
327
+
328
+ if self._cbioportal_studies is None or len(self._cbioportal_studies) == 0:
329
+ self.get_cancer_studies()
330
+
331
+ if url_file_name is not None:
332
+ with open(url_file_name, 'w', encoding='utf-8') as url_file:
333
+ if 'all' not in download_study:
334
+ if not self.check_study_identifier(download_study):
335
+ msg = "The following study accession '{}' is not present in cBioPortal Studies".format(download_study)
336
+ self.get_logger().debug(msg)
337
+ raise AppException(msg)
338
+ else:
339
+ self.download_one_study(download_study, url_file=url_file)
340
+ else:
341
+ csv_reader = csv.reader(self._cbioportal_studies.splitlines(), delimiter="\t")
342
+ line_count = 0
343
+ if self._multithreading:
344
+ processes = []
345
+ # Pass url_file=None to workers — concurrent writes to the
346
+ # shared handle would interleave/corrupt lines. The main
347
+ # thread serializes the writes below.
348
+ with ThreadPoolExecutor(max_workers=10, thread_name_prefix='Thread-Download') as executor:
349
+ for row in csv_reader:
350
+ if line_count != 0:
351
+ processes.append(executor.submit(self.download_one_study, row[0]))
352
+ line_count = line_count + 1
353
+ for task in as_completed(processes):
354
+ result = task.result()
355
+ print(result)
356
+ if result is not None:
357
+ url_file.write(result + "\n")
358
+ else:
359
+ for row in csv_reader:
360
+ if line_count != 0:
361
+ self.download_one_study(row[0], url_file=url_file)
362
+ line_count = line_count + 1
363
+ else:
364
+ if 'all' not in download_study:
365
+ if not self.check_study_identifier(download_study):
366
+ msg = "The following study accession '{}' is not present in cBioPortal Studies".format(download_study)
367
+ self.get_logger().debug(msg)
368
+ raise AppException(msg)
369
+ else:
370
+ self.download_one_study(download_study)
371
+ else:
372
+ csv_reader = csv.reader(self._cbioportal_studies.splitlines(), delimiter="\t")
373
+ line_count = 0
374
+ if self._multithreading:
375
+ processes = []
376
+ with ThreadPoolExecutor(max_workers=10, thread_name_prefix='Thread-Download') as executor:
377
+ for row in csv_reader:
378
+ if line_count != 0:
379
+ processes.append(executor.submit(self.download_one_study, row[0]))
380
+ line_count = line_count + 1
381
+ for task in as_completed(processes):
382
+ print(task.result())
383
+ else:
384
+ for row in csv_reader:
385
+ if line_count != 0:
386
+ self.download_one_study(row[0])
387
+ line_count = line_count + 1
388
+
389
+ def download_one_study(self, download_study: str, url_file=None) -> Optional[str]:
390
+ log = self.get_logger()
391
+ study_dir = os.path.join(self.get_local_path_root_cbioportal_repo(), download_study)
392
+ check_create_folders([study_dir])
393
+ out_path = os.path.join(study_dir, "data_mutations.txt")
394
+
395
+ log.info("Fetching mutations for study '%s' via cBioPortal API ...", download_study)
396
+ try:
397
+ mutations = _fetch_study_mutations(self._cbioportal_base_url, download_study, log)
398
+ except Exception as exc:
399
+ log.error("Failed to fetch mutations for study '%s': %s", download_study, exc)
400
+ return None
401
+
402
+ if not mutations:
403
+ log.warning("No mutations returned for study '%s'", download_study)
404
+ return None
405
+
406
+ with open(out_path, "w", encoding="utf-8", newline="") as fh:
407
+ fh.write("\t".join(MAF_HEADER) + "\n")
408
+ for m in mutations:
409
+ row = _mutation_to_row(m)
410
+ fh.write("\t".join("" if x is None else str(x) for x in row) + "\n")
411
+
412
+ log.info("Wrote %d mutations to %s", len(mutations), out_path)
413
+
414
+ _fetch_clinical_data(self._cbioportal_base_url, download_study, study_dir, log)
415
+
416
+ # Note: url_file writes intentionally happen in download_study() (single-threaded
417
+ # main thread) after futures complete; the parameter is retained for the
418
+ # single-study code path and to keep the legacy serial call sites unchanged.
419
+ if url_file is not None:
420
+ url_file.write(out_path + "\n")
421
+ return out_path
422
+
423
+ def check_study_identifier(self, download_study):
424
+ return download_study in self._cbioportal_studies