mgnify-pipelines-toolkit 1.2.10__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -167,7 +167,8 @@ def main():
167
167
  matched_primers_list.append(cleaned_primer_name)
168
168
 
169
169
  res_df = pd.DataFrame.from_dict(res_dict)
170
- res_df.to_csv(f"./{sample}_primer_validation.tsv", sep="\t", index=False)
170
+ res_tsv_name = f"./{sample}_primer_validation.tsv"
171
+ res_df.to_csv(res_tsv_name, sep="\t", index=False) if not res_df.empty else open(res_tsv_name, "w").close()
171
172
 
172
173
  fwd_primers_fw.close()
173
174
  rev_primers_fw.close()
@@ -1,7 +1,5 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
- import shutil
4
- from shutil import SameFileError
5
3
 
6
4
  # Copyright 2024-2025 EMBL - European Bioinformatics Institute
7
5
  #
@@ -16,25 +14,27 @@ from shutil import SameFileError
16
14
  # See the License for the specific language governing permissions and
17
15
  # limitations under the License.
18
16
 
19
- import click
20
- from collections import defaultdict
21
17
  import glob
22
18
  import logging
19
+ import shutil
20
+ from collections import defaultdict
23
21
  from pathlib import Path
24
- from typing import Union, List
22
+ from shutil import SameFileError
23
+ from typing import List, Union
25
24
 
25
+ import click
26
26
  import pandas as pd
27
27
 
28
- from mgnify_pipelines_toolkit.constants.db_labels import TAXDB_LABELS, ASV_TAXDB_LABELS
28
+ from mgnify_pipelines_toolkit.constants.db_labels import ASV_TAXDB_LABELS, TAXDB_LABELS
29
29
  from mgnify_pipelines_toolkit.constants.tax_ranks import (
30
- _SILVA_TAX_RANKS,
31
30
  _PR2_TAX_RANKS,
31
+ _SILVA_TAX_RANKS,
32
32
  )
33
- from mgnify_pipelines_toolkit.schemas.schemas import (
34
- AmpliconPassedRunsSchema,
33
+ from mgnify_pipelines_toolkit.schemas.dataframes import (
35
34
  AmpliconNonINSDCPassedRunsSchema,
36
- TaxonSchema,
35
+ AmpliconPassedRunsSchema,
37
36
  PR2TaxonSchema,
37
+ TaxonSchema,
38
38
  validate_dataframe,
39
39
  )
40
40
 
@@ -46,9 +46,7 @@ def cli():
46
46
  pass
47
47
 
48
48
 
49
- def get_tax_file(
50
- run_acc: str, analyses_dir: Path, db_label: str
51
- ) -> Union[Path, List[Path]]:
49
+ def get_tax_file(run_acc: str, analyses_dir: Path, db_label: str) -> Union[Path, List[Path]]:
52
50
  """Takes path information for a particular analysis and db_label combo, and returns any existing files.
53
51
 
54
52
  :param run_acc: Run accession for the tax file that should be retrieved.
@@ -69,48 +67,32 @@ def get_tax_file(
69
67
  db_path = Path(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}")
70
68
 
71
69
  if not db_path.exists():
72
- logging.debug(
73
- f"DB {db_path} doesn't exist for {run_acc}. Skipping"
74
- ) # or error?
70
+ logging.debug(f"DB {db_path} doesn't exist for {run_acc}. Skipping") # or error?
75
71
  return
76
72
 
77
73
  if db_label in TAXDB_LABELS:
78
- tax_file = Path(
79
- f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/{run_acc}_{db_label}.txt"
80
- )
74
+ tax_file = Path(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/{run_acc}_{db_label}.txt")
81
75
  if not tax_file.exists():
82
- logging.error(
83
- f"DB path exists but file doesn't - exiting. Path: {tax_file}"
84
- )
76
+ logging.error(f"DB path exists but file doesn't - exiting. Path: {tax_file}")
85
77
  exit(1)
86
78
 
87
79
  file_size = tax_file.stat().st_size
88
- if (
89
- file_size == 0
90
- ): # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
80
+ if file_size == 0: # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
91
81
  # so need to skip those. Should probably fix that at some point
92
- logging.debug(
93
- f"File {tax_file} exists but is empty, so will be skipping it."
94
- )
82
+ logging.debug(f"File {tax_file} exists but is empty, so will be skipping it.")
95
83
  tax_file = None
96
84
  elif db_label in ASV_TAXDB_LABELS:
97
85
  # ASV tax files could have up to two files, one for each amplified region (maximum two from the pipeline).
98
86
  # So will need to handle this differently to closed-reference files
99
- asv_tax_files = glob.glob(
100
- f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/*.txt"
101
- )
102
- asv_tax_files = [
103
- Path(file) for file in asv_tax_files if "concat" not in file
104
- ] # Have to filter out concatenated file if it exists
87
+ asv_tax_files = glob.glob(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/*.txt")
88
+ asv_tax_files = [Path(file) for file in asv_tax_files if "concat" not in file] # Have to filter out concatenated file if it exists
105
89
 
106
90
  tax_file = asv_tax_files
107
91
 
108
92
  return tax_file
109
93
 
110
94
 
111
- def parse_one_tax_file(
112
- run_acc: str, tax_file: Path, long_tax_ranks: list
113
- ) -> pd.DataFrame:
95
+ def parse_one_tax_file(run_acc: str, tax_file: Path, long_tax_ranks: list) -> pd.DataFrame:
114
96
  """Parses a taxonomy file, and returns it as a pandas DataFrame object.
115
97
 
116
98
  :param run_acc: Run accession of the taxonomy file that will be parsed.
@@ -134,9 +116,7 @@ def parse_one_tax_file(
134
116
  elif len(long_tax_ranks) == 9:
135
117
  validate_dataframe(res_df, PR2TaxonSchema, str(tax_file))
136
118
 
137
- res_df["full_taxon"] = res_df.iloc[:, 1:].apply(
138
- lambda x: ";".join(x).strip(";"), axis=1
139
- )
119
+ res_df["full_taxon"] = res_df.iloc[:, 1:].apply(lambda x: ";".join(x).strip(";"), axis=1)
140
120
  final_df = res_df.iloc[:, [0, -1]]
141
121
  final_df = final_df.set_index("full_taxon")
142
122
  final_df.columns = [run_acc]
@@ -144,9 +124,7 @@ def parse_one_tax_file(
144
124
  return final_df
145
125
 
146
126
 
147
- def generate_db_summary(
148
- db_label: str, tax_dfs: defaultdict[Path], output_prefix: str
149
- ) -> None:
127
+ def generate_db_summary(db_label: str, tax_dfs: defaultdict[Path], output_prefix: str) -> None:
150
128
  """Takes paired run accessions taxonomy dataframes in the form of a dictionary,
151
129
  and respective db_label, joins them together, and generates a study-wide summary
152
130
  in the form of a .tsv file.
@@ -185,7 +163,6 @@ def generate_db_summary(
185
163
  )
186
164
 
187
165
  elif db_label in ASV_TAXDB_LABELS:
188
-
189
166
  if "PR2" in db_label:
190
167
  long_tax_ranks = _PR2_TAX_RANKS
191
168
  else:
@@ -196,13 +173,9 @@ def generate_db_summary(
196
173
  for (
197
174
  run_acc,
198
175
  tax_df_asv_lst,
199
- ) in (
200
- tax_dfs.items()
201
- ): # each `tax_file` will be a list containing at most two files (one for each amp_region)
176
+ ) in tax_dfs.items(): # each `tax_file` will be a list containing at most two files (one for each amp_region)
202
177
  for tax_df in tax_df_asv_lst:
203
- amp_region = str(tax_df).split("_")[
204
- -5
205
- ] # there are a lot of underscores in these names... but it is consistent
178
+ amp_region = str(tax_df).split("_")[-5] # there are a lot of underscores in these names... but it is consistent
206
179
  # e.g. ERR4334351_16S-V3-V4_DADA2-SILVA_asv_krona_counts.txt
207
180
  amp_region_df = parse_one_tax_file(run_acc, tax_df, long_tax_ranks)
208
181
  amp_region_dict[amp_region].append(amp_region_df)
@@ -241,13 +214,9 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
241
214
 
242
215
  temp_lst = summary_filename.split("_")
243
216
  if "asv_study_summary" in summary_filename:
244
- summary_db_label = "_".join(
245
- temp_lst[1:3]
246
- ) # For ASVs we need to include the amp_region in the label
217
+ summary_db_label = "_".join(temp_lst[1:3]) # For ASVs we need to include the amp_region in the label
247
218
  else:
248
- summary_db_label = temp_lst[
249
- 1
250
- ] # For closed reference, just the db_label is needed
219
+ summary_db_label = temp_lst[1] # For closed reference, just the db_label is needed
251
220
 
252
221
  summaries_dict[summary_db_label].append(summary_path)
253
222
 
@@ -273,18 +242,14 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
273
242
  help="Input directory to where all the individual analyses subdirectories for summarising",
274
243
  type=click.Path(exists=True, path_type=Path, file_okay=False),
275
244
  )
276
- @click.option(
277
- "-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
278
- )
245
+ @click.option("-p", "--output_prefix", required=True, help="Prefix to summary files", type=str)
279
246
  @click.option(
280
247
  "--non_insdc",
281
248
  default=False,
282
249
  is_flag=True,
283
250
  help="If run accessions aren't INSDC-formatted",
284
251
  )
285
- def summarise_analyses(
286
- runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
287
- ) -> None:
252
+ def summarise_analyses(runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool) -> None:
288
253
  """Function that will take a file of pipeline-successful run accessions
289
254
  that should be used for the generation of the relevant db-specific
290
255
  study-level summary files. For ASV results, these will also be on a
@@ -302,16 +267,14 @@ def summarise_analyses(
302
267
  """
303
268
  runs_df = pd.read_csv(runs, names=["run", "status"])
304
269
 
270
+ # Run validation on the successful_runs .csv file
305
271
  if not non_insdc:
306
- AmpliconPassedRunsSchema(
307
- runs_df
308
- ) # Run validation on the successful_runs .csv file
272
+ AmpliconPassedRunsSchema(runs_df)
309
273
  else:
310
274
  AmpliconNonINSDCPassedRunsSchema(runs_df)
311
275
 
312
276
  all_db_labels = TAXDB_LABELS + ASV_TAXDB_LABELS
313
277
  for db_label in all_db_labels:
314
-
315
278
  tax_files = defaultdict(Path)
316
279
  for i in range(0, len(runs_df)):
317
280
  run_acc = runs_df.loc[i, "run"]
@@ -376,9 +339,7 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
376
339
  index_label="taxonomy",
377
340
  )
378
341
  elif len(summaries) == 1:
379
- logging.info(
380
- f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
381
- )
342
+ logging.info(f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}")
382
343
  try:
383
344
  shutil.copyfile(summaries[0], merged_summary_name)
384
345
  except SameFileError:
@@ -14,39 +14,37 @@
14
14
  # See the License for the specific language governing permissions and
15
15
  # limitations under the License.
16
16
 
17
- import click
18
- from functools import reduce
19
17
  import glob
20
18
  import logging
19
+ from functools import reduce
21
20
  from pathlib import Path
22
21
  from typing import Literal
23
22
 
23
+ import click
24
24
  import pandas as pd
25
25
 
26
- from mgnify_pipelines_toolkit.schemas.schemas import (
26
+ from mgnify_pipelines_toolkit.schemas.dataframes import (
27
+ AntismashStudySummarySchema,
28
+ AntismashSummarySchema,
27
29
  CompletedAnalysisSchema,
28
- TaxonSchema,
30
+ GOStudySummarySchema,
29
31
  GOSummarySchema,
32
+ InterProStudySummarySchema,
30
33
  InterProSummarySchema,
31
- KOSummarySchema,
32
- SanntisSummarySchema,
33
- AntismashSummarySchema,
34
- PFAMSummarySchema,
34
+ KEGGModulesStudySummarySchema,
35
35
  KEGGModulesSummarySchema,
36
- GOStudySummarySchema,
37
- InterProStudySummarySchema,
38
- TaxonomyStudySummarySchema,
39
36
  KOStudySummarySchema,
40
- SanntisStudySummarySchema,
41
- AntismashStudySummarySchema,
37
+ KOSummarySchema,
42
38
  PFAMStudySummarySchema,
43
- KEGGModulesStudySummarySchema,
39
+ PFAMSummarySchema,
40
+ SanntisStudySummarySchema,
41
+ SanntisSummarySchema,
42
+ TaxonomyStudySummarySchema,
43
+ TaxonSchema,
44
44
  validate_dataframe,
45
45
  )
46
46
 
47
- logging.basicConfig(
48
- level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
49
- )
47
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
50
48
 
51
49
  # Keys are the original column names in the input files,
52
50
  # values are the standardised column names used in the generated study summary files
@@ -173,9 +171,7 @@ def check_files_exist(file_list: list[Path]) -> None:
173
171
  """
174
172
  missing_files = [str(path) for path in file_list if not path.is_file()]
175
173
  if missing_files:
176
- raise FileNotFoundError(
177
- f"The following required files are missing: {', '.join(missing_files)}"
178
- )
174
+ raise FileNotFoundError(f"The following required files are missing: {', '.join(missing_files)}")
179
175
 
180
176
 
181
177
  def generate_taxonomy_summary(
@@ -206,9 +202,7 @@ def generate_taxonomy_summary(
206
202
  df = validate_dataframe(df, TaxonSchema, str(path))
207
203
 
208
204
  # Combine all taxonomic ranks in the classification into a single string
209
- df["full_taxon"] = (
210
- df[TAXONOMY_COLUMN_NAMES[1:]].agg(";".join, axis=1).str.strip(";")
211
- )
205
+ df["full_taxon"] = df[TAXONOMY_COLUMN_NAMES[1:]].agg(";".join, axis=1).str.strip(";")
212
206
 
213
207
  # Create a new DataFrame with taxonomy as index and count as the only column
214
208
  result = df[["Count", "full_taxon"]].set_index("full_taxon")
@@ -229,9 +223,7 @@ def generate_functional_summary(
229
223
  file_dict: dict[str, Path],
230
224
  column_names: dict[str, str],
231
225
  output_prefix: str,
232
- label: Literal[
233
- "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
234
- ],
226
+ label: Literal["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"],
235
227
  outdir: Path = None,
236
228
  allow_missing: bool = False,
237
229
  ) -> None:
@@ -292,9 +284,7 @@ def generate_functional_summary(
292
284
  check_files_exist(list(file_dict.values()))
293
285
  except FileNotFoundError as e:
294
286
  if allow_missing:
295
- logging.warning(
296
- f"One of the expected files is missing, but this is allowed for {label}."
297
- )
287
+ logging.warning(f"One of the expected files is missing, but this is allowed for {label}.")
298
288
  logging.warning(e)
299
289
  return
300
290
  raise
@@ -324,9 +314,7 @@ def generate_functional_summary(
324
314
  dfs.append(df)
325
315
 
326
316
  if not dfs:
327
- logging.warning(
328
- f"No valid files with functional annotation summary were found. Skipping creation of {output_file_name}."
329
- )
317
+ logging.warning(f"No valid files with functional annotation summary were found. Skipping creation of {output_file_name}.")
330
318
  return
331
319
 
332
320
  # Merge all dataframes on the renamed metadata columns
@@ -384,9 +372,7 @@ def generate_functional_summary(
384
372
  help="Directory for the output files, by default it will use the current working directory.",
385
373
  type=click.Path(exists=True, path_type=Path, file_okay=False),
386
374
  )
387
- def summarise_analyses(
388
- assemblies: Path, study_dir: Path, output_prefix: str, outdir: Path
389
- ) -> None:
375
+ def summarise_analyses(assemblies: Path, study_dir: Path, output_prefix: str, outdir: Path) -> None:
390
376
  """
391
377
  Generate study-level summaries for successfully proccessed assemblies.
392
378
 
@@ -405,16 +391,11 @@ def summarise_analyses(
405
391
  Construct file paths for each assembly given a subdirectory and filename template.
406
392
  Template must contain {acc} as a placeholder.
407
393
  """
408
- return {
409
- acc: study_dir / acc / subdir / filename_template.format(acc=acc)
410
- for acc in assembly_list
411
- }
394
+ return {acc: study_dir / acc / subdir / filename_template.format(acc=acc) for acc in assembly_list}
412
395
 
413
396
  logging.info("Start processing of assembly-level summaries.")
414
397
 
415
- logging.info(
416
- "Generating taxonomy summary from assembly-level summaries <accession>.krona.txt"
417
- )
398
+ logging.info("Generating taxonomy summary from assembly-level summaries <accession>.krona.txt")
418
399
  generate_taxonomy_summary(
419
400
  get_file_paths("taxonomy", "{acc}.krona.txt.gz"),
420
401
  f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}",
@@ -422,9 +403,7 @@ def summarise_analyses(
422
403
  )
423
404
 
424
405
  for summary_type, config in SUMMARY_TYPES_MAP.items():
425
- logging.info(
426
- f"Generating study-level {summary_type.capitalize()} summary from file <accession>_{summary_type}_summary.tsv.gz"
427
- )
406
+ logging.info(f"Generating study-level {summary_type.capitalize()} summary from file <accession>_{summary_type}_summary.tsv.gz")
428
407
  generate_functional_summary(
429
408
  get_file_paths(config["folder"], f"{{acc}}_{summary_type}_summary.tsv.gz"),
430
409
  config["column_names"],
@@ -469,9 +448,7 @@ def merge_summaries(study_dir: str, output_prefix: str) -> None:
469
448
 
470
449
  logging.info("Generating combined assembly-level summaries")
471
450
  logging.info("Parsing summary files for taxonomic classification")
472
- merge_taxonomy_summaries(
473
- get_file_paths("taxonomy"), f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}"
474
- )
451
+ merge_taxonomy_summaries(get_file_paths("taxonomy"), f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}")
475
452
 
476
453
  for summary_type, config in SUMMARY_TYPES_MAP.items():
477
454
  logging.info(f"Parsing summary files for {summary_type.capitalize()}.")
@@ -500,9 +477,7 @@ def merge_taxonomy_summaries(summary_files: list[str], output_file_name: str) ->
500
477
  sk__Eukaryota;k__Metazoa;p__Chordata;c__Mammalia;o__Primates 118 94
501
478
  """
502
479
  if not summary_files:
503
- raise FileNotFoundError(
504
- "The required taxonomic classification summary files are missing. Exiting."
505
- )
480
+ raise FileNotFoundError("The required taxonomic classification summary files are missing. Exiting.")
506
481
 
507
482
  summary_dfs = []
508
483
  for file in summary_files:
@@ -527,9 +502,7 @@ def merge_functional_summaries(
527
502
  summary_files: list[str],
528
503
  merge_keys: list[str],
529
504
  output_prefix: str,
530
- label: Literal[
531
- "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
532
- ],
505
+ label: Literal["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"],
533
506
  ) -> None:
534
507
  """
535
508
  Merge multiple functional study-level summary files into a single study-level summary.
@@ -580,9 +553,7 @@ def merge_functional_summaries(
580
553
  output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
581
554
 
582
555
  if not summary_files:
583
- logging.warning(
584
- f"Skipping creation of {output_file_name} because no summaries were found for this type of functional annotation."
585
- )
556
+ logging.warning(f"Skipping creation of {output_file_name} because no summaries were found for this type of functional annotation.")
586
557
  return
587
558
 
588
559
  validation_schema = SUMMARY_TYPES_MAP[label]["study_schema"]
@@ -596,9 +567,7 @@ def merge_functional_summaries(
596
567
  if len(dfs) == 1:
597
568
  merged_df = dfs[0]
598
569
  else:
599
- merged_df = reduce(
600
- lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs
601
- )
570
+ merged_df = reduce(lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs)
602
571
 
603
572
  # Identify non-key columns (i.e. counts)
604
573
  value_columns = [col for col in merged_df.columns if col not in merge_keys]
@@ -14,32 +14,31 @@
14
14
  # See the License for the specific language governing permissions and
15
15
  # limitations under the License.
16
16
 
17
- import shutil
18
- from shutil import SameFileError
19
-
20
- import click
21
- from collections import defaultdict
22
17
  import glob
23
18
  import logging
19
+ import shutil
20
+ from collections import defaultdict
24
21
  from pathlib import Path
25
- from typing import Union, List
22
+ from shutil import SameFileError
23
+ from typing import List, Union
26
24
 
25
+ import click
27
26
  import pandas as pd
28
27
 
29
28
  from mgnify_pipelines_toolkit.constants.db_labels import (
30
- RRAP_TAXDB_LABELS,
31
29
  RRAP_FUNCDB_LABELS,
30
+ RRAP_TAXDB_LABELS,
32
31
  )
33
32
  from mgnify_pipelines_toolkit.constants.tax_ranks import (
34
- _SILVA_TAX_RANKS,
35
33
  _MOTUS_TAX_RANKS,
34
+ _SILVA_TAX_RANKS,
36
35
  )
37
- from mgnify_pipelines_toolkit.schemas.schemas import (
38
- RawReadsPassedRunsSchema,
36
+ from mgnify_pipelines_toolkit.schemas.dataframes import (
37
+ FunctionProfileSchema,
38
+ MotusTaxonSchema,
39
39
  RawReadsNonINSDCPassedRunsSchema,
40
+ RawReadsPassedRunsSchema,
40
41
  TaxonSchema,
41
- MotusTaxonSchema,
42
- FunctionProfileSchema,
43
42
  validate_dataframe,
44
43
  )
45
44
 
@@ -51,9 +50,7 @@ def cli():
51
50
  pass
52
51
 
53
52
 
54
- def get_file(
55
- run_acc: str, analyses_dir: Path, db_label: str
56
- ) -> Union[Path, List[Path], None]:
53
+ def get_file(run_acc: str, analyses_dir: Path, db_label: str) -> Union[Path, List[Path], None]:
57
54
  """Takes path information for a particular analysis and db_label combo, and returns any existing files.
58
55
 
59
56
  :param run_acc: Run accession for the tax file that should be retrieved.
@@ -78,28 +75,18 @@ def get_file(
78
75
  db_path = Path(f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}")
79
76
 
80
77
  if not db_path.exists():
81
- logging.debug(
82
- f"DB {db_path} doesn't exist for {run_acc}. Skipping"
83
- ) # or error?
78
+ logging.debug(f"DB {db_path} doesn't exist for {run_acc}. Skipping") # or error?
84
79
  return
85
80
 
86
- analysis_file = Path(
87
- f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt.gz"
88
- )
81
+ analysis_file = Path(f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt.gz")
89
82
  if not analysis_file.exists():
90
- logging.error(
91
- f"DB path exists but file doesn't - exiting. Path: {analysis_file}"
92
- )
83
+ logging.error(f"DB path exists but file doesn't - exiting. Path: {analysis_file}")
93
84
  exit(1)
94
85
 
95
86
  file_size = analysis_file.stat().st_size
96
- if (
97
- file_size == 0
98
- ): # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
87
+ if file_size == 0: # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
99
88
  # so need to skip those. Should probably fix that at some point
100
- logging.debug(
101
- f"File {analysis_file} exists but is empty, so will be skipping it."
102
- )
89
+ logging.debug(f"File {analysis_file} exists but is empty, so will be skipping it.")
103
90
  analysis_file = None
104
91
 
105
92
  return analysis_file
@@ -130,21 +117,13 @@ def parse_one_tax_file(run_acc: str, tax_file: Path, db_label: str) -> pd.DataFr
130
117
  str(tax_file),
131
118
  )
132
119
 
133
- res_df["full_taxon"] = [
134
- ";".join(r[tax_ranks]).strip(";") for _, r in res_df.iterrows()
135
- ]
136
- final_df = (
137
- res_df[["Count", "full_taxon"]]
138
- .set_index("full_taxon")
139
- .rename(columns={"Count": run_acc})
140
- )
120
+ res_df["full_taxon"] = [";".join(r[tax_ranks]).strip(";") for _, r in res_df.iterrows()]
121
+ final_df = res_df[["Count", "full_taxon"]].set_index("full_taxon").rename(columns={"Count": run_acc})
141
122
 
142
123
  return final_df
143
124
 
144
125
 
145
- def parse_one_func_file(
146
- run_acc: str, func_file: Path, db_label: str
147
- ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
126
+ def parse_one_func_file(run_acc: str, func_file: Path, db_label: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
148
127
  """Parses a functional profile file, and returns it as a pandas DataFrame object.
149
128
 
150
129
  :param run_acc: Run accession of the taxonomy file that will be parsed.
@@ -170,24 +149,16 @@ def parse_one_func_file(
170
149
  if res_df.shape[0] > 0:
171
150
  validate_dataframe(res_df, FunctionProfileSchema, str(func_file))
172
151
 
173
- count_df = pd.DataFrame(res_df[["read_count"]]).rename(
174
- columns={"read_count": run_acc}
175
- )
152
+ count_df = pd.DataFrame(res_df[["read_count"]]).rename(columns={"read_count": run_acc})
176
153
 
177
- depth_df = pd.DataFrame(res_df[["coverage_depth"]]).rename(
178
- columns={"coverage_depth": run_acc}
179
- )
154
+ depth_df = pd.DataFrame(res_df[["coverage_depth"]]).rename(columns={"coverage_depth": run_acc})
180
155
 
181
- breadth_df = pd.DataFrame(res_df[["coverage_breadth"]]).rename(
182
- columns={"coverage_breadth": run_acc}
183
- )
156
+ breadth_df = pd.DataFrame(res_df[["coverage_breadth"]]).rename(columns={"coverage_breadth": run_acc})
184
157
 
185
158
  return count_df, depth_df, breadth_df
186
159
 
187
160
 
188
- def generate_db_summary(
189
- db_label: str, analysis_dfs: dict[str, Path], output_prefix: str
190
- ) -> None:
161
+ def generate_db_summary(db_label: str, analysis_dfs: dict[str, Path], output_prefix: str) -> None:
191
162
  """Takes paired run accessions taxonomy dataframes in the form of a dictionary,
192
163
  and respective db_label, joins them together, and generates a study-wide summary
193
164
  in the form of a .tsv file.
@@ -225,9 +196,7 @@ def generate_db_summary(
225
196
  breadth_df_list = []
226
197
 
227
198
  for run_acc, analysis_df in analysis_dfs.items():
228
- count_df, depth_df, breadth_df = parse_one_func_file(
229
- run_acc, analysis_df, db_label
230
- )
199
+ count_df, depth_df, breadth_df = parse_one_func_file(run_acc, analysis_df, db_label)
231
200
  count_df_list.append(count_df)
232
201
  depth_df_list.append(depth_df)
233
202
  breadth_df_list.append(breadth_df)
@@ -308,18 +277,14 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[str,
308
277
  help="Input directory to where all the individual analyses subdirectories for summarising",
309
278
  type=click.Path(exists=True, path_type=Path, file_okay=False),
310
279
  )
311
- @click.option(
312
- "-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
313
- )
280
+ @click.option("-p", "--output_prefix", required=True, help="Prefix to summary files", type=str)
314
281
  @click.option(
315
282
  "--non_insdc",
316
283
  default=False,
317
284
  is_flag=True,
318
285
  help="If run accessions aren't INSDC-formatted",
319
286
  )
320
- def summarise_analyses(
321
- runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
322
- ) -> None:
287
+ def summarise_analyses(runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool) -> None:
323
288
  """Function that will take a file of pipeline-successful run accessions
324
289
  that should be used for the generation of the relevant db-specific
325
290
  study-level summary files.
@@ -337,15 +302,12 @@ def summarise_analyses(
337
302
  runs_df = pd.read_csv(runs, names=["run", "status"])
338
303
 
339
304
  if not non_insdc:
340
- RawReadsPassedRunsSchema(
341
- runs_df
342
- ) # Run validation on the successful_runs .csv file
305
+ RawReadsPassedRunsSchema(runs_df) # Run validation on the successful_runs .csv file
343
306
  else:
344
307
  RawReadsNonINSDCPassedRunsSchema(runs_df)
345
308
 
346
309
  all_db_labels = RRAP_TAXDB_LABELS + RRAP_FUNCDB_LABELS
347
310
  for db_label in all_db_labels:
348
-
349
311
  analysis_files = {}
350
312
  for run_acc in runs_df["run"]:
351
313
  analysis_file = get_file(run_acc, analyses_dir, db_label)
@@ -410,9 +372,7 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
410
372
  index_label="taxonomy",
411
373
  )
412
374
  elif len(summaries) == 1:
413
- logging.info(
414
- f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
415
- )
375
+ logging.info(f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}")
416
376
  try:
417
377
  shutil.copyfile(summaries[0], merged_summary_name)
418
378
  except SameFileError:
@@ -420,21 +380,15 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
420
380
 
421
381
  if db_label in RRAP_FUNCDB_LABELS:
422
382
  for table_type in ["read-count", "coverage-depth", "coverage-breadth"]:
423
- merged_summary_name = (
424
- f"{output_prefix}_{db_label}_{table_type}_study_summary.tsv"
425
- )
426
- summaries_ = [
427
- v for v in summaries if Path(v).stem.split("_")[2] == table_type
428
- ]
383
+ merged_summary_name = f"{output_prefix}_{db_label}_{table_type}_study_summary.tsv"
384
+ summaries_ = [v for v in summaries if Path(v).stem.split("_")[2] == table_type]
429
385
  if len(summaries_) > 1:
430
386
  res_df = pd.read_csv(summaries_[0], sep="\t", index_col=0)
431
387
  for summary in summaries_[1:]:
432
388
  curr_df = pd.read_csv(summary, sep="\t", index_col=0)
433
389
  res_df = res_df.join(curr_df, how="outer")
434
390
  res_df = res_df.fillna(0)
435
- res_df = res_df.astype(
436
- int if table_type == "read-count" else float
437
- )
391
+ res_df = res_df.astype(int if table_type == "read-count" else float)
438
392
 
439
393
  res_df = res_df.reindex(sorted(res_df.columns), axis=1)
440
394
  res_df.to_csv(
@@ -444,9 +398,7 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
444
398
  float_format="%.6g",
445
399
  )
446
400
  elif len(summaries_) == 1:
447
- logging.info(
448
- f"Only one summary ({summaries_[0]}) so will use that as {merged_summary_name}"
449
- )
401
+ logging.info(f"Only one summary ({summaries_[0]}) so will use that as {merged_summary_name}")
450
402
  try:
451
403
  shutil.copyfile(summaries_[0], merged_summary_name)
452
404
  except SameFileError: