mgnify-pipelines-toolkit 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -257,7 +257,7 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
257
257
  @cli.command(
258
258
  "summarise",
259
259
  options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
260
- short_help="Generate study-level analysis summaries.",
260
+ short_help="Generate study-level summaries of amplicon analysis results.",
261
261
  )
262
262
  @click.option(
263
263
  "-r",
@@ -327,7 +327,7 @@ def summarise_analyses(
327
327
  @cli.command(
328
328
  "merge",
329
329
  options_metavar="-a <analyses_dir> -p <output_prefix>",
330
- short_help="Merge multiple study-level analysis summaries.",
330
+ short_help="Merge multiple study-level summaries of amplicon analysis.",
331
331
  )
332
332
  @click.option(
333
333
  "-a",
@@ -0,0 +1,605 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import click
18
+ from functools import reduce
19
+ import glob
20
+ import logging
21
+ from pathlib import Path
22
+ from typing import Literal
23
+
24
+ import pandas as pd
25
+
26
+ from mgnify_pipelines_toolkit.schemas.schemas import (
27
+ CompletedAnalysisSchema,
28
+ TaxonSchema,
29
+ GOSummarySchema,
30
+ InterProSummarySchema,
31
+ KOSummarySchema,
32
+ SanntisSummarySchema,
33
+ AntismashSummarySchema,
34
+ PFAMSummarySchema,
35
+ KEGGModulesSummarySchema,
36
+ GOStudySummarySchema,
37
+ InterProStudySummarySchema,
38
+ TaxonomyStudySummarySchema,
39
+ KOStudySummarySchema,
40
+ SanntisStudySummarySchema,
41
+ AntismashStudySummarySchema,
42
+ PFAMStudySummarySchema,
43
+ KEGGModulesStudySummarySchema,
44
+ validate_dataframe,
45
+ )
46
+
47
+ logging.basicConfig(
48
+ level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
49
+ )
50
+
51
+ # Keys are the original column names in the input files,
52
+ # values are the standardised column names used in the generated study summary files
53
+ # Note: "Count" or "count" column should be excluded
54
+ GO_COLUMN_NAMES = {
55
+ "go": "GO",
56
+ "term": "description",
57
+ "category": "category",
58
+ }
59
+
60
+ INTERPRO_COLUMN_NAMES = {
61
+ "interpro_accession": "IPR",
62
+ "description": "description",
63
+ }
64
+
65
+ SANNTIS_COLUMN_NAMES = {
66
+ "nearest_mibig": "nearest_mibig",
67
+ "nearest_mibig_class": "nearest_mibig_class",
68
+ "description": "description",
69
+ }
70
+
71
+ ANTISMASH_COLUMN_NAMES = {
72
+ "label": "label",
73
+ "description": "description",
74
+ }
75
+
76
+ KEGG_COLUMN_NAMES = {
77
+ "ko": "KO",
78
+ "description": "description",
79
+ }
80
+
81
+ PFAM_COLUMN_NAMES = {
82
+ "pfam": "PFAM",
83
+ "description": "description",
84
+ }
85
+
86
+ KEGG_MODULES_COLUMN_NAMES = {
87
+ "module_accession": "module_accession",
88
+ "pathway_name": "pathway_name",
89
+ "pathway_class": "pathway_class",
90
+ }
91
+
92
+ # this mapping allows using 'for' cycle later to process all summary types in one way
93
+ SUMMARY_TYPES_MAP = {
94
+ "go": {
95
+ "folder": "functional-annotation/go",
96
+ "column_names": GO_COLUMN_NAMES,
97
+ "schema": GOSummarySchema,
98
+ "study_schema": GOStudySummarySchema,
99
+ },
100
+ "goslim": {
101
+ "folder": "functional-annotation/go",
102
+ "column_names": GO_COLUMN_NAMES,
103
+ "schema": GOSummarySchema,
104
+ "study_schema": GOStudySummarySchema,
105
+ },
106
+ "interpro": {
107
+ "folder": "functional-annotation/interpro",
108
+ "column_names": INTERPRO_COLUMN_NAMES,
109
+ "schema": InterProSummarySchema,
110
+ "study_schema": InterProStudySummarySchema,
111
+ },
112
+ "ko": {
113
+ "folder": "functional-annotation/kegg",
114
+ "column_names": KEGG_COLUMN_NAMES,
115
+ "schema": KOSummarySchema,
116
+ "study_schema": KOStudySummarySchema,
117
+ },
118
+ "sanntis": {
119
+ "folder": "pathways-and-systems/sanntis",
120
+ "column_names": SANNTIS_COLUMN_NAMES,
121
+ "schema": SanntisSummarySchema,
122
+ "study_schema": SanntisStudySummarySchema,
123
+ },
124
+ "antismash": {
125
+ "folder": "pathways-and-systems/antismash",
126
+ "column_names": ANTISMASH_COLUMN_NAMES,
127
+ "schema": AntismashSummarySchema,
128
+ "study_schema": AntismashStudySummarySchema,
129
+ },
130
+ "pfam": {
131
+ "folder": "functional-annotation/pfam",
132
+ "column_names": PFAM_COLUMN_NAMES,
133
+ "schema": PFAMSummarySchema,
134
+ "study_schema": PFAMStudySummarySchema,
135
+ },
136
+ "kegg_modules": {
137
+ "folder": "pathways-and-systems/kegg-modules",
138
+ "column_names": KEGG_MODULES_COLUMN_NAMES,
139
+ "schema": KEGGModulesSummarySchema,
140
+ "study_schema": KEGGModulesStudySummarySchema,
141
+ },
142
+ }
143
+
144
+ # The taxonomy file is a tab-separated file without any header
145
+ # containing of following columns:
146
+ TAXONOMY_COLUMN_NAMES = [
147
+ "Count",
148
+ "Superkingdom",
149
+ "Kingdom",
150
+ "Phylum",
151
+ "Class",
152
+ "Order",
153
+ "Family",
154
+ "Genus",
155
+ "Species",
156
+ ]
157
+
158
+ OUTPUT_SUFFIX = "summary.tsv"
159
+
160
+
161
+ @click.group()
162
+ def cli():
163
+ pass
164
+
165
+
166
+ def check_files_exist(file_list: list[Path]) -> None:
167
+ """
168
+ Check that all files in the given list exist on disk.
169
+
170
+ :param file_list: List of file paths to check.
171
+ :raises FileNotFoundError: If any file does not exist.
172
+ """
173
+ missing_files = [str(path) for path in file_list if not path.is_file()]
174
+ if missing_files:
175
+ raise FileNotFoundError(
176
+ f"The following required files are missing: {', '.join(missing_files)}"
177
+ )
178
+
179
+
180
+ def generate_taxonomy_summary(
181
+ file_dict: dict[str, Path],
182
+ output_file_name: str,
183
+ outdir: Path = None,
184
+ ) -> None:
185
+ """
186
+ Generate a combined study-level taxonomic classification summary from multiple input
187
+ assembly-level summary files.
188
+
189
+ :param file_dict: Dictionary mapping assembly accession to its taxonomy file.
190
+ :param output_file_name: Output path for the output summary file.
191
+ :param outdir: Optional output directory for the results.
192
+
193
+ Example of the taxonomy file:
194
+ 23651 sk__Bacteria
195
+ 4985 sk__Archaea k__Thermoproteati p__Nitrososphaerota
196
+ 882 sk__Archaea k__Nanobdellati p__ c__ o__ f__ g__ s__Candidatus Pacearchaeota archaeon
197
+ """
198
+ check_files_exist(list(file_dict.values()))
199
+
200
+ tax_dfs = []
201
+ for assembly_acc, path in file_dict.items():
202
+ df = pd.read_csv(path, sep="\t", names=TAXONOMY_COLUMN_NAMES).fillna("")
203
+
204
+ # Note: schema validation will fail if the taxonomy file is empty
205
+ df = validate_dataframe(df, TaxonSchema, str(path))
206
+
207
+ # Combine all taxonomic ranks in the classification into a single string
208
+ df["full_taxon"] = (
209
+ df[TAXONOMY_COLUMN_NAMES[1:]].agg(";".join, axis=1).str.strip(";")
210
+ )
211
+
212
+ # Create a new DataFrame with taxonomy as index and count as the only column
213
+ result = df[["Count", "full_taxon"]].set_index("full_taxon")
214
+ result.columns = [assembly_acc]
215
+ tax_dfs.append(result)
216
+
217
+ summary_df = pd.concat(tax_dfs, axis=1)
218
+ summary_df = summary_df.fillna(0).astype(int).sort_index()
219
+
220
+ outfile = output_file_name
221
+ if outdir:
222
+ outfile = outdir / output_file_name
223
+
224
+ summary_df.to_csv(outfile, sep="\t", index_label="taxonomy")
225
+
226
+
227
+ def generate_functional_summary(
228
+ file_dict: dict[str, Path],
229
+ column_names: dict[str, str],
230
+ output_prefix: str,
231
+ label: Literal[
232
+ "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
233
+ ],
234
+ outdir: Path = None,
235
+ ) -> None:
236
+ """
237
+ Generate a combined study-level functional annotation summary from multiple input
238
+ assembly-level summary files.
239
+
240
+ :param file_dict: Dictionary mapping assembly accession to its summary file path.
241
+ :param column_names: Dictionary mapping original column names to standard column names.
242
+ :param output_prefix: Prefix for the output summary file.
243
+ :param label: Label for the functional annotation type
244
+ (expected one of ["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"]).
245
+ :param outdir: Optional output directory for the results.
246
+
247
+ In the input files, column orders may vary, but the following columns are expected:
248
+ GO summary input file:
249
+ go term category count
250
+ GO:0016020 membrane cellular_component 30626
251
+ GO:0005524 ATP binding molecular_function 30524
252
+
253
+ InterPro summary input file:
254
+ interpro_accession description count
255
+ IPR036291 NAD(P)-binding domain superfamily 16503
256
+ IPR019734 Tetratricopeptide repeat 14694
257
+
258
+ KEGG summary input file:
259
+ ko description count
260
+ K01552 energy-coupling factor transport system ATP-binding protein [EC:7.-.-.-] 562
261
+ K18889 ATP-binding cassette, subfamily B, multidrug efflux pump 537
262
+ K15497 molybdate/tungstate transport system ATP-binding protein [EC:7.3.2.5 7.3.2.6] 517
263
+
264
+ Sanntis summary input file:
265
+ nearest_mibig nearest_mibig_class description count
266
+ BGC0000787 Saccharide Carbohydrate-based natural products (e.g., aminoglycoside antibiotics) 1
267
+ BGC0000248 Polyketide Built from iterative condensation of acetate units derived from acetyl-CoA 3
268
+ BGC0001327 NRP Polyketide Nonribosomal Peptide Polyketide 2
269
+
270
+ Antismash summary input file:
271
+ label description count
272
+ terpene Terpene 16
273
+ betalactone Beta-lactone containing protease inhibitor 8
274
+ T1PKS Type I PKS (Polyketide synthase) 3
275
+
276
+ PFAM summary input file:
277
+ pfam description count
278
+ PF00265 Thymidine kinase 457
279
+ PF01852 START domain 368
280
+ PF13756 Stimulus-sensing domain 397
281
+
282
+ KEGG modules summary input file:
283
+ module_accession completeness pathway_name pathway_class matching_ko missing_ko
284
+ M00986 100.0 Sulfur reduction, sulfur => sulfide Pathway modules; Energy metabolism; Sulfur metabolism K18367
285
+ M00163 83.33 Photosystem I Pathway modules; Energy metabolism; Photosynthesis K02689,K02690,K02691,K02692,K02694 K02693
286
+ M00615 50.0 Nitrate assimilation Signature modules; Module set; Metabolic capacity K02575 M00531
287
+ """
288
+ check_files_exist(list(file_dict.values()))
289
+
290
+ output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
291
+
292
+ original_col_names = list(column_names.keys())
293
+ renamed_col_names = list(column_names.values())
294
+ value_col_name = "completeness" if label == "kegg_modules" else "count"
295
+
296
+ dfs = []
297
+ for assembly_acc, filepath in file_dict.items():
298
+ try:
299
+ df = pd.read_csv(filepath, sep="\t")
300
+ except pd.errors.EmptyDataError:
301
+ logging.warning(f"File {filepath.resolve()} is empty. Skipping.")
302
+ continue
303
+
304
+ schema = SUMMARY_TYPES_MAP[label]["schema"]
305
+ df = validate_dataframe(df, schema, str(filepath))
306
+
307
+ # Extract only relevant columns
308
+ df = df[original_col_names + [value_col_name]].copy()
309
+
310
+ # Rename columns: metadata columns are renamed according to column_names dict, "count"/"completeness" -> assembly acc
311
+ df.rename(columns={**column_names, value_col_name: assembly_acc}, inplace=True)
312
+ dfs.append(df)
313
+
314
+ if not dfs:
315
+ logging.warning(
316
+ f"No valid files with functional annotation summary were found. Skipping creation of {output_file_name}."
317
+ )
318
+ return
319
+
320
+ # Merge all dataframes on the renamed metadata columns
321
+ merged_df = reduce(
322
+ lambda left, right: pd.merge(left, right, on=renamed_col_names, how="outer"),
323
+ dfs,
324
+ )
325
+
326
+ # Fill missing values appropriately, convert completeness percentages to float, counts to integers
327
+ value_columns = [col for col in merged_df.columns if col not in renamed_col_names]
328
+ fill_value = 0.0 if label == "kegg_modules" else 0
329
+ dtype = float if label == "kegg_modules" else int
330
+ merged_df[value_columns] = merged_df[value_columns].fillna(fill_value).astype(dtype)
331
+
332
+ # Reorder columns: merge keys first, then sorted assembly accessions
333
+ merged_df = merged_df[renamed_col_names + sorted(value_columns)]
334
+
335
+ outfile = output_file_name
336
+ if outdir:
337
+ outfile = outdir / output_file_name
338
+
339
+ merged_df.to_csv(outfile, sep="\t", index=False)
340
+
341
+
342
+ @cli.command(
343
+ "summarise",
344
+ options_metavar="-a <assemblies> -s <study_dir> -p <output_prefix>",
345
+ short_help="Generate study-level summaries for assembly analysis results.",
346
+ )
347
+ @click.option(
348
+ "-a",
349
+ "--assemblies",
350
+ required=True,
351
+ help="CSV file containing successful analyses generated by the pipeline",
352
+ type=click.Path(exists=True, path_type=Path, dir_okay=False),
353
+ )
354
+ @click.option(
355
+ "-s",
356
+ "--study_dir",
357
+ required=True,
358
+ help="Input directory to where all the individual analyses subdirectories for summarising",
359
+ type=click.Path(exists=True, path_type=Path, file_okay=False),
360
+ )
361
+ @click.option(
362
+ "-p",
363
+ "--output_prefix",
364
+ required=True,
365
+ help="Prefix for generated summary files",
366
+ type=str,
367
+ )
368
+ @click.option(
369
+ "-o",
370
+ "--outdir",
371
+ required=False,
372
+ help="Directory for the output files, by default it will use the current working directory.",
373
+ type=click.Path(exists=True, path_type=Path, file_okay=False),
374
+ )
375
+ def summarise_analyses(
376
+ assemblies: Path, study_dir: Path, output_prefix: str, outdir: Path
377
+ ) -> None:
378
+ """
379
+ Generate study-level summaries for successfully proccessed assemblies.
380
+
381
+ :param assemblies: Path to a file listing completed assembly accessions and their status.
382
+ :param study_dir: Path to the directory containing analysis results for each assembly.
383
+ :param output_prefix: Prefix for the generated summary files.
384
+ """
385
+ logging.info(f"Reading assembly list from {assemblies.resolve()}")
386
+ assemblies_df = pd.read_csv(assemblies, names=["assembly", "status"])
387
+ CompletedAnalysisSchema(assemblies_df)
388
+ assembly_list = assemblies_df["assembly"].tolist()
389
+ logging.info("Assembly list was read successfully.")
390
+
391
+ def get_file_paths(subdir: str, filename_template: str) -> dict[str, Path]:
392
+ """
393
+ Construct file paths for each assembly given a subdirectory and filename template.
394
+ Template must contain {acc} as a placeholder.
395
+ """
396
+ return {
397
+ acc: study_dir / acc / subdir / filename_template.format(acc=acc)
398
+ for acc in assembly_list
399
+ }
400
+
401
+ logging.info("Start processing of assembly-level summaries.")
402
+
403
+ logging.info(
404
+ "Generating taxonomy summary from assembly-level summaries <accession>.krona.txt"
405
+ )
406
+ generate_taxonomy_summary(
407
+ get_file_paths("taxonomy", "{acc}.krona.txt.gz"),
408
+ f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}",
409
+ outdir=outdir,
410
+ )
411
+
412
+ for summary_type, config in SUMMARY_TYPES_MAP.items():
413
+ logging.info(
414
+ f"Generating study-level {summary_type.capitalize()} summary from file <accession>_{summary_type}_summary.tsv.gz"
415
+ )
416
+ generate_functional_summary(
417
+ get_file_paths(config["folder"], f"{{acc}}_{summary_type}_summary.tsv.gz"),
418
+ config["column_names"],
419
+ output_prefix,
420
+ summary_type,
421
+ outdir=outdir,
422
+ )
423
+ logging.info("Assembly-level summaries were generated successfully.")
424
+ logging.info("Done.")
425
+
426
+
427
+ @cli.command(
428
+ "merge",
429
+ options_metavar="-a <study_dir> -p <output_prefix>",
430
+ short_help="Merge multiple study-level summaries of assembly analysis.",
431
+ )
432
+ @click.option(
433
+ "-s",
434
+ "--study_dir",
435
+ required=True,
436
+ help="Input directory to where all the individual analyses subdirectories for merging",
437
+ type=click.Path(exists=True, file_okay=False),
438
+ )
439
+ @click.option(
440
+ "-p",
441
+ "--output_prefix",
442
+ required=True,
443
+ help="Prefix for generated merged summary files",
444
+ type=str,
445
+ )
446
+ def merge_summaries(study_dir: str, output_prefix: str) -> None:
447
+ """
448
+ Merge multiple study-level summary files into combined summary files.
449
+
450
+ :param study_dir: Path to the directory containing study-level summary files.
451
+ :param output_prefix: Prefix for the output merged summary files.
452
+ """
453
+
454
+ def get_file_paths(summary_type: str) -> list[str]:
455
+ return glob.glob(f"{study_dir}/*_{summary_type}_{OUTPUT_SUFFIX}")
456
+
457
+ logging.info("Generating combined assembly-level summaries")
458
+ logging.info("Parsing summary files for taxonomic classification")
459
+ merge_taxonomy_summaries(
460
+ get_file_paths("taxonomy"), f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}"
461
+ )
462
+
463
+ for summary_type, config in SUMMARY_TYPES_MAP.items():
464
+ logging.info(f"Parsing summary files for {summary_type.capitalize()}.")
465
+ column_names = config["column_names"]
466
+ merge_functional_summaries(
467
+ get_file_paths(summary_type),
468
+ list(column_names.values()),
469
+ output_prefix,
470
+ summary_type,
471
+ )
472
+ logging.info("Merged assembly-level summaries were generated successfully.")
473
+ logging.info("Done.")
474
+
475
+
476
+ def merge_taxonomy_summaries(summary_files: list[str], output_file_name: str) -> None:
477
+ """
478
+ Merge multiple taxonomy study-level summary files into a single study-level summary.
479
+
480
+ :param summary_files: List of paths to taxonomy summary files, each containing
481
+ taxonomic classifications and counts for an individual analysis.
482
+ :param output_file_name: Output path for the merged taxonomy summary.
483
+
484
+ Example of input taxonomy summary file:
485
+ taxonomy ERZ1049444 ERZ1049446
486
+ sk__Eukaryota;k__Metazoa;p__Chordata 2 10
487
+ sk__Eukaryota;k__Metazoa;p__Chordata;c__Mammalia;o__Primates 118 94
488
+ """
489
+ if not summary_files:
490
+ raise FileNotFoundError(
491
+ "The required taxonomic classification summary files are missing. Exiting."
492
+ )
493
+
494
+ summary_dfs = []
495
+ for file in summary_files:
496
+ df = pd.read_csv(file, sep="\t", index_col=0)
497
+ df = validate_dataframe(df, TaxonomyStudySummarySchema, file)
498
+ summary_dfs.append(df)
499
+ merged_df = pd.concat(summary_dfs, axis=1)
500
+ merged_df = merged_df.fillna(0).astype(int)
501
+
502
+ # Reorder columns: taxonomy first, then sorted assembly accessions
503
+ merged_df = merged_df[sorted(merged_df.columns)]
504
+ merged_df = merged_df.sort_index()
505
+
506
+ merged_df.to_csv(
507
+ output_file_name,
508
+ sep="\t",
509
+ index_label="taxonomy",
510
+ )
511
+
512
+
513
+ def merge_functional_summaries(
514
+ summary_files: list[str],
515
+ merge_keys: list[str],
516
+ output_prefix: str,
517
+ label: Literal[
518
+ "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
519
+ ],
520
+ ) -> None:
521
+ """
522
+ Merge multiple functional study-level summary files into a single study-level summary.
523
+
524
+ :param summary_files: List of paths to functional summary files, each containing
525
+ annotation terms and counts for an individual analysis.
526
+ :param merge_keys: List of column names to merge on (e.g. term ID, description).
527
+ :param output_prefix: Prefix for the generated output file.
528
+ :param label: Label describing the functional annotation type
529
+ (expected one of ["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"]).
530
+
531
+ In the input files, column orders may vary, but the following columns are expected:
532
+ GO summary input:
533
+ GO description category ERZ1049444 ERZ1049446
534
+ GO:0016020 membrane cellular_component 30626 673
535
+ GO:0005524 ATP binding molecular_function 30524 2873
536
+
537
+ Example of InterPro summary input:
538
+ IPR description ERZ1049444 ERZ1049446
539
+ IPR036291 NAD(P)-binding domain superfamily 16503 13450
540
+ IPR019734 Tetratricopeptide repeat 14694 11021
541
+
542
+ KEGG summary input:
543
+ GO description category ERZ1049440 ERZ1049443
544
+ GO:0003677 DNA binding molecular_function 6125 16417
545
+ GO:0055085 transmembrane transport biological_process 144 13926
546
+
547
+ Sanntis summary input:
548
+ nearest_mibig nearest_mibig_class description ERZ1049440 ERZ1049443
549
+ BGC0001356 RiPP Ribosomally synthesised and Post-translationally modified Peptide 230 185
550
+ BGC0001432 NRP Polyketide Nonribosomal Peptide Polyketide 0 8
551
+
552
+ Antismash summary input:
553
+ label description ERZ1049440 ERZ1049443
554
+ NRPS Non-ribosomal peptide synthetase 368 0
555
+ arylpolyene Aryl polyene 149 447
556
+
557
+ PFAM summary input:
558
+ PFAM description ERZ1049440 ERZ1049443
559
+ PF24718 HTH-like domain 468 1
560
+ PF06039 Malate:quinone oxidoreductase (Mqo) 490 21
561
+
562
+ KEGG modules summary input:
563
+ module_accession pathway_name pathway_class ERZ1049440 ERZ1049443
564
+ M00109 C21-Steroid hormone biosynthesis, progesterone => cortisol/cortisone Pathway modules; Lipid metabolism; Sterol biosynthesis 38.9 0.0
565
+ M00153 Cytochrome bd ubiquinol oxidase Pathway modules; Energy metabolism; ATP synthesis 44.7 84.4
566
+ """
567
+ output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
568
+
569
+ if not summary_files:
570
+ logging.warning(
571
+ f"Skipping creation of {output_file_name} because no summaries were found for this type of functional annotation."
572
+ )
573
+ return
574
+
575
+ validation_schema = SUMMARY_TYPES_MAP[label]["study_schema"]
576
+
577
+ dfs = []
578
+ for filepath in summary_files:
579
+ df = pd.read_csv(filepath, sep="\t")
580
+ df = validate_dataframe(df, validation_schema, filepath)
581
+ dfs.append(df)
582
+
583
+ if len(dfs) == 1:
584
+ merged_df = dfs[0]
585
+ else:
586
+ merged_df = reduce(
587
+ lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs
588
+ )
589
+
590
+ # Identify non-key columns (i.e. counts)
591
+ value_columns = [col for col in merged_df.columns if col not in merge_keys]
592
+
593
+ # Fill NaNs and set dtype accordingly
594
+ fill_value = 0.0 if label == "kegg_modules" else 0
595
+ dtype = float if label == "kegg_modules" else int
596
+ merged_df[value_columns] = merged_df[value_columns].fillna(fill_value).astype(dtype)
597
+
598
+ # Reorder columns
599
+ merged_df = merged_df[merge_keys + sorted(value_columns)]
600
+
601
+ merged_df.to_csv(output_file_name, sep="\t", index=False)
602
+
603
+
604
+ if __name__ == "__main__":
605
+ cli()
@@ -155,7 +155,7 @@ def parse_args():
155
155
  description = (
156
156
  "antiSMASH output summary generator. "
157
157
  "Script takes regions from GFF and counts its appearance in annotation. "
158
- "Output columns contain classID, descriptions and count. "
158
+ "Output columns contain label, descriptions and count. "
159
159
  f"Descriptions were taken from pre-parsed glossary provided on antiSMASH website. "
160
160
  f"Current script supports antiSMASH results for version {ANTISMASH_VERSION} and older."
161
161
  )
@@ -202,15 +202,15 @@ def main():
202
202
  df = pd.DataFrame(dict_list)
203
203
  df = df[df["product"].notna()]
204
204
  df_grouped = (
205
- df.groupby(["product"]).size().reset_index(name="Count")
206
- ).sort_values(by="Count", ascending=False)
205
+ df.groupby(["product"]).size().reset_index(name="count")
206
+ ).sort_values(by="count", ascending=False)
207
207
 
208
208
  df_grouped = df_grouped.rename(
209
209
  columns={
210
210
  "product": "label",
211
211
  }
212
212
  )
213
- df_grouped["Description"] = df_grouped["label"].apply(
213
+ df_grouped["description"] = df_grouped["label"].apply(
214
214
  lambda x: ",".join(
215
215
  [
216
216
  DESCRIPTIONS.get(cls.strip().lower(), cls.strip())
@@ -218,11 +218,7 @@ def main():
218
218
  ]
219
219
  )
220
220
  )
221
- df_grouped = df_grouped[["label", "Description", "Count"]]
222
- df_grouped = df_grouped.rename(columns={
223
- "Description": "description",
224
- "Count": "count"
225
- })
221
+ df_grouped = df_grouped[["label", "description", "count"]]
226
222
  df_grouped.to_csv(output_filename, sep="\t", index=False)
227
223
 
228
224
 
@@ -75,38 +75,40 @@ def main():
75
75
  entry_dict[key] = value
76
76
  dict_list.append(entry_dict)
77
77
 
78
- # Convert to DataFrame
78
+ # Convert to DataFrame
79
79
  df = pd.DataFrame(dict_list)
80
80
  df = df.rename(
81
81
  columns={
82
- "nearest_MiBIG": "nearest_MIBiG",
83
- "nearest_MiBIG_class": "nearest_MIBiG_class",
82
+ "nearest_MiBIG": "nearest_mibig",
83
+ "nearest_MiBIG_class": "nearest_mibig_class",
84
84
  }
85
85
  )
86
86
  df_grouped = (
87
- df.groupby(["nearest_MIBiG", "nearest_MIBiG_class"])
87
+ df.groupby(["nearest_mibig", "nearest_mibig_class"])
88
88
  .size()
89
- .reset_index(name="Count")
89
+ .reset_index(name="count")
90
90
  )
91
- df_grouped = df_grouped.sort_values(by="Count", ascending=False)
91
+ df_grouped = df_grouped.sort_values(by="count", ascending=False)
92
92
 
93
93
  df_desc = pd.DataFrame(
94
- list(DESCRIPTIONS.items()), columns=["MIBiG_class", "Description"]
94
+ list(DESCRIPTIONS.items()), columns=["mibig_class", "description"]
95
95
  )
96
- df_desc = df_desc.set_index("MIBiG_class")
96
+ df_desc = df_desc.set_index("mibig_class")
97
97
  df_merged = df_grouped.merge(
98
- df_desc, left_on="nearest_MIBiG_class", right_index=True, how="left"
98
+ df_desc, left_on="nearest_mibig_class", right_index=True, how="left"
99
99
  )
100
- df_merged["Description"] = df_merged.apply(
101
- lambda row: row["nearest_MIBiG_class"].replace(
102
- "NRP", df_desc.loc["NRP"]["Description"]
103
- )
104
- if pd.isna(row["Description"]) and "NRP" in row["nearest_MIBiG_class"]
105
- else row["Description"],
100
+ df_merged["description"] = df_merged.apply(
101
+ lambda row: (
102
+ row["nearest_mibig_class"].replace(
103
+ "NRP", df_desc.loc["NRP"]["description"]
104
+ )
105
+ if pd.isna(row["description"]) and "NRP" in row["nearest_mibig_class"]
106
+ else row["description"]
107
+ ),
106
108
  axis=1,
107
109
  )
108
110
  df_merged = df_merged[
109
- ["nearest_MIBiG", "nearest_MIBiG_class", "Description", "Count"]
111
+ ["nearest_mibig", "nearest_mibig_class", "description", "count"]
110
112
  ]
111
113
  df_merged = df_merged.rename(columns={
112
114
  "Description": "description",
@@ -17,10 +17,11 @@ import logging
17
17
  import re
18
18
 
19
19
  from enum import Enum
20
- from typing import ClassVar, Optional, Type
20
+ from typing import ClassVar, Optional, Type, Literal
21
21
 
22
22
  import pandas as pd
23
23
  import pandera as pa
24
+ from pandera.typing import Series
24
25
  from pandera.typing.common import DataFrameBase
25
26
 
26
27
  from pydantic import (
@@ -110,6 +111,354 @@ class AmpliconPassedRunsSchema(pa.DataFrameModel):
110
111
  coerce = True
111
112
 
112
113
 
114
+ class CompletedAnalysisRecord(BaseModel):
115
+ """Class defining a Pydantic model for a single "row" of an successfully analysed assemblies file."""
116
+
117
+ assembly: str = Field(
118
+ ...,
119
+ description="Assembly accession",
120
+ examples=["ERZ789012"],
121
+ pattern=r"ERZ\d{6,}",
122
+ )
123
+ status: Literal["success"] = Field(
124
+ ...,
125
+ description="Pipeline output for whether this assembly's analysis succeeded or not",
126
+ )
127
+
128
+
129
+ class CompletedAnalysisSchema(pa.DataFrameModel):
130
+ """Class modelling a Pandera dataframe schema that uses the CompletedAnalysisSchema class as dtype.
131
+ This is what actually validates the generated dataframe when read by pandas.read_csv.
132
+ """
133
+
134
+ assembly: Series[str]
135
+
136
+ @pa.check("assembly")
137
+ def accessions_unique(self, series: Series[str]) -> Series[bool]:
138
+ return ~series.duplicated()
139
+
140
+ class Config:
141
+ """Config with dataframe-level data type."""
142
+
143
+ dtype = PydanticModel(CompletedAnalysisRecord)
144
+ coerce = True
145
+
146
+
147
+ class InterProSummaryRecord(BaseModel):
148
+ """Model of a row in the InterPro summary file."""
149
+
150
+ count: int = Field(
151
+ ..., ge=0, description="Number of hits for the InterPro accession"
152
+ )
153
+ interpro_accession: str = Field(
154
+ ...,
155
+ description="InterPro accession ID",
156
+ examples=["IPR123456"],
157
+ pattern=r"IPR\d{6}",
158
+ )
159
+ description: str = Field(..., description="Description of the InterPro domain")
160
+
161
+
162
+ class GOSummaryRecord(BaseModel):
163
+ """Model of a row in the GO summary file."""
164
+
165
+ go: str = Field(
166
+ ...,
167
+ description="GO term identifier",
168
+ examples=["GO:1234567"],
169
+ pattern=r"GO:\d{7}",
170
+ )
171
+ term: str = Field(..., description="GO term name")
172
+ category: str = Field(
173
+ ...,
174
+ description="GO category",
175
+ examples=["biological_process", "molecular_function", "cellular_component"],
176
+ )
177
+ count: int = Field(..., ge=0, description="Number of times the GO term is observed")
178
+
179
+
180
+ class BaseSummarySchema(pa.DataFrameModel):
181
+ """Base schema for summary files."""
182
+
183
+ @staticmethod
184
+ def is_unique(series: Series[str]) -> Series[bool]:
185
+ return ~series.duplicated()
186
+
187
+
188
+ class InterProSummarySchema(BaseSummarySchema):
189
+ """Schema for InterPro summary file validation."""
190
+
191
+ interpro_accession: Series[str]
192
+
193
+ @pa.check("interpro_accession")
194
+ def interpro_ids_unique(self, series: Series[str]) -> Series[bool]:
195
+ return self.is_unique(series)
196
+
197
+ class Config:
198
+ dtype = PydanticModel(InterProSummaryRecord)
199
+ coerce = True
200
+
201
+
202
+ class GOSummarySchema(BaseSummarySchema):
203
+ """Schema for GO or GOslim summary file validation."""
204
+
205
+ go: Series[str]
206
+
207
+ @pa.check("go")
208
+ def go_ids_unique(self, series: Series[str]) -> Series[bool]:
209
+ return self.is_unique(series)
210
+
211
+ class Config:
212
+ dtype = PydanticModel(GOSummaryRecord)
213
+ coerce = True
214
+
215
+
216
+ class SanntisSummaryRecord(BaseModel):
217
+ """Model of a row in the Sanntis assembly-level summary file."""
218
+
219
+ nearest_mibig: str = Field(
220
+ ...,
221
+ description="The accession ID of the closest matching biosynthetic gene cluster (BGC) in the MIBiG database",
222
+ examples=["BGC0000073"],
223
+ pattern=r"BGC\d{7}",
224
+ )
225
+ nearest_mibig_class: str = Field(
226
+ ...,
227
+ description="The biosynthetic class of the nearest MIBiG BGC",
228
+ examples=["Polyketide"],
229
+ )
230
+ description: str = Field(
231
+ ...,
232
+ description="A brief summary of the biosynthetic process or type of metabolite associated with the nearest MIBiG cluster",
233
+ )
234
+
235
+ count: int = Field(
236
+ ..., ge=0, description="Number of times the MIBiG entry is observed"
237
+ )
238
+
239
+
240
+ class AntismashSummaryRecord(BaseModel):
241
+ """Model of a row in the Antismash summary file."""
242
+
243
+ label: str = Field(
244
+ ...,
245
+ description="Biosynthetic class or label assigned by Antismash based on sequence similarity to known biosynthetic gene clusters.",
246
+ examples=["RiPP-like", "T1PKS", "terpene"],
247
+ )
248
+ description: str = Field(
249
+ ...,
250
+ description="Brief explanation of the biosynthetic class, often indicating compound type or functional characteristics.",
251
+ examples=["Type I PKS (Polyketide synthase)", "Redox-cofactors such as PQQ"],
252
+ )
253
+ count: int = Field(
254
+ ...,
255
+ ge=0,
256
+ description="Number of BGCs (biosynthetic gene clusters) in the dataset assigned to this label.",
257
+ )
258
+
259
+
260
+ class KOSummaryRecord(BaseModel):
261
+ """Model of a row in the KEGG summary file."""
262
+
263
+ ko: str = Field(
264
+ ...,
265
+ description="KEGG Orthology (KO) identifier representing a functional gene or pathway component.",
266
+ examples=["K07547", "K04874", "K19946"],
267
+ pattern=r"K\d{5,}",
268
+ )
269
+ description: str = Field(
270
+ ...,
271
+ description="Name or function of the KO, sometimes including EC numbers and protein families.",
272
+ examples=["optineurin", "MFS transporter, POT/PTR family"],
273
+ )
274
+ count: int = Field(
275
+ ...,
276
+ ge=0,
277
+ description="Number of times this KO identifier is observed in the dataset.",
278
+ )
279
+
280
+
281
+ class PFAMSummaryRecord(BaseModel):
282
+ """Model of a row in the PFAM summary file."""
283
+
284
+ pfam: str = Field(
285
+ ...,
286
+ description="PFAM accession identifier representing a protein domain or family.",
287
+ examples=["PF00265", "PF01956", "PF00673"],
288
+ pattern=r"PF\d{5}",
289
+ )
290
+ description: str = Field(
291
+ ...,
292
+ description="Description of the protein domain or family associated with the PFAM ID.",
293
+ examples=["Thymidine kinase", "Integral membrane protein EMC3/TMCO1-like"],
294
+ )
295
+ count: int = Field(
296
+ ...,
297
+ ge=0,
298
+ description="Number of times the PFAM domain is observed in the dataset.",
299
+ )
300
+
301
+
302
+ class KEGGModulesSummaryRecord(BaseModel):
303
+ """Model of a row in the KEGG Modules summary file."""
304
+
305
+ module_accession: str = Field(
306
+ ...,
307
+ description="KEGG Module identifier representing a specific metabolic pathway or module.",
308
+ examples=["M00123", "M00234"],
309
+ pattern=r"M\d{5}",
310
+ )
311
+ completeness: float = Field(
312
+ ...,
313
+ ge=0,
314
+ description="Completeness score of the KEGG Module, indicating the extent to which the module is present in the metagenome.",
315
+ )
316
+ pathway_name: str = Field(
317
+ ...,
318
+ description="Name of the metabolic pathway associated with the KEGG Module.",
319
+ examples=["Sulfur reduction, sulfur => sulfide"],
320
+ )
321
+ pathway_class: str = Field(
322
+ ...,
323
+ description="Biosynthetic class or category associated with the KEGG Module, semi colon separated.",
324
+ examples=["Pathway modules; Energy metabolism; Photosynthesis"],
325
+ )
326
+
327
+
328
+ class SanntisSummarySchema(BaseSummarySchema):
329
+ nearest_mibig: Series[str]
330
+
331
+ @pa.check("nearest_mibig")
332
+ def mibig_ids_unique(self, series: Series[str]) -> Series[bool]:
333
+ return self.is_unique(series)
334
+
335
+ class Config:
336
+ dtype = PydanticModel(SanntisSummaryRecord)
337
+ coerce = True
338
+
339
+
340
+ class AntismashSummarySchema(BaseSummarySchema):
341
+ label: Series[str]
342
+
343
+ @pa.check("label")
344
+ def class_names_unique(self, series: Series[str]) -> Series[bool]:
345
+ return self.is_unique(series)
346
+
347
+ class Config:
348
+ dtype = PydanticModel(AntismashSummaryRecord)
349
+ coerce = True
350
+
351
+
352
+ class KOSummarySchema(BaseSummarySchema):
353
+ ko: Series[str]
354
+
355
+ @pa.check("ko")
356
+ def ko_ids_unique(self, series: Series[str]) -> Series[bool]:
357
+ return self.is_unique(series)
358
+
359
+ class Config:
360
+ dtype = PydanticModel(KOSummaryRecord)
361
+ coerce = True
362
+
363
+
364
+ class PFAMSummarySchema(BaseSummarySchema):
365
+ pfam: Series[str]
366
+
367
+ @pa.check("pfam")
368
+ def pfam_ids_unique(self, series: Series[str]) -> Series[bool]:
369
+ return self.is_unique(series)
370
+
371
+ class Config:
372
+ dtype = PydanticModel(PFAMSummaryRecord)
373
+ coerce = True
374
+
375
+
376
+ class KEGGModulesSummarySchema(BaseSummarySchema):
377
+ module_accession: Series[str]
378
+
379
+ @pa.check("module_accession")
380
+ def module_ids_unique(self, series: Series[str]) -> Series[bool]:
381
+ return self.is_unique(series)
382
+
383
+ class Config:
384
+ dtype = PydanticModel(KEGGModulesSummaryRecord)
385
+ coerce = True
386
+
387
+
388
+ class BaseStudySummarySchema(BaseSummarySchema):
389
+ """Base schema for study summary files with ERZ* columns and count checks."""
390
+
391
+ @pa.check(regex=r"^ERZ\d+")
392
+ def count_columns_are_non_negative(self, s: Series[int]) -> Series[bool]:
393
+ return s >= 0
394
+
395
+ class Config:
396
+ strict = False # allow extra ERZ* columns not declared above
397
+
398
+
399
+ class GOStudySummarySchema(BaseStudySummarySchema):
400
+ GO: Series[str] = pa.Field(str_matches=r"^GO:\d{7}$")
401
+ description: Series[str]
402
+ category: Series[str]
403
+
404
+ @pa.check("GO")
405
+ def go_ids_unique(self, series: Series[str]) -> Series[bool]:
406
+ return self.is_unique(series)
407
+
408
+
409
+ class InterProStudySummarySchema(BaseStudySummarySchema):
410
+ IPR: Series[str] = pa.Field(str_matches=r"^IPR\d{6}$")
411
+ description: Series[str]
412
+
413
+ @pa.check("IPR")
414
+ def interpro_ids_unique(self, series: Series[str]) -> Series[bool]:
415
+ return self.is_unique(series)
416
+
417
+
418
+ class AntismashStudySummarySchema(BaseStudySummarySchema):
419
+ label: Series[str]
420
+
421
+ @pa.check("label")
422
+ def class_names_unique(self, series: Series[str]) -> Series[bool]:
423
+ return self.is_unique(series)
424
+
425
+
426
+ class SanntisStudySummarySchema(BaseStudySummarySchema):
427
+ nearest_mibig: Series[str]
428
+
429
+ @pa.check("nearest_mibig")
430
+ def mibig_ids_unique(self, series: Series[str]) -> Series[bool]:
431
+ return self.is_unique(series)
432
+
433
+
434
+ class KOStudySummarySchema(BaseStudySummarySchema):
435
+ KO: Series[str]
436
+
437
+ @pa.check("KO")
438
+ def ko_ids_unique(self, series: Series[str]) -> Series[bool]:
439
+ return self.is_unique(series)
440
+
441
+
442
+ class PFAMStudySummarySchema(BaseStudySummarySchema):
443
+ PFAM: Series[str]
444
+
445
+ @pa.check("PFAM")
446
+ def pfam_ids_unique(self, series: Series[str]) -> Series[bool]:
447
+ return self.is_unique(series)
448
+
449
+
450
+ class KEGGModulesStudySummarySchema(BaseStudySummarySchema):
451
+ module_accession: Series[str]
452
+
453
+ @pa.check("module_accession")
454
+ def module_ids_unique(self, series: Series[str]) -> Series[bool]:
455
+ return self.is_unique(series)
456
+
457
+
458
+ class TaxonomyStudySummarySchema(BaseStudySummarySchema):
459
+ pass
460
+
461
+
113
462
  class AmpliconNonINSDCPassedRunsSchema(pa.DataFrameModel):
114
463
  """Class modelling the same dataframe schema as the preceding one, except with no INSDC validation.
115
464
  Uses the AmpliconNonINSDCSPassedRunsRecord as a dtype to achieve this.
@@ -145,7 +494,11 @@ class TaxRank(RootModel):
145
494
  def rank_structure_validity_check(cls, taxrank: str) -> bool:
146
495
  taxrank_list = taxrank.split("__")
147
496
  rank = taxrank_list[0]
148
- if rank != "" and rank != "Unclassified" and rank not in cls.valid_tax_ranks:
497
+ if (
498
+ rank != ""
499
+ and rank.capitalize() != "Unclassified"
500
+ and rank not in cls.valid_tax_ranks
501
+ ):
149
502
  raise ValueError(f"Invalid taxonomy rank {rank}.")
150
503
 
151
504
  return taxrank
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.1.0
3
+ Version: 1.1.1
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -8,7 +8,7 @@ Keywords: bioinformatics,pipelines,metagenomics
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: Apache Software License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.9
11
+ Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: biopython>=1.85
@@ -12,6 +12,7 @@ mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=B
12
12
  mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=Wu4tRtuRkgd3hoeuwPl_E5ghxIW7e_1vrcvFGWv_U4A,3173
13
13
  mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=yLpzkRJXAeXRUNgz60zopEwHcdprM2UDjquE-GkrFys,1722
14
14
  mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=K6gniytuItq5WzHLi1BsaUCOdP4Zm0_ZzW2_ns7-BTI,11114
15
+ mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py,sha256=epVClL10QcllL8yu7YGjx0rXNVHL2GxHi-Ek0MOjsjo,13859
15
16
  mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=NZSNY2bqs_TQyz8riDqiEFPLKcwTgzh1C7DeVHT6V8Q,4366
16
17
  mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=wXrw1B-z4hOu5oA27Vp1WYxGP2Mk6ZY4i_T5jDZgek0,6954
17
18
  mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py,sha256=Pq-9RSt3RCxzDMQVW1VHlHF4NtpVwCWFbg2CMkvpZZc,19089
@@ -23,9 +24,10 @@ mgnify_pipelines_toolkit/analysis/assembly/go_utils.py,sha256=eay9e3Xdc8XxnlC_4S
23
24
  mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py,sha256=uex2T6GagtYFBIc39-Xm4SFHL06KAQ5v0_loOmY_eaw,4289
24
25
  mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py,sha256=KaJHOKfbIurbD1iiMssjdAaSAT8Nv-_ZUFwxkLqukAE,7799
25
26
  mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py,sha256=DYZhChGD49M-zAtGkCmNHXDoVTnd5Qy6amG-oePO8Ek,5981
26
- mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=eRAQ0vFbqnWreiBdtFuwLKve9WwYwv9dYQtD1pumaZs,10776
27
+ mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=J4cIWaFyWihqo2JtaOR531aXtVxIfOi_hcwZZw-vP8g,21252
28
+ mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=jUeA7I12YrtIqnm3hUxpdgsWfa2pP1ALGjb9OMKPcgY,10643
27
29
  mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=TPaKlYkoy37_XgYNOskWCCoXtPNku_k5ygSeK4fT1VQ,6689
28
- mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=65szj-H8Hxy_eXy3TyTs48EhPJbJ2w1skHlVbH2YeVM,4538
30
+ mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=1wblLbZl521digIUWoqneAu15gErzvN_oC--5T_xUdw,4582
29
31
  mgnify_pipelines_toolkit/analysis/genomes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
32
  mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
33
  mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py,sha256=kAGU5kQyj-Hlcdx32i-xOJSuHYYUDj-kqnyYHMohHGc,4477
@@ -36,7 +38,6 @@ mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=EH5RyzesL
36
38
  mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py,sha256=6Ck2NhwRWw66GctUtKDdPT5fwJhWFR_YOZq-Vxwoa8A,1996
37
39
  mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=7-U0DN1joVu0ifLOoDUK2Pfqy8rb1RDKT6khVg3jky0,5559
38
40
  mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py,sha256=sKAo_rKEyVAZXSaIFMkpSoYZxiWwXMA3XDA6Z-hbHgg,7904
39
- mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py,sha256=OOqKaQmKGAya6_BZgfcWBZSVlmZ918PQTVMv6KwGIns,13827
40
41
  mgnify_pipelines_toolkit/constants/db_labels.py,sha256=omPINMylAjO2PxeFhSk2MbYNcGZH3P82optSlMey3dw,858
41
42
  mgnify_pipelines_toolkit/constants/ncrna.py,sha256=a_5hWp446S7BhRbe_JcydFgZM7sgPLuMlaiBvKWN_XM,1928
42
43
  mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=7nEOODQq35y9wx9YnvJuo29oBpwTpXg_kIbf_t7N4TQ,1093
@@ -44,13 +45,13 @@ mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=G-xrc9b8zdmPTaOI
44
45
  mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=kMq__kOJcbiwsgolkdvb-XLo3WMnJdEXgedjUyMOYjI,1081
45
46
  mgnify_pipelines_toolkit/constants/thresholds.py,sha256=V_xDBk0RhS3hHeWqOacKzth2gM6zJABRPgwHy-Ciqfk,1157
46
47
  mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=0bM4MwarFiM5yTcp5AbAmQ0o-q-gWy7kknir9zJ9R0A,1312
47
- mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pnH8LUH8i2ACNvFNWyG-n-eIHZcI5O9UDYulkh43mec,7692
48
+ mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pyDZvCuWbwccQF0D7c5BN1vv36wQdgcAUXU43_zAu74,18164
48
49
  mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
50
  mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
50
51
  mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
51
- mgnify_pipelines_toolkit-1.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
52
- mgnify_pipelines_toolkit-1.1.0.dist-info/METADATA,sha256=ZBar6psIFlDE7DNfuDFjeX0HLKsgMwFn6ZW_ifMqEww,5810
53
- mgnify_pipelines_toolkit-1.1.0.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
54
- mgnify_pipelines_toolkit-1.1.0.dist-info/entry_points.txt,sha256=T8soGT2to8c_qafw-0itqCn4sjOnxlfaNWHIaHz4H54,3416
55
- mgnify_pipelines_toolkit-1.1.0.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
56
- mgnify_pipelines_toolkit-1.1.0.dist-info/RECORD,,
52
+ mgnify_pipelines_toolkit-1.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
53
+ mgnify_pipelines_toolkit-1.1.1.dist-info/METADATA,sha256=E86Tp9qJuQUrkNIklK4PEATQ4ovZfhRbgMKVTyxGSx0,5811
54
+ mgnify_pipelines_toolkit-1.1.1.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
55
+ mgnify_pipelines_toolkit-1.1.1.dist-info/entry_points.txt,sha256=JSjuxAr71MTeSUPPpno22wmZYgVO-gbsXfDkgWKkF7A,3533
56
+ mgnify_pipelines_toolkit-1.1.1.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
57
+ mgnify_pipelines_toolkit-1.1.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.0.0)
2
+ Generator: setuptools (80.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,7 +1,9 @@
1
1
  [console_scripts]
2
2
  add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main
3
+ amplicon_study_summary_generator = mgnify_pipelines_toolkit.analysis.amplicon.study_summary_generator:cli
3
4
  antismash_gff_builder = mgnify_pipelines_toolkit.analysis.assembly.antismash_gff_builder:main
4
5
  are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
6
+ assembly_study_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.study_summary_generator:cli
5
7
  assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
6
8
  assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
7
9
  classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
@@ -31,7 +33,6 @@ process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbca
31
33
  remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
32
34
  rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
33
35
  standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
34
- study_summary_generator = mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:cli
35
36
  summarise_antismash_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_antismash_bgcs:main
36
37
  summarise_goslims = mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main
37
38
  summarise_sanntis_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_sanntis_bgcs:main