mgnify-pipelines-toolkit 0.2.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (42) hide show
  1. mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +1 -1
  2. mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +1 -1
  3. mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +1 -1
  4. mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +1 -1
  5. mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +1 -1
  6. mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +1 -1
  7. mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +1 -1
  8. mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +1 -1
  9. mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +1 -1
  10. mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +1 -1
  11. mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +1 -1
  12. mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +1 -1
  13. mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +1 -1
  14. mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +1 -1
  15. mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +511 -0
  16. mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +1 -1
  17. mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +1 -1
  18. mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +1 -1
  19. mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +240 -0
  20. mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +1 -1
  21. mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +1 -1
  22. mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +1 -1
  23. mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +1 -1
  24. mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +1 -1
  25. mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py +18 -11
  26. mgnify_pipelines_toolkit/constants/db_labels.py +1 -1
  27. mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +1 -1
  28. mgnify_pipelines_toolkit/constants/regex_fasta_header.py +1 -1
  29. mgnify_pipelines_toolkit/constants/tax_ranks.py +1 -1
  30. mgnify_pipelines_toolkit/constants/thresholds.py +1 -1
  31. mgnify_pipelines_toolkit/constants/var_region_coordinates.py +1 -1
  32. mgnify_pipelines_toolkit/schemas/schemas.py +21 -3
  33. mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +1 -1
  34. mgnify_pipelines_toolkit/utils/get_mpt_version.py +1 -1
  35. {mgnify_pipelines_toolkit-0.2.2.dist-info → mgnify_pipelines_toolkit-1.0.1.dist-info}/METADATA +2 -1
  36. mgnify_pipelines_toolkit-1.0.1.dist-info/RECORD +48 -0
  37. {mgnify_pipelines_toolkit-0.2.2.dist-info → mgnify_pipelines_toolkit-1.0.1.dist-info}/WHEEL +1 -1
  38. {mgnify_pipelines_toolkit-0.2.2.dist-info → mgnify_pipelines_toolkit-1.0.1.dist-info}/entry_points.txt +2 -1
  39. mgnify_pipelines_toolkit/analysis/assembly/cgc_merge.py +0 -424
  40. mgnify_pipelines_toolkit-0.2.2.dist-info/RECORD +0 -47
  41. {mgnify_pipelines_toolkit-0.2.2.dist-info → mgnify_pipelines_toolkit-1.0.1.dist-info}/LICENSE +0 -0
  42. {mgnify_pipelines_toolkit-0.2.2.dist-info → mgnify_pipelines_toolkit-1.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,240 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ from collections import defaultdict
19
+ import pathlib
20
+ import logging
21
+ import requests
22
+
23
+ import pandas as pd
24
+ import pyfastx
25
+
26
+ logging.basicConfig(level=logging.DEBUG)
27
+
28
+ URL = "https://www.ebi.ac.uk/ena/portal/api/search?result"
29
+ RUNS_URL = f"{URL}=read_run&fields=secondary_study_accession,sample_accession&limit=10&format=json&download=false"
30
+ SAMPLES_URL = f"{URL}=sample&fields=lat,lon,collection_date,depth&limit=10&format=json&download=false"
31
+ HEADERS = {"Accept": "application/json"}
32
+
33
+
34
+ def parse_args():
35
+
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument(
38
+ "-i",
39
+ "--input_path",
40
+ required=True,
41
+ type=str,
42
+ help="Directory where ASV files are.",
43
+ )
44
+ parser.add_argument(
45
+ "-r",
46
+ "--runs",
47
+ required=True,
48
+ type=str,
49
+ help="Path to CSV file containing successful analyses generated by the pipeline (columns: `run, status`)",
50
+ )
51
+ parser.add_argument(
52
+ "-o", "--output", required=True, type=str, help="Path to output directory."
53
+ )
54
+
55
+ args = parser.parse_args()
56
+
57
+ input_path = args.input_path
58
+ runs = args.runs
59
+ output = args.output
60
+
61
+ return input_path, runs, output
62
+
63
+
64
+ def get_metadata_from_run_acc(run_acc):
65
+
66
+ query = f"{RUNS_URL}&includeAccessions={run_acc}"
67
+ res_run = requests.get(query, headers=HEADERS)
68
+
69
+ if res_run.status_code != 200:
70
+ logging.error(f"Data not found for run {run_acc}")
71
+ return False
72
+
73
+ sample_acc = res_run.json()[0]["sample_accession"]
74
+
75
+ query = f"{SAMPLES_URL}&includeAccessions={sample_acc}"
76
+ res_sample = requests.get(query, headers=HEADERS)
77
+
78
+ full_res_dict = res_run.json()[0] | res_sample.json()[0]
79
+
80
+ fields_to_clean = ["lat", "lon", "depth"]
81
+
82
+ for field in fields_to_clean:
83
+ val = full_res_dict[field]
84
+ if val == "":
85
+ full_res_dict[field] = "NA"
86
+
87
+ if full_res_dict["collection_date"] == "":
88
+ full_res_dict["collectionDate"] = "NA"
89
+ else:
90
+ full_res_dict["collectionDate"] = full_res_dict["collection_date"]
91
+
92
+ del full_res_dict["collection_date"]
93
+
94
+ res_df = pd.DataFrame(full_res_dict, index=[0])
95
+ res_df.columns = [
96
+ "RunID",
97
+ "SampleID",
98
+ "StudyID",
99
+ "decimalLongitude",
100
+ "depth",
101
+ "decimalLatitude",
102
+ "collectionDate",
103
+ ]
104
+
105
+ return res_df
106
+
107
+
108
+ def get_all_metadata_from_runs(runs):
109
+
110
+ run_metadata_dict = defaultdict(dict)
111
+
112
+ for run in runs:
113
+ res_df = get_metadata_from_run_acc(run)
114
+ if res_df is not False:
115
+ run_metadata_dict[run] = res_df
116
+
117
+ return run_metadata_dict
118
+
119
+
120
+ def cleanup_taxa(df):
121
+
122
+ df.pop("Kingdom")
123
+ cleaned_df = df.rename(columns={"Superkingdom": "Kingdom", "asv": "ASVID"})
124
+
125
+ ranks = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
126
+
127
+ for rank in ranks:
128
+ cleaned_df[rank] = cleaned_df[rank].apply(
129
+ lambda x: x.split("__")[1] if pd.notnull(x) else "NA"
130
+ )
131
+
132
+ for rank in ranks:
133
+ cleaned_df[rank] = cleaned_df[rank].apply(lambda x: x if x != "" else "NA")
134
+
135
+ cleaned_df = cleaned_df[
136
+ [
137
+ "ASVID",
138
+ "StudyID",
139
+ "SampleID",
140
+ "RunID",
141
+ "decimalLongitude",
142
+ "decimalLatitude",
143
+ "depth",
144
+ "collectionDate",
145
+ "Kingdom",
146
+ "Phylum",
147
+ "Class",
148
+ "Order",
149
+ "Family",
150
+ "Genus",
151
+ "Species",
152
+ "ASVSeq",
153
+ ]
154
+ ]
155
+
156
+ return cleaned_df
157
+
158
+
159
+ def get_asv_dict(runs_df, root_path):
160
+
161
+ asv_dict = {}
162
+ for i in range(0, len(runs_df)):
163
+ run_acc = runs_df.loc[i, "run"]
164
+ status = runs_df.loc[i, "status"]
165
+
166
+ if status != "all_results":
167
+ continue
168
+
169
+ tax_file = sorted(
170
+ list(
171
+ (pathlib.Path(root_path) / run_acc / "asv").glob(
172
+ "*_DADA2-SILVA_asv_tax.tsv"
173
+ )
174
+ )
175
+ )[0]
176
+ count_files = sorted(
177
+ list(pathlib.Path(f"{root_path}/{run_acc}/asv").glob("*S-V*/*.tsv"))
178
+ )
179
+
180
+ asv_fasta_file = sorted(
181
+ list(pathlib.Path(f"{root_path}/{run_acc}/asv").glob("*_asv_seqs.fasta"))
182
+ )[0]
183
+ fasta = pyfastx.Fasta(str(asv_fasta_file), build_index=False)
184
+ asv_fasta_dict = {name: seq for name, seq in fasta}
185
+ asv_fasta_df = pd.DataFrame(asv_fasta_dict, index=["ASVSeq"]).transpose()
186
+ asv_fasta_df["asv"] = asv_fasta_df.index
187
+ run_tax_df = pd.read_csv(tax_file, sep="\t")
188
+
189
+ count_dfs = []
190
+
191
+ for count_file in count_files:
192
+ count_df = pd.read_csv(count_file, sep="\t")
193
+ count_dfs.append(count_df)
194
+
195
+ all_ampregions_count_df = pd.concat(count_dfs)
196
+ merged_df = all_ampregions_count_df.merge(
197
+ run_tax_df, left_on="asv", right_on="ASV"
198
+ )
199
+ merged_df.pop("ASV")
200
+ run_col = [run_acc] * len(merged_df)
201
+ merged_df["RunID"] = run_col
202
+ merged_df = merged_df.merge(asv_fasta_df, on="asv")
203
+ asv_dict[run_acc] = merged_df
204
+
205
+ return asv_dict
206
+
207
+
208
+ def main():
209
+
210
+ input_path, runs, output = parse_args()
211
+
212
+ root_path = pathlib.Path(input_path)
213
+
214
+ if not root_path.exists():
215
+ logging.error(f"Results path does not exist: {root_path}")
216
+ exit(1)
217
+
218
+ runs_df = pd.read_csv(runs, names=["run", "status"])
219
+
220
+ all_runs = runs_df.run.to_list()
221
+ run_metadata_dict = get_all_metadata_from_runs(all_runs)
222
+ asv_dict = get_asv_dict(runs_df, root_path)
223
+
224
+ all_merged_df = []
225
+
226
+ for run in all_runs:
227
+ if run in asv_dict.keys() and run in run_metadata_dict.keys():
228
+ run_asv_data = asv_dict[run]
229
+ run_metadata = run_metadata_dict[run]
230
+ run_merged_result = run_metadata.merge(run_asv_data, on="RunID")
231
+ all_merged_df.append(run_merged_result)
232
+
233
+ final_df = pd.concat(all_merged_df, ignore_index=True)
234
+ final_df = cleanup_taxa(final_df)
235
+
236
+ final_df.to_csv(f"{output}_dwcready.csv", index=False, na_rep="NA")
237
+
238
+
239
+ if __name__ == "__main__":
240
+ main()
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,9 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
+ import shutil
4
+ from shutil import SameFileError
3
5
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
6
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
7
  #
6
8
  # Licensed under the Apache License, Version 2.0 (the "License");
7
9
  # you may not use this file except in compliance with the License.
@@ -33,6 +35,7 @@ from mgnify_pipelines_toolkit.schemas.schemas import (
33
35
  AmpliconNonINSDCPassedRunsSchema,
34
36
  TaxonSchema,
35
37
  PR2TaxonSchema,
38
+ validate_dataframe,
36
39
  )
37
40
 
38
41
  logging.basicConfig(level=logging.DEBUG)
@@ -127,9 +130,9 @@ def parse_one_tax_file(
127
130
  # Two different schemas used for validation depending on the database
128
131
  # because PR2 schema has different taxonomic ranks than the standard
129
132
  if len(long_tax_ranks) == 8:
130
- TaxonSchema(res_df)
133
+ validate_dataframe(res_df, TaxonSchema, str(tax_file))
131
134
  elif len(long_tax_ranks) == 9:
132
- PR2TaxonSchema(res_df)
135
+ validate_dataframe(res_df, PR2TaxonSchema, str(tax_file))
133
136
 
134
137
  res_df["full_taxon"] = res_df.iloc[:, 1:].apply(
135
138
  lambda x: ";".join(x).strip(";"), axis=1
@@ -205,9 +208,7 @@ def generate_db_summary(
205
208
  amp_region_dict[amp_region].append(amp_region_df)
206
209
 
207
210
  for amp_region, amp_region_dfs in amp_region_dict.items():
208
- if (
209
- len(amp_region_dfs) > 1
210
- ): # Need at least two analyses with this amp_region to bother with the summary
211
+ if amp_region_dfs:
211
212
  amp_res_df = amp_region_dfs[0]
212
213
  for amp_df in amp_region_dfs[1:]:
213
214
  amp_res_df = amp_res_df.join(amp_df, how="outer")
@@ -319,9 +320,7 @@ def summarise_analyses(
319
320
  if tax_file:
320
321
  tax_files[run_acc] = tax_file
321
322
 
322
- if (
323
- len(tax_files) > 1
324
- ): # If at least two analyses have results from the current DB, generate a study-level summary for it
323
+ if tax_files:
325
324
  generate_db_summary(db_label, tax_files, output_prefix)
326
325
 
327
326
 
@@ -356,12 +355,12 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
356
355
  :type output_prefix: str
357
356
  """
358
357
 
359
- # TODO: The way we grab all the summaries might change depending on how the prefect side does things
360
358
  all_study_summaries = glob.glob(f"{analyses_dir}/*_study_summary.tsv")
361
359
 
362
360
  summaries_dict = organise_study_summaries(all_study_summaries)
363
361
 
364
362
  for db_label, summaries in summaries_dict.items():
363
+ merged_summary_name = f"{output_prefix}_{db_label}_study_summary.tsv"
365
364
  if len(summaries) > 1:
366
365
  res_df = pd.read_csv(summaries[0], sep="\t", index_col=0)
367
366
  for summary in summaries[1:]:
@@ -372,10 +371,18 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
372
371
 
373
372
  res_df = res_df.reindex(sorted(res_df.columns), axis=1)
374
373
  res_df.to_csv(
375
- f"{output_prefix}_{db_label}_study_summary.tsv",
374
+ merged_summary_name,
376
375
  sep="\t",
377
376
  index_label="taxonomy",
378
377
  )
378
+ elif len(summaries) == 1:
379
+ logging.info(
380
+ f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
381
+ )
382
+ try:
383
+ shutil.copyfile(summaries[0], merged_summary_name)
384
+ except SameFileError:
385
+ pass
379
386
 
380
387
 
381
388
  if __name__ == "__main__":
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
-
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
3
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
4
  #
6
5
  # Licensed under the Apache License, Version 2.0 (the "License");
7
6
  # you may not use this file except in compliance with the License.
@@ -14,11 +13,15 @@
14
13
  # See the License for the specific language governing permissions and
15
14
  # limitations under the License.
16
15
 
16
+ import logging
17
17
  import re
18
18
 
19
19
  from enum import Enum
20
- from typing import ClassVar, Optional
20
+ from typing import ClassVar, Optional, Type
21
+
22
+ import pandas as pd
21
23
  import pandera as pa
24
+ from pandera.typing.common import DataFrameBase
22
25
 
23
26
  from pydantic import (
24
27
  Field,
@@ -215,3 +218,18 @@ class PR2TaxonSchema(pa.DataFrameModel):
215
218
 
216
219
  dtype = PydanticModel(PR2TaxonRecord)
217
220
  coerce = True
221
+
222
+
223
+ def validate_dataframe(
224
+ df: pd.DataFrame, schema: Type[pa.DataFrameModel], df_metadata: str
225
+ ) -> DataFrameBase:
226
+ """
227
+ Validate a pandas dataframe using a pandera schema.
228
+ df_metadata will be shown in logs on failure: example, the TSV filename from which the df was read.
229
+ """
230
+ try:
231
+ dfs = schema.validate(df, lazy=True)
232
+ except pa.errors.SchemaErrors as e:
233
+ logging.error(f"{schema.__name__} validation failure for {df_metadata}")
234
+ raise e
235
+ return dfs
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.2.2
3
+ Version: 1.0.1
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -19,6 +19,7 @@ Requires-Dist: requests==2.32.3
19
19
  Requires-Dist: click==8.1.7
20
20
  Requires-Dist: pandera==0.22.1
21
21
  Requires-Dist: pyfastx>=2.2.0
22
+ Requires-Dist: intervaltree==3.1.0
22
23
  Provides-Extra: tests
23
24
  Requires-Dist: pytest==7.4.0; extra == "tests"
24
25
  Requires-Dist: pytest-md==0.2.0; extra == "tests"
@@ -0,0 +1,48 @@
1
+ mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=8qmb57E2XBrwqo6YcJYyvPyuaIMu82Ifho7yyyUdnSM,6572
4
+ mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py,sha256=2-URxvcl13_8O9bUmoa3-KMPSvdTaLbxfFDY-ycs_4M,5316
5
+ mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py,sha256=cRoHPM-VB_L3NWYgkNWuyzqIqhzwHJuU3-6BiiS2lnw,7553
6
+ mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py,sha256=RAdqakH05Qt_LG9jlV7P2M90o5KmlAXmDFQ4X51NIBE,5387
7
+ mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=EqfaATb5agvtQOhJqrb2YS6OxtCXvxC-q_05UzvDYug,19926
8
+ mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py,sha256=vC3nKxggnSljfw4HNkugXbXfGvLx7XnryEE7eEGqfqs,3552
9
+ mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=soTewFddtebW-EcejGh9whs3cBLWJrGCYdPc0KukoAw,8756
10
+ mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=BLqhflblUegCvuQic16PrFXfIXlFWmGkmWJyl4wJoLQ,5040
11
+ mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=Bmc4Yu8inpT6AVTG1zwxp9F9mknIDLY33-UuFdaZuq0,3756
12
+ mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=Wu4tRtuRkgd3hoeuwPl_E5ghxIW7e_1vrcvFGWv_U4A,3173
13
+ mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=yLpzkRJXAeXRUNgz60zopEwHcdprM2UDjquE-GkrFys,1722
14
+ mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=K6gniytuItq5WzHLi1BsaUCOdP4Zm0_ZzW2_ns7-BTI,11114
15
+ mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=HarDM6ay0MbyDfGGjmxP8epjsXciAJHOmqe8G64gLuM,4258
16
+ mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=wXrw1B-z4hOu5oA27Vp1WYxGP2Mk6ZY4i_T5jDZgek0,6954
17
+ mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py,sha256=Pq-9RSt3RCxzDMQVW1VHlHF4NtpVwCWFbg2CMkvpZZc,19089
18
+ mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py,sha256=2T4T7aXMGPac-LZUXJF3lOUzZZF50dAKkKTSaO-4idQ,3587
19
+ mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py,sha256=IlkeP4DuN7rXJIHa7o2sONHAXLhV9nGP-5Y1_0u8YQo,31393
20
+ mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py,sha256=8kv_6KWznOVRkeAtghLf4pxKPhAqdn36LOK4MsTz9hU,3282
21
+ mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py,sha256=uUIo97gmzO2zzN-pYF5paIzeHWBsmmjFp7zGAhf4PKY,5021
22
+ mgnify_pipelines_toolkit/analysis/assembly/go_utils.py,sha256=5D-9rB3omTxKwZuS_WjgyjsaaSPNnvZoXeThofWrK7k,5452
23
+ mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=07CbJdpo-Gy2aglCFiu2mHbkY18pYMlLFLPnYoD7tyk,5839
24
+ mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py,sha256=hggPqv9QawWAccm5tmru4VF9VnQAHF5LCXnqyLw_BWI,6727
26
+ mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py,sha256=ye0Jka6_lNn4dQGb2QG3YT46y7QK0QvyaIitIaS8JVQ,4026
27
+ mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=NhX6cSLu9QB9I5JKNUJwJVMmcRcbG-0MJCEgDJ5DxtE,4777
28
+ mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=EH5RyzesLqsonnTQbSDs7kAOV6IskS4oyqZYlex1tAY,1934
29
+ mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py,sha256=6Ck2NhwRWw66GctUtKDdPT5fwJhWFR_YOZq-Vxwoa8A,1996
30
+ mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=7-U0DN1joVu0ifLOoDUK2Pfqy8rb1RDKT6khVg3jky0,5559
31
+ mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py,sha256=sKAo_rKEyVAZXSaIFMkpSoYZxiWwXMA3XDA6Z-hbHgg,7904
32
+ mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py,sha256=OOqKaQmKGAya6_BZgfcWBZSVlmZ918PQTVMv6KwGIns,13827
33
+ mgnify_pipelines_toolkit/constants/db_labels.py,sha256=omPINMylAjO2PxeFhSk2MbYNcGZH3P82optSlMey3dw,858
34
+ mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=7nEOODQq35y9wx9YnvJuo29oBpwTpXg_kIbf_t7N4TQ,1093
35
+ mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=G-xrc9b8zdmPTaOICD2b3RCVeFAEOVkfRkIfotQ7gek,1193
36
+ mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=kMq__kOJcbiwsgolkdvb-XLo3WMnJdEXgedjUyMOYjI,1081
37
+ mgnify_pipelines_toolkit/constants/thresholds.py,sha256=guDE7c4KrVJEfg_AcO_cQoJM6LGGaRlmo_U2i8d4N7g,1157
38
+ mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=0bM4MwarFiM5yTcp5AbAmQ0o-q-gWy7kknir9zJ9R0A,1312
39
+ mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pnH8LUH8i2ACNvFNWyG-n-eIHZcI5O9UDYulkh43mec,7692
40
+ mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
42
+ mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
43
+ mgnify_pipelines_toolkit-1.0.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ mgnify_pipelines_toolkit-1.0.1.dist-info/METADATA,sha256=3xW9nS84AonTMO6tWU03fii6CqyV5-oa7pa4XrlYvWE,6181
45
+ mgnify_pipelines_toolkit-1.0.1.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
46
+ mgnify_pipelines_toolkit-1.0.1.dist-info/entry_points.txt,sha256=MsQXFdzL_dd7-2V6kHtA-QCf_iSQ-FmDcB9nZMLzJ98,2301
47
+ mgnify_pipelines_toolkit-1.0.1.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
48
+ mgnify_pipelines_toolkit-1.0.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (76.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -3,8 +3,9 @@ add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_
3
3
  are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
4
4
  assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
5
5
  assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
6
- cgc_merge = mgnify_pipelines_toolkit.analysis.assembly.cgc_merge:combine_main
7
6
  classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
7
+ combined_gene_caller_merge = mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main
8
+ dwc_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.dwc_summary_generator:main
8
9
  fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
9
10
  fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
10
11
  find_mcp_inflection_points = mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main