mgnify-pipelines-toolkit 1.2.7__py3-none-any.whl → 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -300,7 +300,7 @@ def main():
300
300
  if paired_end:
301
301
  rev_fr.close()
302
302
 
303
- if asv_dict: # if there are matches between taxonomic and ASV annotations
303
+ if asv_dict: # if there are matches between taxonomic and ASV annotations
304
304
  ref_db = ""
305
305
 
306
306
  if len(taxa_df.columns) == 9:
@@ -89,8 +89,10 @@ def main():
89
89
  if iter_cds and feature["type"] == "CDS":
90
90
  # Annotate CDS features
91
91
 
92
- start = int(feature["location"].split(":")[0][1:])
93
- end = int(feature["location"].split(":")[1].split("]")[0])
92
+ # The > and < are removed to work with pseudogene outputs in Bakta
93
+ # A feature["location"] example that can be seen in Bakta outputs: "[81883:>82231](+)"
94
+ start = int(feature["location"].split(":")[0][1:].lstrip("<>"))
95
+ end = int(feature["location"].split(":")[1].split("]")[0].lstrip("<>"))
94
96
  strand = feature["location"].split("(")[1][0] # + or -
95
97
 
96
98
  if not region_name or not (region_start <= end and start <= region_end):
@@ -110,10 +110,9 @@ def main():
110
110
  df_merged = df_merged[
111
111
  ["nearest_mibig", "nearest_mibig_class", "description", "count"]
112
112
  ]
113
- df_merged = df_merged.rename(columns={
114
- "Description": "description",
115
- "Count": "count"
116
- })
113
+ df_merged = df_merged.rename(
114
+ columns={"Description": "description", "Count": "count"}
115
+ )
117
116
  df_merged.to_csv(output_filename, sep="\t", index=False)
118
117
 
119
118
 
@@ -53,7 +53,7 @@ def cli():
53
53
 
54
54
  def get_file(
55
55
  run_acc: str, analyses_dir: Path, db_label: str
56
- ) -> Union[Path, List[Path]]:
56
+ ) -> Union[Path, List[Path], None]:
57
57
  """Takes path information for a particular analysis and db_label combo, and returns any existing files.
58
58
 
59
59
  :param run_acc: Run accession for the tax file that should be retrieved.
@@ -84,7 +84,7 @@ def get_file(
84
84
  return
85
85
 
86
86
  analysis_file = Path(
87
- f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt"
87
+ f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt.gz"
88
88
  )
89
89
  if not analysis_file.exists():
90
90
  logging.error(
@@ -119,20 +119,25 @@ def parse_one_tax_file(run_acc: str, tax_file: Path, db_label: str) -> pd.DataFr
119
119
  :rtype: pd.DataFrame
120
120
  """
121
121
 
122
- tax_ranks = _MOTUS_TAX_RANKS if db_label == "mOTUs" else _SILVA_TAX_RANKS
122
+ tax_ranks = _MOTUS_TAX_RANKS if db_label == "motus" else _SILVA_TAX_RANKS
123
123
  res_df = pd.read_csv(tax_file, sep="\t", skiprows=1, names=["Count"] + tax_ranks)
124
124
  res_df = res_df.fillna("")
125
125
 
126
- validate_dataframe(
127
- res_df, MotusTaxonSchema if db_label == "mOTUs" else TaxonSchema, str(tax_file)
128
- )
126
+ if res_df.shape[0] > 0:
127
+ validate_dataframe(
128
+ res_df,
129
+ MotusTaxonSchema if db_label == "motus" else TaxonSchema,
130
+ str(tax_file),
131
+ )
129
132
 
130
- res_df["full_taxon"] = res_df.iloc[:, 1:].apply(
131
- lambda x: ";".join(x).strip(";"), axis=1
133
+ res_df["full_taxon"] = [
134
+ ";".join(r[tax_ranks]).strip(";") for _, r in res_df.iterrows()
135
+ ]
136
+ final_df = (
137
+ res_df[["Count", "full_taxon"]]
138
+ .set_index("full_taxon")
139
+ .rename(columns={"Count": run_acc})
132
140
  )
133
- final_df = res_df.iloc[:, [0, -1]]
134
- final_df = final_df.set_index("full_taxon")
135
- final_df.columns = [run_acc]
136
141
 
137
142
  return final_df
138
143
 
@@ -162,16 +167,20 @@ def parse_one_func_file(
162
167
  ).set_index("function")
163
168
  res_df = res_df.fillna(0)
164
169
 
165
- validate_dataframe(res_df, FunctionProfileSchema, str(func_file))
170
+ if res_df.shape[0] > 0:
171
+ validate_dataframe(res_df, FunctionProfileSchema, str(func_file))
166
172
 
167
- count_df = res_df[["read_count"]]
168
- count_df.columns = [run_acc]
173
+ count_df = pd.DataFrame(res_df[["read_count"]]).rename(
174
+ columns={"read_count": run_acc}
175
+ )
169
176
 
170
- depth_df = res_df[["coverage_depth"]]
171
- depth_df.columns = [run_acc]
177
+ depth_df = pd.DataFrame(res_df[["coverage_depth"]]).rename(
178
+ columns={"coverage_depth": run_acc}
179
+ )
172
180
 
173
- breadth_df = res_df[["coverage_breadth"]]
174
- breadth_df.columns = [run_acc]
181
+ breadth_df = pd.DataFrame(res_df[["coverage_breadth"]]).rename(
182
+ columns={"coverage_breadth": run_acc}
183
+ )
175
184
 
176
185
  return count_df, depth_df, breadth_df
177
186
 
@@ -423,7 +432,9 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
423
432
  curr_df = pd.read_csv(summary, sep="\t", index_col=0)
424
433
  res_df = res_df.join(curr_df, how="outer")
425
434
  res_df = res_df.fillna(0)
426
- res_df = res_df.astype(int if table_type == "count" else float)
435
+ res_df = res_df.astype(
436
+ int if table_type == "read-count" else float
437
+ )
427
438
 
428
439
  res_df = res_df.reindex(sorted(res_df.columns), axis=1)
429
440
  res_df.to_csv(
@@ -14,23 +14,40 @@
14
14
  # See the License for the specific language governing permissions and
15
15
  # limitations under the License.
16
16
 
17
+ import shutil
18
+ from shutil import SameFileError
19
+
17
20
  import argparse
18
21
  from collections import defaultdict
19
22
  import pathlib
20
23
  import logging
24
+
25
+ import click
21
26
  import requests
27
+ from typing import Union, Dict, List, Literal
28
+ from pathlib import Path
22
29
 
23
30
  import pandas as pd
24
31
  import pyfastx
25
32
 
33
+ from mgnify_pipelines_toolkit.constants.tax_ranks import (
34
+ _SILVA_TAX_RANKS,
35
+ _PR2_TAX_RANKS,
36
+ SHORT_PR2_TAX_RANKS,
37
+ SHORT_TAX_RANKS,
38
+ )
39
+
26
40
  logging.basicConfig(level=logging.DEBUG)
27
41
 
28
- URL = "https://www.ebi.ac.uk/ena/portal/api/search?result"
29
- RUNS_URL = f"{URL}=read_run&fields=secondary_study_accession,sample_accession&limit=10&format=json&download=false"
30
- SAMPLES_URL = f"{URL}=sample&fields=lat,lon,collection_date,depth&limit=10&format=json&download=false"
42
+ URL = "https://www.ebi.ac.uk/ena/portal/api/search"
31
43
  HEADERS = {"Accept": "application/json"}
32
44
 
33
45
 
46
+ @click.group()
47
+ def cli():
48
+ pass
49
+
50
+
34
51
  def parse_args():
35
52
 
36
53
  parser = argparse.ArgumentParser()
@@ -61,28 +78,72 @@ def parse_args():
61
78
  return input_path, runs, output
62
79
 
63
80
 
64
- def get_metadata_from_run_acc(run_acc):
81
+ def get_ena_metadata_from_run_acc(run_acc: str) -> Union[pd.DataFrame, bool]:
82
+ """
83
+ Fetches and processes metadata from ENA using the provided run accession.
84
+
85
+ This function queries the European Nucleotide Archive (ENA) API to retrieve
86
+ metadata related to the specified run accession. Once the metadata is
87
+ retrieved, it performs cleaning and formatting to return the data in a
88
+ structured pandas DataFrame.
65
89
 
66
- query = f"{RUNS_URL}&includeAccessions={run_acc}"
67
- res_run = requests.get(query, headers=HEADERS)
90
+ Parameters:
91
+ run_acc: str
92
+ Accession identifier for the run to query from ENA.
93
+
94
+ Returns:
95
+ Union[pd.DataFrame, bool]
96
+ A pandas DataFrame containing the retrieved and processed metadata
97
+ if the query is successful, or False if the data for the given run
98
+ accession is not found.
99
+ """
100
+
101
+ run_fields_list = [
102
+ "secondary_study_accession",
103
+ "sample_accession",
104
+ "instrument_model",
105
+ ]
106
+ run_query_args = {
107
+ "result": "read_run",
108
+ "includeAccessions": run_acc,
109
+ "fields": ",".join(run_fields_list),
110
+ "limit": 10,
111
+ "format": "json",
112
+ "download": "false",
113
+ }
114
+ res_run = requests.get(URL, headers=HEADERS, params=run_query_args)
68
115
 
69
116
  if res_run.status_code != 200:
70
117
  logging.error(f"Data not found for run {run_acc}")
71
118
  return False
72
119
 
73
120
  sample_acc = res_run.json()[0]["sample_accession"]
74
-
75
- query = f"{SAMPLES_URL}&includeAccessions={sample_acc}"
76
- res_sample = requests.get(query, headers=HEADERS)
121
+ sample_fields_list = [
122
+ "lat",
123
+ "lon",
124
+ "collection_date",
125
+ "depth",
126
+ "center_name",
127
+ "temperature",
128
+ "salinity",
129
+ "country",
130
+ ]
131
+ sample_query_args = {
132
+ "result": "sample",
133
+ "includeAccessions": sample_acc,
134
+ "fields": ",".join(sample_fields_list),
135
+ "limit": 10,
136
+ "format": "json",
137
+ "download": "false",
138
+ }
139
+ res_sample = requests.get(URL, headers=HEADERS, params=sample_query_args)
77
140
 
78
141
  full_res_dict = res_run.json()[0] | res_sample.json()[0]
79
142
 
80
- fields_to_clean = ["lat", "lon", "depth"]
81
-
82
- for field in fields_to_clean:
83
- val = full_res_dict[field]
84
- if val == "":
85
- full_res_dict[field] = "NA"
143
+ # Turn empty values into NA
144
+ full_res_dict = {
145
+ field: "NA" if val == "" else val for field, val in full_res_dict.items()
146
+ }
86
147
 
87
148
  if full_res_dict["collection_date"] == "":
88
149
  full_res_dict["collectionDate"] = "NA"
@@ -92,38 +153,74 @@ def get_metadata_from_run_acc(run_acc):
92
153
  del full_res_dict["collection_date"]
93
154
 
94
155
  res_df = pd.DataFrame(full_res_dict, index=[0])
95
- res_df.columns = [
96
- "RunID",
97
- "SampleID",
98
- "StudyID",
99
- "decimalLongitude",
100
- "depth",
101
- "decimalLatitude",
102
- "collectionDate",
103
- ]
156
+ res_df = res_df.rename(
157
+ columns={
158
+ "run_accession": "RunID",
159
+ "sample_accession": "SampleID",
160
+ "secondary_study_accession": "StudyID",
161
+ "lon": "decimalLongitude",
162
+ "lat": "decimalLatitude",
163
+ "instrument_model": "seq_meth",
164
+ }
165
+ )
104
166
 
105
167
  return res_df
106
168
 
107
169
 
108
- def get_all_metadata_from_runs(runs):
170
+ def get_all_ena_metadata_from_runs(runs: List[str]) -> Dict[str, pd.DataFrame]:
171
+ """
172
+ Fetches ENA metadata for a list of run accessions.
109
173
 
110
- run_metadata_dict = defaultdict(dict)
174
+ This function retrieves metadata from the European Nucleotide Archive (ENA)
175
+ for the provided list of run accessions. For each valid run accession, the
176
+ metadata is parsed and stored in a dictionary, where the key is the run
177
+ accession and the value is a DataFrame containing the metadata.
178
+
179
+ Parameters:
180
+ runs (List[str]): A list of strings representing run accessions for which
181
+ the metadata needs to be retrieved.
182
+
183
+ Returns:
184
+ Dict[str, pd.DataFrame]: A dictionary where keys are run accessions and
185
+ values are DataFrames containing the corresponding ENA metadata.
186
+ """
187
+ run_metadata_dict = defaultdict(pd.DataFrame)
111
188
 
112
189
  for run in runs:
113
- res_df = get_metadata_from_run_acc(run)
190
+ res_df = get_ena_metadata_from_run_acc(run)
114
191
  if res_df is not False:
115
192
  run_metadata_dict[run] = res_df
116
193
 
117
194
  return run_metadata_dict
118
195
 
119
196
 
120
- def cleanup_taxa(df):
121
-
122
- df.pop("Kingdom")
123
- cleaned_df = df.rename(columns={"Superkingdom": "Kingdom", "asv": "ASVID"})
197
+ def cleanup_asv_taxa(df: pd.DataFrame, db: Literal["SILVA", "PR2"]) -> pd.DataFrame:
198
+ """
199
+ Cleans ASV dataframe by renaming columns, handling empty fields, and adding
200
+ constant metadata fields.
201
+
202
+ Parameters:
203
+ df : pd.DataFrame
204
+ Input DataFrame containing ASV data to clean
205
+ db : Literal["SILVA", "PR2"]
206
+ Reference database used for taxonomic ranks
207
+ """
208
+
209
+ # Rename some columns
210
+ cleaned_df = df.rename(
211
+ columns={
212
+ "asv": "ASVID",
213
+ "count": "MeasurementValue",
214
+ "center_name": "InstitutionCode",
215
+ }
216
+ )
124
217
 
125
- ranks = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
218
+ if db == "SILVA":
219
+ ranks = _SILVA_TAX_RANKS
220
+ else:
221
+ ranks = _PR2_TAX_RANKS
126
222
 
223
+ # Turn empty taxa into NA
127
224
  for rank in ranks:
128
225
  cleaned_df[rank] = cleaned_df[rank].apply(
129
226
  lambda x: x.split("__")[1] if pd.notnull(x) else "NA"
@@ -132,6 +229,12 @@ def cleanup_taxa(df):
132
229
  for rank in ranks:
133
230
  cleaned_df[rank] = cleaned_df[rank].apply(lambda x: x if x != "" else "NA")
134
231
 
232
+ # Add a few constant columns
233
+ cleaned_df["MeasurementUnit"] = ["Number of reads"] * len(cleaned_df)
234
+ cleaned_df["ASVCaller"] = ["DADA2"] * len(cleaned_df)
235
+ cleaned_df["ReferenceDatabase"] = [db] * len(cleaned_df)
236
+ cleaned_df["TaxAnnotationTool"] = ["MAPseq"] * len(cleaned_df)
237
+ # Final order of fields in output csv
135
238
  cleaned_df = cleaned_df[
136
239
  [
137
240
  "ASVID",
@@ -141,14 +244,25 @@ def cleanup_taxa(df):
141
244
  "decimalLongitude",
142
245
  "decimalLatitude",
143
246
  "depth",
247
+ "temperature",
248
+ "salinity",
144
249
  "collectionDate",
145
- "Kingdom",
146
- "Phylum",
147
- "Class",
148
- "Order",
149
- "Family",
150
- "Genus",
151
- "Species",
250
+ "seq_meth",
251
+ "country",
252
+ "InstitutionCode",
253
+ "amplifiedRegion",
254
+ "ASVCaller",
255
+ "ReferenceDatabase",
256
+ "TaxAnnotationTool",
257
+ ]
258
+ + ranks
259
+ + [
260
+ "MeasurementUnit",
261
+ "MeasurementValue",
262
+ "dbhit",
263
+ "dbhitIdentity",
264
+ "dbhitStart",
265
+ "dbhitEnd",
152
266
  "ASVSeq",
153
267
  ]
154
268
  ]
@@ -156,27 +270,140 @@ def cleanup_taxa(df):
156
270
  return cleaned_df
157
271
 
158
272
 
159
- def get_asv_dict(runs_df, root_path):
273
+ def cleanup_closedref_taxa(
274
+ df: pd.DataFrame, db: Literal["SILVA-SSU", "PR2"]
275
+ ) -> pd.DataFrame:
276
+ """
277
+ Cleans closed-reference taxonomy dataframe by renaming columns, handling empty fields,
278
+ and adding constant metadata fields.
279
+
280
+ Similar to cleanup_asv_taxa() but specifically handles closed-reference taxonomy data
281
+ rather than ASV data. Performs column renaming, empty field handling,
282
+ and adds relevant metadata columns.
283
+
284
+ Parameters:
285
+ df : pd.DataFrame
286
+ Input DataFrame containing closed-reference taxonomy data to clean
287
+ db : Literal["SILVA-SSU", "PR2"]
288
+ Reference database used for taxonomic ranks
289
+
290
+ Returns:
291
+ pd.DataFrame
292
+ Cleaned and formatted DataFrame with standardized column names and metadata fields
293
+ """
294
+
295
+ cleaned_df = df.rename(
296
+ columns={
297
+ "count": "MeasurementValue",
298
+ "center_name": "InstitutionCode",
299
+ }
300
+ )
301
+
302
+ if db == "SILVA-SSU":
303
+ ranks = _SILVA_TAX_RANKS
304
+ else:
305
+ ranks = _PR2_TAX_RANKS
306
+
307
+ # Turn empty taxa into NA
308
+ for rank in ranks:
309
+ cleaned_df[rank] = cleaned_df[rank].apply(lambda x: x if x != "" else "NA")
310
+
311
+ # Add a MeasurementUnit Column for the read count for each asv
312
+ cleaned_df["MeasurementUnit"] = ["Number of reads"] * len(cleaned_df)
313
+ cleaned_df["ReferenceDatabase"] = [db] * len(cleaned_df)
314
+ cleaned_df["TaxAnnotationTool"] = ["MAPseq"] * len(cleaned_df)
315
+
316
+ # Final order of fields in output csv
317
+ cleaned_df = cleaned_df[
318
+ [
319
+ "StudyID",
320
+ "SampleID",
321
+ "RunID",
322
+ "decimalLongitude",
323
+ "decimalLatitude",
324
+ "depth",
325
+ "temperature",
326
+ "salinity",
327
+ "collectionDate",
328
+ "seq_meth",
329
+ "country",
330
+ "InstitutionCode",
331
+ "ReferenceDatabase",
332
+ "TaxAnnotationTool",
333
+ ]
334
+ + ranks
335
+ + [
336
+ "MeasurementUnit",
337
+ "MeasurementValue",
338
+ ]
339
+ ]
340
+
341
+ return cleaned_df
342
+
343
+
344
+ def get_asv_dict(
345
+ runs_df: pd.DataFrame, root_path: Path, db: Literal["SILVA", "PR2"]
346
+ ) -> Dict[str, pd.DataFrame]:
347
+ """
348
+ Generates a dictionary containing ASV (Amplicon Sequence Variant) data for each run.
349
+
350
+ This function processes sequencing run data, extracts relevant information, and
351
+ aggregates it into a dictionary. Each key in the dictionary corresponds to a
352
+ unique run ID, and its value is a DataFrame containing detailed ASV data such
353
+ as taxonomy assignments, sequence read counts, MAPseq hit data, and the ASV sequences
354
+ themselves. The function filters runs to only include those with a
355
+ complete analysis status ("all_results").
356
+
357
+ Arguments:
358
+ runs_df (pd.DataFrame): A DataFrame containing results status info about the runs.
359
+ root_path (Path): The base directory path where analysis results files are stored.
360
+ db (Literal["SILVA", "PR2"]): Specifies the database used for taxonomy assignment
361
+ (e.g., SILVA or PR2).
362
+
363
+ Returns:
364
+ Dict[str, pd.DataFrame]: A dictionary where keys are run IDs and values are
365
+ DataFrames containing merged ASV data for corresponding runs.
366
+ """
160
367
 
161
368
  asv_dict = {}
162
369
  for i in range(0, len(runs_df)):
163
370
  run_acc = runs_df.loc[i, "run"]
164
- status = runs_df.loc[i, "status"]
371
+ analysis_status = runs_df.loc[i, "status"]
165
372
 
166
- if status != "all_results":
373
+ # Only keep runs that have all_results i.e. includes ASV results
374
+ if analysis_status != "all_results":
167
375
  continue
168
376
 
377
+ # Raw MAPseq taxonomy assignment files
378
+ # Used to extract hit data like the exact dbhit, %identity, and matching coords
379
+ mapseq_file = sorted(
380
+ list(
381
+ (
382
+ pathlib.Path(root_path)
383
+ / run_acc
384
+ / "taxonomy-summary"
385
+ / f"DADA2-{db}"
386
+ ).glob(f"*_DADA2-{db}.mseq")
387
+ )
388
+ )[0]
389
+ mapseq_df = pd.read_csv(mapseq_file, sep="\t", usecols=[0, 1, 3, 9, 10])
390
+ mapseq_df.columns = ["asv", "dbhit", "dbhitIdentity", "dbhitStart", "dbhitEnd"]
391
+
392
+ # Processed MAPseq taxonomy assignment files
169
393
  tax_file = sorted(
170
394
  list(
171
395
  (pathlib.Path(root_path) / run_acc / "asv").glob(
172
- "*_DADA2-SILVA_asv_tax.tsv"
396
+ f"*_DADA2-{db}_asv_tax.tsv"
173
397
  )
174
398
  )
175
399
  )[0]
400
+ run_tax_df = pd.read_csv(tax_file, sep="\t")
401
+
402
+ # ASV read count files
176
403
  count_files = sorted(
177
404
  list(pathlib.Path(f"{root_path}/{run_acc}/asv").glob("*S-V*/*.tsv"))
178
405
  )
179
-
406
+ # ASV sequence FASTA files
180
407
  asv_fasta_file = sorted(
181
408
  list(pathlib.Path(f"{root_path}/{run_acc}/asv").glob("*_asv_seqs.fasta"))
182
409
  )[0]
@@ -184,32 +411,163 @@ def get_asv_dict(runs_df, root_path):
184
411
  asv_fasta_dict = {name: seq for name, seq in fasta}
185
412
  asv_fasta_df = pd.DataFrame(asv_fasta_dict, index=["ASVSeq"]).transpose()
186
413
  asv_fasta_df["asv"] = asv_fasta_df.index
187
- run_tax_df = pd.read_csv(tax_file, sep="\t")
188
414
 
189
415
  count_dfs = []
190
416
 
191
417
  for count_file in count_files:
418
+ amp_region = count_file.stem.split("_")[1]
192
419
  count_df = pd.read_csv(count_file, sep="\t")
420
+ count_df["amplifiedRegion"] = [amp_region] * len(count_df)
193
421
  count_dfs.append(count_df)
194
422
 
195
- all_ampregions_count_df = pd.concat(count_dfs)
196
- merged_df = all_ampregions_count_df.merge(
423
+ # Merge counts into one DF in case there are multiple amplified regions...
424
+ all_amplified_regions_count_df = pd.concat(count_dfs)
425
+
426
+ # ...then merge with taxonomy dataframes...
427
+ merged_df = all_amplified_regions_count_df.merge(
197
428
  run_tax_df, left_on="asv", right_on="ASV"
198
429
  )
430
+ # ...then merge with MAPseq columns...
431
+ merged_df = merged_df.merge(mapseq_df, on="asv")
432
+
433
+ # ...then merge with ASV FASTA sequences
199
434
  merged_df.pop("ASV")
200
435
  run_col = [run_acc] * len(merged_df)
201
436
  merged_df["RunID"] = run_col
202
437
  merged_df = merged_df.merge(asv_fasta_df, on="asv")
438
+
439
+ # Assign final DF to run_acc in dictionary
203
440
  asv_dict[run_acc] = merged_df
204
441
 
205
442
  return asv_dict
206
443
 
207
444
 
208
- def main():
445
+ def get_closedref_dict(
446
+ runs_df: pd.DataFrame, root_path: Path, db: Literal["SILVA-SSU", "PR2"]
447
+ ) -> Dict[str, pd.DataFrame]:
448
+ """
449
+ Generates a dictionary of closed-reference taxonomy data for multiple sequencing runs.
450
+
451
+ Processes Krona-formatted taxonomy files from analysis results and converts them
452
+ to DataFrames mapping taxonomic ranks to abundances. Returns dictionary with run
453
+ accessions as keys and said DataFrames as values.
209
454
 
210
- input_path, runs, output = parse_args()
455
+ Arguments:
456
+ runs_df (pd.DataFrame): A DataFrame containing results status info about the runs.
457
+ root_path (Path): The base directory path where analysis results files are stored.
458
+ db (Literal["SILVA", "PR2"]): Specifies the database used for taxonomy assignment
459
+ (e.g., SILVA or PR2).
211
460
 
212
- root_path = pathlib.Path(input_path)
461
+ Returns:
462
+ Dict[str, pd.DataFrame]: A dictionary mapping each run accession (str) to its
463
+ corresponding taxonomy DataFrame (pd.DataFrame). Each DataFrame contains taxonomic
464
+ abundance counts.
465
+ """
466
+
467
+ if db == "SILVA-SSU":
468
+ ranks = _SILVA_TAX_RANKS
469
+ short_ranks = SHORT_TAX_RANKS
470
+ else:
471
+ ranks = _PR2_TAX_RANKS
472
+ short_ranks = SHORT_PR2_TAX_RANKS
473
+
474
+ closedref_dict = {}
475
+ for i in range(0, len(runs_df)):
476
+ run_acc = runs_df.loc[i, "run"]
477
+ status = runs_df.loc[i, "status"]
478
+
479
+ if status != "all_results":
480
+ continue
481
+
482
+ # Krona formatted results
483
+ kronatxt_file = sorted(
484
+ list(
485
+ (pathlib.Path(root_path) / run_acc / "taxonomy-summary" / f"{db}").glob(
486
+ "*.txt"
487
+ )
488
+ )
489
+ )[0]
490
+
491
+ column_names = ["count"] + ranks
492
+ tax_df = pd.read_csv(kronatxt_file, sep="\t", names=column_names)
493
+
494
+ # Clean up empty ranks
495
+ tax_df = tax_df.fillna("NA")
496
+ krona_taxranks = [rank + "__" for rank in short_ranks]
497
+ tax_df = tax_df.map(lambda x: "NA" if x in krona_taxranks else x)
498
+
499
+ run_col = [run_acc] * len(tax_df)
500
+ tax_df["RunID"] = run_col
501
+
502
+ # Assign final DF to run_acc in dictionary
503
+ closedref_dict[run_acc] = tax_df
504
+
505
+ return closedref_dict
506
+
507
+
508
+ @cli.command(
509
+ "summarise",
510
+ options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
511
+ short_help='Generate "DarwinCore-ready" study-level summaries of amplicon analysis results.',
512
+ )
513
+ @click.option(
514
+ "-r",
515
+ "--runs",
516
+ required=True,
517
+ help="CSV file containing successful analyses generated by the pipeline",
518
+ type=click.Path(exists=True, path_type=Path, dir_okay=False),
519
+ )
520
+ @click.option(
521
+ "-a",
522
+ "--analyses_dir",
523
+ required=True,
524
+ help="Input directory to where all the individual analyses subdirectories for summarising",
525
+ type=click.Path(exists=True, path_type=Path, file_okay=False),
526
+ )
527
+ @click.option(
528
+ "-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
529
+ )
530
+ def generate_dwcready_summaries(
531
+ runs: Path, analyses_dir: Path, output_prefix: str
532
+ ) -> None:
533
+ """
534
+ Generate Darwin Core-ready study-level summaries of amplicon analysis results.
535
+
536
+ This function processes amplicon analysis results from both ASV (DADA2) and closed-reference
537
+ analyses to create "Darwin Core Ready" summary files. The function handles both
538
+ SILVA and PR2 database results, combining taxonomy assignments with ENA metadata.
539
+
540
+ For ASV data, files are generated per amplified region - that means with SILVA and PR2 as
541
+ reference databases, one CSV is created per amplified region. For example:
542
+ - With one amplified region (e.g. 16S-V3-V4):
543
+ - {output_prefix}_DADA2_SILVA_16S-V3-V4_dwcready.csv
544
+ - {output_prefix}_DADA2_PR2_16S-V3-V4_dwcready.csv
545
+ - With two amplified regions (e.g. 16S-V3-V4 and 18S-V4):
546
+ - {output_prefix}_DADA2_SILVA_16S-V3-V4_dwcready.csv
547
+ - {output_prefix}_DADA2_PR2_16S-V3-V4_dwcready.csv
548
+ - {output_prefix}_DADA2_SILVA_18S-V4_dwcready.csv
549
+ - {output_prefix}_DADA2_PR2_18S-V4_dwcready.csv
550
+
551
+ For closed-reference data, one file per database is generated regardless of amplified regions:
552
+ - {output_prefix}_closedref_SILVA-SSU_dwcready.csv
553
+ - {output_prefix}_closedref_PR2_dwcready.csv
554
+
555
+ Args:
556
+ runs (Path): Path to CSV file containing successful analyses generated by the pipeline.
557
+ The CSV should have columns for run accessions and analysis results status.
558
+ analyses_dir (Path): Input directory containing all individual analyses subdirectories
559
+ to be summarized. Each subdirectory should contain taxonomy-summary
560
+ and ASV results.
561
+ output_prefix (str): Prefix to be used for the output summary files.
562
+
563
+ Returns:
564
+ None: Writes output CSV files with Darwin Core-compliant summaries of amplicon
565
+ analysis results. The total number of output files depends on:
566
+ 1. For ASV data: Number of amplified regions × Number of reference databases
567
+ 2. For closed-reference: Number of reference databases
568
+ """
569
+
570
+ root_path = pathlib.Path(analyses_dir)
213
571
 
214
572
  if not root_path.exists():
215
573
  logging.error(f"Results path does not exist: {root_path}")
@@ -218,23 +576,174 @@ def main():
218
576
  runs_df = pd.read_csv(runs, names=["run", "status"])
219
577
 
220
578
  all_runs = runs_df.run.to_list()
221
- run_metadata_dict = get_all_metadata_from_runs(all_runs)
222
- asv_dict = get_asv_dict(runs_df, root_path)
579
+ run_metadata_dict = get_all_ena_metadata_from_runs(all_runs)
580
+
581
+ # Generate DwC-ready files for ASV results
582
+ asv_dbs = ["SILVA", "PR2"]
583
+ for db in asv_dbs:
584
+
585
+ asv_dict = get_asv_dict(runs_df, root_path, db)
586
+ all_merged_df = []
587
+
588
+ for run in all_runs:
589
+ if run in asv_dict.keys() and run in run_metadata_dict.keys():
590
+ run_asv_data = asv_dict[run]
591
+ run_metadata = run_metadata_dict[run]
592
+ run_merged_result = run_metadata.merge(run_asv_data, on="RunID")
593
+ all_merged_df.append(run_merged_result)
594
+
595
+ final_df = pd.concat(all_merged_df, ignore_index=True)
596
+ final_df = cleanup_asv_taxa(final_df, db)
597
+
598
+ # get all amplified regions present in the study
599
+ present_amplified_regions = final_df["amplifiedRegion"].unique()
600
+
601
+ # generate a DataFrame and then write a CSV file on an amplifiedRegion basis
602
+ for amplified_region in present_amplified_regions:
603
+ amplified_region_df = final_df.loc[
604
+ final_df["amplifiedRegion"] == amplified_region
605
+ ]
606
+ amplified_region_df.to_csv(
607
+ f"{output_prefix}_DADA2_{db}_{amplified_region}_dwcready.csv",
608
+ index=False,
609
+ na_rep="NA",
610
+ )
223
611
 
224
- all_merged_df = []
612
+ # Generate DwC-ready files for closed reference results
613
+ closedref_dbs = ["SILVA-SSU", "PR2"]
614
+ for db in closedref_dbs:
225
615
 
226
- for run in all_runs:
227
- if run in asv_dict.keys() and run in run_metadata_dict.keys():
228
- run_asv_data = asv_dict[run]
229
- run_metadata = run_metadata_dict[run]
230
- run_merged_result = run_metadata.merge(run_asv_data, on="RunID")
231
- all_merged_df.append(run_merged_result)
616
+ closedref_dict = get_closedref_dict(runs_df, root_path, db)
617
+ all_merged_df = []
232
618
 
233
- final_df = pd.concat(all_merged_df, ignore_index=True)
234
- final_df = cleanup_taxa(final_df)
619
+ for run in all_runs:
620
+ if run in closedref_dict.keys() and run in run_metadata_dict.keys():
621
+ run_closedref_data = closedref_dict[run]
622
+ run_metadata = run_metadata_dict[run]
623
+ run_merged_result = run_metadata.merge(run_closedref_data, on="RunID")
624
+ all_merged_df.append(run_merged_result)
235
625
 
236
- final_df.to_csv(f"{output}_dwcready.csv", index=False, na_rep="NA")
626
+ final_df = pd.concat(all_merged_df, ignore_index=True)
627
+ final_df = cleanup_closedref_taxa(final_df, db)
628
+
629
+ final_df.to_csv(
630
+ f"{output_prefix}_closedref_{db}_dwcready.csv", index=False, na_rep="NA"
631
+ )
632
+
633
+
634
+ def organise_dwcr_summaries(all_study_summaries: List[Path]) -> defaultdict[List]:
635
+ """
636
+ Organizes Darwin Core-ready summary files into groups based on their analysis type and database.
637
+
638
+ This function processes paths to Darwin Core-ready summary files and organizes them into a
639
+ dictionary based on their type (ASV/DADA2 or closed-reference) and database used. The function
640
+ handles the two types of summaries differently:
641
+
642
+ 1. ASV/DADA2 summaries:
643
+ - Label includes analysis type (DADA2), database, and amplified region
644
+ - Example label: "DADA2_SILVA_16S-V3-V4"
645
+ 2. Closed-reference summaries:
646
+ - Label only includes analysis type and database
647
+ - Example label: "closedref_SILVA-SSU"
648
+
649
+ Args:
650
+ all_study_summaries (List[Path]): List of paths to Darwin Core-ready summary files
651
+ to be organized.
652
+
653
+ Returns:
654
+ defaultdict[List]: Dictionary where keys are summary labels (combining analysis type,
655
+ database, and for ASVs, amplified region) and values are lists of paths to
656
+ corresponding summary files.
657
+ """
658
+
659
+ summaries_dict = defaultdict(list)
660
+
661
+ for summary_path in all_study_summaries:
662
+ summary_filename = summary_path.stem
663
+
664
+ temp_lst = summary_filename.split("_")
665
+ if "DADA2" in summary_filename:
666
+ summary_db_label = "_".join(
667
+ temp_lst[1:4]
668
+ ) # For ASVs we need to include the amplified region in the label
669
+ else:
670
+ summary_db_label = "_".join(
671
+ temp_lst[1:3]
672
+ ) # For closed reference, just the db_label is needed
673
+ summaries_dict[summary_db_label].append(summary_path)
674
+
675
+ return summaries_dict
676
+
677
+
678
+ @cli.command(
679
+ "merge",
680
+ options_metavar="-a <analyses_dir> -p <output_prefix>",
681
+ short_help="Merge multiple DwC-ready summaries of amplicon analysis.",
682
+ )
683
+ @click.option(
684
+ "-a",
685
+ "--analyses_dir",
686
+ required=True,
687
+ help="Input directory where all the individual analyses subdirectories are for merging",
688
+ type=click.Path(exists=True, file_okay=False),
689
+ )
690
+ @click.option(
691
+ "-p",
692
+ "--output_prefix",
693
+ required=True,
694
+ help="Prefix to merged summary files",
695
+ type=str,
696
+ )
697
+ def merge_dwcr_summaries(analyses_dir: str, output_prefix: str) -> None:
698
+ """
699
+ Merges multiple Darwin Core-ready summary files into consolidated summaries by type.
700
+
701
+ This function takes a directory containing multiple Darwin Core-ready summary files
702
+ and merges them based on their analysis type (ASV/DADA2 or closed-reference) and
703
+ reference database. The function processes two types of summaries:
704
+
705
+ 1. ASV/DADA2 summaries:
706
+ - Merged by analysis type, database, and amplified region
707
+ - Output example: "{prefix}_DADA2_SILVA_16S-V3-V4_dwcready.csv"
708
+ 2. Closed-reference summaries:
709
+ - Merged by analysis type and database only
710
+ - Output example: "{prefix}_closedref_SILVA-SSU_dwcready.csv"
711
+
712
+ If only one summary file exists for a particular combination, it is copied to the
713
+ output location instead of being merged.
714
+
715
+ Args:
716
+ analyses_dir (str): Path to directory containing Darwin Core-ready summary files
717
+ (files ending in "_dwcready.csv")
718
+ output_prefix (str): Prefix to use for merged output files
719
+
720
+ Returns:
721
+ None: Writes merged summary files to current directory, with names following the
722
+ pattern "{output_prefix}_{analysis-type}_{database}[_{region}]_dwcready.csv"
723
+ """
724
+
725
+ all_dwcr_summaries = Path(analyses_dir).glob("*_dwcready.csv")
726
+
727
+ summaries_dict = organise_dwcr_summaries(all_dwcr_summaries)
728
+
729
+ for db_label, summaries in summaries_dict.items():
730
+ merged_summary_name = f"{output_prefix}_{db_label}_dwcready.csv"
731
+ if len(summaries) > 1:
732
+ res_df = pd.read_csv(summaries[0])
733
+ for summary in summaries[1:]:
734
+ curr_df = pd.read_csv(summary)
735
+ res_df = pd.concat([res_df, curr_df])
736
+
737
+ res_df.to_csv(merged_summary_name, index=False, na_rep="NA")
738
+ elif len(summaries) == 1:
739
+ logging.info(
740
+ f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
741
+ )
742
+ try:
743
+ shutil.copyfile(summaries[0], merged_summary_name)
744
+ except SameFileError:
745
+ pass
237
746
 
238
747
 
239
748
  if __name__ == "__main__":
240
- main()
749
+ cli()
@@ -21,7 +21,7 @@ TAXDB_LABELS = ["SILVA-SSU", "SILVA-LSU", "PR2", "UNITE", "ITSoneDB"]
21
21
  ASV_TAXDB_LABELS = ["DADA2-SILVA", "DADA2-PR2"]
22
22
 
23
23
  # taxonomy_summary labels for Raw Reads Analysis Pipeline
24
- RRAP_TAXDB_LABELS = ['SILVA-SSU', 'SILVA-LSU', 'mOTUs']
24
+ RRAP_TAXDB_LABELS = ["silva-ssu", "silva-lsu", "motus"]
25
25
 
26
26
  # function_summary labels for Raw Reads Analysis Pipeline
27
- RRAP_FUNCDB_LABELS = ['Pfam-A']
27
+ RRAP_FUNCDB_LABELS = ["pfam"]
@@ -35,15 +35,7 @@ _PR2_TAX_RANKS = [
35
35
  "Genus",
36
36
  "Species",
37
37
  ]
38
- _MOTUS_TAX_RANKS = [
39
- 'Kingdom',
40
- 'Phylum',
41
- 'Class',
42
- 'Order',
43
- 'Family',
44
- 'Genus',
45
- 'Species'
46
- ]
38
+ _MOTUS_TAX_RANKS = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
47
39
 
48
40
  SHORT_TAX_RANKS = ["sk", "k", "p", "c", "o", "f", "g", "s"]
49
41
  SHORT_MOTUS_TAX_RANKS = ["k", "p", "c", "o", "f", "g", "s"]
@@ -581,8 +581,8 @@ class RawReadsStatusTypes(StrEnum):
581
581
 
582
582
  all_results = "all_results"
583
583
  no_reads = "no_reads"
584
- no_results = "no_results"
585
- missing_results = "missing_results"
584
+ all_empty_results = "all_empty_results"
585
+ some_empty_results = "some_empty_results"
586
586
 
587
587
 
588
588
  class RawReadsPassedRunsRecord(BaseModel):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.2.7
3
+ Version: 1.2.9
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -1,7 +1,7 @@
1
1
  mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=8yFhmHQXVDPXvRX8oWSANV3VMu0X-zNnz12u1fcGwTE,20649
4
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=ohguvrMSg7GuiiZ5aHj1DvCnfThKFUG4s13LUSMM0mo,8892
4
+ mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=-g1FDwdEndWH9VvYLmc_NEs2l204kKjMHk65wag8T_s,8891
5
5
  mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=BLqhflblUegCvuQic16PrFXfIXlFWmGkmWJyl4wJoLQ,5040
6
6
  mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py,sha256=1aGOJX9tC7M1rnd0U2PeJ681sUo02wxk7_ycJqeVt6s,2216
7
7
  mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=-W_QmdmKAIqVC5n-RS8LX11hEQM4xdp5r1jkITB1CI8,5256
@@ -9,7 +9,7 @@ mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=Wu4t
9
9
  mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=yLpzkRJXAeXRUNgz60zopEwHcdprM2UDjquE-GkrFys,1722
10
10
  mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py,sha256=epVClL10QcllL8yu7YGjx0rXNVHL2GxHi-Ek0MOjsjo,13859
11
11
  mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=NZSNY2bqs_TQyz8riDqiEFPLKcwTgzh1C7DeVHT6V8Q,4366
12
- mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=vZdDIcG09hulgCp0FylwHXVSGSlwl2RsDU4_xvsrUC0,6732
12
+ mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=2Zkm3KJ1Borzch5XSZbsVNTPej3J5QYkqTQQACkRDVo,6944
13
13
  mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py,sha256=Pq-9RSt3RCxzDMQVW1VHlHF4NtpVwCWFbg2CMkvpZZc,19089
14
14
  mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py,sha256=2T4T7aXMGPac-LZUXJF3lOUzZZF50dAKkKTSaO-4idQ,3587
15
15
  mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py,sha256=6gbCRlEX1eBqzFYjOt3og-961dZ--QsCJL-7l5nzg1k,33992
@@ -22,31 +22,31 @@ mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py,sha2
22
22
  mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=eNichqFFmfPsa2J10IUm_PemVs9fBhbKa2vpDqEvJNU,21791
23
23
  mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=jUeA7I12YrtIqnm3hUxpdgsWfa2pP1ALGjb9OMKPcgY,10643
24
24
  mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=TPaKlYkoy37_XgYNOskWCCoXtPNku_k5ygSeK4fT1VQ,6689
25
- mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=1wblLbZl521digIUWoqneAu15gErzvN_oC--5T_xUdw,4582
25
+ mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=lxe7R2RQFyNCzEm6YuNRrqKZLZOUPq5W1P23Pt2sKBU,4570
26
26
  mgnify_pipelines_toolkit/analysis/genomes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py,sha256=A3QefwftUoG1cbpmgCJ_rUcuk7cbPxjn1ZyZk9iDPKY,15731
27
+ mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py,sha256=ltyNHwzaZZkK1ScH2vV2QV1eUXTHQUMYyadJwO-zSQY,16028
28
28
  mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
29
  mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py,sha256=kAGU5kQyj-Hlcdx32i-xOJSuHYYUDj-kqnyYHMohHGc,4477
30
- mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py,sha256=hggPqv9QawWAccm5tmru4VF9VnQAHF5LCXnqyLw_BWI,6727
30
+ mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py,sha256=RaFopUjJI4UO1ttnSEHj7iUXpAL5-2FTbDXlhOmNy0s,25534
31
31
  mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py,sha256=ye0Jka6_lNn4dQGb2QG3YT46y7QK0QvyaIitIaS8JVQ,4026
32
32
  mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=UrU0CpZj3pfHZWI7Uuhv2a_C0JsO8pnVErY0sWGgNdw,4920
33
33
  mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=EH5RyzesLqsonnTQbSDs7kAOV6IskS4oyqZYlex1tAY,1934
34
34
  mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py,sha256=6Ck2NhwRWw66GctUtKDdPT5fwJhWFR_YOZq-Vxwoa8A,1996
35
35
  mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=7-U0DN1joVu0ifLOoDUK2Pfqy8rb1RDKT6khVg3jky0,5559
36
36
  mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py,sha256=sKAo_rKEyVAZXSaIFMkpSoYZxiWwXMA3XDA6Z-hbHgg,7904
37
- mgnify_pipelines_toolkit/constants/db_labels.py,sha256=smYSBBO6QuWUfL2QFPieaSV5oDCQOd9au6g26U6pky4,1064
37
+ mgnify_pipelines_toolkit/constants/db_labels.py,sha256=12mksTtAwTE1smLnemdoItxGw1AmtJPOzbnW2aGj0u0,1062
38
38
  mgnify_pipelines_toolkit/constants/ncrna.py,sha256=a_5hWp446S7BhRbe_JcydFgZM7sgPLuMlaiBvKWN_XM,1928
39
39
  mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=G-xrc9b8zdmPTaOICD2b3RCVeFAEOVkfRkIfotQ7gek,1193
40
- mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=t6FquKhTWK3KUiavm42ryqcYLEUHvhfJYEiyf4zP5v0,1259
40
+ mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=ekZN5OcMBhDRcj7XB_27wQ8fEnmAqMJc4aQ3pv4BRmI,1229
41
41
  mgnify_pipelines_toolkit/constants/thresholds.py,sha256=1AMBmoHBR0WjXZpkwJ7_Q-gfJtHXuCA4tZ-uvPhF0Xc,1085
42
42
  mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=0bM4MwarFiM5yTcp5AbAmQ0o-q-gWy7kknir9zJ9R0A,1312
43
- mgnify_pipelines_toolkit/schemas/schemas.py,sha256=AII14TozgAUfYdvo42Mo2FPVE9rtEo2kGq5cJ2ojPUI,23113
43
+ mgnify_pipelines_toolkit/schemas/schemas.py,sha256=he9igC80YTR32v1e5NslwTgtdVySmnXwK9iY9IBPNBg,23133
44
44
  mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
46
46
  mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
47
- mgnify_pipelines_toolkit-1.2.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
48
- mgnify_pipelines_toolkit-1.2.7.dist-info/METADATA,sha256=I_SJna7ACyZSKCOyoqjiNNBJ4uOlrB7FHFgCLRgaZ7Y,5775
49
- mgnify_pipelines_toolkit-1.2.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
50
- mgnify_pipelines_toolkit-1.2.7.dist-info/entry_points.txt,sha256=hiSz-RkJWyEH2N6D9qHriTRb9jQtmA8Lji7RyguWDvQ,3229
51
- mgnify_pipelines_toolkit-1.2.7.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
52
- mgnify_pipelines_toolkit-1.2.7.dist-info/RECORD,,
47
+ mgnify_pipelines_toolkit-1.2.9.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
48
+ mgnify_pipelines_toolkit-1.2.9.dist-info/METADATA,sha256=hxvefbAKXzSx05LR0jTdW3iW2fHlr2hxmNhY4TCdJ_4,5775
49
+ mgnify_pipelines_toolkit-1.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
50
+ mgnify_pipelines_toolkit-1.2.9.dist-info/entry_points.txt,sha256=7TJ8GgbKoX1xnQsOdWwMvwhIv4uuHCx7pMxKmZabPOs,3228
51
+ mgnify_pipelines_toolkit-1.2.9.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
52
+ mgnify_pipelines_toolkit-1.2.9.dist-info/RECORD,,
@@ -6,7 +6,7 @@ assembly_study_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.st
6
6
  classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
7
7
  combined_gene_caller_merge = mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main
8
8
  convert_cmscan_to_cmsearch_tblout = mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main
9
- dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main
9
+ dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:cli
10
10
  fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
11
11
  fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
12
12
  generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main