mgnify-pipelines-toolkit 1.2.7__py3-none-any.whl → 1.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +1 -1
- mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +3 -4
- mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py +30 -19
- mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +572 -63
- mgnify_pipelines_toolkit/constants/db_labels.py +2 -2
- mgnify_pipelines_toolkit/constants/tax_ranks.py +1 -9
- mgnify_pipelines_toolkit/schemas/schemas.py +2 -2
- {mgnify_pipelines_toolkit-1.2.7.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/METADATA +1 -1
- {mgnify_pipelines_toolkit-1.2.7.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/RECORD +13 -13
- {mgnify_pipelines_toolkit-1.2.7.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/entry_points.txt +1 -1
- {mgnify_pipelines_toolkit-1.2.7.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/WHEEL +0 -0
- {mgnify_pipelines_toolkit-1.2.7.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/licenses/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.2.7.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/top_level.txt +0 -0
|
@@ -300,7 +300,7 @@ def main():
|
|
|
300
300
|
if paired_end:
|
|
301
301
|
rev_fr.close()
|
|
302
302
|
|
|
303
|
-
if asv_dict:
|
|
303
|
+
if asv_dict: # if there are matches between taxonomic and ASV annotations
|
|
304
304
|
ref_db = ""
|
|
305
305
|
|
|
306
306
|
if len(taxa_df.columns) == 9:
|
|
@@ -110,10 +110,9 @@ def main():
|
|
|
110
110
|
df_merged = df_merged[
|
|
111
111
|
["nearest_mibig", "nearest_mibig_class", "description", "count"]
|
|
112
112
|
]
|
|
113
|
-
df_merged = df_merged.rename(
|
|
114
|
-
"Description": "description",
|
|
115
|
-
|
|
116
|
-
})
|
|
113
|
+
df_merged = df_merged.rename(
|
|
114
|
+
columns={"Description": "description", "Count": "count"}
|
|
115
|
+
)
|
|
117
116
|
df_merged.to_csv(output_filename, sep="\t", index=False)
|
|
118
117
|
|
|
119
118
|
|
|
@@ -53,7 +53,7 @@ def cli():
|
|
|
53
53
|
|
|
54
54
|
def get_file(
|
|
55
55
|
run_acc: str, analyses_dir: Path, db_label: str
|
|
56
|
-
) -> Union[Path, List[Path]]:
|
|
56
|
+
) -> Union[Path, List[Path], None]:
|
|
57
57
|
"""Takes path information for a particular analysis and db_label combo, and returns any existing files.
|
|
58
58
|
|
|
59
59
|
:param run_acc: Run accession for the tax file that should be retrieved.
|
|
@@ -84,7 +84,7 @@ def get_file(
|
|
|
84
84
|
return
|
|
85
85
|
|
|
86
86
|
analysis_file = Path(
|
|
87
|
-
f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt"
|
|
87
|
+
f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt.gz"
|
|
88
88
|
)
|
|
89
89
|
if not analysis_file.exists():
|
|
90
90
|
logging.error(
|
|
@@ -119,20 +119,25 @@ def parse_one_tax_file(run_acc: str, tax_file: Path, db_label: str) -> pd.DataFr
|
|
|
119
119
|
:rtype: pd.DataFrame
|
|
120
120
|
"""
|
|
121
121
|
|
|
122
|
-
tax_ranks = _MOTUS_TAX_RANKS if db_label == "
|
|
122
|
+
tax_ranks = _MOTUS_TAX_RANKS if db_label == "motus" else _SILVA_TAX_RANKS
|
|
123
123
|
res_df = pd.read_csv(tax_file, sep="\t", skiprows=1, names=["Count"] + tax_ranks)
|
|
124
124
|
res_df = res_df.fillna("")
|
|
125
125
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
126
|
+
if res_df.shape[0] > 0:
|
|
127
|
+
validate_dataframe(
|
|
128
|
+
res_df,
|
|
129
|
+
MotusTaxonSchema if db_label == "motus" else TaxonSchema,
|
|
130
|
+
str(tax_file),
|
|
131
|
+
)
|
|
129
132
|
|
|
130
|
-
res_df["full_taxon"] =
|
|
131
|
-
|
|
133
|
+
res_df["full_taxon"] = [
|
|
134
|
+
";".join(r[tax_ranks]).strip(";") for _, r in res_df.iterrows()
|
|
135
|
+
]
|
|
136
|
+
final_df = (
|
|
137
|
+
res_df[["Count", "full_taxon"]]
|
|
138
|
+
.set_index("full_taxon")
|
|
139
|
+
.rename(columns={"Count": run_acc})
|
|
132
140
|
)
|
|
133
|
-
final_df = res_df.iloc[:, [0, -1]]
|
|
134
|
-
final_df = final_df.set_index("full_taxon")
|
|
135
|
-
final_df.columns = [run_acc]
|
|
136
141
|
|
|
137
142
|
return final_df
|
|
138
143
|
|
|
@@ -162,16 +167,20 @@ def parse_one_func_file(
|
|
|
162
167
|
).set_index("function")
|
|
163
168
|
res_df = res_df.fillna(0)
|
|
164
169
|
|
|
165
|
-
|
|
170
|
+
if res_df.shape[0] > 0:
|
|
171
|
+
validate_dataframe(res_df, FunctionProfileSchema, str(func_file))
|
|
166
172
|
|
|
167
|
-
count_df = res_df[["read_count"]]
|
|
168
|
-
|
|
173
|
+
count_df = pd.DataFrame(res_df[["read_count"]]).rename(
|
|
174
|
+
columns={"read_count": run_acc}
|
|
175
|
+
)
|
|
169
176
|
|
|
170
|
-
depth_df = res_df[["coverage_depth"]]
|
|
171
|
-
|
|
177
|
+
depth_df = pd.DataFrame(res_df[["coverage_depth"]]).rename(
|
|
178
|
+
columns={"coverage_depth": run_acc}
|
|
179
|
+
)
|
|
172
180
|
|
|
173
|
-
breadth_df = res_df[["coverage_breadth"]]
|
|
174
|
-
|
|
181
|
+
breadth_df = pd.DataFrame(res_df[["coverage_breadth"]]).rename(
|
|
182
|
+
columns={"coverage_breadth": run_acc}
|
|
183
|
+
)
|
|
175
184
|
|
|
176
185
|
return count_df, depth_df, breadth_df
|
|
177
186
|
|
|
@@ -423,7 +432,9 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
|
|
|
423
432
|
curr_df = pd.read_csv(summary, sep="\t", index_col=0)
|
|
424
433
|
res_df = res_df.join(curr_df, how="outer")
|
|
425
434
|
res_df = res_df.fillna(0)
|
|
426
|
-
res_df = res_df.astype(
|
|
435
|
+
res_df = res_df.astype(
|
|
436
|
+
int if table_type == "read-count" else float
|
|
437
|
+
)
|
|
427
438
|
|
|
428
439
|
res_df = res_df.reindex(sorted(res_df.columns), axis=1)
|
|
429
440
|
res_df.to_csv(
|
|
@@ -14,23 +14,40 @@
|
|
|
14
14
|
# See the License for the specific language governing permissions and
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
|
+
import shutil
|
|
18
|
+
from shutil import SameFileError
|
|
19
|
+
|
|
17
20
|
import argparse
|
|
18
21
|
from collections import defaultdict
|
|
19
22
|
import pathlib
|
|
20
23
|
import logging
|
|
24
|
+
|
|
25
|
+
import click
|
|
21
26
|
import requests
|
|
27
|
+
from typing import Union, Dict, List, Literal
|
|
28
|
+
from pathlib import Path
|
|
22
29
|
|
|
23
30
|
import pandas as pd
|
|
24
31
|
import pyfastx
|
|
25
32
|
|
|
33
|
+
from mgnify_pipelines_toolkit.constants.tax_ranks import (
|
|
34
|
+
_SILVA_TAX_RANKS,
|
|
35
|
+
_PR2_TAX_RANKS,
|
|
36
|
+
SHORT_PR2_TAX_RANKS,
|
|
37
|
+
SHORT_TAX_RANKS,
|
|
38
|
+
)
|
|
39
|
+
|
|
26
40
|
logging.basicConfig(level=logging.DEBUG)
|
|
27
41
|
|
|
28
|
-
URL = "https://www.ebi.ac.uk/ena/portal/api/search
|
|
29
|
-
RUNS_URL = f"{URL}=read_run&fields=secondary_study_accession,sample_accession&limit=10&format=json&download=false"
|
|
30
|
-
SAMPLES_URL = f"{URL}=sample&fields=lat,lon,collection_date,depth&limit=10&format=json&download=false"
|
|
42
|
+
URL = "https://www.ebi.ac.uk/ena/portal/api/search"
|
|
31
43
|
HEADERS = {"Accept": "application/json"}
|
|
32
44
|
|
|
33
45
|
|
|
46
|
+
@click.group()
|
|
47
|
+
def cli():
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
|
|
34
51
|
def parse_args():
|
|
35
52
|
|
|
36
53
|
parser = argparse.ArgumentParser()
|
|
@@ -61,28 +78,72 @@ def parse_args():
|
|
|
61
78
|
return input_path, runs, output
|
|
62
79
|
|
|
63
80
|
|
|
64
|
-
def
|
|
81
|
+
def get_ena_metadata_from_run_acc(run_acc: str) -> Union[pd.DataFrame, bool]:
|
|
82
|
+
"""
|
|
83
|
+
Fetches and processes metadata from ENA using the provided run accession.
|
|
84
|
+
|
|
85
|
+
This function queries the European Nucleotide Archive (ENA) API to retrieve
|
|
86
|
+
metadata related to the specified run accession. Once the metadata is
|
|
87
|
+
retrieved, it performs cleaning and formatting to return the data in a
|
|
88
|
+
structured pandas DataFrame.
|
|
65
89
|
|
|
66
|
-
|
|
67
|
-
|
|
90
|
+
Parameters:
|
|
91
|
+
run_acc: str
|
|
92
|
+
Accession identifier for the run to query from ENA.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Union[pd.DataFrame, bool]
|
|
96
|
+
A pandas DataFrame containing the retrieved and processed metadata
|
|
97
|
+
if the query is successful, or False if the data for the given run
|
|
98
|
+
accession is not found.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
run_fields_list = [
|
|
102
|
+
"secondary_study_accession",
|
|
103
|
+
"sample_accession",
|
|
104
|
+
"instrument_model",
|
|
105
|
+
]
|
|
106
|
+
run_query_args = {
|
|
107
|
+
"result": "read_run",
|
|
108
|
+
"includeAccessions": run_acc,
|
|
109
|
+
"fields": ",".join(run_fields_list),
|
|
110
|
+
"limit": 10,
|
|
111
|
+
"format": "json",
|
|
112
|
+
"download": "false",
|
|
113
|
+
}
|
|
114
|
+
res_run = requests.get(URL, headers=HEADERS, params=run_query_args)
|
|
68
115
|
|
|
69
116
|
if res_run.status_code != 200:
|
|
70
117
|
logging.error(f"Data not found for run {run_acc}")
|
|
71
118
|
return False
|
|
72
119
|
|
|
73
120
|
sample_acc = res_run.json()[0]["sample_accession"]
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
121
|
+
sample_fields_list = [
|
|
122
|
+
"lat",
|
|
123
|
+
"lon",
|
|
124
|
+
"collection_date",
|
|
125
|
+
"depth",
|
|
126
|
+
"center_name",
|
|
127
|
+
"temperature",
|
|
128
|
+
"salinity",
|
|
129
|
+
"country",
|
|
130
|
+
]
|
|
131
|
+
sample_query_args = {
|
|
132
|
+
"result": "sample",
|
|
133
|
+
"includeAccessions": sample_acc,
|
|
134
|
+
"fields": ",".join(sample_fields_list),
|
|
135
|
+
"limit": 10,
|
|
136
|
+
"format": "json",
|
|
137
|
+
"download": "false",
|
|
138
|
+
}
|
|
139
|
+
res_sample = requests.get(URL, headers=HEADERS, params=sample_query_args)
|
|
77
140
|
|
|
78
141
|
full_res_dict = res_run.json()[0] | res_sample.json()[0]
|
|
79
142
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
if val == "":
|
|
85
|
-
full_res_dict[field] = "NA"
|
|
143
|
+
# Turn empty values into NA
|
|
144
|
+
full_res_dict = {
|
|
145
|
+
field: "NA" if val == "" else val for field, val in full_res_dict.items()
|
|
146
|
+
}
|
|
86
147
|
|
|
87
148
|
if full_res_dict["collection_date"] == "":
|
|
88
149
|
full_res_dict["collectionDate"] = "NA"
|
|
@@ -92,38 +153,74 @@ def get_metadata_from_run_acc(run_acc):
|
|
|
92
153
|
del full_res_dict["collection_date"]
|
|
93
154
|
|
|
94
155
|
res_df = pd.DataFrame(full_res_dict, index=[0])
|
|
95
|
-
res_df
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
156
|
+
res_df = res_df.rename(
|
|
157
|
+
columns={
|
|
158
|
+
"run_accession": "RunID",
|
|
159
|
+
"sample_accession": "SampleID",
|
|
160
|
+
"secondary_study_accession": "StudyID",
|
|
161
|
+
"lon": "decimalLongitude",
|
|
162
|
+
"lat": "decimalLatitude",
|
|
163
|
+
"instrument_model": "seq_meth",
|
|
164
|
+
}
|
|
165
|
+
)
|
|
104
166
|
|
|
105
167
|
return res_df
|
|
106
168
|
|
|
107
169
|
|
|
108
|
-
def
|
|
170
|
+
def get_all_ena_metadata_from_runs(runs: List[str]) -> Dict[str, pd.DataFrame]:
|
|
171
|
+
"""
|
|
172
|
+
Fetches ENA metadata for a list of run accessions.
|
|
109
173
|
|
|
110
|
-
|
|
174
|
+
This function retrieves metadata from the European Nucleotide Archive (ENA)
|
|
175
|
+
for the provided list of run accessions. For each valid run accession, the
|
|
176
|
+
metadata is parsed and stored in a dictionary, where the key is the run
|
|
177
|
+
accession and the value is a DataFrame containing the metadata.
|
|
178
|
+
|
|
179
|
+
Parameters:
|
|
180
|
+
runs (List[str]): A list of strings representing run accessions for which
|
|
181
|
+
the metadata needs to be retrieved.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Dict[str, pd.DataFrame]: A dictionary where keys are run accessions and
|
|
185
|
+
values are DataFrames containing the corresponding ENA metadata.
|
|
186
|
+
"""
|
|
187
|
+
run_metadata_dict = defaultdict(pd.DataFrame)
|
|
111
188
|
|
|
112
189
|
for run in runs:
|
|
113
|
-
res_df =
|
|
190
|
+
res_df = get_ena_metadata_from_run_acc(run)
|
|
114
191
|
if res_df is not False:
|
|
115
192
|
run_metadata_dict[run] = res_df
|
|
116
193
|
|
|
117
194
|
return run_metadata_dict
|
|
118
195
|
|
|
119
196
|
|
|
120
|
-
def
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
197
|
+
def cleanup_asv_taxa(df: pd.DataFrame, db: Literal["SILVA", "PR2"]) -> pd.DataFrame:
|
|
198
|
+
"""
|
|
199
|
+
Cleans ASV dataframe by renaming columns, handling empty fields, and adding
|
|
200
|
+
constant metadata fields.
|
|
201
|
+
|
|
202
|
+
Parameters:
|
|
203
|
+
df : pd.DataFrame
|
|
204
|
+
Input DataFrame containing ASV data to clean
|
|
205
|
+
db : Literal["SILVA", "PR2"]
|
|
206
|
+
Reference database used for taxonomic ranks
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
# Rename some columns
|
|
210
|
+
cleaned_df = df.rename(
|
|
211
|
+
columns={
|
|
212
|
+
"asv": "ASVID",
|
|
213
|
+
"count": "MeasurementValue",
|
|
214
|
+
"center_name": "InstitutionCode",
|
|
215
|
+
}
|
|
216
|
+
)
|
|
124
217
|
|
|
125
|
-
|
|
218
|
+
if db == "SILVA":
|
|
219
|
+
ranks = _SILVA_TAX_RANKS
|
|
220
|
+
else:
|
|
221
|
+
ranks = _PR2_TAX_RANKS
|
|
126
222
|
|
|
223
|
+
# Turn empty taxa into NA
|
|
127
224
|
for rank in ranks:
|
|
128
225
|
cleaned_df[rank] = cleaned_df[rank].apply(
|
|
129
226
|
lambda x: x.split("__")[1] if pd.notnull(x) else "NA"
|
|
@@ -132,6 +229,12 @@ def cleanup_taxa(df):
|
|
|
132
229
|
for rank in ranks:
|
|
133
230
|
cleaned_df[rank] = cleaned_df[rank].apply(lambda x: x if x != "" else "NA")
|
|
134
231
|
|
|
232
|
+
# Add a few constant columns
|
|
233
|
+
cleaned_df["MeasurementUnit"] = ["Number of reads"] * len(cleaned_df)
|
|
234
|
+
cleaned_df["ASVCaller"] = ["DADA2"] * len(cleaned_df)
|
|
235
|
+
cleaned_df["ReferenceDatabase"] = [db] * len(cleaned_df)
|
|
236
|
+
cleaned_df["TaxAnnotationTool"] = ["MAPseq"] * len(cleaned_df)
|
|
237
|
+
# Final order of fields in output csv
|
|
135
238
|
cleaned_df = cleaned_df[
|
|
136
239
|
[
|
|
137
240
|
"ASVID",
|
|
@@ -141,14 +244,25 @@ def cleanup_taxa(df):
|
|
|
141
244
|
"decimalLongitude",
|
|
142
245
|
"decimalLatitude",
|
|
143
246
|
"depth",
|
|
247
|
+
"temperature",
|
|
248
|
+
"salinity",
|
|
144
249
|
"collectionDate",
|
|
145
|
-
"
|
|
146
|
-
"
|
|
147
|
-
"
|
|
148
|
-
"
|
|
149
|
-
"
|
|
150
|
-
"
|
|
151
|
-
"
|
|
250
|
+
"seq_meth",
|
|
251
|
+
"country",
|
|
252
|
+
"InstitutionCode",
|
|
253
|
+
"amplifiedRegion",
|
|
254
|
+
"ASVCaller",
|
|
255
|
+
"ReferenceDatabase",
|
|
256
|
+
"TaxAnnotationTool",
|
|
257
|
+
]
|
|
258
|
+
+ ranks
|
|
259
|
+
+ [
|
|
260
|
+
"MeasurementUnit",
|
|
261
|
+
"MeasurementValue",
|
|
262
|
+
"dbhit",
|
|
263
|
+
"dbhitIdentity",
|
|
264
|
+
"dbhitStart",
|
|
265
|
+
"dbhitEnd",
|
|
152
266
|
"ASVSeq",
|
|
153
267
|
]
|
|
154
268
|
]
|
|
@@ -156,27 +270,140 @@ def cleanup_taxa(df):
|
|
|
156
270
|
return cleaned_df
|
|
157
271
|
|
|
158
272
|
|
|
159
|
-
def
|
|
273
|
+
def cleanup_closedref_taxa(
|
|
274
|
+
df: pd.DataFrame, db: Literal["SILVA-SSU", "PR2"]
|
|
275
|
+
) -> pd.DataFrame:
|
|
276
|
+
"""
|
|
277
|
+
Cleans closed-reference taxonomy dataframe by renaming columns, handling empty fields,
|
|
278
|
+
and adding constant metadata fields.
|
|
279
|
+
|
|
280
|
+
Similar to cleanup_asv_taxa() but specifically handles closed-reference taxonomy data
|
|
281
|
+
rather than ASV data. Performs column renaming, empty field handling,
|
|
282
|
+
and adds relevant metadata columns.
|
|
283
|
+
|
|
284
|
+
Parameters:
|
|
285
|
+
df : pd.DataFrame
|
|
286
|
+
Input DataFrame containing closed-reference taxonomy data to clean
|
|
287
|
+
db : Literal["SILVA-SSU", "PR2"]
|
|
288
|
+
Reference database used for taxonomic ranks
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
pd.DataFrame
|
|
292
|
+
Cleaned and formatted DataFrame with standardized column names and metadata fields
|
|
293
|
+
"""
|
|
294
|
+
|
|
295
|
+
cleaned_df = df.rename(
|
|
296
|
+
columns={
|
|
297
|
+
"count": "MeasurementValue",
|
|
298
|
+
"center_name": "InstitutionCode",
|
|
299
|
+
}
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
if db == "SILVA-SSU":
|
|
303
|
+
ranks = _SILVA_TAX_RANKS
|
|
304
|
+
else:
|
|
305
|
+
ranks = _PR2_TAX_RANKS
|
|
306
|
+
|
|
307
|
+
# Turn empty taxa into NA
|
|
308
|
+
for rank in ranks:
|
|
309
|
+
cleaned_df[rank] = cleaned_df[rank].apply(lambda x: x if x != "" else "NA")
|
|
310
|
+
|
|
311
|
+
# Add a MeasurementUnit Column for the read count for each asv
|
|
312
|
+
cleaned_df["MeasurementUnit"] = ["Number of reads"] * len(cleaned_df)
|
|
313
|
+
cleaned_df["ReferenceDatabase"] = [db] * len(cleaned_df)
|
|
314
|
+
cleaned_df["TaxAnnotationTool"] = ["MAPseq"] * len(cleaned_df)
|
|
315
|
+
|
|
316
|
+
# Final order of fields in output csv
|
|
317
|
+
cleaned_df = cleaned_df[
|
|
318
|
+
[
|
|
319
|
+
"StudyID",
|
|
320
|
+
"SampleID",
|
|
321
|
+
"RunID",
|
|
322
|
+
"decimalLongitude",
|
|
323
|
+
"decimalLatitude",
|
|
324
|
+
"depth",
|
|
325
|
+
"temperature",
|
|
326
|
+
"salinity",
|
|
327
|
+
"collectionDate",
|
|
328
|
+
"seq_meth",
|
|
329
|
+
"country",
|
|
330
|
+
"InstitutionCode",
|
|
331
|
+
"ReferenceDatabase",
|
|
332
|
+
"TaxAnnotationTool",
|
|
333
|
+
]
|
|
334
|
+
+ ranks
|
|
335
|
+
+ [
|
|
336
|
+
"MeasurementUnit",
|
|
337
|
+
"MeasurementValue",
|
|
338
|
+
]
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
return cleaned_df
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def get_asv_dict(
|
|
345
|
+
runs_df: pd.DataFrame, root_path: Path, db: Literal["SILVA", "PR2"]
|
|
346
|
+
) -> Dict[str, pd.DataFrame]:
|
|
347
|
+
"""
|
|
348
|
+
Generates a dictionary containing ASV (Amplicon Sequence Variant) data for each run.
|
|
349
|
+
|
|
350
|
+
This function processes sequencing run data, extracts relevant information, and
|
|
351
|
+
aggregates it into a dictionary. Each key in the dictionary corresponds to a
|
|
352
|
+
unique run ID, and its value is a DataFrame containing detailed ASV data such
|
|
353
|
+
as taxonomy assignments, sequence read counts, MAPseq hit data, and the ASV sequences
|
|
354
|
+
themselves. The function filters runs to only include those with a
|
|
355
|
+
complete analysis status ("all_results").
|
|
356
|
+
|
|
357
|
+
Arguments:
|
|
358
|
+
runs_df (pd.DataFrame): A DataFrame containing results status info about the runs.
|
|
359
|
+
root_path (Path): The base directory path where analysis results files are stored.
|
|
360
|
+
db (Literal["SILVA", "PR2"]): Specifies the database used for taxonomy assignment
|
|
361
|
+
(e.g., SILVA or PR2).
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
Dict[str, pd.DataFrame]: A dictionary where keys are run IDs and values are
|
|
365
|
+
DataFrames containing merged ASV data for corresponding runs.
|
|
366
|
+
"""
|
|
160
367
|
|
|
161
368
|
asv_dict = {}
|
|
162
369
|
for i in range(0, len(runs_df)):
|
|
163
370
|
run_acc = runs_df.loc[i, "run"]
|
|
164
|
-
|
|
371
|
+
analysis_status = runs_df.loc[i, "status"]
|
|
165
372
|
|
|
166
|
-
|
|
373
|
+
# Only keep runs that have all_results i.e. includes ASV results
|
|
374
|
+
if analysis_status != "all_results":
|
|
167
375
|
continue
|
|
168
376
|
|
|
377
|
+
# Raw MAPseq taxonomy assignment files
|
|
378
|
+
# Used to extract hit data like the exact dbhit, %identity, and matching coords
|
|
379
|
+
mapseq_file = sorted(
|
|
380
|
+
list(
|
|
381
|
+
(
|
|
382
|
+
pathlib.Path(root_path)
|
|
383
|
+
/ run_acc
|
|
384
|
+
/ "taxonomy-summary"
|
|
385
|
+
/ f"DADA2-{db}"
|
|
386
|
+
).glob(f"*_DADA2-{db}.mseq")
|
|
387
|
+
)
|
|
388
|
+
)[0]
|
|
389
|
+
mapseq_df = pd.read_csv(mapseq_file, sep="\t", usecols=[0, 1, 3, 9, 10])
|
|
390
|
+
mapseq_df.columns = ["asv", "dbhit", "dbhitIdentity", "dbhitStart", "dbhitEnd"]
|
|
391
|
+
|
|
392
|
+
# Processed MAPseq taxonomy assignment files
|
|
169
393
|
tax_file = sorted(
|
|
170
394
|
list(
|
|
171
395
|
(pathlib.Path(root_path) / run_acc / "asv").glob(
|
|
172
|
-
"*_DADA2-
|
|
396
|
+
f"*_DADA2-{db}_asv_tax.tsv"
|
|
173
397
|
)
|
|
174
398
|
)
|
|
175
399
|
)[0]
|
|
400
|
+
run_tax_df = pd.read_csv(tax_file, sep="\t")
|
|
401
|
+
|
|
402
|
+
# ASV read count files
|
|
176
403
|
count_files = sorted(
|
|
177
404
|
list(pathlib.Path(f"{root_path}/{run_acc}/asv").glob("*S-V*/*.tsv"))
|
|
178
405
|
)
|
|
179
|
-
|
|
406
|
+
# ASV sequence FASTA files
|
|
180
407
|
asv_fasta_file = sorted(
|
|
181
408
|
list(pathlib.Path(f"{root_path}/{run_acc}/asv").glob("*_asv_seqs.fasta"))
|
|
182
409
|
)[0]
|
|
@@ -184,32 +411,163 @@ def get_asv_dict(runs_df, root_path):
|
|
|
184
411
|
asv_fasta_dict = {name: seq for name, seq in fasta}
|
|
185
412
|
asv_fasta_df = pd.DataFrame(asv_fasta_dict, index=["ASVSeq"]).transpose()
|
|
186
413
|
asv_fasta_df["asv"] = asv_fasta_df.index
|
|
187
|
-
run_tax_df = pd.read_csv(tax_file, sep="\t")
|
|
188
414
|
|
|
189
415
|
count_dfs = []
|
|
190
416
|
|
|
191
417
|
for count_file in count_files:
|
|
418
|
+
amp_region = count_file.stem.split("_")[1]
|
|
192
419
|
count_df = pd.read_csv(count_file, sep="\t")
|
|
420
|
+
count_df["amplifiedRegion"] = [amp_region] * len(count_df)
|
|
193
421
|
count_dfs.append(count_df)
|
|
194
422
|
|
|
195
|
-
|
|
196
|
-
|
|
423
|
+
# Merge counts into one DF in case there are multiple amplified regions...
|
|
424
|
+
all_amplified_regions_count_df = pd.concat(count_dfs)
|
|
425
|
+
|
|
426
|
+
# ...then merge with taxonomy dataframes...
|
|
427
|
+
merged_df = all_amplified_regions_count_df.merge(
|
|
197
428
|
run_tax_df, left_on="asv", right_on="ASV"
|
|
198
429
|
)
|
|
430
|
+
# ...then merge with MAPseq columns...
|
|
431
|
+
merged_df = merged_df.merge(mapseq_df, on="asv")
|
|
432
|
+
|
|
433
|
+
# ...then merge with ASV FASTA sequences
|
|
199
434
|
merged_df.pop("ASV")
|
|
200
435
|
run_col = [run_acc] * len(merged_df)
|
|
201
436
|
merged_df["RunID"] = run_col
|
|
202
437
|
merged_df = merged_df.merge(asv_fasta_df, on="asv")
|
|
438
|
+
|
|
439
|
+
# Assign final DF to run_acc in dictionary
|
|
203
440
|
asv_dict[run_acc] = merged_df
|
|
204
441
|
|
|
205
442
|
return asv_dict
|
|
206
443
|
|
|
207
444
|
|
|
208
|
-
def
|
|
445
|
+
def get_closedref_dict(
|
|
446
|
+
runs_df: pd.DataFrame, root_path: Path, db: Literal["SILVA-SSU", "PR2"]
|
|
447
|
+
) -> Dict[str, pd.DataFrame]:
|
|
448
|
+
"""
|
|
449
|
+
Generates a dictionary of closed-reference taxonomy data for multiple sequencing runs.
|
|
450
|
+
|
|
451
|
+
Processes Krona-formatted taxonomy files from analysis results and converts them
|
|
452
|
+
to DataFrames mapping taxonomic ranks to abundances. Returns dictionary with run
|
|
453
|
+
accessions as keys and said DataFrames as values.
|
|
209
454
|
|
|
210
|
-
|
|
455
|
+
Arguments:
|
|
456
|
+
runs_df (pd.DataFrame): A DataFrame containing results status info about the runs.
|
|
457
|
+
root_path (Path): The base directory path where analysis results files are stored.
|
|
458
|
+
db (Literal["SILVA", "PR2"]): Specifies the database used for taxonomy assignment
|
|
459
|
+
(e.g., SILVA or PR2).
|
|
211
460
|
|
|
212
|
-
|
|
461
|
+
Returns:
|
|
462
|
+
Dict[str, pd.DataFrame]: A dictionary mapping each run accession (str) to its
|
|
463
|
+
corresponding taxonomy DataFrame (pd.DataFrame). Each DataFrame contains taxonomic
|
|
464
|
+
abundance counts.
|
|
465
|
+
"""
|
|
466
|
+
|
|
467
|
+
if db == "SILVA-SSU":
|
|
468
|
+
ranks = _SILVA_TAX_RANKS
|
|
469
|
+
short_ranks = SHORT_TAX_RANKS
|
|
470
|
+
else:
|
|
471
|
+
ranks = _PR2_TAX_RANKS
|
|
472
|
+
short_ranks = SHORT_PR2_TAX_RANKS
|
|
473
|
+
|
|
474
|
+
closedref_dict = {}
|
|
475
|
+
for i in range(0, len(runs_df)):
|
|
476
|
+
run_acc = runs_df.loc[i, "run"]
|
|
477
|
+
status = runs_df.loc[i, "status"]
|
|
478
|
+
|
|
479
|
+
if status != "all_results":
|
|
480
|
+
continue
|
|
481
|
+
|
|
482
|
+
# Krona formatted results
|
|
483
|
+
kronatxt_file = sorted(
|
|
484
|
+
list(
|
|
485
|
+
(pathlib.Path(root_path) / run_acc / "taxonomy-summary" / f"{db}").glob(
|
|
486
|
+
"*.txt"
|
|
487
|
+
)
|
|
488
|
+
)
|
|
489
|
+
)[0]
|
|
490
|
+
|
|
491
|
+
column_names = ["count"] + ranks
|
|
492
|
+
tax_df = pd.read_csv(kronatxt_file, sep="\t", names=column_names)
|
|
493
|
+
|
|
494
|
+
# Clean up empty ranks
|
|
495
|
+
tax_df = tax_df.fillna("NA")
|
|
496
|
+
krona_taxranks = [rank + "__" for rank in short_ranks]
|
|
497
|
+
tax_df = tax_df.map(lambda x: "NA" if x in krona_taxranks else x)
|
|
498
|
+
|
|
499
|
+
run_col = [run_acc] * len(tax_df)
|
|
500
|
+
tax_df["RunID"] = run_col
|
|
501
|
+
|
|
502
|
+
# Assign final DF to run_acc in dictionary
|
|
503
|
+
closedref_dict[run_acc] = tax_df
|
|
504
|
+
|
|
505
|
+
return closedref_dict
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
@cli.command(
|
|
509
|
+
"summarise",
|
|
510
|
+
options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
|
|
511
|
+
short_help='Generate "DarwinCore-ready" study-level summaries of amplicon analysis results.',
|
|
512
|
+
)
|
|
513
|
+
@click.option(
|
|
514
|
+
"-r",
|
|
515
|
+
"--runs",
|
|
516
|
+
required=True,
|
|
517
|
+
help="CSV file containing successful analyses generated by the pipeline",
|
|
518
|
+
type=click.Path(exists=True, path_type=Path, dir_okay=False),
|
|
519
|
+
)
|
|
520
|
+
@click.option(
|
|
521
|
+
"-a",
|
|
522
|
+
"--analyses_dir",
|
|
523
|
+
required=True,
|
|
524
|
+
help="Input directory to where all the individual analyses subdirectories for summarising",
|
|
525
|
+
type=click.Path(exists=True, path_type=Path, file_okay=False),
|
|
526
|
+
)
|
|
527
|
+
@click.option(
|
|
528
|
+
"-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
|
|
529
|
+
)
|
|
530
|
+
def generate_dwcready_summaries(
|
|
531
|
+
runs: Path, analyses_dir: Path, output_prefix: str
|
|
532
|
+
) -> None:
|
|
533
|
+
"""
|
|
534
|
+
Generate Darwin Core-ready study-level summaries of amplicon analysis results.
|
|
535
|
+
|
|
536
|
+
This function processes amplicon analysis results from both ASV (DADA2) and closed-reference
|
|
537
|
+
analyses to create "Darwin Core Ready" summary files. The function handles both
|
|
538
|
+
SILVA and PR2 database results, combining taxonomy assignments with ENA metadata.
|
|
539
|
+
|
|
540
|
+
For ASV data, files are generated per amplified region - that means with SILVA and PR2 as
|
|
541
|
+
reference databases, one CSV is created per amplified region. For example:
|
|
542
|
+
- With one amplified region (e.g. 16S-V3-V4):
|
|
543
|
+
- {output_prefix}_DADA2_SILVA_16S-V3-V4_dwcready.csv
|
|
544
|
+
- {output_prefix}_DADA2_PR2_16S-V3-V4_dwcready.csv
|
|
545
|
+
- With two amplified regions (e.g. 16S-V3-V4 and 18S-V4):
|
|
546
|
+
- {output_prefix}_DADA2_SILVA_16S-V3-V4_dwcready.csv
|
|
547
|
+
- {output_prefix}_DADA2_PR2_16S-V3-V4_dwcready.csv
|
|
548
|
+
- {output_prefix}_DADA2_SILVA_18S-V4_dwcready.csv
|
|
549
|
+
- {output_prefix}_DADA2_PR2_18S-V4_dwcready.csv
|
|
550
|
+
|
|
551
|
+
For closed-reference data, one file per database is generated regardless of amplified regions:
|
|
552
|
+
- {output_prefix}_closedref_SILVA-SSU_dwcready.csv
|
|
553
|
+
- {output_prefix}_closedref_PR2_dwcready.csv
|
|
554
|
+
|
|
555
|
+
Args:
|
|
556
|
+
runs (Path): Path to CSV file containing successful analyses generated by the pipeline.
|
|
557
|
+
The CSV should have columns for run accessions and analysis results status.
|
|
558
|
+
analyses_dir (Path): Input directory containing all individual analyses subdirectories
|
|
559
|
+
to be summarized. Each subdirectory should contain taxonomy-summary
|
|
560
|
+
and ASV results.
|
|
561
|
+
output_prefix (str): Prefix to be used for the output summary files.
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
None: Writes output CSV files with Darwin Core-compliant summaries of amplicon
|
|
565
|
+
analysis results. The total number of output files depends on:
|
|
566
|
+
1. For ASV data: Number of amplified regions × Number of reference databases
|
|
567
|
+
2. For closed-reference: Number of reference databases
|
|
568
|
+
"""
|
|
569
|
+
|
|
570
|
+
root_path = pathlib.Path(analyses_dir)
|
|
213
571
|
|
|
214
572
|
if not root_path.exists():
|
|
215
573
|
logging.error(f"Results path does not exist: {root_path}")
|
|
@@ -218,23 +576,174 @@ def main():
|
|
|
218
576
|
runs_df = pd.read_csv(runs, names=["run", "status"])
|
|
219
577
|
|
|
220
578
|
all_runs = runs_df.run.to_list()
|
|
221
|
-
run_metadata_dict =
|
|
222
|
-
|
|
579
|
+
run_metadata_dict = get_all_ena_metadata_from_runs(all_runs)
|
|
580
|
+
|
|
581
|
+
# Generate DwC-ready files for ASV results
|
|
582
|
+
asv_dbs = ["SILVA", "PR2"]
|
|
583
|
+
for db in asv_dbs:
|
|
584
|
+
|
|
585
|
+
asv_dict = get_asv_dict(runs_df, root_path, db)
|
|
586
|
+
all_merged_df = []
|
|
587
|
+
|
|
588
|
+
for run in all_runs:
|
|
589
|
+
if run in asv_dict.keys() and run in run_metadata_dict.keys():
|
|
590
|
+
run_asv_data = asv_dict[run]
|
|
591
|
+
run_metadata = run_metadata_dict[run]
|
|
592
|
+
run_merged_result = run_metadata.merge(run_asv_data, on="RunID")
|
|
593
|
+
all_merged_df.append(run_merged_result)
|
|
594
|
+
|
|
595
|
+
final_df = pd.concat(all_merged_df, ignore_index=True)
|
|
596
|
+
final_df = cleanup_asv_taxa(final_df, db)
|
|
597
|
+
|
|
598
|
+
# get all amplified regions present in the study
|
|
599
|
+
present_amplified_regions = final_df["amplifiedRegion"].unique()
|
|
600
|
+
|
|
601
|
+
# generate a DataFrame and then write a CSV file on an amplifiedRegion basis
|
|
602
|
+
for amplified_region in present_amplified_regions:
|
|
603
|
+
amplified_region_df = final_df.loc[
|
|
604
|
+
final_df["amplifiedRegion"] == amplified_region
|
|
605
|
+
]
|
|
606
|
+
amplified_region_df.to_csv(
|
|
607
|
+
f"{output_prefix}_DADA2_{db}_{amplified_region}_dwcready.csv",
|
|
608
|
+
index=False,
|
|
609
|
+
na_rep="NA",
|
|
610
|
+
)
|
|
223
611
|
|
|
224
|
-
|
|
612
|
+
# Generate DwC-ready files for closed reference results
|
|
613
|
+
closedref_dbs = ["SILVA-SSU", "PR2"]
|
|
614
|
+
for db in closedref_dbs:
|
|
225
615
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
run_asv_data = asv_dict[run]
|
|
229
|
-
run_metadata = run_metadata_dict[run]
|
|
230
|
-
run_merged_result = run_metadata.merge(run_asv_data, on="RunID")
|
|
231
|
-
all_merged_df.append(run_merged_result)
|
|
616
|
+
closedref_dict = get_closedref_dict(runs_df, root_path, db)
|
|
617
|
+
all_merged_df = []
|
|
232
618
|
|
|
233
|
-
|
|
234
|
-
|
|
619
|
+
for run in all_runs:
|
|
620
|
+
if run in closedref_dict.keys() and run in run_metadata_dict.keys():
|
|
621
|
+
run_closedref_data = closedref_dict[run]
|
|
622
|
+
run_metadata = run_metadata_dict[run]
|
|
623
|
+
run_merged_result = run_metadata.merge(run_closedref_data, on="RunID")
|
|
624
|
+
all_merged_df.append(run_merged_result)
|
|
235
625
|
|
|
236
|
-
|
|
626
|
+
final_df = pd.concat(all_merged_df, ignore_index=True)
|
|
627
|
+
final_df = cleanup_closedref_taxa(final_df, db)
|
|
628
|
+
|
|
629
|
+
final_df.to_csv(
|
|
630
|
+
f"{output_prefix}_closedref_{db}_dwcready.csv", index=False, na_rep="NA"
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def organise_dwcr_summaries(all_study_summaries: List[Path]) -> defaultdict[List]:
|
|
635
|
+
"""
|
|
636
|
+
Organizes Darwin Core-ready summary files into groups based on their analysis type and database.
|
|
637
|
+
|
|
638
|
+
This function processes paths to Darwin Core-ready summary files and organizes them into a
|
|
639
|
+
dictionary based on their type (ASV/DADA2 or closed-reference) and database used. The function
|
|
640
|
+
handles the two types of summaries differently:
|
|
641
|
+
|
|
642
|
+
1. ASV/DADA2 summaries:
|
|
643
|
+
- Label includes analysis type (DADA2), database, and amplified region
|
|
644
|
+
- Example label: "DADA2_SILVA_16S-V3-V4"
|
|
645
|
+
2. Closed-reference summaries:
|
|
646
|
+
- Label only includes analysis type and database
|
|
647
|
+
- Example label: "closedref_SILVA-SSU"
|
|
648
|
+
|
|
649
|
+
Args:
|
|
650
|
+
all_study_summaries (List[Path]): List of paths to Darwin Core-ready summary files
|
|
651
|
+
to be organized.
|
|
652
|
+
|
|
653
|
+
Returns:
|
|
654
|
+
defaultdict[List]: Dictionary where keys are summary labels (combining analysis type,
|
|
655
|
+
database, and for ASVs, amplified region) and values are lists of paths to
|
|
656
|
+
corresponding summary files.
|
|
657
|
+
"""
|
|
658
|
+
|
|
659
|
+
summaries_dict = defaultdict(list)
|
|
660
|
+
|
|
661
|
+
for summary_path in all_study_summaries:
|
|
662
|
+
summary_filename = summary_path.stem
|
|
663
|
+
|
|
664
|
+
temp_lst = summary_filename.split("_")
|
|
665
|
+
if "DADA2" in summary_filename:
|
|
666
|
+
summary_db_label = "_".join(
|
|
667
|
+
temp_lst[1:4]
|
|
668
|
+
) # For ASVs we need to include the amplified region in the label
|
|
669
|
+
else:
|
|
670
|
+
summary_db_label = "_".join(
|
|
671
|
+
temp_lst[1:3]
|
|
672
|
+
) # For closed reference, just the db_label is needed
|
|
673
|
+
summaries_dict[summary_db_label].append(summary_path)
|
|
674
|
+
|
|
675
|
+
return summaries_dict
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
@cli.command(
|
|
679
|
+
"merge",
|
|
680
|
+
options_metavar="-a <analyses_dir> -p <output_prefix>",
|
|
681
|
+
short_help="Merge multiple DwC-ready summaries of amplicon analysis.",
|
|
682
|
+
)
|
|
683
|
+
@click.option(
|
|
684
|
+
"-a",
|
|
685
|
+
"--analyses_dir",
|
|
686
|
+
required=True,
|
|
687
|
+
help="Input directory where all the individual analyses subdirectories are for merging",
|
|
688
|
+
type=click.Path(exists=True, file_okay=False),
|
|
689
|
+
)
|
|
690
|
+
@click.option(
|
|
691
|
+
"-p",
|
|
692
|
+
"--output_prefix",
|
|
693
|
+
required=True,
|
|
694
|
+
help="Prefix to merged summary files",
|
|
695
|
+
type=str,
|
|
696
|
+
)
|
|
697
|
+
def merge_dwcr_summaries(analyses_dir: str, output_prefix: str) -> None:
|
|
698
|
+
"""
|
|
699
|
+
Merges multiple Darwin Core-ready summary files into consolidated summaries by type.
|
|
700
|
+
|
|
701
|
+
This function takes a directory containing multiple Darwin Core-ready summary files
|
|
702
|
+
and merges them based on their analysis type (ASV/DADA2 or closed-reference) and
|
|
703
|
+
reference database. The function processes two types of summaries:
|
|
704
|
+
|
|
705
|
+
1. ASV/DADA2 summaries:
|
|
706
|
+
- Merged by analysis type, database, and amplified region
|
|
707
|
+
- Output example: "{prefix}_DADA2_SILVA_16S-V3-V4_dwcready.csv"
|
|
708
|
+
2. Closed-reference summaries:
|
|
709
|
+
- Merged by analysis type and database only
|
|
710
|
+
- Output example: "{prefix}_closedref_SILVA-SSU_dwcready.csv"
|
|
711
|
+
|
|
712
|
+
If only one summary file exists for a particular combination, it is copied to the
|
|
713
|
+
output location instead of being merged.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
analyses_dir (str): Path to directory containing Darwin Core-ready summary files
|
|
717
|
+
(files ending in "_dwcready.csv")
|
|
718
|
+
output_prefix (str): Prefix to use for merged output files
|
|
719
|
+
|
|
720
|
+
Returns:
|
|
721
|
+
None: Writes merged summary files to current directory, with names following the
|
|
722
|
+
pattern "{output_prefix}_{analysis-type}_{database}[_{region}]_dwcready.csv"
|
|
723
|
+
"""
|
|
724
|
+
|
|
725
|
+
all_dwcr_summaries = Path(analyses_dir).glob("*_dwcready.csv")
|
|
726
|
+
|
|
727
|
+
summaries_dict = organise_dwcr_summaries(all_dwcr_summaries)
|
|
728
|
+
|
|
729
|
+
for db_label, summaries in summaries_dict.items():
|
|
730
|
+
merged_summary_name = f"{output_prefix}_{db_label}_dwcready.csv"
|
|
731
|
+
if len(summaries) > 1:
|
|
732
|
+
res_df = pd.read_csv(summaries[0])
|
|
733
|
+
for summary in summaries[1:]:
|
|
734
|
+
curr_df = pd.read_csv(summary)
|
|
735
|
+
res_df = pd.concat([res_df, curr_df])
|
|
736
|
+
|
|
737
|
+
res_df.to_csv(merged_summary_name, index=False, na_rep="NA")
|
|
738
|
+
elif len(summaries) == 1:
|
|
739
|
+
logging.info(
|
|
740
|
+
f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
|
|
741
|
+
)
|
|
742
|
+
try:
|
|
743
|
+
shutil.copyfile(summaries[0], merged_summary_name)
|
|
744
|
+
except SameFileError:
|
|
745
|
+
pass
|
|
237
746
|
|
|
238
747
|
|
|
239
748
|
if __name__ == "__main__":
|
|
240
|
-
|
|
749
|
+
cli()
|
|
@@ -21,7 +21,7 @@ TAXDB_LABELS = ["SILVA-SSU", "SILVA-LSU", "PR2", "UNITE", "ITSoneDB"]
|
|
|
21
21
|
ASV_TAXDB_LABELS = ["DADA2-SILVA", "DADA2-PR2"]
|
|
22
22
|
|
|
23
23
|
# taxonomy_summary labels for Raw Reads Analysis Pipeline
|
|
24
|
-
RRAP_TAXDB_LABELS = [
|
|
24
|
+
RRAP_TAXDB_LABELS = ["silva-ssu", "silva-lsu", "motus"]
|
|
25
25
|
|
|
26
26
|
# function_summary labels for Raw Reads Analysis Pipeline
|
|
27
|
-
RRAP_FUNCDB_LABELS = [
|
|
27
|
+
RRAP_FUNCDB_LABELS = ["pfam"]
|
|
@@ -35,15 +35,7 @@ _PR2_TAX_RANKS = [
|
|
|
35
35
|
"Genus",
|
|
36
36
|
"Species",
|
|
37
37
|
]
|
|
38
|
-
_MOTUS_TAX_RANKS = [
|
|
39
|
-
'Kingdom',
|
|
40
|
-
'Phylum',
|
|
41
|
-
'Class',
|
|
42
|
-
'Order',
|
|
43
|
-
'Family',
|
|
44
|
-
'Genus',
|
|
45
|
-
'Species'
|
|
46
|
-
]
|
|
38
|
+
_MOTUS_TAX_RANKS = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
|
|
47
39
|
|
|
48
40
|
SHORT_TAX_RANKS = ["sk", "k", "p", "c", "o", "f", "g", "s"]
|
|
49
41
|
SHORT_MOTUS_TAX_RANKS = ["k", "p", "c", "o", "f", "g", "s"]
|
|
@@ -581,8 +581,8 @@ class RawReadsStatusTypes(StrEnum):
|
|
|
581
581
|
|
|
582
582
|
all_results = "all_results"
|
|
583
583
|
no_reads = "no_reads"
|
|
584
|
-
|
|
585
|
-
|
|
584
|
+
all_empty_results = "all_empty_results"
|
|
585
|
+
some_empty_results = "some_empty_results"
|
|
586
586
|
|
|
587
587
|
|
|
588
588
|
class RawReadsPassedRunsRecord(BaseModel):
|
{mgnify_pipelines_toolkit-1.2.7.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/RECORD
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=8yFhmHQXVDPXvRX8oWSANV3VMu0X-zNnz12u1fcGwTE,20649
|
|
4
|
-
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256
|
|
4
|
+
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=-g1FDwdEndWH9VvYLmc_NEs2l204kKjMHk65wag8T_s,8891
|
|
5
5
|
mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=BLqhflblUegCvuQic16PrFXfIXlFWmGkmWJyl4wJoLQ,5040
|
|
6
6
|
mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py,sha256=1aGOJX9tC7M1rnd0U2PeJ681sUo02wxk7_ycJqeVt6s,2216
|
|
7
7
|
mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=-W_QmdmKAIqVC5n-RS8LX11hEQM4xdp5r1jkITB1CI8,5256
|
|
@@ -22,31 +22,31 @@ mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py,sha2
|
|
|
22
22
|
mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=eNichqFFmfPsa2J10IUm_PemVs9fBhbKa2vpDqEvJNU,21791
|
|
23
23
|
mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=jUeA7I12YrtIqnm3hUxpdgsWfa2pP1ALGjb9OMKPcgY,10643
|
|
24
24
|
mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=TPaKlYkoy37_XgYNOskWCCoXtPNku_k5ygSeK4fT1VQ,6689
|
|
25
|
-
mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=
|
|
25
|
+
mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=lxe7R2RQFyNCzEm6YuNRrqKZLZOUPq5W1P23Pt2sKBU,4570
|
|
26
26
|
mgnify_pipelines_toolkit/analysis/genomes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
|
-
mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py,sha256=
|
|
27
|
+
mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py,sha256=ltyNHwzaZZkK1ScH2vV2QV1eUXTHQUMYyadJwO-zSQY,16028
|
|
28
28
|
mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
29
|
mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py,sha256=kAGU5kQyj-Hlcdx32i-xOJSuHYYUDj-kqnyYHMohHGc,4477
|
|
30
|
-
mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py,sha256=
|
|
30
|
+
mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py,sha256=RaFopUjJI4UO1ttnSEHj7iUXpAL5-2FTbDXlhOmNy0s,25534
|
|
31
31
|
mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py,sha256=ye0Jka6_lNn4dQGb2QG3YT46y7QK0QvyaIitIaS8JVQ,4026
|
|
32
32
|
mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=UrU0CpZj3pfHZWI7Uuhv2a_C0JsO8pnVErY0sWGgNdw,4920
|
|
33
33
|
mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=EH5RyzesLqsonnTQbSDs7kAOV6IskS4oyqZYlex1tAY,1934
|
|
34
34
|
mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py,sha256=6Ck2NhwRWw66GctUtKDdPT5fwJhWFR_YOZq-Vxwoa8A,1996
|
|
35
35
|
mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=7-U0DN1joVu0ifLOoDUK2Pfqy8rb1RDKT6khVg3jky0,5559
|
|
36
36
|
mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py,sha256=sKAo_rKEyVAZXSaIFMkpSoYZxiWwXMA3XDA6Z-hbHgg,7904
|
|
37
|
-
mgnify_pipelines_toolkit/constants/db_labels.py,sha256=
|
|
37
|
+
mgnify_pipelines_toolkit/constants/db_labels.py,sha256=12mksTtAwTE1smLnemdoItxGw1AmtJPOzbnW2aGj0u0,1062
|
|
38
38
|
mgnify_pipelines_toolkit/constants/ncrna.py,sha256=a_5hWp446S7BhRbe_JcydFgZM7sgPLuMlaiBvKWN_XM,1928
|
|
39
39
|
mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=G-xrc9b8zdmPTaOICD2b3RCVeFAEOVkfRkIfotQ7gek,1193
|
|
40
|
-
mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=
|
|
40
|
+
mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=ekZN5OcMBhDRcj7XB_27wQ8fEnmAqMJc4aQ3pv4BRmI,1229
|
|
41
41
|
mgnify_pipelines_toolkit/constants/thresholds.py,sha256=1AMBmoHBR0WjXZpkwJ7_Q-gfJtHXuCA4tZ-uvPhF0Xc,1085
|
|
42
42
|
mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=0bM4MwarFiM5yTcp5AbAmQ0o-q-gWy7kknir9zJ9R0A,1312
|
|
43
|
-
mgnify_pipelines_toolkit/schemas/schemas.py,sha256=
|
|
43
|
+
mgnify_pipelines_toolkit/schemas/schemas.py,sha256=he9igC80YTR32v1e5NslwTgtdVySmnXwK9iY9IBPNBg,23133
|
|
44
44
|
mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
45
|
mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
|
|
46
46
|
mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
|
|
47
|
-
mgnify_pipelines_toolkit-1.2.
|
|
48
|
-
mgnify_pipelines_toolkit-1.2.
|
|
49
|
-
mgnify_pipelines_toolkit-1.2.
|
|
50
|
-
mgnify_pipelines_toolkit-1.2.
|
|
51
|
-
mgnify_pipelines_toolkit-1.2.
|
|
52
|
-
mgnify_pipelines_toolkit-1.2.
|
|
47
|
+
mgnify_pipelines_toolkit-1.2.8.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
48
|
+
mgnify_pipelines_toolkit-1.2.8.dist-info/METADATA,sha256=RkF31O7GjADzb2k96oZxbyWOmDvN1bKzIThNTb0e7Qg,5775
|
|
49
|
+
mgnify_pipelines_toolkit-1.2.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
50
|
+
mgnify_pipelines_toolkit-1.2.8.dist-info/entry_points.txt,sha256=7TJ8GgbKoX1xnQsOdWwMvwhIv4uuHCx7pMxKmZabPOs,3228
|
|
51
|
+
mgnify_pipelines_toolkit-1.2.8.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
|
|
52
|
+
mgnify_pipelines_toolkit-1.2.8.dist-info/RECORD,,
|
|
@@ -6,7 +6,7 @@ assembly_study_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.st
|
|
|
6
6
|
classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
|
|
7
7
|
combined_gene_caller_merge = mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main
|
|
8
8
|
convert_cmscan_to_cmsearch_tblout = mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main
|
|
9
|
-
dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:
|
|
9
|
+
dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:cli
|
|
10
10
|
fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
|
|
11
11
|
fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
|
|
12
12
|
generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main
|
|
File without changes
|
|
File without changes
|
{mgnify_pipelines_toolkit-1.2.7.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/top_level.txt
RENAMED
|
File without changes
|