mgnify-pipelines-toolkit 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -83,70 +83,63 @@ def print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_line
83
83
  file_out.write("##gff-version 3\n")
84
84
  with fileinput.hook_compressed(overview_file, "r", encoding="utf-8") as file_in:
85
85
  for line in file_in:
86
- if line.startswith("MGYG") or line.startswith("ERZ"):
87
- (
88
- transcript,
89
- ec_number_raw,
90
- dbcan_hmmer,
91
- dbcan_sub_ecami,
92
- diamond,
93
- num_of_tools,
94
- ) = line.strip().split("\t")
95
- # EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
96
-
97
- ec_number = ""
98
- ec_list = ec_number_raw.split("|")
99
- for ec in ec_list:
100
- if ec != "-":
101
- ec_number += ec.split(":")[0] + "|"
102
-
103
- ec_number = ec_number.strip("|")
104
-
105
- # Dbcan recommends to use subfamily preference as dbcan_hmmer > dbcan_sub_ecami > diamond
106
- # diamond is messier, so we don't report it here
107
- if dbcan_hmmer != "-":
108
- # the field dbcan_hmmer reports match positions in parentheses, clear them out first:
109
- subfamily = dbcan_hmmer.split("(")[0]
110
- elif dbcan_sub_ecami != "-":
111
- subfamily = dbcan_sub_ecami
112
- else:
113
- continue
114
- cleaned_substrates = ",".join(
115
- sorted(
116
- {
117
- subsrate.strip()
118
- for subsrate in substrates.get(transcript, "N/A").split(
119
- ","
120
- )
121
- }
122
- )
86
+
87
+ if not line.startswith("MGYG") and not line.startswith("ERZ"):
88
+ continue
89
+
90
+ line = line.strip()
91
+ temp_list = line.split("\t")
92
+ transcript = temp_list[0]
93
+ ec_number_raw = temp_list[1]
94
+ num_of_tools = temp_list[5]
95
+ recc_subfamily = temp_list[6]
96
+
97
+ # EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
98
+
99
+ ec_number = ""
100
+ ec_list = ec_number_raw.split("|")
101
+ for ec in ec_list:
102
+ if ec != "-":
103
+ ec_number += ec.split(":")[0] + "|"
104
+
105
+ ec_number = ec_number.strip("|")
106
+ cleaned_substrates = ",".join(
107
+ sorted(
108
+ {
109
+ subsrate.strip()
110
+ for subsrate in substrates.get(transcript, "N/A").split(",")
111
+ }
123
112
  )
124
- # Assemble information to add to the 9th column
125
- col9_parts = [
126
- f"protein_family={subfamily}",
127
- f"substrate_dbcan-sub={cleaned_substrates}",
128
- ]
129
-
130
- if ec_number:
131
- col9_parts.append(f"eC_number={ec_number}")
132
-
133
- col9_parts.append(f"num_tools={num_of_tools}")
134
- col9_text = ";".join(col9_parts)
135
-
136
- for gff_line in genome_gff_lines[transcript]:
137
- fields = gff_line.strip().split("\t")
138
- # Replace the tool
139
- fields[1] = f"dbCAN:{dbcan_version}"
140
- # Replace the feature
141
- fields[2] = "CAZyme"
142
- # Replace the confidence value
143
- fields[5] = "."
144
- # Keep only the ID in the 9th column
145
- attributes = fields[8].split(";")[0]
146
- # Add dbcan information to the 9th column
147
- attributes = f"{attributes};{col9_text};"
148
- fields[8] = attributes
149
- file_out.write("\t".join(fields) + "\n")
113
+ )
114
+ # Assemble information to add to the 9th column
115
+ if recc_subfamily == "-":
116
+ continue
117
+
118
+ col9_parts = [
119
+ f"protein_family={recc_subfamily}",
120
+ f"substrate_dbcan-sub={cleaned_substrates}",
121
+ ]
122
+
123
+ if ec_number:
124
+ col9_parts.append(f"eC_number={ec_number}")
125
+
126
+ col9_parts.append(f"num_tools={num_of_tools}")
127
+ col9_text = ";".join(col9_parts)
128
+
129
+ for gff_line in genome_gff_lines[transcript]:
130
+ fields = gff_line.strip().split("\t")
131
+ # Replace the tool
132
+ fields[1] = f"dbCAN:{dbcan_version}"
133
+ # Replace the feature
134
+ fields[2] = "CAZyme"
135
+ # Replace the confidence value
136
+ fields[5] = "."
137
+ # Keep only the ID in the 9th column
138
+ attributes = fields[8].split(";")[0]
139
+ # Add dbcan information to the 9th column
140
+ attributes = f"{attributes};{col9_text};"
141
+ fields[8] = attributes
142
+ file_out.write("\t".join(fields) + "\n")
150
143
 
151
144
 
152
145
  def load_substrates(hmm_path):
@@ -155,8 +148,8 @@ def load_substrates(hmm_path):
155
148
  header = next(file_in)
156
149
  header_fields = header.strip().split("\t")
157
150
  substrate_idx = header_fields.index("Substrate")
158
- gene_idx = header_fields.index("Gene ID")
159
- evalue_idx = header_fields.index("E Value")
151
+ gene_idx = header_fields.index("Target Name")
152
+ evalue_idx = header_fields.index("i-Evalue")
160
153
  for line in file_in:
161
154
  fields = line.strip().split("\t")
162
155
  if float(fields[evalue_idx]) < 1e-15: # evalue is the default from dbcan
@@ -117,6 +117,7 @@ SUMMARY_TYPES_MAP = {
117
117
  },
118
118
  "sanntis": {
119
119
  "folder": "pathways-and-systems/sanntis",
120
+ "allow_missing": True,
120
121
  "column_names": SANNTIS_COLUMN_NAMES,
121
122
  "schema": SanntisSummarySchema,
122
123
  "study_schema": SanntisStudySummarySchema,
@@ -232,6 +233,7 @@ def generate_functional_summary(
232
233
  "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
233
234
  ],
234
235
  outdir: Path = None,
236
+ allow_missing: bool = False,
235
237
  ) -> None:
236
238
  """
237
239
  Generate a combined study-level functional annotation summary from multiple input
@@ -243,6 +245,7 @@ def generate_functional_summary(
243
245
  :param label: Label for the functional annotation type
244
246
  (expected one of ["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"]).
245
247
  :param outdir: Optional output directory for the results.
248
+ :param allow_missing: Whether to allow the summary files to be missing (e.g. because the pipeline doesn't emit them if acceptably empty).
246
249
 
247
250
  In the input files, column orders may vary, but the following columns are expected:
248
251
  GO summary input file:
@@ -285,7 +288,16 @@ def generate_functional_summary(
285
288
  M00163 83.33 Photosystem I Pathway modules; Energy metabolism; Photosynthesis K02689,K02690,K02691,K02692,K02694 K02693
286
289
  M00615 50.0 Nitrate assimilation Signature modules; Module set; Metabolic capacity K02575 M00531
287
290
  """
288
- check_files_exist(list(file_dict.values()))
291
+ try:
292
+ check_files_exist(list(file_dict.values()))
293
+ except FileNotFoundError as e:
294
+ if allow_missing:
295
+ logging.warning(
296
+ f"One of the expected files is missing, but this is allowed for {label}."
297
+ )
298
+ logging.warning(e)
299
+ return
300
+ raise
289
301
 
290
302
  output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
291
303
 
@@ -419,6 +431,7 @@ def summarise_analyses(
419
431
  output_prefix,
420
432
  summary_type,
421
433
  outdir=outdir,
434
+ allow_missing=config.get("allow_missing", False),
422
435
  )
423
436
  logging.info("Assembly-level summaries were generated successfully.")
424
437
  logging.info("Done.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.1.1
3
+ Version: 1.1.2
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -22,9 +22,9 @@ mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py,sha256=_4J31wAjK5B1
22
22
  mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py,sha256=_iaTBvMKbQDi_02_QuSPqLJ_rC37ruxiPHv5lLQmI-w,5480
23
23
  mgnify_pipelines_toolkit/analysis/assembly/go_utils.py,sha256=eay9e3Xdc8XxnlC_4SHHjN89k-M9i_cFMc2lI_ZFxqY,5596
24
24
  mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py,sha256=uex2T6GagtYFBIc39-Xm4SFHL06KAQ5v0_loOmY_eaw,4289
25
- mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py,sha256=KaJHOKfbIurbD1iiMssjdAaSAT8Nv-_ZUFwxkLqukAE,7799
25
+ mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py,sha256=5m5AwWEKidJx1FI0y93AFka7z0zEE8dBf1ofgP8TV_Y,7108
26
26
  mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py,sha256=DYZhChGD49M-zAtGkCmNHXDoVTnd5Qy6amG-oePO8Ek,5981
27
- mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=J4cIWaFyWihqo2JtaOR531aXtVxIfOi_hcwZZw-vP8g,21252
27
+ mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=eNichqFFmfPsa2J10IUm_PemVs9fBhbKa2vpDqEvJNU,21791
28
28
  mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=jUeA7I12YrtIqnm3hUxpdgsWfa2pP1ALGjb9OMKPcgY,10643
29
29
  mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=TPaKlYkoy37_XgYNOskWCCoXtPNku_k5ygSeK4fT1VQ,6689
30
30
  mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=1wblLbZl521digIUWoqneAu15gErzvN_oC--5T_xUdw,4582
@@ -49,9 +49,9 @@ mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pyDZvCuWbwccQF0D7c5BN1vv36wQd
49
49
  mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
50
  mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
51
51
  mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
52
- mgnify_pipelines_toolkit-1.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
53
- mgnify_pipelines_toolkit-1.1.1.dist-info/METADATA,sha256=E86Tp9qJuQUrkNIklK4PEATQ4ovZfhRbgMKVTyxGSx0,5811
54
- mgnify_pipelines_toolkit-1.1.1.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
55
- mgnify_pipelines_toolkit-1.1.1.dist-info/entry_points.txt,sha256=JSjuxAr71MTeSUPPpno22wmZYgVO-gbsXfDkgWKkF7A,3533
56
- mgnify_pipelines_toolkit-1.1.1.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
57
- mgnify_pipelines_toolkit-1.1.1.dist-info/RECORD,,
52
+ mgnify_pipelines_toolkit-1.1.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
53
+ mgnify_pipelines_toolkit-1.1.2.dist-info/METADATA,sha256=5ByyTshLjj3V5NYnqFinX0ROLb9GmST0m_CltiUdBiY,5811
54
+ mgnify_pipelines_toolkit-1.1.2.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
55
+ mgnify_pipelines_toolkit-1.1.2.dist-info/entry_points.txt,sha256=JSjuxAr71MTeSUPPpno22wmZYgVO-gbsXfDkgWKkF7A,3533
56
+ mgnify_pipelines_toolkit-1.1.2.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
57
+ mgnify_pipelines_toolkit-1.1.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.1.0)
2
+ Generator: setuptools (80.3.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5