moducomp 0.7.3__tar.gz → 0.7.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {moducomp-0.7.3 → moducomp-0.7.4}/PKG-INFO +2 -2
- {moducomp-0.7.3 → moducomp-0.7.4}/README.md +1 -1
- {moducomp-0.7.3 → moducomp-0.7.4}/moducomp/__init__.py +1 -1
- {moducomp-0.7.3 → moducomp-0.7.4}/moducomp/moducomp.py +230 -96
- {moducomp-0.7.3 → moducomp-0.7.4}/recipe.yaml +2 -2
- {moducomp-0.7.3 → moducomp-0.7.4}/.gitignore +0 -0
- {moducomp-0.7.3 → moducomp-0.7.4}/LICENSE.txt +0 -0
- {moducomp-0.7.3 → moducomp-0.7.4}/moducomp/__main__.py +0 -0
- {moducomp-0.7.3 → moducomp-0.7.4}/moducomp/data/test_genomes/IMG2562617132.faa +0 -0
- {moducomp-0.7.3 → moducomp-0.7.4}/moducomp/data/test_genomes/IMG2568526683.faa +0 -0
- {moducomp-0.7.3 → moducomp-0.7.4}/moducomp/data/test_genomes/IMG2740892217.faa +0 -0
- {moducomp-0.7.3 → moducomp-0.7.4}/pixi.lock +0 -0
- {moducomp-0.7.3 → moducomp-0.7.4}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: moducomp
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.4
|
|
4
4
|
Summary: moducomp: metabolic module completeness and complementarity for microbiomes.
|
|
5
5
|
Keywords: bioinformatics,microbiome,metabolic,kegg,genomics
|
|
6
6
|
Author-email: "Juan C. Villada" <jvillada@lbl.gov>
|
|
@@ -74,7 +74,7 @@ If `EGGNOG_DATA_DIR` is not set, `moducomp download-eggnog-data` defaults to `${
|
|
|
74
74
|
Small test data sets ship with `moducomp`. After installation you can confirm the pipeline by running:
|
|
75
75
|
|
|
76
76
|
```bash
|
|
77
|
-
moducomp test --ncpus
|
|
77
|
+
moducomp test --ncpus 16 --eggnog-data-dir "$EGGNOG_DATA_DIR"
|
|
78
78
|
```
|
|
79
79
|
|
|
80
80
|
### Developer install (Pixi)
|
|
@@ -49,7 +49,7 @@ If `EGGNOG_DATA_DIR` is not set, `moducomp download-eggnog-data` defaults to `${
|
|
|
49
49
|
Small test data sets ship with `moducomp`. After installation you can confirm the pipeline by running:
|
|
50
50
|
|
|
51
51
|
```bash
|
|
52
|
-
moducomp test --ncpus
|
|
52
|
+
moducomp test --ncpus 16 --eggnog-data-dir "$EGGNOG_DATA_DIR"
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
### Developer install (Pixi)
|
|
@@ -141,8 +141,39 @@ def emit_error(message: str, logger: Optional[logging.Logger] = None) -> None:
|
|
|
141
141
|
"""Log and emit an error to both stdout and stderr."""
|
|
142
142
|
if logger:
|
|
143
143
|
logger.error(message)
|
|
144
|
-
typer.secho(f"
|
|
145
|
-
typer.secho(f"
|
|
144
|
+
typer.secho(f"ERROR: {message}", fg="red", err=True)
|
|
145
|
+
typer.secho(f"ERROR: {message}", fg="red")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def format_bytes(num_bytes: float) -> str:
|
|
149
|
+
"""Format bytes as a human-readable string."""
|
|
150
|
+
units = ["B", "KB", "MB", "GB", "TB", "PB"]
|
|
151
|
+
value = float(num_bytes)
|
|
152
|
+
for unit in units:
|
|
153
|
+
if value < 1024 or unit == units[-1]:
|
|
154
|
+
return f"{value:.1f} {unit}"
|
|
155
|
+
value /= 1024
|
|
156
|
+
return f"{value:.1f} PB"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def get_dir_size(path: Path) -> int:
|
|
160
|
+
"""Return total size of files under path."""
|
|
161
|
+
total = 0
|
|
162
|
+
for root, _, files in os.walk(path):
|
|
163
|
+
for name in files:
|
|
164
|
+
try:
|
|
165
|
+
total += (Path(root) / name).stat().st_size
|
|
166
|
+
except OSError:
|
|
167
|
+
continue
|
|
168
|
+
return total
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def count_files(path: Path) -> int:
|
|
172
|
+
"""Return number of files under path."""
|
|
173
|
+
total = 0
|
|
174
|
+
for _, _, files in os.walk(path):
|
|
175
|
+
total += len(files)
|
|
176
|
+
return total
|
|
146
177
|
|
|
147
178
|
|
|
148
179
|
def default_eggnog_data_dir() -> Path:
|
|
@@ -329,7 +360,7 @@ def run_subprocess_with_logging(
|
|
|
329
360
|
error_msg = f"Exception running command {' '.join(cmd)}: {str(e)}"
|
|
330
361
|
if logger:
|
|
331
362
|
logger.error(error_msg)
|
|
332
|
-
print(f"
|
|
363
|
+
print(f"ERROR: {error_msg}", file=sys.stderr)
|
|
333
364
|
return -1, "", str(e)
|
|
334
365
|
|
|
335
366
|
|
|
@@ -876,10 +907,10 @@ def how_many_genomes(genomedir: str, verbose: bool = True):
|
|
|
876
907
|
"""
|
|
877
908
|
n_files = len(get_path_to_each_genome(genomedir))
|
|
878
909
|
if n_files > 0:
|
|
879
|
-
conditional_output(f"
|
|
910
|
+
conditional_output(f"OK: {n_files} faa files were found in '{genomedir}'\n", "white", verbose)
|
|
880
911
|
else:
|
|
881
912
|
# Always show errors regardless of verbose setting
|
|
882
|
-
typer.secho(f"
|
|
913
|
+
typer.secho(f"ERROR: No FAA files were found in '{genomedir}'\n", fg="red")
|
|
883
914
|
exit()
|
|
884
915
|
|
|
885
916
|
|
|
@@ -896,10 +927,10 @@ def create_output_dir(savedir: str, verbose: bool = True):
|
|
|
896
927
|
"""
|
|
897
928
|
conditional_output("\nCreating output directory", "green", verbose)
|
|
898
929
|
if os.path.exists(savedir):
|
|
899
|
-
conditional_output(f"
|
|
930
|
+
conditional_output(f"OK: Output directory already exists at: {savedir}\n", "white", verbose)
|
|
900
931
|
else:
|
|
901
932
|
os.makedirs(savedir, exist_ok=True)
|
|
902
|
-
conditional_output(f"
|
|
933
|
+
conditional_output(f"OK: Output directory created at: {savedir}\n", "white", verbose)
|
|
903
934
|
|
|
904
935
|
|
|
905
936
|
def get_tmp_dir(savedir:str) -> str:
|
|
@@ -934,10 +965,10 @@ def create_tmp_dir(savedir: str, verbose: bool = True):
|
|
|
934
965
|
conditional_output("\nCreating tmp dir", "green", verbose)
|
|
935
966
|
tmp_dir_path = get_tmp_dir(savedir)
|
|
936
967
|
if (os.path.exists(tmp_dir_path)):
|
|
937
|
-
conditional_output(f"
|
|
968
|
+
conditional_output(f"OK: Tmp directory already exists at: {tmp_dir_path}\n", "white", verbose)
|
|
938
969
|
else:
|
|
939
970
|
os.mkdir(tmp_dir_path)
|
|
940
|
-
conditional_output(f"
|
|
971
|
+
conditional_output(f"OK: Tmp directory created at: {tmp_dir_path}\n", "white", verbose)
|
|
941
972
|
|
|
942
973
|
|
|
943
974
|
def adapt_fasta_headers(genomedir: str, savedir: str, verbose: bool = True) -> None:
|
|
@@ -962,7 +993,7 @@ def adapt_fasta_headers(genomedir: str, savedir: str, verbose: bool = True) -> N
|
|
|
962
993
|
path_to_each_genome = get_path_to_each_genome(genomedir)
|
|
963
994
|
output_dir = f"{get_tmp_dir(savedir)}/faa"
|
|
964
995
|
if os.path.exists(output_dir):
|
|
965
|
-
conditional_output(f"
|
|
996
|
+
conditional_output(f"OK: Fasta headers already modified at: {output_dir}\n", "white", verbose)
|
|
966
997
|
return
|
|
967
998
|
|
|
968
999
|
os.mkdir(output_dir)
|
|
@@ -978,7 +1009,7 @@ def adapt_fasta_headers(genomedir: str, savedir: str, verbose: bool = True) -> N
|
|
|
978
1009
|
i+=1
|
|
979
1010
|
else:
|
|
980
1011
|
outfile.write(line)
|
|
981
|
-
conditional_output(f"
|
|
1012
|
+
conditional_output(f"OK: Fasta headers modified at: {output_dir}\n", "white", verbose)
|
|
982
1013
|
|
|
983
1014
|
|
|
984
1015
|
def copy_faa_to_tmp(genomedir: str, savedir: str, verbose: bool = True) -> None:
|
|
@@ -1002,14 +1033,14 @@ def copy_faa_to_tmp(genomedir: str, savedir: str, verbose: bool = True) -> None:
|
|
|
1002
1033
|
path_to_each_genome = get_path_to_each_genome(genomedir)
|
|
1003
1034
|
output_dir = f"{get_tmp_dir(savedir)}/faa"
|
|
1004
1035
|
if os.path.exists(output_dir):
|
|
1005
|
-
conditional_output(f"
|
|
1036
|
+
conditional_output(f"OK: Fasta files already exist at: {output_dir}\n", "white", verbose)
|
|
1006
1037
|
return
|
|
1007
1038
|
|
|
1008
1039
|
os.mkdir(output_dir)
|
|
1009
1040
|
conditional_output("Copying genome files to temporary directory...", "yellow", verbose)
|
|
1010
1041
|
for each_file in path_to_each_genome:
|
|
1011
1042
|
shutil.copy(each_file, output_dir)
|
|
1012
|
-
conditional_output(f"
|
|
1043
|
+
conditional_output(f"OK: Fasta files copied to: {output_dir}\n", "white", verbose)
|
|
1013
1044
|
|
|
1014
1045
|
|
|
1015
1046
|
def merge_genomes(savedir: str, logger: Optional[logging.Logger] = None, verbose: bool = True) -> bool:
|
|
@@ -1036,7 +1067,7 @@ def merge_genomes(savedir: str, logger: Optional[logging.Logger] = None, verbose
|
|
|
1036
1067
|
|
|
1037
1068
|
|
|
1038
1069
|
if os.path.exists(output_file):
|
|
1039
|
-
conditional_output(f"
|
|
1070
|
+
conditional_output(f"OK: Merged genomes file already exists at: {output_file}\n", "white", verbose)
|
|
1040
1071
|
if logger:
|
|
1041
1072
|
logger.info(f"Using existing merged genomes file: {output_file}")
|
|
1042
1073
|
return True
|
|
@@ -1045,7 +1076,7 @@ def merge_genomes(savedir: str, logger: Optional[logging.Logger] = None, verbose
|
|
|
1045
1076
|
if not genome_file_paths:
|
|
1046
1077
|
error_msg = f"No FAA files found in {get_tmp_dir(savedir)}/faa/"
|
|
1047
1078
|
# Always show errors regardless of verbose setting
|
|
1048
|
-
typer.secho(f"
|
|
1079
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1049
1080
|
if logger:
|
|
1050
1081
|
logger.error(error_msg)
|
|
1051
1082
|
return False
|
|
@@ -1057,14 +1088,14 @@ def merge_genomes(savedir: str, logger: Optional[logging.Logger] = None, verbose
|
|
|
1057
1088
|
with open(each_file) as infile:
|
|
1058
1089
|
for line in infile:
|
|
1059
1090
|
outfile.write(line)
|
|
1060
|
-
conditional_output(f"
|
|
1091
|
+
conditional_output(f"OK: Fasta files merged at: {output_file}\n", "white", verbose)
|
|
1061
1092
|
if logger:
|
|
1062
1093
|
logger.info(f"Successfully created merged genome file: {output_file}")
|
|
1063
1094
|
return True
|
|
1064
1095
|
except Exception as e:
|
|
1065
1096
|
error_msg = f"Error merging genome files: {str(e)}"
|
|
1066
1097
|
# Always show errors regardless of verbose setting
|
|
1067
|
-
typer.secho(f"
|
|
1098
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1068
1099
|
if logger:
|
|
1069
1100
|
logger.error(error_msg)
|
|
1070
1101
|
return False
|
|
@@ -1090,12 +1121,12 @@ def run_emapper(savedir: str, ncpus: int, resource_log_file: str, lowmem: bool =
|
|
|
1090
1121
|
bool
|
|
1091
1122
|
True if emapper ran successfully or outputs already exist, False otherwise
|
|
1092
1123
|
"""
|
|
1093
|
-
|
|
1124
|
+
conditional_output("\nStarting eggNOG-mapper", "green", verbose)
|
|
1094
1125
|
|
|
1095
1126
|
|
|
1096
1127
|
final_emapper_annotation_file = f"{savedir}/emapper_out.emapper.annotations"
|
|
1097
1128
|
if os.path.exists(final_emapper_annotation_file):
|
|
1098
|
-
typer.secho(f"
|
|
1129
|
+
typer.secho(f"OK: Emapper annotations already exist at: {final_emapper_annotation_file}\n", fg="white")
|
|
1099
1130
|
if logger:
|
|
1100
1131
|
logger.info(f"Using existing emapper annotations: {final_emapper_annotation_file}")
|
|
1101
1132
|
return True
|
|
@@ -1108,14 +1139,14 @@ def run_emapper(savedir: str, ncpus: int, resource_log_file: str, lowmem: bool =
|
|
|
1108
1139
|
|
|
1109
1140
|
if not os.path.exists(merged_genomes_file):
|
|
1110
1141
|
error_msg = f"Merged genomes file not found at: {merged_genomes_file}"
|
|
1111
|
-
typer.secho(f"
|
|
1142
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1112
1143
|
if logger:
|
|
1113
1144
|
logger.error(error_msg)
|
|
1114
1145
|
return False
|
|
1115
1146
|
|
|
1116
1147
|
|
|
1117
1148
|
if os.path.exists(emapper_tmp_file):
|
|
1118
|
-
typer.secho(f"
|
|
1149
|
+
typer.secho(f"OK: Emapper output already exists at: {emapper_tmp_file}\n", fg="white")
|
|
1119
1150
|
if logger:
|
|
1120
1151
|
logger.info(f"Using existing emapper output from temporary directory: {emapper_tmp_file}")
|
|
1121
1152
|
|
|
@@ -1150,7 +1181,7 @@ def run_emapper(savedir: str, ncpus: int, resource_log_file: str, lowmem: bool =
|
|
|
1150
1181
|
cmd_emapper,
|
|
1151
1182
|
resource_log_file,
|
|
1152
1183
|
logger,
|
|
1153
|
-
"
|
|
1184
|
+
"eggNOG-mapper",
|
|
1154
1185
|
verbose
|
|
1155
1186
|
)
|
|
1156
1187
|
|
|
@@ -1158,7 +1189,7 @@ def run_emapper(savedir: str, ncpus: int, resource_log_file: str, lowmem: bool =
|
|
|
1158
1189
|
error_msg = f"emapper failed with return code {returncode}"
|
|
1159
1190
|
if stderr:
|
|
1160
1191
|
error_msg += f": {stderr}"
|
|
1161
|
-
typer.secho(f"
|
|
1192
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1162
1193
|
if logger:
|
|
1163
1194
|
logger.error(error_msg)
|
|
1164
1195
|
return False
|
|
@@ -1170,7 +1201,7 @@ def run_emapper(savedir: str, ncpus: int, resource_log_file: str, lowmem: bool =
|
|
|
1170
1201
|
|
|
1171
1202
|
if not os.path.exists(emapper_tmp_file):
|
|
1172
1203
|
error_msg = f"emapper did not generate expected output: {emapper_tmp_file}"
|
|
1173
|
-
typer.secho(f"
|
|
1204
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1174
1205
|
if logger:
|
|
1175
1206
|
logger.error(error_msg)
|
|
1176
1207
|
return False
|
|
@@ -1178,15 +1209,15 @@ def run_emapper(savedir: str, ncpus: int, resource_log_file: str, lowmem: bool =
|
|
|
1178
1209
|
|
|
1179
1210
|
shutil.copy(emapper_tmp_file, final_emapper_annotation_file)
|
|
1180
1211
|
|
|
1181
|
-
typer.secho(f"
|
|
1182
|
-
typer.secho(f"
|
|
1212
|
+
typer.secho(f"OK: emapper output saved at: {output_folder_emapper}\n", fg="white")
|
|
1213
|
+
typer.secho(f"OK: emapper annotations copied to: {final_emapper_annotation_file}\n", fg="white")
|
|
1183
1214
|
if logger:
|
|
1184
1215
|
logger.info(f"Successfully ran emapper and saved annotations to: {final_emapper_annotation_file}")
|
|
1185
1216
|
return True
|
|
1186
1217
|
|
|
1187
1218
|
except Exception as e:
|
|
1188
1219
|
error_msg = f"Error running emapper: {str(e)}"
|
|
1189
|
-
typer.secho(f"
|
|
1220
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1190
1221
|
if logger:
|
|
1191
1222
|
logger.error(error_msg)
|
|
1192
1223
|
return False
|
|
@@ -1207,11 +1238,11 @@ def remove_temp_files(savedir: str, logger: Optional[logging.Logger] = None) ->
|
|
|
1207
1238
|
if os.path.exists(tmp_dir):
|
|
1208
1239
|
try:
|
|
1209
1240
|
shutil.rmtree(tmp_dir)
|
|
1210
|
-
typer.secho(f"
|
|
1241
|
+
typer.secho(f"OK: Temporary files removed from: {tmp_dir}", fg="white")
|
|
1211
1242
|
if logger:
|
|
1212
1243
|
logger.info(f"Removed temporary directory: {tmp_dir}")
|
|
1213
1244
|
except Exception as e:
|
|
1214
|
-
typer.secho(f"
|
|
1245
|
+
typer.secho(f"WARNING: Failed to remove temporary files: {str(e)}", fg="yellow")
|
|
1215
1246
|
if logger:
|
|
1216
1247
|
logger.warning(f"Failed to remove temporary directory {tmp_dir}: {str(e)}")
|
|
1217
1248
|
|
|
@@ -1318,7 +1349,7 @@ def generate_complementarity_report(
|
|
|
1318
1349
|
if os.path.exists(output_file):
|
|
1319
1350
|
if logger:
|
|
1320
1351
|
logger.info(f"Complementarity report already exists at {output_file}")
|
|
1321
|
-
conditional_output(f"
|
|
1352
|
+
conditional_output(f"OK: Complementarity report already exists at: {output_file}", "white", verbose)
|
|
1322
1353
|
return
|
|
1323
1354
|
|
|
1324
1355
|
|
|
@@ -1327,7 +1358,7 @@ def generate_complementarity_report(
|
|
|
1327
1358
|
error_msg = f"Module completeness matrix not found at: {module_matrix_file}"
|
|
1328
1359
|
if logger:
|
|
1329
1360
|
logger.error(error_msg)
|
|
1330
|
-
typer.secho(f"
|
|
1361
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1331
1362
|
return
|
|
1332
1363
|
|
|
1333
1364
|
|
|
@@ -1344,13 +1375,13 @@ def generate_complementarity_report(
|
|
|
1344
1375
|
emapper_file = possible_file
|
|
1345
1376
|
if logger:
|
|
1346
1377
|
logger.info(f"Found emapper annotation file at: {emapper_file}")
|
|
1347
|
-
typer.secho(f"
|
|
1378
|
+
typer.secho(f"OK: Using emapper annotations from: {emapper_file}", fg="white")
|
|
1348
1379
|
break
|
|
1349
1380
|
|
|
1350
1381
|
if not emapper_file:
|
|
1351
1382
|
if logger:
|
|
1352
1383
|
logger.warning(f"Emapper annotation file not found in any of the expected locations. Will use placeholder protein IDs.")
|
|
1353
|
-
typer.secho(f"
|
|
1384
|
+
typer.secho(f"WARNING: Emapper annotation file not found. Will use placeholder protein IDs.", fg="yellow")
|
|
1354
1385
|
|
|
1355
1386
|
|
|
1356
1387
|
kpct_output_file = None
|
|
@@ -1370,7 +1401,7 @@ def generate_complementarity_report(
|
|
|
1370
1401
|
error_msg = "KPCT output file not found. Cannot extract module metadata."
|
|
1371
1402
|
if logger:
|
|
1372
1403
|
logger.error(error_msg)
|
|
1373
|
-
typer.secho(f"
|
|
1404
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1374
1405
|
return
|
|
1375
1406
|
|
|
1376
1407
|
try:
|
|
@@ -1452,7 +1483,7 @@ def generate_complementarity_report(
|
|
|
1452
1483
|
if module_id_col: logger.error(f"Found module_id_col: {module_id_col}")
|
|
1453
1484
|
if module_name_col: logger.error(f"Found module_name_col: {module_name_col}")
|
|
1454
1485
|
logger.error(f"Available columns: {kpct_df.columns.tolist()}")
|
|
1455
|
-
typer.secho(f"
|
|
1486
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1456
1487
|
return
|
|
1457
1488
|
|
|
1458
1489
|
if not contig_col or not matching_ko_col:
|
|
@@ -1460,7 +1491,7 @@ def generate_complementarity_report(
|
|
|
1460
1491
|
logger.warning(f"Cannot identify contig or matching_ko columns in KPCT output.")
|
|
1461
1492
|
logger.warning(f"Found contig_col: {contig_col}, matching_ko_col: {matching_ko_col}")
|
|
1462
1493
|
logger.warning(f"Available columns: {kpct_df.columns.tolist()}")
|
|
1463
|
-
typer.secho(f"
|
|
1494
|
+
typer.secho(f"WARNING: Missing columns in KPCT output may affect mapping of KOs to combinations.", fg="yellow")
|
|
1464
1495
|
|
|
1465
1496
|
|
|
1466
1497
|
module_metadata = {}
|
|
@@ -1726,7 +1757,7 @@ def generate_complementarity_report(
|
|
|
1726
1757
|
logger.info(f"Found {len(report_df)} complementary modules in {n_members}-member combinations")
|
|
1727
1758
|
logger.info(f"Complementarity report saved to: {output_file}")
|
|
1728
1759
|
|
|
1729
|
-
conditional_output(f"
|
|
1760
|
+
conditional_output(f"OK: Found {len(report_df)} complementary modules in {n_members}-member combinations", "green", verbose)
|
|
1730
1761
|
conditional_output(f"Complementarity report saved to: {output_file}", "white", verbose)
|
|
1731
1762
|
else:
|
|
1732
1763
|
if logger:
|
|
@@ -1746,14 +1777,14 @@ def generate_complementarity_report(
|
|
|
1746
1777
|
report_df = pd.DataFrame(columns=columns)
|
|
1747
1778
|
report_df.to_csv(output_file, sep='\t', index=False)
|
|
1748
1779
|
|
|
1749
|
-
typer.secho(f"
|
|
1780
|
+
typer.secho(f"WARNING: No complementary modules found in {n_members}-member combinations", fg="yellow")
|
|
1750
1781
|
typer.secho(f"Empty report saved to: {output_file}", fg="white")
|
|
1751
1782
|
|
|
1752
1783
|
except Exception as e:
|
|
1753
1784
|
error_msg = f"Error generating complementarity report: {str(e)}"
|
|
1754
1785
|
if logger:
|
|
1755
1786
|
logger.error(error_msg, exc_info=True)
|
|
1756
|
-
typer.secho(f"
|
|
1787
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1757
1788
|
|
|
1758
1789
|
|
|
1759
1790
|
def ko_matrix_to_kpct_format(kos_matrix: str, savedir: str, calculate_complementarity: int = 0, logger: Optional[logging.Logger] = None) -> str:
|
|
@@ -1840,7 +1871,7 @@ def ko_matrix_to_kpct_format(kos_matrix: str, savedir: str, calculate_complement
|
|
|
1840
1871
|
msg = "Invalid KO matrix format: missing 'taxon_oid' column"
|
|
1841
1872
|
if logger:
|
|
1842
1873
|
logger.error(msg)
|
|
1843
|
-
typer.secho(f"
|
|
1874
|
+
typer.secho(f"ERROR: {msg}", fg="red")
|
|
1844
1875
|
exit(1)
|
|
1845
1876
|
|
|
1846
1877
|
|
|
@@ -1885,14 +1916,14 @@ def ko_matrix_to_kpct_format(kos_matrix: str, savedir: str, calculate_complement
|
|
|
1885
1916
|
|
|
1886
1917
|
if logger:
|
|
1887
1918
|
logger.info(f"KO matrix converted to KPCT format: {output_path}")
|
|
1888
|
-
typer.secho(f"
|
|
1919
|
+
typer.secho(f"OK: KO matrix converted to KPCT format: {output_path}", fg="white")
|
|
1889
1920
|
return output_path
|
|
1890
1921
|
|
|
1891
1922
|
except Exception as e:
|
|
1892
1923
|
error_msg = f"Error converting KO matrix to KPCT format: {str(e)}"
|
|
1893
1924
|
if logger:
|
|
1894
1925
|
logger.error(error_msg)
|
|
1895
|
-
typer.secho(f"
|
|
1926
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1896
1927
|
raise
|
|
1897
1928
|
|
|
1898
1929
|
|
|
@@ -1982,7 +2013,7 @@ def create_module_completeness_matrix(savedir: str, kpct_outprefix: str, logger:
|
|
|
1982
2013
|
error_msg = f"KPCT output file not found: tried {kpct_outprefix}_contigs.with_weights.tsv and alternatives"
|
|
1983
2014
|
if logger:
|
|
1984
2015
|
logger.error(error_msg)
|
|
1985
|
-
typer.secho(f"
|
|
2016
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1986
2017
|
return
|
|
1987
2018
|
|
|
1988
2019
|
try:
|
|
@@ -2082,7 +2113,7 @@ def create_module_completeness_matrix(savedir: str, kpct_outprefix: str, logger:
|
|
|
2082
2113
|
error_msg = "Could not identify module columns in the KPCT output"
|
|
2083
2114
|
if logger:
|
|
2084
2115
|
logger.error(error_msg)
|
|
2085
|
-
typer.secho(f"
|
|
2116
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2086
2117
|
return
|
|
2087
2118
|
|
|
2088
2119
|
# Build the result data
|
|
@@ -2125,13 +2156,13 @@ def create_module_completeness_matrix(savedir: str, kpct_outprefix: str, logger:
|
|
|
2125
2156
|
logger.info(f"Matrix contains {single_genomes} single genomes out of {total_genomes} total entries")
|
|
2126
2157
|
if all_genomes:
|
|
2127
2158
|
logger.info(f"Expected {len(all_genomes)} single genomes from KPCT input")
|
|
2128
|
-
typer.secho(f"
|
|
2159
|
+
typer.secho(f"OK: Module completeness matrix saved to: {output_file}", fg="white")
|
|
2129
2160
|
|
|
2130
2161
|
except Exception as e:
|
|
2131
2162
|
error_msg = f"Error creating module completeness matrix: {str(e)}"
|
|
2132
2163
|
if logger:
|
|
2133
2164
|
logger.error(error_msg)
|
|
2134
|
-
typer.secho(f"
|
|
2165
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2135
2166
|
|
|
2136
2167
|
if logger:
|
|
2137
2168
|
logger.error(f"Error details: {e}", exc_info=True)
|
|
@@ -2180,7 +2211,7 @@ def create_ko_matrix_from_emapper_annotation(emapper_file_path: str, output_file
|
|
|
2180
2211
|
|
|
2181
2212
|
|
|
2182
2213
|
if os.path.exists(output_file_path):
|
|
2183
|
-
typer.secho(f"
|
|
2214
|
+
typer.secho(f"OK: KO matrix already exists at: {output_file_path}", fg="white")
|
|
2184
2215
|
if logger:
|
|
2185
2216
|
logger.info(f"KO matrix already exists at: {output_file_path}")
|
|
2186
2217
|
return
|
|
@@ -2189,7 +2220,7 @@ def create_ko_matrix_from_emapper_annotation(emapper_file_path: str, output_file
|
|
|
2189
2220
|
error_msg = f"eMapper annotation file not found at {emapper_file_path}. Cannot proceed."
|
|
2190
2221
|
if logger:
|
|
2191
2222
|
logger.error(error_msg)
|
|
2192
|
-
typer.secho(f"
|
|
2223
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2193
2224
|
exit(1)
|
|
2194
2225
|
|
|
2195
2226
|
try:
|
|
@@ -2255,7 +2286,7 @@ def create_ko_matrix_from_emapper_annotation(emapper_file_path: str, output_file
|
|
|
2255
2286
|
error_msg = "No KO data found in the eMapper annotations file"
|
|
2256
2287
|
if logger:
|
|
2257
2288
|
logger.error(error_msg)
|
|
2258
|
-
typer.secho(f"
|
|
2289
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2259
2290
|
return
|
|
2260
2291
|
|
|
2261
2292
|
|
|
@@ -2286,13 +2317,13 @@ def create_ko_matrix_from_emapper_annotation(emapper_file_path: str, output_file
|
|
|
2286
2317
|
logger.info(f"Created KO matrix with {len(kos_count_df)} genomes and {len(kos_count_df.columns)-1} KOs")
|
|
2287
2318
|
logger.info(f"KO matrix saved to: {output_file_path}")
|
|
2288
2319
|
|
|
2289
|
-
typer.secho(f"
|
|
2320
|
+
typer.secho(f"OK: KO matrix created and saved to: {output_file_path}", fg="white")
|
|
2290
2321
|
|
|
2291
2322
|
except Exception as e:
|
|
2292
2323
|
error_msg = f"Error creating KO matrix: {str(e)}"
|
|
2293
2324
|
if logger:
|
|
2294
2325
|
logger.error(error_msg, exc_info=True)
|
|
2295
|
-
typer.secho(f"
|
|
2326
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2296
2327
|
exit(1)
|
|
2297
2328
|
|
|
2298
2329
|
|
|
@@ -2322,14 +2353,14 @@ def check_kpct_installed(logger: Optional[logging.Logger] = None) -> bool:
|
|
|
2322
2353
|
error_msg = "KPCT 'give_completeness' tool not found in PATH. Please install it via pip: pip install kegg-pathways-completeness"
|
|
2323
2354
|
if logger:
|
|
2324
2355
|
logger.error(error_msg)
|
|
2325
|
-
typer.secho(f"
|
|
2356
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2326
2357
|
return False
|
|
2327
2358
|
return True
|
|
2328
2359
|
except Exception as e:
|
|
2329
2360
|
error_msg = f"Error checking for KPCT installation: {str(e)}"
|
|
2330
2361
|
if logger:
|
|
2331
2362
|
logger.error(error_msg)
|
|
2332
|
-
typer.secho(f"
|
|
2363
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2333
2364
|
return False
|
|
2334
2365
|
|
|
2335
2366
|
|
|
@@ -2416,7 +2447,7 @@ def chunk_kpct_input_file(kpct_input_file: str, savedir: str, n_chunks: int, log
|
|
|
2416
2447
|
error_msg = f"KPCT input file is empty: {kpct_input_file}"
|
|
2417
2448
|
if logger:
|
|
2418
2449
|
logger.error(error_msg)
|
|
2419
|
-
typer.secho(f"
|
|
2450
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2420
2451
|
return []
|
|
2421
2452
|
|
|
2422
2453
|
# Calculate lines per chunk using ceiling division to ensure we create exactly n_chunks
|
|
@@ -2470,7 +2501,7 @@ def chunk_kpct_input_file(kpct_input_file: str, savedir: str, n_chunks: int, log
|
|
|
2470
2501
|
error_msg = f"Error chunking KPCT input file: {str(e)}"
|
|
2471
2502
|
if logger:
|
|
2472
2503
|
logger.error(error_msg)
|
|
2473
|
-
typer.secho(f"
|
|
2504
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2474
2505
|
return []
|
|
2475
2506
|
|
|
2476
2507
|
|
|
@@ -2611,7 +2642,7 @@ def concatenate_kpct_outputs(chunk_dirs: List[str], savedir: str, kpct_outprefix
|
|
|
2611
2642
|
error_msg = f"Error concatenating KPCT outputs: {str(e)}"
|
|
2612
2643
|
if logger:
|
|
2613
2644
|
logger.error(error_msg)
|
|
2614
|
-
typer.secho(f"
|
|
2645
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2615
2646
|
return False
|
|
2616
2647
|
|
|
2617
2648
|
|
|
@@ -2666,7 +2697,7 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2666
2697
|
if all(os.path.exists(f) for f in final_outputs):
|
|
2667
2698
|
if logger:
|
|
2668
2699
|
logger.info("KPCT output files already exist, skipping parallel processing")
|
|
2669
|
-
typer.secho("
|
|
2700
|
+
typer.secho("OK: KPCT output files already exist", fg="white")
|
|
2670
2701
|
return True
|
|
2671
2702
|
|
|
2672
2703
|
|
|
@@ -2694,7 +2725,7 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2694
2725
|
if all_chunks_exist:
|
|
2695
2726
|
if logger:
|
|
2696
2727
|
logger.info("All chunk outputs already exist, proceeding to concatenation")
|
|
2697
|
-
typer.secho("
|
|
2728
|
+
typer.secho("OK: All chunks already processed, concatenating results", fg="white")
|
|
2698
2729
|
|
|
2699
2730
|
|
|
2700
2731
|
concatenation_success = concatenate_kpct_outputs(existing_chunk_dirs, savedir, kpct_outprefix, logger)
|
|
@@ -2751,7 +2782,7 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2751
2782
|
if not chunks_to_process:
|
|
2752
2783
|
if logger:
|
|
2753
2784
|
logger.info("All chunks already processed, proceeding to concatenation")
|
|
2754
|
-
typer.secho("
|
|
2785
|
+
typer.secho("OK: All chunks already processed, concatenating results", fg="white")
|
|
2755
2786
|
else:
|
|
2756
2787
|
if logger:
|
|
2757
2788
|
logger.info(f"Processing {len(chunks_to_process)} remaining chunks")
|
|
@@ -2799,7 +2830,7 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2799
2830
|
error_msg = f"Failed to process {len(failed_chunks)} chunks: {failed_chunks}"
|
|
2800
2831
|
if logger:
|
|
2801
2832
|
logger.error(error_msg)
|
|
2802
|
-
typer.secho(f"
|
|
2833
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2803
2834
|
return False
|
|
2804
2835
|
|
|
2805
2836
|
|
|
@@ -2812,7 +2843,7 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2812
2843
|
error_msg = f"Not all chunks were processed successfully. Expected {len(all_chunk_dirs)}, got {len(final_chunk_dirs)}"
|
|
2813
2844
|
if logger:
|
|
2814
2845
|
logger.error(error_msg)
|
|
2815
|
-
typer.secho(f"
|
|
2846
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2816
2847
|
return False
|
|
2817
2848
|
|
|
2818
2849
|
|
|
@@ -2830,12 +2861,12 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2830
2861
|
error_msg = f"Failed to create final output files: {missing_outputs}"
|
|
2831
2862
|
if logger:
|
|
2832
2863
|
logger.error(error_msg)
|
|
2833
|
-
typer.secho(f"
|
|
2864
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2834
2865
|
return False
|
|
2835
2866
|
|
|
2836
2867
|
if logger:
|
|
2837
2868
|
logger.info("Successfully completed parallel KPCT processing")
|
|
2838
|
-
typer.secho("
|
|
2869
|
+
typer.secho("OK: KPCT parallel processing completed successfully", fg="green")
|
|
2839
2870
|
|
|
2840
2871
|
return True
|
|
2841
2872
|
|
|
@@ -2843,7 +2874,7 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2843
2874
|
error_msg = f"Error in parallel KPCT processing: {str(e)}"
|
|
2844
2875
|
if logger:
|
|
2845
2876
|
logger.error(error_msg)
|
|
2846
|
-
typer.secho(f"
|
|
2877
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2847
2878
|
return False
|
|
2848
2879
|
|
|
2849
2880
|
|
|
@@ -2894,7 +2925,7 @@ def run_kpct(kpct_input_file: str, savedir: str, kpct_outprefix: str, resource_l
|
|
|
2894
2925
|
error_msg += f": {stderr}"
|
|
2895
2926
|
if logger:
|
|
2896
2927
|
logger.error(error_msg)
|
|
2897
|
-
typer.secho(f"
|
|
2928
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2898
2929
|
return False
|
|
2899
2930
|
|
|
2900
2931
|
|
|
@@ -2914,14 +2945,14 @@ def run_kpct(kpct_input_file: str, savedir: str, kpct_outprefix: str, resource_l
|
|
|
2914
2945
|
error_msg = f"KPCT did not generate any output files with prefix '{kpct_outprefix}'"
|
|
2915
2946
|
if logger:
|
|
2916
2947
|
logger.error(error_msg)
|
|
2917
|
-
typer.secho(f"
|
|
2948
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2918
2949
|
return False
|
|
2919
2950
|
|
|
2920
2951
|
|
|
2921
2952
|
created_files = [f for f in possible_kpct_files if os.path.exists(f)]
|
|
2922
2953
|
if logger:
|
|
2923
2954
|
logger.info(f"KPCT successfully created output files: {created_files}")
|
|
2924
|
-
typer.secho(f"
|
|
2955
|
+
typer.secho(f"OK: KPCT completed successfully. Created files: {[os.path.basename(f) for f in created_files]}", fg="green")
|
|
2925
2956
|
|
|
2926
2957
|
return True
|
|
2927
2958
|
|
|
@@ -2929,7 +2960,7 @@ def run_kpct(kpct_input_file: str, savedir: str, kpct_outprefix: str, resource_l
|
|
|
2929
2960
|
error_msg = f"Error running KPCT: {str(e)}"
|
|
2930
2961
|
if logger:
|
|
2931
2962
|
logger.error(error_msg)
|
|
2932
|
-
typer.secho(f"
|
|
2963
|
+
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2933
2964
|
return False
|
|
2934
2965
|
|
|
2935
2966
|
|
|
@@ -2967,7 +2998,7 @@ def run_kpct_with_fallback(kpct_input_file: str, savedir: str, kpct_outprefix: s
|
|
|
2967
2998
|
else:
|
|
2968
2999
|
if logger:
|
|
2969
3000
|
logger.warning("Parallel KPCT processing failed, falling back to sequential processing")
|
|
2970
|
-
typer.secho("
|
|
3001
|
+
typer.secho("WARNING: Parallel processing failed, trying sequential approach", fg="yellow")
|
|
2971
3002
|
|
|
2972
3003
|
|
|
2973
3004
|
if logger:
|
|
@@ -3086,9 +3117,9 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3086
3117
|
|
|
3087
3118
|
# Check if all outputs already exist
|
|
3088
3119
|
if check_final_reports_exist(savedir, calculate_complementarity, logger):
|
|
3089
|
-
conditional_output("
|
|
3120
|
+
conditional_output("OK: All output files already exist. Skipping processing.", "green", verbose)
|
|
3090
3121
|
if not del_tmp:
|
|
3091
|
-
conditional_output("
|
|
3122
|
+
conditional_output("INFO: Keeping temporary files as requested.", "blue", verbose)
|
|
3092
3123
|
logger.info("Pipeline skipped as all output files already exist")
|
|
3093
3124
|
return
|
|
3094
3125
|
|
|
@@ -3111,15 +3142,15 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3111
3142
|
# Process annotations and create KO matrix
|
|
3112
3143
|
if os.path.exists(ko_matrix_path):
|
|
3113
3144
|
logger.info(f"KO matrix already exists: {ko_matrix_path}")
|
|
3114
|
-
conditional_output(f"
|
|
3145
|
+
conditional_output(f"OK: Using existing KO matrix: {ko_matrix_path}", "white", verbose)
|
|
3115
3146
|
else:
|
|
3116
3147
|
# Check for existing emapper annotations
|
|
3117
3148
|
if os.path.exists(emapper_annotation_file):
|
|
3118
3149
|
logger.info(f"Emapper annotations already exist: {emapper_annotation_file}")
|
|
3119
|
-
conditional_output(f"
|
|
3150
|
+
conditional_output(f"OK: Using existing emapper annotations: {emapper_annotation_file}", "white", verbose)
|
|
3120
3151
|
elif os.path.exists(tmp_emapper_file):
|
|
3121
3152
|
logger.info(f"Emapper annotations found in temp directory: {tmp_emapper_file}")
|
|
3122
|
-
conditional_output(f"
|
|
3153
|
+
conditional_output(f"OK: Using existing emapper annotations from temp directory", "white", verbose)
|
|
3123
3154
|
# Copy to final location
|
|
3124
3155
|
try:
|
|
3125
3156
|
shutil.copy(tmp_emapper_file, emapper_annotation_file)
|
|
@@ -3149,7 +3180,7 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3149
3180
|
merge_success = merge_genomes(savedir, logger, verbose)
|
|
3150
3181
|
if not merge_success:
|
|
3151
3182
|
logger.error("Failed to merge genomes. Exiting pipeline.")
|
|
3152
|
-
typer.secho("
|
|
3183
|
+
typer.secho("ERROR: Failed to merge genomes. Exiting pipeline.", fg="red")
|
|
3153
3184
|
return
|
|
3154
3185
|
|
|
3155
3186
|
# Run eggNOG-mapper
|
|
@@ -3157,7 +3188,7 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3157
3188
|
emapper_success = run_emapper(savedir, ncpus, resource_log_file, lowmem, logger, verbose)
|
|
3158
3189
|
if not emapper_success:
|
|
3159
3190
|
logger.error("Failed to run emapper. Exiting pipeline.")
|
|
3160
|
-
typer.secho("
|
|
3191
|
+
typer.secho("ERROR: Failed to run emapper. Exiting pipeline.", fg="red")
|
|
3161
3192
|
return
|
|
3162
3193
|
|
|
3163
3194
|
# Create KO matrix from annotations
|
|
@@ -3170,7 +3201,7 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3170
3201
|
|
|
3171
3202
|
if os.path.exists(module_completeness_file):
|
|
3172
3203
|
logger.info(f"Module completeness matrix already exists: {module_completeness_file}")
|
|
3173
|
-
typer.secho(f"
|
|
3204
|
+
typer.secho(f"OK: Using existing module completeness matrix: {module_completeness_file}", fg="white")
|
|
3174
3205
|
else:
|
|
3175
3206
|
# Set up KPCT processing
|
|
3176
3207
|
kpct_outprefix = "output_give_completeness"
|
|
@@ -3191,7 +3222,7 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3191
3222
|
ko_matrix_to_kpct_format(ko_matrix_path, savedir, calculate_complementarity, logger)
|
|
3192
3223
|
else:
|
|
3193
3224
|
logger.info(f"KPCT input file already exists: {kpct_input_file}")
|
|
3194
|
-
typer.secho(f"
|
|
3225
|
+
typer.secho(f"OK: Using existing KPCT input file: {kpct_input_file}", fg="white")
|
|
3195
3226
|
|
|
3196
3227
|
# Run KPCT if needed
|
|
3197
3228
|
if not kpct_file_exists:
|
|
@@ -3206,7 +3237,7 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3206
3237
|
return
|
|
3207
3238
|
else:
|
|
3208
3239
|
logger.info(f"KPCT output file(s) already exist with prefix '{kpct_outprefix}'")
|
|
3209
|
-
typer.secho(f"
|
|
3240
|
+
typer.secho(f"OK: Using existing KPCT output files with prefix '{kpct_outprefix}'", fg="white")
|
|
3210
3241
|
|
|
3211
3242
|
# Create module completeness matrix
|
|
3212
3243
|
logger.info(f"Creating module completeness matrix")
|
|
@@ -3221,7 +3252,7 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3221
3252
|
complementarity_report_file = f"{savedir}/module_completeness_complementarity_{n_members}member.tsv"
|
|
3222
3253
|
if os.path.exists(complementarity_report_file):
|
|
3223
3254
|
logger.info(f"Complementarity report for {n_members}-member combinations already exists: {complementarity_report_file}")
|
|
3224
|
-
typer.secho(f"
|
|
3255
|
+
typer.secho(f"OK: Using existing {n_members}-member complementarity report: {complementarity_report_file}", fg="white")
|
|
3225
3256
|
else:
|
|
3226
3257
|
logger.info(f"Generating complementarity report for {n_members}-member combinations")
|
|
3227
3258
|
generate_complementarity_report(savedir, n_members, logger, verbose)
|
|
@@ -3354,16 +3385,13 @@ def download_eggnog_data(
|
|
|
3354
3385
|
logger.info("Starting eggNOG data download.")
|
|
3355
3386
|
logger.info("CLI command: %s", " ".join(shlex.quote(arg) for arg in sys.argv))
|
|
3356
3387
|
|
|
3357
|
-
if eggnog_data_dir:
|
|
3358
|
-
os.environ["EGGNOG_DATA_DIR"] = eggnog_data_dir
|
|
3359
|
-
|
|
3360
3388
|
env_value = os.environ.get("EGGNOG_DATA_DIR", "").strip()
|
|
3361
3389
|
if not env_value:
|
|
3362
3390
|
default_dir = default_eggnog_data_dir()
|
|
3363
3391
|
os.environ["EGGNOG_DATA_DIR"] = str(default_dir)
|
|
3364
3392
|
env_value = str(default_dir)
|
|
3365
3393
|
typer.secho(
|
|
3366
|
-
f"
|
|
3394
|
+
f"INFO: EGGNOG_DATA_DIR not set; using default {env_value}",
|
|
3367
3395
|
fg="yellow",
|
|
3368
3396
|
)
|
|
3369
3397
|
logger.info("EGGNOG_DATA_DIR not set; using default %s", env_value)
|
|
@@ -3381,12 +3409,118 @@ def download_eggnog_data(
|
|
|
3381
3409
|
emit_error(message, logger)
|
|
3382
3410
|
raise typer.Exit(1)
|
|
3383
3411
|
|
|
3384
|
-
|
|
3385
|
-
|
|
3386
|
-
|
|
3387
|
-
|
|
3388
|
-
|
|
3412
|
+
# Run the downloader with progress updates based on data directory growth.
|
|
3413
|
+
cmd = [downloader]
|
|
3414
|
+
logger.info("Downloading eggNOG data: %s", downloader)
|
|
3415
|
+
if verbose:
|
|
3416
|
+
typer.secho("Running download_eggnog_data.py", fg="yellow")
|
|
3417
|
+
typer.secho(f" Command: {' '.join(cmd)}", fg="blue")
|
|
3418
|
+
|
|
3419
|
+
start_time = time.time()
|
|
3420
|
+
last_progress_time = start_time
|
|
3421
|
+
last_size = get_dir_size(data_dir)
|
|
3422
|
+
last_files = count_files(data_dir)
|
|
3423
|
+
progress_interval = 60
|
|
3424
|
+
|
|
3425
|
+
process = subprocess.Popen(
|
|
3426
|
+
cmd,
|
|
3427
|
+
stdout=subprocess.PIPE,
|
|
3428
|
+
stderr=subprocess.PIPE,
|
|
3429
|
+
text=True,
|
|
3430
|
+
bufsize=1,
|
|
3431
|
+
universal_newlines=True,
|
|
3432
|
+
)
|
|
3433
|
+
|
|
3434
|
+
stdout_queue: "queue.Queue[Tuple[str, str]]" = queue.Queue()
|
|
3435
|
+
stderr_queue: "queue.Queue[Tuple[str, str]]" = queue.Queue()
|
|
3436
|
+
|
|
3437
|
+
def stream_reader(stream, q, stream_type):
|
|
3438
|
+
try:
|
|
3439
|
+
for line in iter(stream.readline, ""):
|
|
3440
|
+
q.put((stream_type, line.rstrip("\n\r")))
|
|
3441
|
+
finally:
|
|
3442
|
+
try:
|
|
3443
|
+
stream.close()
|
|
3444
|
+
except Exception:
|
|
3445
|
+
pass
|
|
3446
|
+
|
|
3447
|
+
stdout_thread = threading.Thread(
|
|
3448
|
+
target=stream_reader,
|
|
3449
|
+
args=(process.stdout, stdout_queue, "stdout"),
|
|
3450
|
+
daemon=True,
|
|
3389
3451
|
)
|
|
3452
|
+
stderr_thread = threading.Thread(
|
|
3453
|
+
target=stream_reader,
|
|
3454
|
+
args=(process.stderr, stderr_queue, "stderr"),
|
|
3455
|
+
daemon=True,
|
|
3456
|
+
)
|
|
3457
|
+
stdout_thread.start()
|
|
3458
|
+
stderr_thread.start()
|
|
3459
|
+
|
|
3460
|
+
while process.poll() is None or not stdout_queue.empty() or not stderr_queue.empty():
|
|
3461
|
+
now = time.time()
|
|
3462
|
+
|
|
3463
|
+
# Drain stdout
|
|
3464
|
+
try:
|
|
3465
|
+
while True:
|
|
3466
|
+
stream_type, line = stdout_queue.get_nowait()
|
|
3467
|
+
if line:
|
|
3468
|
+
if verbose:
|
|
3469
|
+
print(line, flush=True)
|
|
3470
|
+
logger.info(line)
|
|
3471
|
+
except queue.Empty:
|
|
3472
|
+
pass
|
|
3473
|
+
|
|
3474
|
+
# Drain stderr
|
|
3475
|
+
try:
|
|
3476
|
+
while True:
|
|
3477
|
+
stream_type, line = stderr_queue.get_nowait()
|
|
3478
|
+
if line:
|
|
3479
|
+
if verbose:
|
|
3480
|
+
print(line, file=sys.stderr, flush=True)
|
|
3481
|
+
logger.warning(line)
|
|
3482
|
+
except queue.Empty:
|
|
3483
|
+
pass
|
|
3484
|
+
|
|
3485
|
+
if now - last_progress_time >= progress_interval:
|
|
3486
|
+
try:
|
|
3487
|
+
current_size = get_dir_size(data_dir)
|
|
3488
|
+
current_files = count_files(data_dir)
|
|
3489
|
+
delta = current_size - last_size
|
|
3490
|
+
elapsed = now - last_progress_time
|
|
3491
|
+
speed = delta / elapsed if elapsed > 0 else 0.0
|
|
3492
|
+
file_delta = current_files - last_files
|
|
3493
|
+
msg = (
|
|
3494
|
+
f"Download progress: {format_bytes(current_size)} total "
|
|
3495
|
+
f"(+{format_bytes(delta)} in {int(elapsed)}s, "
|
|
3496
|
+
f"{format_bytes(speed)}/s, +{file_delta} files)"
|
|
3497
|
+
)
|
|
3498
|
+
logger.info(msg)
|
|
3499
|
+
if verbose:
|
|
3500
|
+
typer.secho(msg, fg="cyan")
|
|
3501
|
+
last_size = current_size
|
|
3502
|
+
last_files = current_files
|
|
3503
|
+
last_progress_time = now
|
|
3504
|
+
except Exception as exc:
|
|
3505
|
+
logger.warning("Progress check failed: %s", exc)
|
|
3506
|
+
last_progress_time = now
|
|
3507
|
+
|
|
3508
|
+
time.sleep(0.2)
|
|
3509
|
+
|
|
3510
|
+
stdout_thread.join(timeout=1.0)
|
|
3511
|
+
stderr_thread.join(timeout=1.0)
|
|
3512
|
+
|
|
3513
|
+
returncode = process.returncode
|
|
3514
|
+
total_size = get_dir_size(data_dir)
|
|
3515
|
+
total_files = count_files(data_dir)
|
|
3516
|
+
total_elapsed = time.time() - start_time
|
|
3517
|
+
summary = (
|
|
3518
|
+
f"Download finished: {format_bytes(total_size)} in {int(total_elapsed)}s "
|
|
3519
|
+
f"across {total_files} files"
|
|
3520
|
+
)
|
|
3521
|
+
logger.info(summary)
|
|
3522
|
+
if verbose:
|
|
3523
|
+
typer.secho(summary, fg="green")
|
|
3390
3524
|
|
|
3391
3525
|
if returncode != 0:
|
|
3392
3526
|
raise typer.Exit(returncode)
|
|
@@ -3472,7 +3606,7 @@ def analyze_ko_matrix(
|
|
|
3472
3606
|
|
|
3473
3607
|
|
|
3474
3608
|
if not os.path.exists(kos_matrix):
|
|
3475
|
-
typer.secho(f"
|
|
3609
|
+
typer.secho(f"ERROR: KO matrix file not found at: {kos_matrix}", fg="red")
|
|
3476
3610
|
exit(1)
|
|
3477
3611
|
|
|
3478
3612
|
|
|
@@ -3486,7 +3620,7 @@ def analyze_ko_matrix(
|
|
|
3486
3620
|
|
|
3487
3621
|
|
|
3488
3622
|
if check_final_reports_exist(savedir, calculate_complementarity, logger):
|
|
3489
|
-
typer.secho("
|
|
3623
|
+
typer.secho("OK: All output files already exist. Skipping processing.", fg="green")
|
|
3490
3624
|
logger.info("Analysis skipped as all output files already exist")
|
|
3491
3625
|
return
|
|
3492
3626
|
|
|
@@ -3524,7 +3658,7 @@ def analyze_ko_matrix(
|
|
|
3524
3658
|
ko_matrix_to_kpct_format(kos_matrix, savedir, calculate_complementarity, logger)
|
|
3525
3659
|
else:
|
|
3526
3660
|
logger.info(f"KPCT input file already exists: {kpct_input_file}")
|
|
3527
|
-
typer.secho(f"
|
|
3661
|
+
typer.secho(f"OK: Using existing KPCT input file: {kpct_input_file}", fg="white")
|
|
3528
3662
|
|
|
3529
3663
|
|
|
3530
3664
|
if not kpct_file_exists:
|
|
@@ -3539,7 +3673,7 @@ def analyze_ko_matrix(
|
|
|
3539
3673
|
exit(1)
|
|
3540
3674
|
else:
|
|
3541
3675
|
logger.info(f"KPCT output file(s) already exist with prefix '{kpct_outprefix}'")
|
|
3542
|
-
typer.secho(f"
|
|
3676
|
+
typer.secho(f"OK: Using existing KPCT output files with prefix '{kpct_outprefix}'", fg="white")
|
|
3543
3677
|
|
|
3544
3678
|
|
|
3545
3679
|
if not os.path.exists(module_completeness_file):
|
|
@@ -3549,7 +3683,7 @@ def analyze_ko_matrix(
|
|
|
3549
3683
|
else:
|
|
3550
3684
|
if logger:
|
|
3551
3685
|
logger.info(f"Module completeness matrix already exists: {module_completeness_file}")
|
|
3552
|
-
typer.secho(f"
|
|
3686
|
+
typer.secho(f"OK: Using existing module completeness matrix: {module_completeness_file}", fg="white")
|
|
3553
3687
|
|
|
3554
3688
|
|
|
3555
3689
|
if calculate_complementarity >= 2:
|
|
@@ -3561,7 +3695,7 @@ def analyze_ko_matrix(
|
|
|
3561
3695
|
complementarity_report_file = f"{savedir}/module_completeness_complementarity_{n_members}member.tsv"
|
|
3562
3696
|
if os.path.exists(complementarity_report_file):
|
|
3563
3697
|
logger.info(f"Complementarity report for {n_members}-member combinations already exists: {complementarity_report_file}")
|
|
3564
|
-
typer.secho(f"
|
|
3698
|
+
typer.secho(f"OK: Using existing {n_members}-member complementarity report: {complementarity_report_file}", fg="white")
|
|
3565
3699
|
else:
|
|
3566
3700
|
logger.info(f"Generating complementarity report for {n_members}-member combinations")
|
|
3567
3701
|
generate_complementarity_report(savedir, n_members, logger, verbose)
|
|
@@ -3581,7 +3715,7 @@ def analyze_ko_matrix(
|
|
|
3581
3715
|
except Exception as e:
|
|
3582
3716
|
if logger:
|
|
3583
3717
|
logger.error(f"Error in KPCT analysis: {str(e)}", exc_info=True)
|
|
3584
|
-
typer.secho(f"
|
|
3718
|
+
typer.secho(f"ERROR: Error in KPCT analysis: {str(e)}", fg="red")
|
|
3585
3719
|
exit(1)
|
|
3586
3720
|
|
|
3587
3721
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
context:
|
|
2
|
-
version: 0.7.
|
|
2
|
+
version: 0.7.4
|
|
3
3
|
|
|
4
4
|
package:
|
|
5
5
|
name: moducomp
|
|
@@ -7,7 +7,7 @@ package:
|
|
|
7
7
|
|
|
8
8
|
source:
|
|
9
9
|
- url: https://pypi.org/packages/source/m/moducomp/moducomp-${{ version }}.tar.gz
|
|
10
|
-
sha256:
|
|
10
|
+
sha256: b6d5648b660aadc9ecdb9375d35a984a02ef23d99d9d8085067a18d72037aabf
|
|
11
11
|
|
|
12
12
|
build:
|
|
13
13
|
script:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|