moducomp 0.7.11__py3-none-any.whl → 0.7.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- moducomp/__init__.py +1 -1
- moducomp/moducomp.py +977 -7
- {moducomp-0.7.11.dist-info → moducomp-0.7.12.dist-info}/METADATA +80 -8
- {moducomp-0.7.11.dist-info → moducomp-0.7.12.dist-info}/RECORD +7 -7
- {moducomp-0.7.11.dist-info → moducomp-0.7.12.dist-info}/WHEEL +0 -0
- {moducomp-0.7.11.dist-info → moducomp-0.7.12.dist-info}/entry_points.txt +0 -0
- {moducomp-0.7.11.dist-info → moducomp-0.7.12.dist-info}/licenses/LICENSE.txt +0 -0
moducomp/__init__.py
CHANGED
moducomp/moducomp.py
CHANGED
|
@@ -23,12 +23,15 @@ License: See LICENSE.txt
|
|
|
23
23
|
Version: See moducomp.__version__ for current version
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
|
+
import csv
|
|
26
27
|
import datetime
|
|
27
28
|
import glob
|
|
28
29
|
import itertools
|
|
30
|
+
import json
|
|
29
31
|
import logging
|
|
30
32
|
import os
|
|
31
33
|
import queue
|
|
34
|
+
import re
|
|
32
35
|
import shlex
|
|
33
36
|
import shutil
|
|
34
37
|
import subprocess
|
|
@@ -223,6 +226,112 @@ def count_files(path: Path) -> int:
|
|
|
223
226
|
return total
|
|
224
227
|
|
|
225
228
|
|
|
229
|
+
def _find_emapper_annotations(savedir: Union[str, Path]) -> Optional[Path]:
|
|
230
|
+
savedir_path = Path(savedir)
|
|
231
|
+
candidates = [
|
|
232
|
+
savedir_path / "emapper_out.emapper.annotations",
|
|
233
|
+
savedir_path / "tmp" / "emapper_output" / "emapper_out.emapper.annotations",
|
|
234
|
+
]
|
|
235
|
+
for candidate in candidates:
|
|
236
|
+
if candidate.exists():
|
|
237
|
+
return candidate
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _read_ko_matrix_file(kos_matrix: Union[str, Path], logger: Optional[logging.Logger] = None) -> Tuple[pd.DataFrame, str]:
|
|
242
|
+
kos_matrix = str(kos_matrix)
|
|
243
|
+
initial_delimiter = "," if kos_matrix.lower().endswith(".csv") else "\t"
|
|
244
|
+
delimiter_used = initial_delimiter
|
|
245
|
+
try:
|
|
246
|
+
if logger:
|
|
247
|
+
logger.info(f"Reading KO matrix file with delimiter '{initial_delimiter}': {kos_matrix}")
|
|
248
|
+
ko_df = pd.read_csv(kos_matrix, sep=initial_delimiter)
|
|
249
|
+
except Exception as e_initial:
|
|
250
|
+
if kos_matrix.lower().endswith(".tsv") and initial_delimiter == "\t":
|
|
251
|
+
try:
|
|
252
|
+
if logger:
|
|
253
|
+
logger.info(f"Tab-delimited read failed. Attempting comma delimiter for {kos_matrix}.")
|
|
254
|
+
ko_df = pd.read_csv(kos_matrix, sep=",")
|
|
255
|
+
delimiter_used = ","
|
|
256
|
+
except Exception as e_fallback:
|
|
257
|
+
if logger:
|
|
258
|
+
logger.error(f"Fallback comma delimiter also failed: {e_fallback}")
|
|
259
|
+
raise e_fallback
|
|
260
|
+
else:
|
|
261
|
+
if logger:
|
|
262
|
+
logger.error(f"Failed to read KO matrix {kos_matrix}: {e_initial}")
|
|
263
|
+
raise e_initial
|
|
264
|
+
return ko_df, delimiter_used
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _read_kpct_input_file(kpct_input_file: Union[str, Path]) -> Dict[str, Set[str]]:
|
|
268
|
+
genome_to_kos: Dict[str, Set[str]] = {}
|
|
269
|
+
with open(kpct_input_file, "r") as handle:
|
|
270
|
+
for line in handle:
|
|
271
|
+
line = line.strip()
|
|
272
|
+
if not line:
|
|
273
|
+
continue
|
|
274
|
+
parts = line.split("\t")
|
|
275
|
+
if len(parts) < 2:
|
|
276
|
+
continue
|
|
277
|
+
genome_id = parts[0]
|
|
278
|
+
kos = {ko for ko in parts[1:] if ko}
|
|
279
|
+
genome_to_kos[genome_id] = kos
|
|
280
|
+
return genome_to_kos
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _compare_kpct_outputs(contigs_file: Path, pathways_file: Path) -> Tuple[bool, str]:
|
|
284
|
+
"""
|
|
285
|
+
Compare KPCT contigs and pathways outputs. Returns (match, detail).
|
|
286
|
+
"""
|
|
287
|
+
with contigs_file.open("r") as contigs, pathways_file.open("r") as pathways:
|
|
288
|
+
contig_header = contigs.readline().rstrip("\n").split("\t")
|
|
289
|
+
pathway_header = pathways.readline().rstrip("\n").split("\t")
|
|
290
|
+
if contig_header[1:] != pathway_header:
|
|
291
|
+
return False, "Header mismatch between contigs and pathways outputs."
|
|
292
|
+
|
|
293
|
+
line_no = 0
|
|
294
|
+
for contig_line, pathway_line in zip(contigs, pathways):
|
|
295
|
+
line_no += 1
|
|
296
|
+
contig_line = contig_line.rstrip("\n")
|
|
297
|
+
pathway_line = pathway_line.rstrip("\n")
|
|
298
|
+
if not contig_line and not pathway_line:
|
|
299
|
+
continue
|
|
300
|
+
if contig_line.split("\t")[1:] != pathway_line.split("\t"):
|
|
301
|
+
return False, f"Row mismatch at line {line_no}."
|
|
302
|
+
|
|
303
|
+
# Check for extra trailing lines in either file
|
|
304
|
+
extra_contig = any(line.strip() for line in contigs)
|
|
305
|
+
extra_path = any(line.strip() for line in pathways)
|
|
306
|
+
if extra_contig or extra_path:
|
|
307
|
+
return False, "Row count mismatch between contigs and pathways outputs."
|
|
308
|
+
|
|
309
|
+
return True, "Contigs and pathways outputs match."
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _record_validation_check(report: Dict[str, Any], name: str, status: str, detail: str) -> None:
|
|
313
|
+
entry = {"name": name, "status": status, "detail": detail}
|
|
314
|
+
report["checks"].append(entry)
|
|
315
|
+
if status == "fail":
|
|
316
|
+
report["errors"].append(f"{name}: {detail}")
|
|
317
|
+
elif status == "warn":
|
|
318
|
+
report["warnings"].append(f"{name}: {detail}")
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _count_emapper_header_issues(emapper_file: Path) -> Tuple[int, int]:
|
|
322
|
+
total = 0
|
|
323
|
+
bad = 0
|
|
324
|
+
with emapper_file.open("r") as handle:
|
|
325
|
+
for line in handle:
|
|
326
|
+
if line.startswith("#"):
|
|
327
|
+
continue
|
|
328
|
+
total += 1
|
|
329
|
+
query = line.split("\t", 1)[0]
|
|
330
|
+
if "|" not in query:
|
|
331
|
+
bad += 1
|
|
332
|
+
return total, bad
|
|
333
|
+
|
|
334
|
+
|
|
226
335
|
def default_eggnog_data_dir() -> Path:
|
|
227
336
|
"""Return a safe default location for eggNOG data downloads."""
|
|
228
337
|
xdg_home = os.environ.get("XDG_DATA_HOME")
|
|
@@ -3165,6 +3274,21 @@ def pipeline(
|
|
|
3165
3274
|
"--verbose/--quiet",
|
|
3166
3275
|
help="Enable verbose output with detailed progress information.",
|
|
3167
3276
|
),
|
|
3277
|
+
run_validation: bool = typer.Option(
|
|
3278
|
+
True,
|
|
3279
|
+
"--validate/--no-validate",
|
|
3280
|
+
help="Run post-run validation checks (default: enabled).",
|
|
3281
|
+
),
|
|
3282
|
+
validation_report: bool = typer.Option(
|
|
3283
|
+
True,
|
|
3284
|
+
"--validate-report/--no-validate-report",
|
|
3285
|
+
help="Write validation_report.json in the output directory.",
|
|
3286
|
+
),
|
|
3287
|
+
validate_strict: bool = typer.Option(
|
|
3288
|
+
False,
|
|
3289
|
+
"--validate-strict/--validate-lenient",
|
|
3290
|
+
help="Treat validation warnings as failures.",
|
|
3291
|
+
),
|
|
3168
3292
|
log_level: str = typer.Option("INFO", "--log-level", "-l", help="Logging level (DEBUG, INFO, WARNING, ERROR)."),
|
|
3169
3293
|
eggnog_data_dir: Optional[str] = typer.Option(
|
|
3170
3294
|
None,
|
|
@@ -3233,15 +3357,42 @@ def pipeline(
|
|
|
3233
3357
|
logger.info(f"Resource monitoring enabled. Log file: {resource_log_file}")
|
|
3234
3358
|
|
|
3235
3359
|
# Run the main pipeline logic
|
|
3236
|
-
_run_pipeline_core(
|
|
3237
|
-
|
|
3238
|
-
|
|
3360
|
+
_run_pipeline_core(
|
|
3361
|
+
genomedir,
|
|
3362
|
+
savedir,
|
|
3363
|
+
ncpus,
|
|
3364
|
+
adapt_headers,
|
|
3365
|
+
del_tmp,
|
|
3366
|
+
calculate_complementarity,
|
|
3367
|
+
lowmem,
|
|
3368
|
+
verbose,
|
|
3369
|
+
logger,
|
|
3370
|
+
resource_log_file,
|
|
3371
|
+
eggnog_data_dir,
|
|
3372
|
+
run_validation,
|
|
3373
|
+
validation_report,
|
|
3374
|
+
validate_strict,
|
|
3375
|
+
log_level,
|
|
3376
|
+
)
|
|
3239
3377
|
|
|
3240
3378
|
|
|
3241
|
-
def _run_pipeline_core(
|
|
3242
|
-
|
|
3243
|
-
|
|
3244
|
-
|
|
3379
|
+
def _run_pipeline_core(
|
|
3380
|
+
genomedir: str,
|
|
3381
|
+
savedir: str,
|
|
3382
|
+
ncpus: int,
|
|
3383
|
+
adapt_headers: bool,
|
|
3384
|
+
del_tmp: bool,
|
|
3385
|
+
calculate_complementarity: int,
|
|
3386
|
+
lowmem: bool,
|
|
3387
|
+
verbose: bool,
|
|
3388
|
+
logger: logging.Logger,
|
|
3389
|
+
resource_log_file: str,
|
|
3390
|
+
eggnog_data_dir: Optional[str],
|
|
3391
|
+
run_validation: bool,
|
|
3392
|
+
validation_report: bool,
|
|
3393
|
+
validate_strict: bool,
|
|
3394
|
+
log_level: str,
|
|
3395
|
+
) -> None:
|
|
3245
3396
|
"""
|
|
3246
3397
|
Core pipeline logic separated for resource monitoring.
|
|
3247
3398
|
"""
|
|
@@ -3435,6 +3586,30 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3435
3586
|
# Generate final resource usage summary
|
|
3436
3587
|
log_final_resource_summary(resource_log_file, start_time, logger, verbose)
|
|
3437
3588
|
|
|
3589
|
+
if run_validation:
|
|
3590
|
+
logger.info("Running post-run validation checks.")
|
|
3591
|
+
report_path = None
|
|
3592
|
+
if validation_report:
|
|
3593
|
+
report_path = os.path.join(savedir, "validation_report.json")
|
|
3594
|
+
try:
|
|
3595
|
+
validate(
|
|
3596
|
+
savedir=savedir,
|
|
3597
|
+
mode="ko-matrix",
|
|
3598
|
+
calculate_complementarity=calculate_complementarity,
|
|
3599
|
+
kpct_outprefix=kpct_outprefix,
|
|
3600
|
+
strict=validate_strict,
|
|
3601
|
+
report=report_path,
|
|
3602
|
+
verbose=verbose,
|
|
3603
|
+
log_level=log_level,
|
|
3604
|
+
)
|
|
3605
|
+
except typer.Exit as exc:
|
|
3606
|
+
if logger:
|
|
3607
|
+
logger.error("Validation failed with exit code %s.", exc.exit_code)
|
|
3608
|
+
logger.error("Outputs written to: %s", savedir)
|
|
3609
|
+
if report_path:
|
|
3610
|
+
logger.error("Validation report: %s", report_path)
|
|
3611
|
+
raise
|
|
3612
|
+
|
|
3438
3613
|
# Display pipeline completion summary
|
|
3439
3614
|
display_pipeline_completion_summary(start_time, savedir, logger, verbose)
|
|
3440
3615
|
|
|
@@ -3481,6 +3656,21 @@ def test(
|
|
|
3481
3656
|
"--verbose/--quiet",
|
|
3482
3657
|
help="Enable verbose output with detailed progress information.",
|
|
3483
3658
|
),
|
|
3659
|
+
run_validation: bool = typer.Option(
|
|
3660
|
+
True,
|
|
3661
|
+
"--validate/--no-validate",
|
|
3662
|
+
help="Run post-run validation checks (default: enabled).",
|
|
3663
|
+
),
|
|
3664
|
+
validation_report: bool = typer.Option(
|
|
3665
|
+
True,
|
|
3666
|
+
"--validate-report/--no-validate-report",
|
|
3667
|
+
help="Write validation_report.json in the output directory.",
|
|
3668
|
+
),
|
|
3669
|
+
validate_strict: bool = typer.Option(
|
|
3670
|
+
False,
|
|
3671
|
+
"--validate-strict/--validate-lenient",
|
|
3672
|
+
help="Treat validation warnings as failures.",
|
|
3673
|
+
),
|
|
3484
3674
|
log_level: str = typer.Option(
|
|
3485
3675
|
"INFO",
|
|
3486
3676
|
"--log-level",
|
|
@@ -3526,6 +3716,10 @@ def test(
|
|
|
3526
3716
|
logger,
|
|
3527
3717
|
resource_log_file,
|
|
3528
3718
|
eggnog_data_dir,
|
|
3719
|
+
run_validation,
|
|
3720
|
+
validation_report,
|
|
3721
|
+
validate_strict,
|
|
3722
|
+
log_level,
|
|
3529
3723
|
)
|
|
3530
3724
|
|
|
3531
3725
|
|
|
@@ -3735,6 +3929,21 @@ def analyze_ko_matrix(
|
|
|
3735
3929
|
"--verbose/--quiet",
|
|
3736
3930
|
help="Enable verbose output with detailed progress information.",
|
|
3737
3931
|
),
|
|
3932
|
+
run_validation: bool = typer.Option(
|
|
3933
|
+
True,
|
|
3934
|
+
"--validate/--no-validate",
|
|
3935
|
+
help="Run post-run validation checks (default: enabled).",
|
|
3936
|
+
),
|
|
3937
|
+
validation_report: bool = typer.Option(
|
|
3938
|
+
True,
|
|
3939
|
+
"--validate-report/--no-validate-report",
|
|
3940
|
+
help="Write validation_report.json in the output directory.",
|
|
3941
|
+
),
|
|
3942
|
+
validate_strict: bool = typer.Option(
|
|
3943
|
+
False,
|
|
3944
|
+
"--validate-strict/--validate-lenient",
|
|
3945
|
+
help="Treat validation warnings as failures.",
|
|
3946
|
+
),
|
|
3738
3947
|
log_level: str = typer.Option("INFO", "--log-level", "-l", help="Logging level (DEBUG, INFO, WARNING, ERROR)."),
|
|
3739
3948
|
) -> None:
|
|
3740
3949
|
"""
|
|
@@ -3924,6 +4133,22 @@ def analyze_ko_matrix(
|
|
|
3924
4133
|
# Display pipeline completion summary
|
|
3925
4134
|
display_pipeline_completion_summary(start_time, savedir, logger, verbose)
|
|
3926
4135
|
|
|
4136
|
+
if run_validation:
|
|
4137
|
+
logger.info("Running post-run validation checks.")
|
|
4138
|
+
report_path = None
|
|
4139
|
+
if validation_report:
|
|
4140
|
+
report_path = os.path.join(savedir, "validation_report.json")
|
|
4141
|
+
validate(
|
|
4142
|
+
savedir=savedir,
|
|
4143
|
+
mode="ko-matrix",
|
|
4144
|
+
calculate_complementarity=calculate_complementarity,
|
|
4145
|
+
kpct_outprefix=kpct_outprefix,
|
|
4146
|
+
strict=validate_strict,
|
|
4147
|
+
report=report_path,
|
|
4148
|
+
verbose=verbose,
|
|
4149
|
+
log_level=log_level,
|
|
4150
|
+
)
|
|
4151
|
+
|
|
3927
4152
|
except Exception as e:
|
|
3928
4153
|
if logger:
|
|
3929
4154
|
logger.error(f"Error in KPCT analysis: {str(e)}", exc_info=True)
|
|
@@ -3932,5 +4157,750 @@ def analyze_ko_matrix(
|
|
|
3932
4157
|
exit(1)
|
|
3933
4158
|
|
|
3934
4159
|
|
|
4160
|
+
@app.command()
|
|
4161
|
+
def validate(
|
|
4162
|
+
savedir: str = typer.Argument(
|
|
4163
|
+
...,
|
|
4164
|
+
help="Output directory to validate (from pipeline or analyze-ko-matrix).",
|
|
4165
|
+
),
|
|
4166
|
+
mode: str = typer.Option(
|
|
4167
|
+
"auto",
|
|
4168
|
+
"--mode",
|
|
4169
|
+
help="Validation mode: auto, pipeline, or ko-matrix.",
|
|
4170
|
+
),
|
|
4171
|
+
calculate_complementarity: Optional[int] = typer.Option(
|
|
4172
|
+
None,
|
|
4173
|
+
"--calculate-complementarity",
|
|
4174
|
+
"-c",
|
|
4175
|
+
help="Expected complementarity size (0 disables). If omitted, detects from outputs.",
|
|
4176
|
+
),
|
|
4177
|
+
kpct_outprefix: str = typer.Option(
|
|
4178
|
+
"output_give_completeness",
|
|
4179
|
+
"--kpct-outprefix",
|
|
4180
|
+
help="Prefix for KPCT output files (use if you changed it in analyze-ko-matrix).",
|
|
4181
|
+
),
|
|
4182
|
+
strict: bool = typer.Option(
|
|
4183
|
+
False,
|
|
4184
|
+
"--strict/--lenient",
|
|
4185
|
+
help="Treat warnings as failures.",
|
|
4186
|
+
),
|
|
4187
|
+
report: Optional[str] = typer.Option(
|
|
4188
|
+
None,
|
|
4189
|
+
"--report",
|
|
4190
|
+
help="Write JSON validation report to this path.",
|
|
4191
|
+
),
|
|
4192
|
+
verbose: bool = typer.Option(
|
|
4193
|
+
False,
|
|
4194
|
+
"--verbose/--quiet",
|
|
4195
|
+
help="Enable verbose output with detailed progress information.",
|
|
4196
|
+
),
|
|
4197
|
+
log_level: str = typer.Option(
|
|
4198
|
+
"INFO",
|
|
4199
|
+
"--log-level",
|
|
4200
|
+
"-l",
|
|
4201
|
+
help="Logging level (DEBUG, INFO, WARNING, ERROR).",
|
|
4202
|
+
),
|
|
4203
|
+
) -> None:
|
|
4204
|
+
"""Run scientific validation checks on a ModuComp output directory."""
|
|
4205
|
+
savedir = os.path.abspath(savedir)
|
|
4206
|
+
if not os.path.isdir(savedir):
|
|
4207
|
+
log_error(f"Output directory not found: {savedir}")
|
|
4208
|
+
raise typer.Exit(1)
|
|
4209
|
+
|
|
4210
|
+
log_dir = Path(savedir) / "logs"
|
|
4211
|
+
logger = configure_logging(log_level, log_dir)
|
|
4212
|
+
RESOURCE_SUMMARIES.clear()
|
|
4213
|
+
logger.info("Starting moducomp validation.")
|
|
4214
|
+
logger.info("Output directory: %s", savedir)
|
|
4215
|
+
logger.info("CLI command: %s", " ".join(shlex.quote(arg) for arg in sys.argv))
|
|
4216
|
+
|
|
4217
|
+
mode = mode.lower().strip()
|
|
4218
|
+
if mode not in {"auto", "pipeline", "ko-matrix"}:
|
|
4219
|
+
log_error(f"Invalid mode '{mode}'. Use auto, pipeline, or ko-matrix.", logger=logger)
|
|
4220
|
+
raise typer.Exit(1)
|
|
4221
|
+
|
|
4222
|
+
report_data: Dict[str, Any] = {
|
|
4223
|
+
"savedir": savedir,
|
|
4224
|
+
"mode": mode,
|
|
4225
|
+
"timestamp": datetime.datetime.now().isoformat(),
|
|
4226
|
+
"checks": [],
|
|
4227
|
+
"warnings": [],
|
|
4228
|
+
"errors": [],
|
|
4229
|
+
"stats": {},
|
|
4230
|
+
}
|
|
4231
|
+
|
|
4232
|
+
emapper_file = _find_emapper_annotations(savedir)
|
|
4233
|
+
if mode == "auto":
|
|
4234
|
+
mode = "pipeline" if emapper_file else "ko-matrix"
|
|
4235
|
+
report_data["mode"] = mode
|
|
4236
|
+
|
|
4237
|
+
if mode == "pipeline" and not emapper_file:
|
|
4238
|
+
_record_validation_check(
|
|
4239
|
+
report_data,
|
|
4240
|
+
"emapper_annotations",
|
|
4241
|
+
"fail",
|
|
4242
|
+
"Pipeline mode selected but emapper annotations were not found.",
|
|
4243
|
+
)
|
|
4244
|
+
logger.error("Pipeline mode requires emapper annotations. Validation aborted.")
|
|
4245
|
+
if report:
|
|
4246
|
+
with open(report, "w") as handle:
|
|
4247
|
+
json.dump(report_data, handle, indent=2)
|
|
4248
|
+
raise typer.Exit(1)
|
|
4249
|
+
|
|
4250
|
+
if mode == "ko-matrix" and emapper_file:
|
|
4251
|
+
_record_validation_check(
|
|
4252
|
+
report_data,
|
|
4253
|
+
"emapper_annotations",
|
|
4254
|
+
"warn",
|
|
4255
|
+
f"KO-matrix mode selected but emapper annotations exist at {emapper_file}.",
|
|
4256
|
+
)
|
|
4257
|
+
|
|
4258
|
+
kos_matrix_path = Path(savedir) / "kos_matrix.csv"
|
|
4259
|
+
kpct_input_file = Path(savedir) / "ko_file_for_kpct.txt"
|
|
4260
|
+
module_matrix_file = Path(savedir) / "module_completeness.tsv"
|
|
4261
|
+
|
|
4262
|
+
contigs_file = None
|
|
4263
|
+
pathways_file = None
|
|
4264
|
+
contigs_candidates = [
|
|
4265
|
+
Path(savedir) / f"{kpct_outprefix}_contigs.with_weights.tsv",
|
|
4266
|
+
Path(savedir) / f"{kpct_outprefix}_contigs.tsv",
|
|
4267
|
+
]
|
|
4268
|
+
pathways_candidates = [
|
|
4269
|
+
Path(savedir) / f"{kpct_outprefix}_pathways.with_weights.tsv",
|
|
4270
|
+
Path(savedir) / f"{kpct_outprefix}_pathways.tsv",
|
|
4271
|
+
]
|
|
4272
|
+
for candidate in contigs_candidates:
|
|
4273
|
+
if candidate.exists():
|
|
4274
|
+
contigs_file = candidate
|
|
4275
|
+
break
|
|
4276
|
+
for candidate in pathways_candidates:
|
|
4277
|
+
if candidate.exists():
|
|
4278
|
+
pathways_file = candidate
|
|
4279
|
+
break
|
|
4280
|
+
|
|
4281
|
+
if not contigs_file and not pathways_file:
|
|
4282
|
+
_record_validation_check(
|
|
4283
|
+
report_data,
|
|
4284
|
+
"kpct_outputs",
|
|
4285
|
+
"warn",
|
|
4286
|
+
f"No KPCT output files found for prefix '{kpct_outprefix}'.",
|
|
4287
|
+
)
|
|
4288
|
+
|
|
4289
|
+
required_files = {
|
|
4290
|
+
"KO matrix": kos_matrix_path,
|
|
4291
|
+
"KPCT input": kpct_input_file,
|
|
4292
|
+
"Module completeness matrix": module_matrix_file,
|
|
4293
|
+
}
|
|
4294
|
+
if contigs_file:
|
|
4295
|
+
required_files["KPCT contigs output"] = contigs_file
|
|
4296
|
+
if pathways_file:
|
|
4297
|
+
required_files["KPCT pathways output"] = pathways_file
|
|
4298
|
+
|
|
4299
|
+
for label, path in required_files.items():
|
|
4300
|
+
if not path.exists():
|
|
4301
|
+
_record_validation_check(
|
|
4302
|
+
report_data,
|
|
4303
|
+
f"file_exists:{label}",
|
|
4304
|
+
"fail",
|
|
4305
|
+
f"Missing required file: {path}",
|
|
4306
|
+
)
|
|
4307
|
+
else:
|
|
4308
|
+
_record_validation_check(
|
|
4309
|
+
report_data,
|
|
4310
|
+
f"file_exists:{label}",
|
|
4311
|
+
"ok",
|
|
4312
|
+
f"Found {path}",
|
|
4313
|
+
)
|
|
4314
|
+
|
|
4315
|
+
if report_data["errors"]:
|
|
4316
|
+
logger.error("Validation halted due to missing required files.")
|
|
4317
|
+
if report:
|
|
4318
|
+
with open(report, "w") as handle:
|
|
4319
|
+
json.dump(report_data, handle, indent=2)
|
|
4320
|
+
raise typer.Exit(1)
|
|
4321
|
+
|
|
4322
|
+
# Read KO matrix
|
|
4323
|
+
ko_df, ko_delimiter = _read_ko_matrix_file(kos_matrix_path, logger)
|
|
4324
|
+
if "taxon_oid" not in ko_df.columns:
|
|
4325
|
+
_record_validation_check(
|
|
4326
|
+
report_data,
|
|
4327
|
+
"ko_matrix_format",
|
|
4328
|
+
"fail",
|
|
4329
|
+
"KO matrix missing required 'taxon_oid' column.",
|
|
4330
|
+
)
|
|
4331
|
+
logger.error("KO matrix validation failed: missing taxon_oid.")
|
|
4332
|
+
if report:
|
|
4333
|
+
with open(report, "w") as handle:
|
|
4334
|
+
json.dump(report_data, handle, indent=2)
|
|
4335
|
+
raise typer.Exit(1)
|
|
4336
|
+
|
|
4337
|
+
ko_columns = [col for col in ko_df.columns if col != "taxon_oid"]
|
|
4338
|
+
if not ko_columns:
|
|
4339
|
+
_record_validation_check(
|
|
4340
|
+
report_data,
|
|
4341
|
+
"ko_matrix_format",
|
|
4342
|
+
"fail",
|
|
4343
|
+
"KO matrix has no KO columns.",
|
|
4344
|
+
)
|
|
4345
|
+
logger.error("KO matrix validation failed: no KO columns.")
|
|
4346
|
+
if report:
|
|
4347
|
+
with open(report, "w") as handle:
|
|
4348
|
+
json.dump(report_data, handle, indent=2)
|
|
4349
|
+
raise typer.Exit(1)
|
|
4350
|
+
|
|
4351
|
+
ko_pattern = re.compile(r"^K\d{5}$")
|
|
4352
|
+
invalid_kos = [col for col in ko_columns if not ko_pattern.match(col)]
|
|
4353
|
+
if invalid_kos:
|
|
4354
|
+
_record_validation_check(
|
|
4355
|
+
report_data,
|
|
4356
|
+
"ko_matrix_columns",
|
|
4357
|
+
"warn",
|
|
4358
|
+
f"Found {len(invalid_kos)} non-KO columns (expected KXXXXX). Example: {invalid_kos[:5]}",
|
|
4359
|
+
)
|
|
4360
|
+
else:
|
|
4361
|
+
_record_validation_check(
|
|
4362
|
+
report_data,
|
|
4363
|
+
"ko_matrix_columns",
|
|
4364
|
+
"ok",
|
|
4365
|
+
f"Found {len(ko_columns)} KO columns.",
|
|
4366
|
+
)
|
|
4367
|
+
|
|
4368
|
+
ko_df["taxon_oid"] = ko_df["taxon_oid"].astype(str)
|
|
4369
|
+
genomes = ko_df["taxon_oid"].tolist()
|
|
4370
|
+
report_data["stats"]["genomes"] = len(genomes)
|
|
4371
|
+
report_data["stats"]["ko_columns"] = len(ko_columns)
|
|
4372
|
+
if len(genomes) != len(set(genomes)):
|
|
4373
|
+
_record_validation_check(
|
|
4374
|
+
report_data,
|
|
4375
|
+
"genome_ids_unique",
|
|
4376
|
+
"warn",
|
|
4377
|
+
"Duplicate genome identifiers found in KO matrix.",
|
|
4378
|
+
)
|
|
4379
|
+
else:
|
|
4380
|
+
_record_validation_check(
|
|
4381
|
+
report_data,
|
|
4382
|
+
"genome_ids_unique",
|
|
4383
|
+
"ok",
|
|
4384
|
+
"Genome identifiers are unique in KO matrix.",
|
|
4385
|
+
)
|
|
4386
|
+
|
|
4387
|
+
bad_names = [g for g in genomes if not g or g.strip() != g or re.search(r"\\s", g)]
|
|
4388
|
+
if bad_names:
|
|
4389
|
+
_record_validation_check(
|
|
4390
|
+
report_data,
|
|
4391
|
+
"genome_id_format",
|
|
4392
|
+
"warn",
|
|
4393
|
+
f"Found {len(bad_names)} genome identifiers with whitespace or empty values. Example: {bad_names[:3]}",
|
|
4394
|
+
)
|
|
4395
|
+
else:
|
|
4396
|
+
_record_validation_check(
|
|
4397
|
+
report_data,
|
|
4398
|
+
"genome_id_format",
|
|
4399
|
+
"ok",
|
|
4400
|
+
"Genome identifiers contain no whitespace.",
|
|
4401
|
+
)
|
|
4402
|
+
|
|
4403
|
+
ko_numeric = ko_df[ko_columns].apply(pd.to_numeric, errors="coerce")
|
|
4404
|
+
if ko_numeric.isna().any().any():
|
|
4405
|
+
_record_validation_check(
|
|
4406
|
+
report_data,
|
|
4407
|
+
"ko_matrix_numeric",
|
|
4408
|
+
"warn",
|
|
4409
|
+
"Non-numeric KO counts detected in KO matrix.",
|
|
4410
|
+
)
|
|
4411
|
+
else:
|
|
4412
|
+
_record_validation_check(
|
|
4413
|
+
report_data,
|
|
4414
|
+
"ko_matrix_numeric",
|
|
4415
|
+
"ok",
|
|
4416
|
+
"KO matrix counts are numeric.",
|
|
4417
|
+
)
|
|
4418
|
+
|
|
4419
|
+
ko_totals_series = ko_numeric.sum(axis=1)
|
|
4420
|
+
ko_totals = {str(ko_df.at[idx, "taxon_oid"]): float(total) for idx, total in ko_totals_series.items()}
|
|
4421
|
+
ko_present = {}
|
|
4422
|
+
for idx, row in ko_numeric.iterrows():
|
|
4423
|
+
genome_id = str(ko_df.at[idx, "taxon_oid"])
|
|
4424
|
+
ko_present[genome_id] = {ko for ko in ko_columns if row[ko] > 0}
|
|
4425
|
+
|
|
4426
|
+
# KPCT input consistency
|
|
4427
|
+
kpct_genomes_to_kos = _read_kpct_input_file(kpct_input_file)
|
|
4428
|
+
missing_kpct = set(genomes) - set(kpct_genomes_to_kos.keys())
|
|
4429
|
+
if missing_kpct:
|
|
4430
|
+
_record_validation_check(
|
|
4431
|
+
report_data,
|
|
4432
|
+
"kpct_input_genomes",
|
|
4433
|
+
"warn",
|
|
4434
|
+
f"{len(missing_kpct)} genomes from KO matrix missing in KPCT input. Example: {list(missing_kpct)[:3]}",
|
|
4435
|
+
)
|
|
4436
|
+
else:
|
|
4437
|
+
_record_validation_check(
|
|
4438
|
+
report_data,
|
|
4439
|
+
"kpct_input_genomes",
|
|
4440
|
+
"ok",
|
|
4441
|
+
"All KO-matrix genomes are present in KPCT input.",
|
|
4442
|
+
)
|
|
4443
|
+
|
|
4444
|
+
ko_mismatch = []
|
|
4445
|
+
for genome_id in genomes:
|
|
4446
|
+
kpct_kos = kpct_genomes_to_kos.get(genome_id)
|
|
4447
|
+
if kpct_kos is None:
|
|
4448
|
+
continue
|
|
4449
|
+
if ko_present[genome_id] != kpct_kos:
|
|
4450
|
+
ko_mismatch.append(genome_id)
|
|
4451
|
+
if ko_mismatch:
|
|
4452
|
+
_record_validation_check(
|
|
4453
|
+
report_data,
|
|
4454
|
+
"kpct_input_kos",
|
|
4455
|
+
"warn",
|
|
4456
|
+
f"KO sets differ between KO matrix and KPCT input for {len(ko_mismatch)} genomes. Example: {ko_mismatch[:3]}",
|
|
4457
|
+
)
|
|
4458
|
+
else:
|
|
4459
|
+
_record_validation_check(
|
|
4460
|
+
report_data,
|
|
4461
|
+
"kpct_input_kos",
|
|
4462
|
+
"ok",
|
|
4463
|
+
"KPCT input KO sets match KO matrix for all genomes.",
|
|
4464
|
+
)
|
|
4465
|
+
|
|
4466
|
+
combo_ids = [gid for gid in kpct_genomes_to_kos.keys() if "__" in gid]
|
|
4467
|
+
if combo_ids:
|
|
4468
|
+
max_checks = 100
|
|
4469
|
+
mismatch_count = 0
|
|
4470
|
+
for combo_id in combo_ids[:max_checks]:
|
|
4471
|
+
members = combo_id.split("__")
|
|
4472
|
+
if any(member not in ko_present for member in members):
|
|
4473
|
+
continue
|
|
4474
|
+
union_kos = set()
|
|
4475
|
+
for member in members:
|
|
4476
|
+
union_kos.update(ko_present[member])
|
|
4477
|
+
if union_kos != kpct_genomes_to_kos[combo_id]:
|
|
4478
|
+
mismatch_count += 1
|
|
4479
|
+
if mismatch_count:
|
|
4480
|
+
_record_validation_check(
|
|
4481
|
+
report_data,
|
|
4482
|
+
"kpct_combo_kos",
|
|
4483
|
+
"warn",
|
|
4484
|
+
f"{mismatch_count} of {min(len(combo_ids), max_checks)} combination KO sets do not match union of members.",
|
|
4485
|
+
)
|
|
4486
|
+
else:
|
|
4487
|
+
_record_validation_check(
|
|
4488
|
+
report_data,
|
|
4489
|
+
"kpct_combo_kos",
|
|
4490
|
+
"ok",
|
|
4491
|
+
"Combination KO sets match union of members (sampled).",
|
|
4492
|
+
)
|
|
4493
|
+
|
|
4494
|
+
# KPCT contigs vs pathways outputs
|
|
4495
|
+
if contigs_file and pathways_file:
|
|
4496
|
+
match, detail = _compare_kpct_outputs(contigs_file, pathways_file)
|
|
4497
|
+
_record_validation_check(
|
|
4498
|
+
report_data,
|
|
4499
|
+
"kpct_output_consistency",
|
|
4500
|
+
"ok" if match else "warn",
|
|
4501
|
+
detail,
|
|
4502
|
+
)
|
|
4503
|
+
|
|
4504
|
+
# Module completeness checks
|
|
4505
|
+
module_df = pd.read_csv(module_matrix_file, sep="\t")
|
|
4506
|
+
if "n_members" not in module_df.columns or "taxon_oid" not in module_df.columns:
|
|
4507
|
+
_record_validation_check(
|
|
4508
|
+
report_data,
|
|
4509
|
+
"module_completeness_format",
|
|
4510
|
+
"fail",
|
|
4511
|
+
"module_completeness.tsv missing n_members or taxon_oid.",
|
|
4512
|
+
)
|
|
4513
|
+
logger.error("module_completeness.tsv missing required columns. Validation aborted.")
|
|
4514
|
+
if report:
|
|
4515
|
+
with open(report, "w") as handle:
|
|
4516
|
+
json.dump(report_data, handle, indent=2)
|
|
4517
|
+
raise typer.Exit(1)
|
|
4518
|
+
else:
|
|
4519
|
+
_record_validation_check(
|
|
4520
|
+
report_data,
|
|
4521
|
+
"module_completeness_format",
|
|
4522
|
+
"ok",
|
|
4523
|
+
"module_completeness.tsv has required columns.",
|
|
4524
|
+
)
|
|
4525
|
+
|
|
4526
|
+
module_df["taxon_oid"] = module_df["taxon_oid"].astype(str)
|
|
4527
|
+
module_df["n_members"] = pd.to_numeric(module_df["n_members"], errors="coerce")
|
|
4528
|
+
if module_df["n_members"].isna().any():
|
|
4529
|
+
_record_validation_check(
|
|
4530
|
+
report_data,
|
|
4531
|
+
"module_completeness_n_members",
|
|
4532
|
+
"warn",
|
|
4533
|
+
"Non-numeric n_members values detected in module completeness matrix.",
|
|
4534
|
+
)
|
|
4535
|
+
module_df["n_members"] = module_df["n_members"].fillna(-1).astype(int)
|
|
4536
|
+
|
|
4537
|
+
module_cols = [col for col in module_df.columns if re.match(r"^M\d{5}$", col)]
|
|
4538
|
+
if not module_cols:
|
|
4539
|
+
_record_validation_check(
|
|
4540
|
+
report_data,
|
|
4541
|
+
"module_columns",
|
|
4542
|
+
"fail",
|
|
4543
|
+
"No KEGG module columns detected in module completeness matrix.",
|
|
4544
|
+
)
|
|
4545
|
+
else:
|
|
4546
|
+
_record_validation_check(
|
|
4547
|
+
report_data,
|
|
4548
|
+
"module_columns",
|
|
4549
|
+
"ok",
|
|
4550
|
+
f"Detected {len(module_cols)} module columns.",
|
|
4551
|
+
)
|
|
4552
|
+
report_data["stats"]["modules"] = len(module_cols)
|
|
4553
|
+
|
|
4554
|
+
completeness_scale = 100.0
|
|
4555
|
+
if module_cols:
|
|
4556
|
+
module_values = module_df[module_cols].apply(pd.to_numeric, errors="coerce")
|
|
4557
|
+
if module_values.isna().any().any():
|
|
4558
|
+
_record_validation_check(
|
|
4559
|
+
report_data,
|
|
4560
|
+
"module_completeness_numeric",
|
|
4561
|
+
"warn",
|
|
4562
|
+
"Non-numeric completeness values found in module completeness matrix.",
|
|
4563
|
+
)
|
|
4564
|
+
min_val = float(module_values.min().min())
|
|
4565
|
+
max_val = float(module_values.max().max())
|
|
4566
|
+
report_data["stats"]["module_min"] = min_val
|
|
4567
|
+
report_data["stats"]["module_max"] = max_val
|
|
4568
|
+
if min_val < -1e-6 or max_val > 100.0 + 1e-6:
|
|
4569
|
+
_record_validation_check(
|
|
4570
|
+
report_data,
|
|
4571
|
+
"module_completeness_range",
|
|
4572
|
+
"fail",
|
|
4573
|
+
f"Module completeness values out of expected range (min={min_val}, max={max_val}).",
|
|
4574
|
+
)
|
|
4575
|
+
else:
|
|
4576
|
+
_record_validation_check(
|
|
4577
|
+
report_data,
|
|
4578
|
+
"module_completeness_range",
|
|
4579
|
+
"ok",
|
|
4580
|
+
f"Module completeness range OK (min={min_val}, max={max_val}).",
|
|
4581
|
+
)
|
|
4582
|
+
completeness_scale = 100.0 if max_val > 1.5 else 1.0
|
|
4583
|
+
report_data["stats"]["completeness_scale"] = completeness_scale
|
|
4584
|
+
|
|
4585
|
+
# Validate n_members vs taxon_oid format
|
|
4586
|
+
mismatch_members = 0
|
|
4587
|
+
for _, row in module_df.iterrows():
|
|
4588
|
+
taxon_id = str(row["taxon_oid"])
|
|
4589
|
+
expected_members = taxon_id.count("__") + 1 if "__" in taxon_id else 1
|
|
4590
|
+
if int(row["n_members"]) != expected_members:
|
|
4591
|
+
mismatch_members += 1
|
|
4592
|
+
if mismatch_members:
|
|
4593
|
+
_record_validation_check(
|
|
4594
|
+
report_data,
|
|
4595
|
+
"n_members_consistency",
|
|
4596
|
+
"warn",
|
|
4597
|
+
f"{mismatch_members} rows have n_members inconsistent with taxon_oid combination size.",
|
|
4598
|
+
)
|
|
4599
|
+
else:
|
|
4600
|
+
_record_validation_check(
|
|
4601
|
+
report_data,
|
|
4602
|
+
"n_members_consistency",
|
|
4603
|
+
"ok",
|
|
4604
|
+
"n_members values match taxon_oid combination sizes.",
|
|
4605
|
+
)
|
|
4606
|
+
|
|
4607
|
+
# Compare contig ids and module ids if KPCT outputs available
|
|
4608
|
+
if contigs_file:
|
|
4609
|
+
contig_ids: Set[str] = set()
|
|
4610
|
+
kpct_module_ids: Set[str] = set()
|
|
4611
|
+
with contigs_file.open("r") as handle:
|
|
4612
|
+
reader = csv.DictReader(handle, delimiter="\t")
|
|
4613
|
+
for row in reader:
|
|
4614
|
+
contig_value = row.get("contig") or row.get("Contig") or row.get("genome") or row.get("Genome") or row.get("taxon_oid")
|
|
4615
|
+
if contig_value:
|
|
4616
|
+
contig_ids.add(str(contig_value))
|
|
4617
|
+
module_value = (
|
|
4618
|
+
row.get("module_accession")
|
|
4619
|
+
or row.get("module_id")
|
|
4620
|
+
or row.get("Module")
|
|
4621
|
+
)
|
|
4622
|
+
if module_value:
|
|
4623
|
+
kpct_module_ids.add(str(module_value))
|
|
4624
|
+
missing_contigs = set(module_df["taxon_oid"]) - contig_ids
|
|
4625
|
+
if missing_contigs:
|
|
4626
|
+
_record_validation_check(
|
|
4627
|
+
report_data,
|
|
4628
|
+
"kpct_contigs_coverage",
|
|
4629
|
+
"warn",
|
|
4630
|
+
f"{len(missing_contigs)} taxon_oids from module matrix missing in KPCT contigs output.",
|
|
4631
|
+
)
|
|
4632
|
+
else:
|
|
4633
|
+
_record_validation_check(
|
|
4634
|
+
report_data,
|
|
4635
|
+
"kpct_contigs_coverage",
|
|
4636
|
+
"ok",
|
|
4637
|
+
"KPCT contigs output covers all taxon_oids in module matrix.",
|
|
4638
|
+
)
|
|
4639
|
+
|
|
4640
|
+
missing_modules = set(module_cols) - kpct_module_ids
|
|
4641
|
+
if missing_modules:
|
|
4642
|
+
_record_validation_check(
|
|
4643
|
+
report_data,
|
|
4644
|
+
"kpct_module_coverage",
|
|
4645
|
+
"warn",
|
|
4646
|
+
f"{len(missing_modules)} module columns missing from KPCT contigs output.",
|
|
4647
|
+
)
|
|
4648
|
+
else:
|
|
4649
|
+
_record_validation_check(
|
|
4650
|
+
report_data,
|
|
4651
|
+
"kpct_module_coverage",
|
|
4652
|
+
"ok",
|
|
4653
|
+
"KPCT contigs output covers all module columns.",
|
|
4654
|
+
)
|
|
4655
|
+
|
|
4656
|
+
# Emapper checks (pipeline mode)
|
|
4657
|
+
if mode == "pipeline" and emapper_file:
|
|
4658
|
+
total_queries, bad_queries = _count_emapper_header_issues(emapper_file)
|
|
4659
|
+
if bad_queries:
|
|
4660
|
+
_record_validation_check(
|
|
4661
|
+
report_data,
|
|
4662
|
+
"emapper_header_format",
|
|
4663
|
+
"warn",
|
|
4664
|
+
f"{bad_queries} of {total_queries} emapper queries lack genome|protein format.",
|
|
4665
|
+
)
|
|
4666
|
+
else:
|
|
4667
|
+
_record_validation_check(
|
|
4668
|
+
report_data,
|
|
4669
|
+
"emapper_header_format",
|
|
4670
|
+
"ok",
|
|
4671
|
+
"All emapper queries follow genome|protein format.",
|
|
4672
|
+
)
|
|
4673
|
+
|
|
4674
|
+
genome_ko_proteins = parse_emapper_annotations(str(emapper_file), logger)
|
|
4675
|
+
emapper_genomes = set(genome_ko_proteins.keys())
|
|
4676
|
+
if set(genomes) - emapper_genomes:
|
|
4677
|
+
_record_validation_check(
|
|
4678
|
+
report_data,
|
|
4679
|
+
"emapper_genome_coverage",
|
|
4680
|
+
"warn",
|
|
4681
|
+
"Some genomes in KO matrix are missing in emapper annotations.",
|
|
4682
|
+
)
|
|
4683
|
+
else:
|
|
4684
|
+
_record_validation_check(
|
|
4685
|
+
report_data,
|
|
4686
|
+
"emapper_genome_coverage",
|
|
4687
|
+
"ok",
|
|
4688
|
+
"All KO-matrix genomes are present in emapper annotations.",
|
|
4689
|
+
)
|
|
4690
|
+
|
|
4691
|
+
emapper_ko_set: Set[str] = set()
|
|
4692
|
+
emapper_totals: Dict[str, int] = {}
|
|
4693
|
+
for genome_id, ko_dict in genome_ko_proteins.items():
|
|
4694
|
+
emapper_totals[genome_id] = sum(len(proteins) for proteins in ko_dict.values())
|
|
4695
|
+
emapper_ko_set.update(ko_dict.keys())
|
|
4696
|
+
|
|
4697
|
+
if set(ko_columns) != emapper_ko_set:
|
|
4698
|
+
_record_validation_check(
|
|
4699
|
+
report_data,
|
|
4700
|
+
"emapper_vs_matrix_kos",
|
|
4701
|
+
"warn",
|
|
4702
|
+
f"KO sets differ between emapper annotations and KO matrix (emapper={len(emapper_ko_set)}, matrix={len(ko_columns)}).",
|
|
4703
|
+
)
|
|
4704
|
+
else:
|
|
4705
|
+
_record_validation_check(
|
|
4706
|
+
report_data,
|
|
4707
|
+
"emapper_vs_matrix_kos",
|
|
4708
|
+
"ok",
|
|
4709
|
+
"KO sets match between emapper annotations and KO matrix.",
|
|
4710
|
+
)
|
|
4711
|
+
|
|
4712
|
+
mismatched_totals = []
|
|
4713
|
+
for genome_id, total in emapper_totals.items():
|
|
4714
|
+
if genome_id not in ko_totals:
|
|
4715
|
+
continue
|
|
4716
|
+
matrix_total = float(ko_totals[genome_id])
|
|
4717
|
+
if abs(matrix_total - total) > 1e-6:
|
|
4718
|
+
mismatched_totals.append(genome_id)
|
|
4719
|
+
if mismatched_totals:
|
|
4720
|
+
_record_validation_check(
|
|
4721
|
+
report_data,
|
|
4722
|
+
"emapper_vs_matrix_counts",
|
|
4723
|
+
"fail",
|
|
4724
|
+
f"KO counts differ between emapper and KO matrix for {len(mismatched_totals)} genomes. Example: {mismatched_totals[:3]}",
|
|
4725
|
+
)
|
|
4726
|
+
else:
|
|
4727
|
+
_record_validation_check(
|
|
4728
|
+
report_data,
|
|
4729
|
+
"emapper_vs_matrix_counts",
|
|
4730
|
+
"ok",
|
|
4731
|
+
"KO counts match between emapper and KO matrix.",
|
|
4732
|
+
)
|
|
4733
|
+
|
|
4734
|
+
# Complementarity checks
|
|
4735
|
+
comp_pattern = re.compile(r"module_completeness_complementarity_(\\d+)member\\.tsv$")
|
|
4736
|
+
comp_files: Dict[int, Path] = {}
|
|
4737
|
+
for file_path in Path(savedir).glob("module_completeness_complementarity_*member.tsv"):
|
|
4738
|
+
match = comp_pattern.match(file_path.name)
|
|
4739
|
+
if match:
|
|
4740
|
+
comp_files[int(match.group(1))] = file_path
|
|
4741
|
+
|
|
4742
|
+
if calculate_complementarity is None:
|
|
4743
|
+
expected_sizes = sorted(comp_files.keys())
|
|
4744
|
+
elif calculate_complementarity >= 2:
|
|
4745
|
+
expected_sizes = list(range(2, calculate_complementarity + 1))
|
|
4746
|
+
else:
|
|
4747
|
+
expected_sizes = []
|
|
4748
|
+
|
|
4749
|
+
for n_members in expected_sizes:
|
|
4750
|
+
if n_members not in comp_files:
|
|
4751
|
+
_record_validation_check(
|
|
4752
|
+
report_data,
|
|
4753
|
+
f"complementarity_file_{n_members}",
|
|
4754
|
+
"fail",
|
|
4755
|
+
f"Expected complementarity report missing for {n_members}-member combinations.",
|
|
4756
|
+
)
|
|
4757
|
+
else:
|
|
4758
|
+
_record_validation_check(
|
|
4759
|
+
report_data,
|
|
4760
|
+
f"complementarity_file_{n_members}",
|
|
4761
|
+
"ok",
|
|
4762
|
+
f"Found complementarity report for {n_members}-member combinations.",
|
|
4763
|
+
)
|
|
4764
|
+
|
|
4765
|
+
# Prepare module lookup for complementarity validation
|
|
4766
|
+
module_df_indexed = module_df.set_index(["n_members", "taxon_oid"])
|
|
4767
|
+
|
|
4768
|
+
for n_members, comp_path in comp_files.items():
|
|
4769
|
+
comp_df = pd.read_csv(comp_path, sep="\t")
|
|
4770
|
+
taxon_cols = [col for col in comp_df.columns if col.startswith("taxon_oid_")]
|
|
4771
|
+
completeness_cols = [col for col in comp_df.columns if col.startswith("completeness_taxon_oid_")]
|
|
4772
|
+
|
|
4773
|
+
if not taxon_cols or "module_id" not in comp_df.columns:
|
|
4774
|
+
_record_validation_check(
|
|
4775
|
+
report_data,
|
|
4776
|
+
f"complementarity_format_{n_members}",
|
|
4777
|
+
"fail",
|
|
4778
|
+
f"Complementarity report {comp_path.name} missing required columns.",
|
|
4779
|
+
)
|
|
4780
|
+
continue
|
|
4781
|
+
|
|
4782
|
+
taxon_cols = sorted(taxon_cols, key=lambda x: int(x.split("_")[-1]))
|
|
4783
|
+
completeness_cols = sorted(completeness_cols, key=lambda x: int(x.split("_")[-1])) if completeness_cols else []
|
|
4784
|
+
|
|
4785
|
+
missing_rows = 0
|
|
4786
|
+
bad_combo = 0
|
|
4787
|
+
bad_individual = 0
|
|
4788
|
+
mismatch_reported = 0
|
|
4789
|
+
missing_proteins = 0
|
|
4790
|
+
non_placeholder = 0
|
|
4791
|
+
protein_cols = [col for col in comp_df.columns if col.startswith("proteins_taxon_oid_")]
|
|
4792
|
+
|
|
4793
|
+
for _, row in comp_df.iterrows():
|
|
4794
|
+
taxon_ids = [str(row[col]) for col in taxon_cols]
|
|
4795
|
+
module_id = str(row["module_id"])
|
|
4796
|
+
combo_id = "__".join(taxon_ids)
|
|
4797
|
+
combo_key = (n_members, combo_id)
|
|
4798
|
+
if combo_key not in module_df_indexed.index:
|
|
4799
|
+
combo_id_sorted = "__".join(sorted(taxon_ids))
|
|
4800
|
+
combo_key = (n_members, combo_id_sorted)
|
|
4801
|
+
if combo_key not in module_df_indexed.index:
|
|
4802
|
+
missing_rows += 1
|
|
4803
|
+
continue
|
|
4804
|
+
|
|
4805
|
+
if module_id not in module_df.columns:
|
|
4806
|
+
missing_rows += 1
|
|
4807
|
+
continue
|
|
4808
|
+
combo_val = module_df_indexed.loc[combo_key, module_id]
|
|
4809
|
+
if isinstance(combo_val, pd.Series):
|
|
4810
|
+
combo_val = combo_val.iloc[0]
|
|
4811
|
+
combo_val = float(combo_val)
|
|
4812
|
+
if abs(combo_val - completeness_scale) > 1e-6:
|
|
4813
|
+
bad_combo += 1
|
|
4814
|
+
|
|
4815
|
+
for idx, taxon_id in enumerate(taxon_ids):
|
|
4816
|
+
try:
|
|
4817
|
+
individual_val = module_df_indexed.loc[(1, taxon_id), module_id]
|
|
4818
|
+
if isinstance(individual_val, pd.Series):
|
|
4819
|
+
individual_val = individual_val.iloc[0]
|
|
4820
|
+
individual_val = float(individual_val)
|
|
4821
|
+
except KeyError:
|
|
4822
|
+
missing_rows += 1
|
|
4823
|
+
continue
|
|
4824
|
+
if individual_val >= completeness_scale - 1e-6:
|
|
4825
|
+
bad_individual += 1
|
|
4826
|
+
if completeness_cols:
|
|
4827
|
+
reported_val = float(row[completeness_cols[idx]])
|
|
4828
|
+
if abs(reported_val - individual_val) > 1e-6:
|
|
4829
|
+
mismatch_reported += 1
|
|
4830
|
+
|
|
4831
|
+
if protein_cols:
|
|
4832
|
+
values = [str(row[col]) for col in protein_cols]
|
|
4833
|
+
if mode == "pipeline" and emapper_file:
|
|
4834
|
+
if any(val.startswith("No protein data available") for val in values):
|
|
4835
|
+
missing_proteins += 1
|
|
4836
|
+
elif mode == "ko-matrix":
|
|
4837
|
+
if any(not val.startswith("No protein data available") for val in values):
|
|
4838
|
+
non_placeholder += 1
|
|
4839
|
+
|
|
4840
|
+
if missing_rows:
|
|
4841
|
+
_record_validation_check(
|
|
4842
|
+
report_data,
|
|
4843
|
+
f"complementarity_lookup_{n_members}",
|
|
4844
|
+
"warn",
|
|
4845
|
+
f"{missing_rows} complementarity rows could not be matched to module completeness matrix.",
|
|
4846
|
+
)
|
|
4847
|
+
if bad_combo:
|
|
4848
|
+
_record_validation_check(
|
|
4849
|
+
report_data,
|
|
4850
|
+
f"complementarity_combo_{n_members}",
|
|
4851
|
+
"warn",
|
|
4852
|
+
f"{bad_combo} rows have combination completeness != {completeness_scale}.",
|
|
4853
|
+
)
|
|
4854
|
+
if bad_individual:
|
|
4855
|
+
_record_validation_check(
|
|
4856
|
+
report_data,
|
|
4857
|
+
f"complementarity_individual_{n_members}",
|
|
4858
|
+
"warn",
|
|
4859
|
+
f"{bad_individual} rows have individuals already complete.",
|
|
4860
|
+
)
|
|
4861
|
+
if mismatch_reported:
|
|
4862
|
+
_record_validation_check(
|
|
4863
|
+
report_data,
|
|
4864
|
+
f"complementarity_reported_{n_members}",
|
|
4865
|
+
"warn",
|
|
4866
|
+
f"{mismatch_reported} rows have completeness values inconsistent with module matrix.",
|
|
4867
|
+
)
|
|
4868
|
+
if missing_proteins:
|
|
4869
|
+
_record_validation_check(
|
|
4870
|
+
report_data,
|
|
4871
|
+
f"complementarity_proteins_{n_members}",
|
|
4872
|
+
"warn",
|
|
4873
|
+
f"{missing_proteins} rows missing protein provenance in pipeline mode.",
|
|
4874
|
+
)
|
|
4875
|
+
if non_placeholder:
|
|
4876
|
+
_record_validation_check(
|
|
4877
|
+
report_data,
|
|
4878
|
+
f"complementarity_placeholder_{n_members}",
|
|
4879
|
+
"warn",
|
|
4880
|
+
f"{non_placeholder} rows contain protein provenance in KO-matrix mode.",
|
|
4881
|
+
)
|
|
4882
|
+
|
|
4883
|
+
summary = (
|
|
4884
|
+
f"Validation summary: {len(report_data['errors'])} errors, "
|
|
4885
|
+
f"{len(report_data['warnings'])} warnings."
|
|
4886
|
+
)
|
|
4887
|
+
if report_data["errors"]:
|
|
4888
|
+
logger.error(summary)
|
|
4889
|
+
elif report_data["warnings"]:
|
|
4890
|
+
logger.warning(summary)
|
|
4891
|
+
else:
|
|
4892
|
+
logger.info(summary)
|
|
4893
|
+
|
|
4894
|
+
if report:
|
|
4895
|
+
report_path = Path(report)
|
|
4896
|
+
report_path.parent.mkdir(parents=True, exist_ok=True)
|
|
4897
|
+
with report_path.open("w") as handle:
|
|
4898
|
+
json.dump(report_data, handle, indent=2)
|
|
4899
|
+
logger.info("Validation report written to %s", report_path)
|
|
4900
|
+
|
|
4901
|
+
if report_data["errors"] or (strict and report_data["warnings"]):
|
|
4902
|
+
raise typer.Exit(1)
|
|
4903
|
+
|
|
4904
|
+
|
|
3935
4905
|
if __name__ == "__main__":
|
|
3936
4906
|
app()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: moducomp
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.12
|
|
4
4
|
Summary: moducomp: metabolic module completeness and complementarity for microbiomes.
|
|
5
5
|
Keywords: bioinformatics,microbiome,metabolic,kegg,genomics
|
|
6
6
|
Author-email: "Juan C. Villada" <jvillada@lbl.gov>
|
|
@@ -38,6 +38,7 @@ Project-URL: Repository, https://github.com/NeLLi-team/moducomp
|
|
|
38
38
|
- Tracks and reports the actual proteins that are responsible for the completion of the module in the combination of N genomes.
|
|
39
39
|
- **Automatic resource monitoring** with timestamped logs tracking CPU usage, memory consumption, and runtime for reproducibility.
|
|
40
40
|
- **Consistent logging to stdout/stderr** with a per-command resource summary emitted at the end of each run.
|
|
41
|
+
- **Built-in validation (`moducomp validate`)** for scientific consistency checks across annotations, KO matrices, KPCT outputs, and complementarity reports.
|
|
41
42
|
|
|
42
43
|
## Installation (Recommended)
|
|
43
44
|
|
|
@@ -148,6 +149,9 @@ This section lists all CLI options implemented today, along with their default v
|
|
|
148
149
|
| `--del-tmp/--keep-tmp` | `true` | Delete temporary files after completion. |
|
|
149
150
|
| `--lowmem/--fullmem` (`--low-mem/--full-mem`) | `fullmem` | Run eggNOG-mapper without `--dbmem` to reduce RAM. |
|
|
150
151
|
| `--verbose/--quiet` | `false` | Enable verbose progress output. |
|
|
152
|
+
| `--validate/--no-validate` | `validate` | Run post-run validation checks. |
|
|
153
|
+
| `--validate-report/--no-validate-report` | `validate-report` | Write `validation_report.json` in the output directory. |
|
|
154
|
+
| `--validate-strict/--validate-lenient` | `lenient` | Treat validation warnings as failures when strict. |
|
|
151
155
|
| `--log-level`, `-l` | `INFO` | Logging level: `DEBUG`, `INFO`, `WARNING`, `ERROR`. |
|
|
152
156
|
| `--eggnog-data-dir` | `EGGNOG_DATA_DIR` | Path to eggNOG-mapper data (sets `EGGNOG_DATA_DIR`). |
|
|
153
157
|
|
|
@@ -162,6 +166,9 @@ This section lists all CLI options implemented today, along with their default v
|
|
|
162
166
|
| `--del-tmp/--keep-tmp` | `true` | Delete temporary files after the test completes. |
|
|
163
167
|
| `--lowmem/--fullmem` (`--low-mem/--full-mem`) | `lowmem` | Low-memory mode is the default for tests. |
|
|
164
168
|
| `--verbose/--quiet` | `verbose` | Verbose output is the default for tests. |
|
|
169
|
+
| `--validate/--no-validate` | `validate` | Run post-run validation checks. |
|
|
170
|
+
| `--validate-report/--no-validate-report` | `validate-report` | Write `validation_report.json` in the output directory. |
|
|
171
|
+
| `--validate-strict/--validate-lenient` | `lenient` | Treat validation warnings as failures when strict. |
|
|
165
172
|
| `--log-level`, `-l` | `INFO` | Logging level: `DEBUG`, `INFO`, `WARNING`, `ERROR`. |
|
|
166
173
|
| `--eggnog-data-dir` | `EGGNOG_DATA_DIR` | Path to eggNOG-mapper data (sets `EGGNOG_DATA_DIR`). |
|
|
167
174
|
|
|
@@ -174,6 +181,21 @@ This section lists all CLI options implemented today, along with their default v
|
|
|
174
181
|
| `--del-tmp/--keep-tmp` | `true` | Delete temporary files after completion. |
|
|
175
182
|
| `--ncpus`, `-n` | `16` | CPU cores for KPCT parallel processing. |
|
|
176
183
|
| `--verbose/--quiet` | `false` | Enable verbose progress output. |
|
|
184
|
+
| `--validate/--no-validate` | `validate` | Run post-run validation checks. |
|
|
185
|
+
| `--validate-report/--no-validate-report` | `validate-report` | Write `validation_report.json` in the output directory. |
|
|
186
|
+
| `--validate-strict/--validate-lenient` | `lenient` | Treat validation warnings as failures when strict. |
|
|
187
|
+
| `--log-level`, `-l` | `INFO` | Logging level: `DEBUG`, `INFO`, `WARNING`, `ERROR`. |
|
|
188
|
+
|
|
189
|
+
#### `validate` command (positional args: `savedir`)
|
|
190
|
+
|
|
191
|
+
| Option | Default | Description |
|
|
192
|
+
| --- | --- | --- |
|
|
193
|
+
| `--mode` | `auto` | Validation mode: `auto`, `pipeline`, or `ko-matrix`. |
|
|
194
|
+
| `--calculate-complementarity`, `-c` | `auto-detect` | Expected complementarity size (0 disables). |
|
|
195
|
+
| `--kpct-outprefix` | `output_give_completeness` | KPCT output prefix used during analysis. |
|
|
196
|
+
| `--strict/--lenient` | `lenient` | Treat warnings as failures when strict. |
|
|
197
|
+
| `--report` | _none_ | Write JSON validation report to this path. |
|
|
198
|
+
| `--verbose/--quiet` | `false` | Enable verbose progress output. |
|
|
177
199
|
| `--log-level`, `-l` | `INFO` | Logging level: `DEBUG`, `INFO`, `WARNING`, `ERROR`. |
|
|
178
200
|
|
|
179
201
|
#### `download-eggnog-data` command
|
|
@@ -198,6 +220,33 @@ This section lists all CLI options implemented today, along with their default v
|
|
|
198
220
|
- For KPCT parallel processing, the system creates the same number of chunks as CPU cores specified
|
|
199
221
|
- Example: `--ncpus 8` will use 8 cores and create 8 chunks for optimal parallel processing
|
|
200
222
|
|
|
223
|
+
### Validation (QC)
|
|
224
|
+
|
|
225
|
+
Use the built-in validator to check scientific consistency across outputs after a run. The validator compares:
|
|
226
|
+
- KO sets and counts between eggNOG-mapper annotations and `kos_matrix.csv`
|
|
227
|
+
- KO sets between `kos_matrix.csv` and `ko_file_for_kpct.txt`
|
|
228
|
+
- KPCT contigs vs pathways outputs
|
|
229
|
+
- Module completeness ranges and combination naming
|
|
230
|
+
- Complementarity reports versus module completeness values
|
|
231
|
+
- Protein provenance fields (pipeline mode) or placeholders (KO-matrix mode)
|
|
232
|
+
|
|
233
|
+
Example:
|
|
234
|
+
|
|
235
|
+
```bash
|
|
236
|
+
# Validation runs by default after pipeline/analyze/test.
|
|
237
|
+
# Use --no-validate to disable or --no-validate-report to skip JSON output.
|
|
238
|
+
# When validation reports errors (or warnings in strict mode), the command exits non-zero.
|
|
239
|
+
|
|
240
|
+
# Validate a pipeline run and write a JSON report
|
|
241
|
+
moducomp validate /path/to/output --mode pipeline --report /path/to/output/validation_report.json
|
|
242
|
+
|
|
243
|
+
# Validate KO-matrix mode outputs (non-default KPCT prefix)
|
|
244
|
+
moducomp validate /path/to/output --mode ko-matrix --kpct-outprefix my_prefix
|
|
245
|
+
|
|
246
|
+
# Treat warnings as failures
|
|
247
|
+
moducomp validate /path/to/output --strict
|
|
248
|
+
```
|
|
249
|
+
|
|
201
250
|
### ⚠️ Important note 1
|
|
202
251
|
|
|
203
252
|
**Prepare FAA files**: Ensure FAA headers are in the form `>genomeName|proteinId`, or use the `--adapt-headers` option to format your headers into `>fileName_prefix|protein_id_counter`.
|
|
@@ -298,15 +347,38 @@ moducomp analyze-ko-matrix ./ko_matrix.csv ./output_moderate --ncpus 16 --calcul
|
|
|
298
347
|
moducomp pipeline ./genomes ./output_lowmem --ncpus 8 --lowmem --calculate-complementarity 2
|
|
299
348
|
```
|
|
300
349
|
|
|
301
|
-
##
|
|
350
|
+
## Expected outputs
|
|
351
|
+
|
|
352
|
+
The sections below describe the expected output files, naming conventions, and the column-level meaning of each file. These details are the same for `moducomp pipeline` and `moducomp test` (pipeline mode), and the subset noted for `moducomp analyze-ko-matrix` (KO-matrix mode).
|
|
353
|
+
|
|
354
|
+
**Naming conventions**
|
|
355
|
+
|
|
356
|
+
Genome identifiers are stored as `taxon_oid`. In pipeline mode, ModuComp expects protein headers in the format `genome_id|protein_id`. If you set `--adapt-headers`, ModuComp rewrites headers to `>genomeName|protein_N`, where `genomeName` is the FAA filename stem. Combination identifiers use `__` (double underscore), for example `GenomeA__GenomeB`, and `n_members` in `module_completeness.tsv` records the size of each combination.
|
|
357
|
+
|
|
358
|
+
**Pipeline mode outputs (`moducomp pipeline`, `moducomp test`)**
|
|
359
|
+
|
|
360
|
+
- `emapper_out.emapper.annotations`: Full eggNOG-mapper annotations. The `#query` column must match `genome_id|protein_id`. `KEGG_ko` entries are prefixed `ko:KXXXXX` and are converted to `KXXXXX` for downstream matrices.
|
|
361
|
+
- `kos_matrix.csv`: Genome × KO count matrix. Columns: `taxon_oid` followed by KO IDs (e.g., `K00001`). Values are integer protein counts per KO.
|
|
362
|
+
- `ko_file_for_kpct.txt`: KPCT input file. Each line starts with `taxon_oid` followed by the set of KO IDs present in that genome or combination. If `--calculate-complementarity` is `N>=2`, combinations up to `N` are included as `GenomeA__GenomeB`.
|
|
363
|
+
- `output_give_completeness_contigs.with_weights.tsv`: KPCT module results per genome/combination. Columns: `contig` (genome/combination ID), `module_accession`, `completeness` (0–100), `pathway_name`, `pathway_class`, `matching_ko` (KO weights), `missing_ko`.
|
|
364
|
+
- `output_give_completeness_pathways.with_weights.tsv`: Same rows and order as the contigs file, but without the `contig` column. This is provided for compatibility with legacy tools; prefer the contigs file when you need genome-level provenance.
|
|
365
|
+
- `module_completeness.tsv`: Pivoted module completeness matrix. Columns: `n_members`, `taxon_oid`, followed by KEGG module IDs (`M00001`, …). Values are numeric percentages in the range 0–100.
|
|
366
|
+
- `module_completeness_complementarity_Nmember.tsv`: Complementarity report for `N`-member combinations (only when `--calculate-complementarity N` is set). Columns: `taxon_oid_1..N`, `completeness_taxon_oid_1..N`, `module_id`, `module_name`, `pathway_class`, `matching_ko`, `proteins_taxon_oid_1..N`. Protein fields list contributing proteins per KO (from eggNOG-mapper) as `{'KXXXXX': 'genome|protein'}`.
|
|
367
|
+
- `logs/moducomp.log`: Detailed run log with structured progress messages and per-command resource summaries.
|
|
368
|
+
- `logs/resource_usage_YYYYMMDD_HHMMSS.log`: Resource monitoring log capturing wall time, CPU time, CPU utilization, peak RAM, and exit code for each monitored command.
|
|
369
|
+
- `tmp/` (only if `--keep-tmp`): Intermediate files such as `merged_genomes.faa`, `emapper_output/`, and KPCT chunk outputs.
|
|
370
|
+
- `validation_report.json` (default when validation is enabled): JSON report produced by the validator.
|
|
302
371
|
|
|
303
|
-
|
|
372
|
+
**KO-matrix mode outputs (`moducomp analyze-ko-matrix`)**
|
|
304
373
|
|
|
305
|
-
-
|
|
306
|
-
-
|
|
307
|
-
-
|
|
308
|
-
-
|
|
309
|
-
-
|
|
374
|
+
- `kos_matrix.csv`: A copy of the input KO matrix (same format as above).
|
|
375
|
+
- `ko_file_for_kpct.txt`: KPCT input generated from the KO matrix. If `--calculate-complementarity` is set, combination lines are added using `GenomeA__GenomeB` identifiers.
|
|
376
|
+
- `output_give_completeness_contigs.with_weights.tsv`: KPCT module results per genome/combination (same format as pipeline mode).
|
|
377
|
+
- `output_give_completeness_pathways.with_weights.tsv`: Same rows as the contigs file, without the `contig` column.
|
|
378
|
+
- `module_completeness.tsv`: Module completeness matrix (same format as pipeline mode).
|
|
379
|
+
- `module_completeness_complementarity_Nmember.tsv`: Complementarity report. Protein contribution columns are filled with `No protein data available for <genome>` because no eggNOG-mapper annotations are available in KO-matrix mode.
|
|
380
|
+
- `logs/moducomp.log` and `logs/resource_usage_YYYYMMDD_HHMMSS.log`: Standard run logs and resource summaries.
|
|
381
|
+
- `validation_report.json` (default when validation is enabled): JSON report produced by the validator.
|
|
310
382
|
|
|
311
383
|
## Citation
|
|
312
384
|
Villada, JC. & Schulz, F. (2025). Assessment of metabolic module completeness of genomes and metabolic complementarity in microbiomes with `moducomp` . `moducomp` (v0.5.1) Zenodo. https://doi.org/10.5281/zenodo.16116092
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
moducomp/__init__.py,sha256=
|
|
1
|
+
moducomp/__init__.py,sha256=P0sHK6IWgEersAkHlDWWFlWULUH_C-ytp5vStZDJqiY,659
|
|
2
2
|
moducomp/__main__.py,sha256=1O2pv6IGjUgqnbqsiMLtVqjxWQpRtZUjp8LDljZ1bsI,185
|
|
3
|
-
moducomp/moducomp.py,sha256=
|
|
3
|
+
moducomp/moducomp.py,sha256=9eLxngLe0zRcevy6x9a-80-MGUjIEOYLTCQr6crnYRM,177904
|
|
4
4
|
moducomp/data/test_genomes/IMG2562617132.faa,sha256=gZPh-08pMRdAWJRr3__TbnU1F68CdkDb3gxtpaCLTTc,356863
|
|
5
5
|
moducomp/data/test_genomes/IMG2568526683.faa,sha256=PxFJwe-68UGw7il1hGlNhZt4-2WzzxXxGE1GTskDnow,343109
|
|
6
6
|
moducomp/data/test_genomes/IMG2740892217.faa,sha256=WsId4sIPxENbqF6tYFouAgDCy6T0SXNY6TywxBNe-3E,548954
|
|
7
|
-
moducomp-0.7.
|
|
8
|
-
moducomp-0.7.
|
|
9
|
-
moducomp-0.7.
|
|
10
|
-
moducomp-0.7.
|
|
11
|
-
moducomp-0.7.
|
|
7
|
+
moducomp-0.7.12.dist-info/entry_points.txt,sha256=dwt0_w7Ex9p1vhfp2fl4WXJLBh50u9fXTRNlAOJkAd4,114
|
|
8
|
+
moducomp-0.7.12.dist-info/licenses/LICENSE.txt,sha256=pt0cfIq9Wop21KDZYyQgP0M1YWYvKG0PomA5cUDC4TI,1536
|
|
9
|
+
moducomp-0.7.12.dist-info/WHEEL,sha256=_2ozNFCLWc93bK4WKHCO-eDUENDlo-dgc9cU3qokYO4,82
|
|
10
|
+
moducomp-0.7.12.dist-info/METADATA,sha256=L7JsOoEk8dYNWZcCWmmQXy1ZPow3iRb_d7XrSsanOAg,21146
|
|
11
|
+
moducomp-0.7.12.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|