moducomp 0.7.11__py3-none-any.whl → 0.7.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
moducomp/__init__.py CHANGED
@@ -2,7 +2,7 @@
2
2
  moducomp: metabolic module completeness and complementarity for microbiomes.
3
3
  """
4
4
 
5
- __version__ = "0.7.11"
5
+ __version__ = "0.7.12"
6
6
  __author__ = "Juan C. Villada"
7
7
  __email__ = "jvillada@lbl.gov"
8
8
  __title__ = "moducomp"
moducomp/moducomp.py CHANGED
@@ -23,12 +23,15 @@ License: See LICENSE.txt
23
23
  Version: See moducomp.__version__ for current version
24
24
  """
25
25
 
26
+ import csv
26
27
  import datetime
27
28
  import glob
28
29
  import itertools
30
+ import json
29
31
  import logging
30
32
  import os
31
33
  import queue
34
+ import re
32
35
  import shlex
33
36
  import shutil
34
37
  import subprocess
@@ -223,6 +226,112 @@ def count_files(path: Path) -> int:
223
226
  return total
224
227
 
225
228
 
229
+ def _find_emapper_annotations(savedir: Union[str, Path]) -> Optional[Path]:
230
+ savedir_path = Path(savedir)
231
+ candidates = [
232
+ savedir_path / "emapper_out.emapper.annotations",
233
+ savedir_path / "tmp" / "emapper_output" / "emapper_out.emapper.annotations",
234
+ ]
235
+ for candidate in candidates:
236
+ if candidate.exists():
237
+ return candidate
238
+ return None
239
+
240
+
241
+ def _read_ko_matrix_file(kos_matrix: Union[str, Path], logger: Optional[logging.Logger] = None) -> Tuple[pd.DataFrame, str]:
242
+ kos_matrix = str(kos_matrix)
243
+ initial_delimiter = "," if kos_matrix.lower().endswith(".csv") else "\t"
244
+ delimiter_used = initial_delimiter
245
+ try:
246
+ if logger:
247
+ logger.info(f"Reading KO matrix file with delimiter '{initial_delimiter}': {kos_matrix}")
248
+ ko_df = pd.read_csv(kos_matrix, sep=initial_delimiter)
249
+ except Exception as e_initial:
250
+ if kos_matrix.lower().endswith(".tsv") and initial_delimiter == "\t":
251
+ try:
252
+ if logger:
253
+ logger.info(f"Tab-delimited read failed. Attempting comma delimiter for {kos_matrix}.")
254
+ ko_df = pd.read_csv(kos_matrix, sep=",")
255
+ delimiter_used = ","
256
+ except Exception as e_fallback:
257
+ if logger:
258
+ logger.error(f"Fallback comma delimiter also failed: {e_fallback}")
259
+ raise e_fallback
260
+ else:
261
+ if logger:
262
+ logger.error(f"Failed to read KO matrix {kos_matrix}: {e_initial}")
263
+ raise e_initial
264
+ return ko_df, delimiter_used
265
+
266
+
267
+ def _read_kpct_input_file(kpct_input_file: Union[str, Path]) -> Dict[str, Set[str]]:
268
+ genome_to_kos: Dict[str, Set[str]] = {}
269
+ with open(kpct_input_file, "r") as handle:
270
+ for line in handle:
271
+ line = line.strip()
272
+ if not line:
273
+ continue
274
+ parts = line.split("\t")
275
+ if len(parts) < 2:
276
+ continue
277
+ genome_id = parts[0]
278
+ kos = {ko for ko in parts[1:] if ko}
279
+ genome_to_kos[genome_id] = kos
280
+ return genome_to_kos
281
+
282
+
283
+ def _compare_kpct_outputs(contigs_file: Path, pathways_file: Path) -> Tuple[bool, str]:
284
+ """
285
+ Compare KPCT contigs and pathways outputs. Returns (match, detail).
286
+ """
287
+ with contigs_file.open("r") as contigs, pathways_file.open("r") as pathways:
288
+ contig_header = contigs.readline().rstrip("\n").split("\t")
289
+ pathway_header = pathways.readline().rstrip("\n").split("\t")
290
+ if contig_header[1:] != pathway_header:
291
+ return False, "Header mismatch between contigs and pathways outputs."
292
+
293
+ line_no = 0
294
+ for contig_line, pathway_line in zip(contigs, pathways):
295
+ line_no += 1
296
+ contig_line = contig_line.rstrip("\n")
297
+ pathway_line = pathway_line.rstrip("\n")
298
+ if not contig_line and not pathway_line:
299
+ continue
300
+ if contig_line.split("\t")[1:] != pathway_line.split("\t"):
301
+ return False, f"Row mismatch at line {line_no}."
302
+
303
+ # Check for extra trailing lines in either file
304
+ extra_contig = any(line.strip() for line in contigs)
305
+ extra_path = any(line.strip() for line in pathways)
306
+ if extra_contig or extra_path:
307
+ return False, "Row count mismatch between contigs and pathways outputs."
308
+
309
+ return True, "Contigs and pathways outputs match."
310
+
311
+
312
+ def _record_validation_check(report: Dict[str, Any], name: str, status: str, detail: str) -> None:
313
+ entry = {"name": name, "status": status, "detail": detail}
314
+ report["checks"].append(entry)
315
+ if status == "fail":
316
+ report["errors"].append(f"{name}: {detail}")
317
+ elif status == "warn":
318
+ report["warnings"].append(f"{name}: {detail}")
319
+
320
+
321
+ def _count_emapper_header_issues(emapper_file: Path) -> Tuple[int, int]:
322
+ total = 0
323
+ bad = 0
324
+ with emapper_file.open("r") as handle:
325
+ for line in handle:
326
+ if line.startswith("#"):
327
+ continue
328
+ total += 1
329
+ query = line.split("\t", 1)[0]
330
+ if "|" not in query:
331
+ bad += 1
332
+ return total, bad
333
+
334
+
226
335
  def default_eggnog_data_dir() -> Path:
227
336
  """Return a safe default location for eggNOG data downloads."""
228
337
  xdg_home = os.environ.get("XDG_DATA_HOME")
@@ -3165,6 +3274,21 @@ def pipeline(
3165
3274
  "--verbose/--quiet",
3166
3275
  help="Enable verbose output with detailed progress information.",
3167
3276
  ),
3277
+ run_validation: bool = typer.Option(
3278
+ True,
3279
+ "--validate/--no-validate",
3280
+ help="Run post-run validation checks (default: enabled).",
3281
+ ),
3282
+ validation_report: bool = typer.Option(
3283
+ True,
3284
+ "--validate-report/--no-validate-report",
3285
+ help="Write validation_report.json in the output directory.",
3286
+ ),
3287
+ validate_strict: bool = typer.Option(
3288
+ False,
3289
+ "--validate-strict/--validate-lenient",
3290
+ help="Treat validation warnings as failures.",
3291
+ ),
3168
3292
  log_level: str = typer.Option("INFO", "--log-level", "-l", help="Logging level (DEBUG, INFO, WARNING, ERROR)."),
3169
3293
  eggnog_data_dir: Optional[str] = typer.Option(
3170
3294
  None,
@@ -3233,15 +3357,42 @@ def pipeline(
3233
3357
  logger.info(f"Resource monitoring enabled. Log file: {resource_log_file}")
3234
3358
 
3235
3359
  # Run the main pipeline logic
3236
- _run_pipeline_core(genomedir, savedir, ncpus, adapt_headers, del_tmp,
3237
- calculate_complementarity, lowmem, verbose, logger, resource_log_file,
3238
- eggnog_data_dir)
3360
+ _run_pipeline_core(
3361
+ genomedir,
3362
+ savedir,
3363
+ ncpus,
3364
+ adapt_headers,
3365
+ del_tmp,
3366
+ calculate_complementarity,
3367
+ lowmem,
3368
+ verbose,
3369
+ logger,
3370
+ resource_log_file,
3371
+ eggnog_data_dir,
3372
+ run_validation,
3373
+ validation_report,
3374
+ validate_strict,
3375
+ log_level,
3376
+ )
3239
3377
 
3240
3378
 
3241
- def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers: bool,
3242
- del_tmp: bool, calculate_complementarity: int, lowmem: bool,
3243
- verbose: bool, logger: logging.Logger, resource_log_file: str,
3244
- eggnog_data_dir: Optional[str]) -> None:
3379
+ def _run_pipeline_core(
3380
+ genomedir: str,
3381
+ savedir: str,
3382
+ ncpus: int,
3383
+ adapt_headers: bool,
3384
+ del_tmp: bool,
3385
+ calculate_complementarity: int,
3386
+ lowmem: bool,
3387
+ verbose: bool,
3388
+ logger: logging.Logger,
3389
+ resource_log_file: str,
3390
+ eggnog_data_dir: Optional[str],
3391
+ run_validation: bool,
3392
+ validation_report: bool,
3393
+ validate_strict: bool,
3394
+ log_level: str,
3395
+ ) -> None:
3245
3396
  """
3246
3397
  Core pipeline logic separated for resource monitoring.
3247
3398
  """
@@ -3435,6 +3586,30 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
3435
3586
  # Generate final resource usage summary
3436
3587
  log_final_resource_summary(resource_log_file, start_time, logger, verbose)
3437
3588
 
3589
+ if run_validation:
3590
+ logger.info("Running post-run validation checks.")
3591
+ report_path = None
3592
+ if validation_report:
3593
+ report_path = os.path.join(savedir, "validation_report.json")
3594
+ try:
3595
+ validate(
3596
+ savedir=savedir,
3597
+ mode="ko-matrix",
3598
+ calculate_complementarity=calculate_complementarity,
3599
+ kpct_outprefix=kpct_outprefix,
3600
+ strict=validate_strict,
3601
+ report=report_path,
3602
+ verbose=verbose,
3603
+ log_level=log_level,
3604
+ )
3605
+ except typer.Exit as exc:
3606
+ if logger:
3607
+ logger.error("Validation failed with exit code %s.", exc.exit_code)
3608
+ logger.error("Outputs written to: %s", savedir)
3609
+ if report_path:
3610
+ logger.error("Validation report: %s", report_path)
3611
+ raise
3612
+
3438
3613
  # Display pipeline completion summary
3439
3614
  display_pipeline_completion_summary(start_time, savedir, logger, verbose)
3440
3615
 
@@ -3481,6 +3656,21 @@ def test(
3481
3656
  "--verbose/--quiet",
3482
3657
  help="Enable verbose output with detailed progress information.",
3483
3658
  ),
3659
+ run_validation: bool = typer.Option(
3660
+ True,
3661
+ "--validate/--no-validate",
3662
+ help="Run post-run validation checks (default: enabled).",
3663
+ ),
3664
+ validation_report: bool = typer.Option(
3665
+ True,
3666
+ "--validate-report/--no-validate-report",
3667
+ help="Write validation_report.json in the output directory.",
3668
+ ),
3669
+ validate_strict: bool = typer.Option(
3670
+ False,
3671
+ "--validate-strict/--validate-lenient",
3672
+ help="Treat validation warnings as failures.",
3673
+ ),
3484
3674
  log_level: str = typer.Option(
3485
3675
  "INFO",
3486
3676
  "--log-level",
@@ -3526,6 +3716,10 @@ def test(
3526
3716
  logger,
3527
3717
  resource_log_file,
3528
3718
  eggnog_data_dir,
3719
+ run_validation,
3720
+ validation_report,
3721
+ validate_strict,
3722
+ log_level,
3529
3723
  )
3530
3724
 
3531
3725
 
@@ -3735,6 +3929,21 @@ def analyze_ko_matrix(
3735
3929
  "--verbose/--quiet",
3736
3930
  help="Enable verbose output with detailed progress information.",
3737
3931
  ),
3932
+ run_validation: bool = typer.Option(
3933
+ True,
3934
+ "--validate/--no-validate",
3935
+ help="Run post-run validation checks (default: enabled).",
3936
+ ),
3937
+ validation_report: bool = typer.Option(
3938
+ True,
3939
+ "--validate-report/--no-validate-report",
3940
+ help="Write validation_report.json in the output directory.",
3941
+ ),
3942
+ validate_strict: bool = typer.Option(
3943
+ False,
3944
+ "--validate-strict/--validate-lenient",
3945
+ help="Treat validation warnings as failures.",
3946
+ ),
3738
3947
  log_level: str = typer.Option("INFO", "--log-level", "-l", help="Logging level (DEBUG, INFO, WARNING, ERROR)."),
3739
3948
  ) -> None:
3740
3949
  """
@@ -3924,6 +4133,22 @@ def analyze_ko_matrix(
3924
4133
  # Display pipeline completion summary
3925
4134
  display_pipeline_completion_summary(start_time, savedir, logger, verbose)
3926
4135
 
4136
+ if run_validation:
4137
+ logger.info("Running post-run validation checks.")
4138
+ report_path = None
4139
+ if validation_report:
4140
+ report_path = os.path.join(savedir, "validation_report.json")
4141
+ validate(
4142
+ savedir=savedir,
4143
+ mode="ko-matrix",
4144
+ calculate_complementarity=calculate_complementarity,
4145
+ kpct_outprefix=kpct_outprefix,
4146
+ strict=validate_strict,
4147
+ report=report_path,
4148
+ verbose=verbose,
4149
+ log_level=log_level,
4150
+ )
4151
+
3927
4152
  except Exception as e:
3928
4153
  if logger:
3929
4154
  logger.error(f"Error in KPCT analysis: {str(e)}", exc_info=True)
@@ -3932,5 +4157,750 @@ def analyze_ko_matrix(
3932
4157
  exit(1)
3933
4158
 
3934
4159
 
4160
+ @app.command()
4161
+ def validate(
4162
+ savedir: str = typer.Argument(
4163
+ ...,
4164
+ help="Output directory to validate (from pipeline or analyze-ko-matrix).",
4165
+ ),
4166
+ mode: str = typer.Option(
4167
+ "auto",
4168
+ "--mode",
4169
+ help="Validation mode: auto, pipeline, or ko-matrix.",
4170
+ ),
4171
+ calculate_complementarity: Optional[int] = typer.Option(
4172
+ None,
4173
+ "--calculate-complementarity",
4174
+ "-c",
4175
+ help="Expected complementarity size (0 disables). If omitted, detects from outputs.",
4176
+ ),
4177
+ kpct_outprefix: str = typer.Option(
4178
+ "output_give_completeness",
4179
+ "--kpct-outprefix",
4180
+ help="Prefix for KPCT output files (use if you changed it in analyze-ko-matrix).",
4181
+ ),
4182
+ strict: bool = typer.Option(
4183
+ False,
4184
+ "--strict/--lenient",
4185
+ help="Treat warnings as failures.",
4186
+ ),
4187
+ report: Optional[str] = typer.Option(
4188
+ None,
4189
+ "--report",
4190
+ help="Write JSON validation report to this path.",
4191
+ ),
4192
+ verbose: bool = typer.Option(
4193
+ False,
4194
+ "--verbose/--quiet",
4195
+ help="Enable verbose output with detailed progress information.",
4196
+ ),
4197
+ log_level: str = typer.Option(
4198
+ "INFO",
4199
+ "--log-level",
4200
+ "-l",
4201
+ help="Logging level (DEBUG, INFO, WARNING, ERROR).",
4202
+ ),
4203
+ ) -> None:
4204
+ """Run scientific validation checks on a ModuComp output directory."""
4205
+ savedir = os.path.abspath(savedir)
4206
+ if not os.path.isdir(savedir):
4207
+ log_error(f"Output directory not found: {savedir}")
4208
+ raise typer.Exit(1)
4209
+
4210
+ log_dir = Path(savedir) / "logs"
4211
+ logger = configure_logging(log_level, log_dir)
4212
+ RESOURCE_SUMMARIES.clear()
4213
+ logger.info("Starting moducomp validation.")
4214
+ logger.info("Output directory: %s", savedir)
4215
+ logger.info("CLI command: %s", " ".join(shlex.quote(arg) for arg in sys.argv))
4216
+
4217
+ mode = mode.lower().strip()
4218
+ if mode not in {"auto", "pipeline", "ko-matrix"}:
4219
+ log_error(f"Invalid mode '{mode}'. Use auto, pipeline, or ko-matrix.", logger=logger)
4220
+ raise typer.Exit(1)
4221
+
4222
+ report_data: Dict[str, Any] = {
4223
+ "savedir": savedir,
4224
+ "mode": mode,
4225
+ "timestamp": datetime.datetime.now().isoformat(),
4226
+ "checks": [],
4227
+ "warnings": [],
4228
+ "errors": [],
4229
+ "stats": {},
4230
+ }
4231
+
4232
+ emapper_file = _find_emapper_annotations(savedir)
4233
+ if mode == "auto":
4234
+ mode = "pipeline" if emapper_file else "ko-matrix"
4235
+ report_data["mode"] = mode
4236
+
4237
+ if mode == "pipeline" and not emapper_file:
4238
+ _record_validation_check(
4239
+ report_data,
4240
+ "emapper_annotations",
4241
+ "fail",
4242
+ "Pipeline mode selected but emapper annotations were not found.",
4243
+ )
4244
+ logger.error("Pipeline mode requires emapper annotations. Validation aborted.")
4245
+ if report:
4246
+ with open(report, "w") as handle:
4247
+ json.dump(report_data, handle, indent=2)
4248
+ raise typer.Exit(1)
4249
+
4250
+ if mode == "ko-matrix" and emapper_file:
4251
+ _record_validation_check(
4252
+ report_data,
4253
+ "emapper_annotations",
4254
+ "warn",
4255
+ f"KO-matrix mode selected but emapper annotations exist at {emapper_file}.",
4256
+ )
4257
+
4258
+ kos_matrix_path = Path(savedir) / "kos_matrix.csv"
4259
+ kpct_input_file = Path(savedir) / "ko_file_for_kpct.txt"
4260
+ module_matrix_file = Path(savedir) / "module_completeness.tsv"
4261
+
4262
+ contigs_file = None
4263
+ pathways_file = None
4264
+ contigs_candidates = [
4265
+ Path(savedir) / f"{kpct_outprefix}_contigs.with_weights.tsv",
4266
+ Path(savedir) / f"{kpct_outprefix}_contigs.tsv",
4267
+ ]
4268
+ pathways_candidates = [
4269
+ Path(savedir) / f"{kpct_outprefix}_pathways.with_weights.tsv",
4270
+ Path(savedir) / f"{kpct_outprefix}_pathways.tsv",
4271
+ ]
4272
+ for candidate in contigs_candidates:
4273
+ if candidate.exists():
4274
+ contigs_file = candidate
4275
+ break
4276
+ for candidate in pathways_candidates:
4277
+ if candidate.exists():
4278
+ pathways_file = candidate
4279
+ break
4280
+
4281
+ if not contigs_file and not pathways_file:
4282
+ _record_validation_check(
4283
+ report_data,
4284
+ "kpct_outputs",
4285
+ "warn",
4286
+ f"No KPCT output files found for prefix '{kpct_outprefix}'.",
4287
+ )
4288
+
4289
+ required_files = {
4290
+ "KO matrix": kos_matrix_path,
4291
+ "KPCT input": kpct_input_file,
4292
+ "Module completeness matrix": module_matrix_file,
4293
+ }
4294
+ if contigs_file:
4295
+ required_files["KPCT contigs output"] = contigs_file
4296
+ if pathways_file:
4297
+ required_files["KPCT pathways output"] = pathways_file
4298
+
4299
+ for label, path in required_files.items():
4300
+ if not path.exists():
4301
+ _record_validation_check(
4302
+ report_data,
4303
+ f"file_exists:{label}",
4304
+ "fail",
4305
+ f"Missing required file: {path}",
4306
+ )
4307
+ else:
4308
+ _record_validation_check(
4309
+ report_data,
4310
+ f"file_exists:{label}",
4311
+ "ok",
4312
+ f"Found {path}",
4313
+ )
4314
+
4315
+ if report_data["errors"]:
4316
+ logger.error("Validation halted due to missing required files.")
4317
+ if report:
4318
+ with open(report, "w") as handle:
4319
+ json.dump(report_data, handle, indent=2)
4320
+ raise typer.Exit(1)
4321
+
4322
+ # Read KO matrix
4323
+ ko_df, ko_delimiter = _read_ko_matrix_file(kos_matrix_path, logger)
4324
+ if "taxon_oid" not in ko_df.columns:
4325
+ _record_validation_check(
4326
+ report_data,
4327
+ "ko_matrix_format",
4328
+ "fail",
4329
+ "KO matrix missing required 'taxon_oid' column.",
4330
+ )
4331
+ logger.error("KO matrix validation failed: missing taxon_oid.")
4332
+ if report:
4333
+ with open(report, "w") as handle:
4334
+ json.dump(report_data, handle, indent=2)
4335
+ raise typer.Exit(1)
4336
+
4337
+ ko_columns = [col for col in ko_df.columns if col != "taxon_oid"]
4338
+ if not ko_columns:
4339
+ _record_validation_check(
4340
+ report_data,
4341
+ "ko_matrix_format",
4342
+ "fail",
4343
+ "KO matrix has no KO columns.",
4344
+ )
4345
+ logger.error("KO matrix validation failed: no KO columns.")
4346
+ if report:
4347
+ with open(report, "w") as handle:
4348
+ json.dump(report_data, handle, indent=2)
4349
+ raise typer.Exit(1)
4350
+
4351
+ ko_pattern = re.compile(r"^K\d{5}$")
4352
+ invalid_kos = [col for col in ko_columns if not ko_pattern.match(col)]
4353
+ if invalid_kos:
4354
+ _record_validation_check(
4355
+ report_data,
4356
+ "ko_matrix_columns",
4357
+ "warn",
4358
+ f"Found {len(invalid_kos)} non-KO columns (expected KXXXXX). Example: {invalid_kos[:5]}",
4359
+ )
4360
+ else:
4361
+ _record_validation_check(
4362
+ report_data,
4363
+ "ko_matrix_columns",
4364
+ "ok",
4365
+ f"Found {len(ko_columns)} KO columns.",
4366
+ )
4367
+
4368
+ ko_df["taxon_oid"] = ko_df["taxon_oid"].astype(str)
4369
+ genomes = ko_df["taxon_oid"].tolist()
4370
+ report_data["stats"]["genomes"] = len(genomes)
4371
+ report_data["stats"]["ko_columns"] = len(ko_columns)
4372
+ if len(genomes) != len(set(genomes)):
4373
+ _record_validation_check(
4374
+ report_data,
4375
+ "genome_ids_unique",
4376
+ "warn",
4377
+ "Duplicate genome identifiers found in KO matrix.",
4378
+ )
4379
+ else:
4380
+ _record_validation_check(
4381
+ report_data,
4382
+ "genome_ids_unique",
4383
+ "ok",
4384
+ "Genome identifiers are unique in KO matrix.",
4385
+ )
4386
+
4387
+ bad_names = [g for g in genomes if not g or g.strip() != g or re.search(r"\\s", g)]
4388
+ if bad_names:
4389
+ _record_validation_check(
4390
+ report_data,
4391
+ "genome_id_format",
4392
+ "warn",
4393
+ f"Found {len(bad_names)} genome identifiers with whitespace or empty values. Example: {bad_names[:3]}",
4394
+ )
4395
+ else:
4396
+ _record_validation_check(
4397
+ report_data,
4398
+ "genome_id_format",
4399
+ "ok",
4400
+ "Genome identifiers contain no whitespace.",
4401
+ )
4402
+
4403
+ ko_numeric = ko_df[ko_columns].apply(pd.to_numeric, errors="coerce")
4404
+ if ko_numeric.isna().any().any():
4405
+ _record_validation_check(
4406
+ report_data,
4407
+ "ko_matrix_numeric",
4408
+ "warn",
4409
+ "Non-numeric KO counts detected in KO matrix.",
4410
+ )
4411
+ else:
4412
+ _record_validation_check(
4413
+ report_data,
4414
+ "ko_matrix_numeric",
4415
+ "ok",
4416
+ "KO matrix counts are numeric.",
4417
+ )
4418
+
4419
+ ko_totals_series = ko_numeric.sum(axis=1)
4420
+ ko_totals = {str(ko_df.at[idx, "taxon_oid"]): float(total) for idx, total in ko_totals_series.items()}
4421
+ ko_present = {}
4422
+ for idx, row in ko_numeric.iterrows():
4423
+ genome_id = str(ko_df.at[idx, "taxon_oid"])
4424
+ ko_present[genome_id] = {ko for ko in ko_columns if row[ko] > 0}
4425
+
4426
+ # KPCT input consistency
4427
+ kpct_genomes_to_kos = _read_kpct_input_file(kpct_input_file)
4428
+ missing_kpct = set(genomes) - set(kpct_genomes_to_kos.keys())
4429
+ if missing_kpct:
4430
+ _record_validation_check(
4431
+ report_data,
4432
+ "kpct_input_genomes",
4433
+ "warn",
4434
+ f"{len(missing_kpct)} genomes from KO matrix missing in KPCT input. Example: {list(missing_kpct)[:3]}",
4435
+ )
4436
+ else:
4437
+ _record_validation_check(
4438
+ report_data,
4439
+ "kpct_input_genomes",
4440
+ "ok",
4441
+ "All KO-matrix genomes are present in KPCT input.",
4442
+ )
4443
+
4444
+ ko_mismatch = []
4445
+ for genome_id in genomes:
4446
+ kpct_kos = kpct_genomes_to_kos.get(genome_id)
4447
+ if kpct_kos is None:
4448
+ continue
4449
+ if ko_present[genome_id] != kpct_kos:
4450
+ ko_mismatch.append(genome_id)
4451
+ if ko_mismatch:
4452
+ _record_validation_check(
4453
+ report_data,
4454
+ "kpct_input_kos",
4455
+ "warn",
4456
+ f"KO sets differ between KO matrix and KPCT input for {len(ko_mismatch)} genomes. Example: {ko_mismatch[:3]}",
4457
+ )
4458
+ else:
4459
+ _record_validation_check(
4460
+ report_data,
4461
+ "kpct_input_kos",
4462
+ "ok",
4463
+ "KPCT input KO sets match KO matrix for all genomes.",
4464
+ )
4465
+
4466
+ combo_ids = [gid for gid in kpct_genomes_to_kos.keys() if "__" in gid]
4467
+ if combo_ids:
4468
+ max_checks = 100
4469
+ mismatch_count = 0
4470
+ for combo_id in combo_ids[:max_checks]:
4471
+ members = combo_id.split("__")
4472
+ if any(member not in ko_present for member in members):
4473
+ continue
4474
+ union_kos = set()
4475
+ for member in members:
4476
+ union_kos.update(ko_present[member])
4477
+ if union_kos != kpct_genomes_to_kos[combo_id]:
4478
+ mismatch_count += 1
4479
+ if mismatch_count:
4480
+ _record_validation_check(
4481
+ report_data,
4482
+ "kpct_combo_kos",
4483
+ "warn",
4484
+ f"{mismatch_count} of {min(len(combo_ids), max_checks)} combination KO sets do not match union of members.",
4485
+ )
4486
+ else:
4487
+ _record_validation_check(
4488
+ report_data,
4489
+ "kpct_combo_kos",
4490
+ "ok",
4491
+ "Combination KO sets match union of members (sampled).",
4492
+ )
4493
+
4494
+ # KPCT contigs vs pathways outputs
4495
+ if contigs_file and pathways_file:
4496
+ match, detail = _compare_kpct_outputs(contigs_file, pathways_file)
4497
+ _record_validation_check(
4498
+ report_data,
4499
+ "kpct_output_consistency",
4500
+ "ok" if match else "warn",
4501
+ detail,
4502
+ )
4503
+
4504
+ # Module completeness checks
4505
+ module_df = pd.read_csv(module_matrix_file, sep="\t")
4506
+ if "n_members" not in module_df.columns or "taxon_oid" not in module_df.columns:
4507
+ _record_validation_check(
4508
+ report_data,
4509
+ "module_completeness_format",
4510
+ "fail",
4511
+ "module_completeness.tsv missing n_members or taxon_oid.",
4512
+ )
4513
+ logger.error("module_completeness.tsv missing required columns. Validation aborted.")
4514
+ if report:
4515
+ with open(report, "w") as handle:
4516
+ json.dump(report_data, handle, indent=2)
4517
+ raise typer.Exit(1)
4518
+ else:
4519
+ _record_validation_check(
4520
+ report_data,
4521
+ "module_completeness_format",
4522
+ "ok",
4523
+ "module_completeness.tsv has required columns.",
4524
+ )
4525
+
4526
+ module_df["taxon_oid"] = module_df["taxon_oid"].astype(str)
4527
+ module_df["n_members"] = pd.to_numeric(module_df["n_members"], errors="coerce")
4528
+ if module_df["n_members"].isna().any():
4529
+ _record_validation_check(
4530
+ report_data,
4531
+ "module_completeness_n_members",
4532
+ "warn",
4533
+ "Non-numeric n_members values detected in module completeness matrix.",
4534
+ )
4535
+ module_df["n_members"] = module_df["n_members"].fillna(-1).astype(int)
4536
+
4537
+ module_cols = [col for col in module_df.columns if re.match(r"^M\d{5}$", col)]
4538
+ if not module_cols:
4539
+ _record_validation_check(
4540
+ report_data,
4541
+ "module_columns",
4542
+ "fail",
4543
+ "No KEGG module columns detected in module completeness matrix.",
4544
+ )
4545
+ else:
4546
+ _record_validation_check(
4547
+ report_data,
4548
+ "module_columns",
4549
+ "ok",
4550
+ f"Detected {len(module_cols)} module columns.",
4551
+ )
4552
+ report_data["stats"]["modules"] = len(module_cols)
4553
+
4554
+ completeness_scale = 100.0
4555
+ if module_cols:
4556
+ module_values = module_df[module_cols].apply(pd.to_numeric, errors="coerce")
4557
+ if module_values.isna().any().any():
4558
+ _record_validation_check(
4559
+ report_data,
4560
+ "module_completeness_numeric",
4561
+ "warn",
4562
+ "Non-numeric completeness values found in module completeness matrix.",
4563
+ )
4564
+ min_val = float(module_values.min().min())
4565
+ max_val = float(module_values.max().max())
4566
+ report_data["stats"]["module_min"] = min_val
4567
+ report_data["stats"]["module_max"] = max_val
4568
+ if min_val < -1e-6 or max_val > 100.0 + 1e-6:
4569
+ _record_validation_check(
4570
+ report_data,
4571
+ "module_completeness_range",
4572
+ "fail",
4573
+ f"Module completeness values out of expected range (min={min_val}, max={max_val}).",
4574
+ )
4575
+ else:
4576
+ _record_validation_check(
4577
+ report_data,
4578
+ "module_completeness_range",
4579
+ "ok",
4580
+ f"Module completeness range OK (min={min_val}, max={max_val}).",
4581
+ )
4582
+ completeness_scale = 100.0 if max_val > 1.5 else 1.0
4583
+ report_data["stats"]["completeness_scale"] = completeness_scale
4584
+
4585
+ # Validate n_members vs taxon_oid format
4586
+ mismatch_members = 0
4587
+ for _, row in module_df.iterrows():
4588
+ taxon_id = str(row["taxon_oid"])
4589
+ expected_members = taxon_id.count("__") + 1 if "__" in taxon_id else 1
4590
+ if int(row["n_members"]) != expected_members:
4591
+ mismatch_members += 1
4592
+ if mismatch_members:
4593
+ _record_validation_check(
4594
+ report_data,
4595
+ "n_members_consistency",
4596
+ "warn",
4597
+ f"{mismatch_members} rows have n_members inconsistent with taxon_oid combination size.",
4598
+ )
4599
+ else:
4600
+ _record_validation_check(
4601
+ report_data,
4602
+ "n_members_consistency",
4603
+ "ok",
4604
+ "n_members values match taxon_oid combination sizes.",
4605
+ )
4606
+
4607
+ # Compare contig ids and module ids if KPCT outputs available
4608
+ if contigs_file:
4609
+ contig_ids: Set[str] = set()
4610
+ kpct_module_ids: Set[str] = set()
4611
+ with contigs_file.open("r") as handle:
4612
+ reader = csv.DictReader(handle, delimiter="\t")
4613
+ for row in reader:
4614
+ contig_value = row.get("contig") or row.get("Contig") or row.get("genome") or row.get("Genome") or row.get("taxon_oid")
4615
+ if contig_value:
4616
+ contig_ids.add(str(contig_value))
4617
+ module_value = (
4618
+ row.get("module_accession")
4619
+ or row.get("module_id")
4620
+ or row.get("Module")
4621
+ )
4622
+ if module_value:
4623
+ kpct_module_ids.add(str(module_value))
4624
+ missing_contigs = set(module_df["taxon_oid"]) - contig_ids
4625
+ if missing_contigs:
4626
+ _record_validation_check(
4627
+ report_data,
4628
+ "kpct_contigs_coverage",
4629
+ "warn",
4630
+ f"{len(missing_contigs)} taxon_oids from module matrix missing in KPCT contigs output.",
4631
+ )
4632
+ else:
4633
+ _record_validation_check(
4634
+ report_data,
4635
+ "kpct_contigs_coverage",
4636
+ "ok",
4637
+ "KPCT contigs output covers all taxon_oids in module matrix.",
4638
+ )
4639
+
4640
+ missing_modules = set(module_cols) - kpct_module_ids
4641
+ if missing_modules:
4642
+ _record_validation_check(
4643
+ report_data,
4644
+ "kpct_module_coverage",
4645
+ "warn",
4646
+ f"{len(missing_modules)} module columns missing from KPCT contigs output.",
4647
+ )
4648
+ else:
4649
+ _record_validation_check(
4650
+ report_data,
4651
+ "kpct_module_coverage",
4652
+ "ok",
4653
+ "KPCT contigs output covers all module columns.",
4654
+ )
4655
+
4656
+ # Emapper checks (pipeline mode)
4657
+ if mode == "pipeline" and emapper_file:
4658
+ total_queries, bad_queries = _count_emapper_header_issues(emapper_file)
4659
+ if bad_queries:
4660
+ _record_validation_check(
4661
+ report_data,
4662
+ "emapper_header_format",
4663
+ "warn",
4664
+ f"{bad_queries} of {total_queries} emapper queries lack genome|protein format.",
4665
+ )
4666
+ else:
4667
+ _record_validation_check(
4668
+ report_data,
4669
+ "emapper_header_format",
4670
+ "ok",
4671
+ "All emapper queries follow genome|protein format.",
4672
+ )
4673
+
4674
+ genome_ko_proteins = parse_emapper_annotations(str(emapper_file), logger)
4675
+ emapper_genomes = set(genome_ko_proteins.keys())
4676
+ if set(genomes) - emapper_genomes:
4677
+ _record_validation_check(
4678
+ report_data,
4679
+ "emapper_genome_coverage",
4680
+ "warn",
4681
+ "Some genomes in KO matrix are missing in emapper annotations.",
4682
+ )
4683
+ else:
4684
+ _record_validation_check(
4685
+ report_data,
4686
+ "emapper_genome_coverage",
4687
+ "ok",
4688
+ "All KO-matrix genomes are present in emapper annotations.",
4689
+ )
4690
+
4691
+ emapper_ko_set: Set[str] = set()
4692
+ emapper_totals: Dict[str, int] = {}
4693
+ for genome_id, ko_dict in genome_ko_proteins.items():
4694
+ emapper_totals[genome_id] = sum(len(proteins) for proteins in ko_dict.values())
4695
+ emapper_ko_set.update(ko_dict.keys())
4696
+
4697
+ if set(ko_columns) != emapper_ko_set:
4698
+ _record_validation_check(
4699
+ report_data,
4700
+ "emapper_vs_matrix_kos",
4701
+ "warn",
4702
+ f"KO sets differ between emapper annotations and KO matrix (emapper={len(emapper_ko_set)}, matrix={len(ko_columns)}).",
4703
+ )
4704
+ else:
4705
+ _record_validation_check(
4706
+ report_data,
4707
+ "emapper_vs_matrix_kos",
4708
+ "ok",
4709
+ "KO sets match between emapper annotations and KO matrix.",
4710
+ )
4711
+
4712
+ mismatched_totals = []
4713
+ for genome_id, total in emapper_totals.items():
4714
+ if genome_id not in ko_totals:
4715
+ continue
4716
+ matrix_total = float(ko_totals[genome_id])
4717
+ if abs(matrix_total - total) > 1e-6:
4718
+ mismatched_totals.append(genome_id)
4719
+ if mismatched_totals:
4720
+ _record_validation_check(
4721
+ report_data,
4722
+ "emapper_vs_matrix_counts",
4723
+ "fail",
4724
+ f"KO counts differ between emapper and KO matrix for {len(mismatched_totals)} genomes. Example: {mismatched_totals[:3]}",
4725
+ )
4726
+ else:
4727
+ _record_validation_check(
4728
+ report_data,
4729
+ "emapper_vs_matrix_counts",
4730
+ "ok",
4731
+ "KO counts match between emapper and KO matrix.",
4732
+ )
4733
+
4734
+ # Complementarity checks
4735
+ comp_pattern = re.compile(r"module_completeness_complementarity_(\\d+)member\\.tsv$")
4736
+ comp_files: Dict[int, Path] = {}
4737
+ for file_path in Path(savedir).glob("module_completeness_complementarity_*member.tsv"):
4738
+ match = comp_pattern.match(file_path.name)
4739
+ if match:
4740
+ comp_files[int(match.group(1))] = file_path
4741
+
4742
+ if calculate_complementarity is None:
4743
+ expected_sizes = sorted(comp_files.keys())
4744
+ elif calculate_complementarity >= 2:
4745
+ expected_sizes = list(range(2, calculate_complementarity + 1))
4746
+ else:
4747
+ expected_sizes = []
4748
+
4749
+ for n_members in expected_sizes:
4750
+ if n_members not in comp_files:
4751
+ _record_validation_check(
4752
+ report_data,
4753
+ f"complementarity_file_{n_members}",
4754
+ "fail",
4755
+ f"Expected complementarity report missing for {n_members}-member combinations.",
4756
+ )
4757
+ else:
4758
+ _record_validation_check(
4759
+ report_data,
4760
+ f"complementarity_file_{n_members}",
4761
+ "ok",
4762
+ f"Found complementarity report for {n_members}-member combinations.",
4763
+ )
4764
+
4765
+ # Prepare module lookup for complementarity validation
4766
+ module_df_indexed = module_df.set_index(["n_members", "taxon_oid"])
4767
+
4768
+ for n_members, comp_path in comp_files.items():
4769
+ comp_df = pd.read_csv(comp_path, sep="\t")
4770
+ taxon_cols = [col for col in comp_df.columns if col.startswith("taxon_oid_")]
4771
+ completeness_cols = [col for col in comp_df.columns if col.startswith("completeness_taxon_oid_")]
4772
+
4773
+ if not taxon_cols or "module_id" not in comp_df.columns:
4774
+ _record_validation_check(
4775
+ report_data,
4776
+ f"complementarity_format_{n_members}",
4777
+ "fail",
4778
+ f"Complementarity report {comp_path.name} missing required columns.",
4779
+ )
4780
+ continue
4781
+
4782
+ taxon_cols = sorted(taxon_cols, key=lambda x: int(x.split("_")[-1]))
4783
+ completeness_cols = sorted(completeness_cols, key=lambda x: int(x.split("_")[-1])) if completeness_cols else []
4784
+
4785
+ missing_rows = 0
4786
+ bad_combo = 0
4787
+ bad_individual = 0
4788
+ mismatch_reported = 0
4789
+ missing_proteins = 0
4790
+ non_placeholder = 0
4791
+ protein_cols = [col for col in comp_df.columns if col.startswith("proteins_taxon_oid_")]
4792
+
4793
+ for _, row in comp_df.iterrows():
4794
+ taxon_ids = [str(row[col]) for col in taxon_cols]
4795
+ module_id = str(row["module_id"])
4796
+ combo_id = "__".join(taxon_ids)
4797
+ combo_key = (n_members, combo_id)
4798
+ if combo_key not in module_df_indexed.index:
4799
+ combo_id_sorted = "__".join(sorted(taxon_ids))
4800
+ combo_key = (n_members, combo_id_sorted)
4801
+ if combo_key not in module_df_indexed.index:
4802
+ missing_rows += 1
4803
+ continue
4804
+
4805
+ if module_id not in module_df.columns:
4806
+ missing_rows += 1
4807
+ continue
4808
+ combo_val = module_df_indexed.loc[combo_key, module_id]
4809
+ if isinstance(combo_val, pd.Series):
4810
+ combo_val = combo_val.iloc[0]
4811
+ combo_val = float(combo_val)
4812
+ if abs(combo_val - completeness_scale) > 1e-6:
4813
+ bad_combo += 1
4814
+
4815
+ for idx, taxon_id in enumerate(taxon_ids):
4816
+ try:
4817
+ individual_val = module_df_indexed.loc[(1, taxon_id), module_id]
4818
+ if isinstance(individual_val, pd.Series):
4819
+ individual_val = individual_val.iloc[0]
4820
+ individual_val = float(individual_val)
4821
+ except KeyError:
4822
+ missing_rows += 1
4823
+ continue
4824
+ if individual_val >= completeness_scale - 1e-6:
4825
+ bad_individual += 1
4826
+ if completeness_cols:
4827
+ reported_val = float(row[completeness_cols[idx]])
4828
+ if abs(reported_val - individual_val) > 1e-6:
4829
+ mismatch_reported += 1
4830
+
4831
+ if protein_cols:
4832
+ values = [str(row[col]) for col in protein_cols]
4833
+ if mode == "pipeline" and emapper_file:
4834
+ if any(val.startswith("No protein data available") for val in values):
4835
+ missing_proteins += 1
4836
+ elif mode == "ko-matrix":
4837
+ if any(not val.startswith("No protein data available") for val in values):
4838
+ non_placeholder += 1
4839
+
4840
+ if missing_rows:
4841
+ _record_validation_check(
4842
+ report_data,
4843
+ f"complementarity_lookup_{n_members}",
4844
+ "warn",
4845
+ f"{missing_rows} complementarity rows could not be matched to module completeness matrix.",
4846
+ )
4847
+ if bad_combo:
4848
+ _record_validation_check(
4849
+ report_data,
4850
+ f"complementarity_combo_{n_members}",
4851
+ "warn",
4852
+ f"{bad_combo} rows have combination completeness != {completeness_scale}.",
4853
+ )
4854
+ if bad_individual:
4855
+ _record_validation_check(
4856
+ report_data,
4857
+ f"complementarity_individual_{n_members}",
4858
+ "warn",
4859
+ f"{bad_individual} rows have individuals already complete.",
4860
+ )
4861
+ if mismatch_reported:
4862
+ _record_validation_check(
4863
+ report_data,
4864
+ f"complementarity_reported_{n_members}",
4865
+ "warn",
4866
+ f"{mismatch_reported} rows have completeness values inconsistent with module matrix.",
4867
+ )
4868
+ if missing_proteins:
4869
+ _record_validation_check(
4870
+ report_data,
4871
+ f"complementarity_proteins_{n_members}",
4872
+ "warn",
4873
+ f"{missing_proteins} rows missing protein provenance in pipeline mode.",
4874
+ )
4875
+ if non_placeholder:
4876
+ _record_validation_check(
4877
+ report_data,
4878
+ f"complementarity_placeholder_{n_members}",
4879
+ "warn",
4880
+ f"{non_placeholder} rows contain protein provenance in KO-matrix mode.",
4881
+ )
4882
+
4883
+ summary = (
4884
+ f"Validation summary: {len(report_data['errors'])} errors, "
4885
+ f"{len(report_data['warnings'])} warnings."
4886
+ )
4887
+ if report_data["errors"]:
4888
+ logger.error(summary)
4889
+ elif report_data["warnings"]:
4890
+ logger.warning(summary)
4891
+ else:
4892
+ logger.info(summary)
4893
+
4894
+ if report:
4895
+ report_path = Path(report)
4896
+ report_path.parent.mkdir(parents=True, exist_ok=True)
4897
+ with report_path.open("w") as handle:
4898
+ json.dump(report_data, handle, indent=2)
4899
+ logger.info("Validation report written to %s", report_path)
4900
+
4901
+ if report_data["errors"] or (strict and report_data["warnings"]):
4902
+ raise typer.Exit(1)
4903
+
4904
+
3935
4905
  if __name__ == "__main__":
3936
4906
  app()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: moducomp
3
- Version: 0.7.11
3
+ Version: 0.7.12
4
4
  Summary: moducomp: metabolic module completeness and complementarity for microbiomes.
5
5
  Keywords: bioinformatics,microbiome,metabolic,kegg,genomics
6
6
  Author-email: "Juan C. Villada" <jvillada@lbl.gov>
@@ -38,6 +38,7 @@ Project-URL: Repository, https://github.com/NeLLi-team/moducomp
38
38
  - Tracks and reports the actual proteins that are responsible for the completion of the module in the combination of N genomes.
39
39
  - **Automatic resource monitoring** with timestamped logs tracking CPU usage, memory consumption, and runtime for reproducibility.
40
40
  - **Consistent logging to stdout/stderr** with a per-command resource summary emitted at the end of each run.
41
+ - **Built-in validation (`moducomp validate`)** for scientific consistency checks across annotations, KO matrices, KPCT outputs, and complementarity reports.
41
42
 
42
43
  ## Installation (Recommended)
43
44
 
@@ -148,6 +149,9 @@ This section lists all CLI options implemented today, along with their default v
148
149
  | `--del-tmp/--keep-tmp` | `true` | Delete temporary files after completion. |
149
150
  | `--lowmem/--fullmem` (`--low-mem/--full-mem`) | `fullmem` | Run eggNOG-mapper without `--dbmem` to reduce RAM. |
150
151
  | `--verbose/--quiet` | `false` | Enable verbose progress output. |
152
+ | `--validate/--no-validate` | `validate` | Run post-run validation checks. |
153
+ | `--validate-report/--no-validate-report` | `validate-report` | Write `validation_report.json` in the output directory. |
154
+ | `--validate-strict/--validate-lenient` | `lenient` | Treat validation warnings as failures when strict. |
151
155
  | `--log-level`, `-l` | `INFO` | Logging level: `DEBUG`, `INFO`, `WARNING`, `ERROR`. |
152
156
  | `--eggnog-data-dir` | `EGGNOG_DATA_DIR` | Path to eggNOG-mapper data (sets `EGGNOG_DATA_DIR`). |
153
157
 
@@ -162,6 +166,9 @@ This section lists all CLI options implemented today, along with their default v
162
166
  | `--del-tmp/--keep-tmp` | `true` | Delete temporary files after the test completes. |
163
167
  | `--lowmem/--fullmem` (`--low-mem/--full-mem`) | `lowmem` | Low-memory mode is the default for tests. |
164
168
  | `--verbose/--quiet` | `verbose` | Verbose output is the default for tests. |
169
+ | `--validate/--no-validate` | `validate` | Run post-run validation checks. |
170
+ | `--validate-report/--no-validate-report` | `validate-report` | Write `validation_report.json` in the output directory. |
171
+ | `--validate-strict/--validate-lenient` | `lenient` | Treat validation warnings as failures when strict. |
165
172
  | `--log-level`, `-l` | `INFO` | Logging level: `DEBUG`, `INFO`, `WARNING`, `ERROR`. |
166
173
  | `--eggnog-data-dir` | `EGGNOG_DATA_DIR` | Path to eggNOG-mapper data (sets `EGGNOG_DATA_DIR`). |
167
174
 
@@ -174,6 +181,21 @@ This section lists all CLI options implemented today, along with their default v
174
181
  | `--del-tmp/--keep-tmp` | `true` | Delete temporary files after completion. |
175
182
  | `--ncpus`, `-n` | `16` | CPU cores for KPCT parallel processing. |
176
183
  | `--verbose/--quiet` | `false` | Enable verbose progress output. |
184
+ | `--validate/--no-validate` | `validate` | Run post-run validation checks. |
185
+ | `--validate-report/--no-validate-report` | `validate-report` | Write `validation_report.json` in the output directory. |
186
+ | `--validate-strict/--validate-lenient` | `lenient` | Treat validation warnings as failures when strict. |
187
+ | `--log-level`, `-l` | `INFO` | Logging level: `DEBUG`, `INFO`, `WARNING`, `ERROR`. |
188
+
189
+ #### `validate` command (positional args: `savedir`)
190
+
191
+ | Option | Default | Description |
192
+ | --- | --- | --- |
193
+ | `--mode` | `auto` | Validation mode: `auto`, `pipeline`, or `ko-matrix`. |
194
+ | `--calculate-complementarity`, `-c` | `auto-detect` | Expected complementarity size (0 disables). |
195
+ | `--kpct-outprefix` | `output_give_completeness` | KPCT output prefix used during analysis. |
196
+ | `--strict/--lenient` | `lenient` | Treat warnings as failures when strict. |
197
+ | `--report` | _none_ | Write JSON validation report to this path. |
198
+ | `--verbose/--quiet` | `false` | Enable verbose progress output. |
177
199
  | `--log-level`, `-l` | `INFO` | Logging level: `DEBUG`, `INFO`, `WARNING`, `ERROR`. |
178
200
 
179
201
  #### `download-eggnog-data` command
@@ -198,6 +220,33 @@ This section lists all CLI options implemented today, along with their default v
198
220
  - For KPCT parallel processing, the system creates the same number of chunks as CPU cores specified
199
221
  - Example: `--ncpus 8` will use 8 cores and create 8 chunks for optimal parallel processing
200
222
 
223
+ ### Validation (QC)
224
+
225
+ Use the built-in validator to check scientific consistency across outputs after a run. The validator compares:
226
+ - KO sets and counts between eggNOG-mapper annotations and `kos_matrix.csv`
227
+ - KO sets between `kos_matrix.csv` and `ko_file_for_kpct.txt`
228
+ - KPCT contigs vs pathways outputs
229
+ - Module completeness ranges and combination naming
230
+ - Complementarity reports versus module completeness values
231
+ - Protein provenance fields (pipeline mode) or placeholders (KO-matrix mode)
232
+
233
+ Example:
234
+
235
+ ```bash
236
+ # Validation runs by default after pipeline/analyze/test.
237
+ # Use --no-validate to disable or --no-validate-report to skip JSON output.
238
+ # When validation reports errors (or warnings in strict mode), the command exits non-zero.
239
+
240
+ # Validate a pipeline run and write a JSON report
241
+ moducomp validate /path/to/output --mode pipeline --report /path/to/output/validation_report.json
242
+
243
+ # Validate KO-matrix mode outputs (non-default KPCT prefix)
244
+ moducomp validate /path/to/output --mode ko-matrix --kpct-outprefix my_prefix
245
+
246
+ # Treat warnings as failures
247
+ moducomp validate /path/to/output --strict
248
+ ```
249
+
201
250
  ### ⚠️ Important note 1
202
251
 
203
252
  **Prepare FAA files**: Ensure FAA headers are in the form `>genomeName|proteinId`, or use the `--adapt-headers` option to format your headers into `>fileName_prefix|protein_id_counter`.
@@ -298,15 +347,38 @@ moducomp analyze-ko-matrix ./ko_matrix.csv ./output_moderate --ncpus 16 --calcul
298
347
  moducomp pipeline ./genomes ./output_lowmem --ncpus 8 --lowmem --calculate-complementarity 2
299
348
  ```
300
349
 
301
- ## Output files
350
+ ## Expected outputs
351
+
352
+ The sections below describe the expected output files, naming conventions, and the column-level meaning of each file. These details are the same for `moducomp pipeline` and `moducomp test` (pipeline mode), and the subset noted for `moducomp analyze-ko-matrix` (KO-matrix mode).
353
+
354
+ **Naming conventions**
355
+
356
+ Genome identifiers are stored as `taxon_oid`. In pipeline mode, ModuComp expects protein headers in the format `genome_id|protein_id`. If you set `--adapt-headers`, ModuComp rewrites headers to `>genomeName|protein_N`, where `genomeName` is the FAA filename stem. Combination identifiers use `__` (double underscore), for example `GenomeA__GenomeB`, and `n_members` in `module_completeness.tsv` records the size of each combination.
357
+
358
+ **Pipeline mode outputs (`moducomp pipeline`, `moducomp test`)**
359
+
360
+ - `emapper_out.emapper.annotations`: Full eggNOG-mapper annotations. The `#query` column must match `genome_id|protein_id`. `KEGG_ko` entries are prefixed `ko:KXXXXX` and are converted to `KXXXXX` for downstream matrices.
361
+ - `kos_matrix.csv`: Genome × KO count matrix. Columns: `taxon_oid` followed by KO IDs (e.g., `K00001`). Values are integer protein counts per KO.
362
+ - `ko_file_for_kpct.txt`: KPCT input file. Each line starts with `taxon_oid` followed by the set of KO IDs present in that genome or combination. If `--calculate-complementarity` is `N>=2`, combinations up to `N` are included as `GenomeA__GenomeB`.
363
+ - `output_give_completeness_contigs.with_weights.tsv`: KPCT module results per genome/combination. Columns: `contig` (genome/combination ID), `module_accession`, `completeness` (0–100), `pathway_name`, `pathway_class`, `matching_ko` (KO weights), `missing_ko`.
364
+ - `output_give_completeness_pathways.with_weights.tsv`: Same rows and order as the contigs file, but without the `contig` column. This is provided for compatibility with legacy tools; prefer the contigs file when you need genome-level provenance.
365
+ - `module_completeness.tsv`: Pivoted module completeness matrix. Columns: `n_members`, `taxon_oid`, followed by KEGG module IDs (`M00001`, …). Values are numeric percentages in the range 0–100.
366
+ - `module_completeness_complementarity_Nmember.tsv`: Complementarity report for `N`-member combinations (only when `--calculate-complementarity N` is set). Columns: `taxon_oid_1..N`, `completeness_taxon_oid_1..N`, `module_id`, `module_name`, `pathway_class`, `matching_ko`, `proteins_taxon_oid_1..N`. Protein fields list contributing proteins per KO (from eggNOG-mapper) as `{'KXXXXX': 'genome|protein'}`.
367
+ - `logs/moducomp.log`: Detailed run log with structured progress messages and per-command resource summaries.
368
+ - `logs/resource_usage_YYYYMMDD_HHMMSS.log`: Resource monitoring log capturing wall time, CPU time, CPU utilization, peak RAM, and exit code for each monitored command.
369
+ - `tmp/` (only if `--keep-tmp`): Intermediate files such as `merged_genomes.faa`, `emapper_output/`, and KPCT chunk outputs.
370
+ - `validation_report.json` (default when validation is enabled): JSON report produced by the validator.
302
371
 
303
- `moducomp` generates several output files in the specified output directory:
372
+ **KO-matrix mode outputs (`moducomp analyze-ko-matrix`)**
304
373
 
305
- - **`kos_matrix.csv`**: Matrix of KO counts for each genome
306
- - **`module_completeness.tsv`**: Module completeness scores for individual genomes and combinations
307
- - **`module_completeness_complementarity_Nmember.tsv`**: Complementarity reports (if requested)
308
- - **`logs/resource_usage_YYYYMMDD_HHMMSS.log`**: Resource monitoring log with CPU, memory, and runtime metrics for reproducibility
309
- - **`logs/moducomp.log`**: Detailed pipeline execution log with a per-command resource summary at the end of the run
374
+ - `kos_matrix.csv`: A copy of the input KO matrix (same format as above).
375
+ - `ko_file_for_kpct.txt`: KPCT input generated from the KO matrix. If `--calculate-complementarity` is set, combination lines are added using `GenomeA__GenomeB` identifiers.
376
+ - `output_give_completeness_contigs.with_weights.tsv`: KPCT module results per genome/combination (same format as pipeline mode).
377
+ - `output_give_completeness_pathways.with_weights.tsv`: Same rows as the contigs file, without the `contig` column.
378
+ - `module_completeness.tsv`: Module completeness matrix (same format as pipeline mode).
379
+ - `module_completeness_complementarity_Nmember.tsv`: Complementarity report. Protein contribution columns are filled with `No protein data available for <genome>` because no eggNOG-mapper annotations are available in KO-matrix mode.
380
+ - `logs/moducomp.log` and `logs/resource_usage_YYYYMMDD_HHMMSS.log`: Standard run logs and resource summaries.
381
+ - `validation_report.json` (default when validation is enabled): JSON report produced by the validator.
310
382
 
311
383
  ## Citation
312
384
  Villada, JC. & Schulz, F. (2025). Assessment of metabolic module completeness of genomes and metabolic complementarity in microbiomes with `moducomp` . `moducomp` (v0.5.1) Zenodo. https://doi.org/10.5281/zenodo.16116092
@@ -1,11 +1,11 @@
1
- moducomp/__init__.py,sha256=aCyJFC9uD2L4Pv7NJxSFMmFDtLYnxQ-lFkkXBu3ri4U,659
1
+ moducomp/__init__.py,sha256=P0sHK6IWgEersAkHlDWWFlWULUH_C-ytp5vStZDJqiY,659
2
2
  moducomp/__main__.py,sha256=1O2pv6IGjUgqnbqsiMLtVqjxWQpRtZUjp8LDljZ1bsI,185
3
- moducomp/moducomp.py,sha256=R4_mXvfpe_ojfDKibduMvgkTC1QDn4sFUt9TFc9xVUw,142734
3
+ moducomp/moducomp.py,sha256=9eLxngLe0zRcevy6x9a-80-MGUjIEOYLTCQr6crnYRM,177904
4
4
  moducomp/data/test_genomes/IMG2562617132.faa,sha256=gZPh-08pMRdAWJRr3__TbnU1F68CdkDb3gxtpaCLTTc,356863
5
5
  moducomp/data/test_genomes/IMG2568526683.faa,sha256=PxFJwe-68UGw7il1hGlNhZt4-2WzzxXxGE1GTskDnow,343109
6
6
  moducomp/data/test_genomes/IMG2740892217.faa,sha256=WsId4sIPxENbqF6tYFouAgDCy6T0SXNY6TywxBNe-3E,548954
7
- moducomp-0.7.11.dist-info/entry_points.txt,sha256=dwt0_w7Ex9p1vhfp2fl4WXJLBh50u9fXTRNlAOJkAd4,114
8
- moducomp-0.7.11.dist-info/licenses/LICENSE.txt,sha256=pt0cfIq9Wop21KDZYyQgP0M1YWYvKG0PomA5cUDC4TI,1536
9
- moducomp-0.7.11.dist-info/WHEEL,sha256=_2ozNFCLWc93bK4WKHCO-eDUENDlo-dgc9cU3qokYO4,82
10
- moducomp-0.7.11.dist-info/METADATA,sha256=jtYz1MoBhV6tRLfJOJ0cuKg_o6Qjnqt2kl0qc1I-V0Y,14715
11
- moducomp-0.7.11.dist-info/RECORD,,
7
+ moducomp-0.7.12.dist-info/entry_points.txt,sha256=dwt0_w7Ex9p1vhfp2fl4WXJLBh50u9fXTRNlAOJkAd4,114
8
+ moducomp-0.7.12.dist-info/licenses/LICENSE.txt,sha256=pt0cfIq9Wop21KDZYyQgP0M1YWYvKG0PomA5cUDC4TI,1536
9
+ moducomp-0.7.12.dist-info/WHEEL,sha256=_2ozNFCLWc93bK4WKHCO-eDUENDlo-dgc9cU3qokYO4,82
10
+ moducomp-0.7.12.dist-info/METADATA,sha256=L7JsOoEk8dYNWZcCWmmQXy1ZPow3iRb_d7XrSsanOAg,21146
11
+ moducomp-0.7.12.dist-info/RECORD,,