bioartifact 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioartifact/__init__.py +25 -0
- bioartifact/__main__.py +4 -0
- bioartifact/cli/__init__.py +1 -0
- bioartifact/cli/main.py +149 -0
- bioartifact/contracts/__init__.py +52 -0
- bioartifact/contracts/alignment.py +100 -0
- bioartifact/contracts/common.py +26 -0
- bioartifact/contracts/fastq.py +174 -0
- bioartifact/contracts/intervals.py +72 -0
- bioartifact/contracts/tables.py +134 -0
- bioartifact/contracts/vcf.py +68 -0
- bioartifact/detection.py +55 -0
- bioartifact/exceptions.py +14 -0
- bioartifact/inspectors/__init__.py +64 -0
- bioartifact/inspectors/alignment.py +191 -0
- bioartifact/inspectors/bed.py +131 -0
- bioartifact/inspectors/fasta.py +66 -0
- bioartifact/inspectors/fastq.py +99 -0
- bioartifact/inspectors/gtf.py +92 -0
- bioartifact/inspectors/html.py +52 -0
- bioartifact/inspectors/tables.py +69 -0
- bioartifact/inspectors/vcf.py +84 -0
- bioartifact/io.py +29 -0
- bioartifact/json.py +8 -0
- bioartifact/manifest.py +211 -0
- bioartifact/metadata.py +144 -0
- bioartifact/models.py +89 -0
- bioartifact/summarize.py +59 -0
- bioartifact-0.1.0.dist-info/METADATA +389 -0
- bioartifact-0.1.0.dist-info/RECORD +33 -0
- bioartifact-0.1.0.dist-info/WHEEL +4 -0
- bioartifact-0.1.0.dist-info/entry_points.txt +2 -0
- bioartifact-0.1.0.dist-info/licenses/LICENSE +21 -0
bioartifact/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Agent-friendly inspection and validation of bioinformatics artifacts."""
|
|
2
|
+
|
|
3
|
+
from bioartifact.contracts import available_contracts, validate_artifact
|
|
4
|
+
from bioartifact.detection import detect_artifact_type
|
|
5
|
+
from bioartifact.inspectors import inspect_artifact
|
|
6
|
+
from bioartifact.manifest import validate_manifest
|
|
7
|
+
from bioartifact.metadata import artifact_type_details, contract_details
|
|
8
|
+
from bioartifact.models import ArtifactResult, CheckResult, ContractResult
|
|
9
|
+
from bioartifact.summarize import summarize_directory
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ArtifactResult",
|
|
13
|
+
"CheckResult",
|
|
14
|
+
"ContractResult",
|
|
15
|
+
"available_contracts",
|
|
16
|
+
"artifact_type_details",
|
|
17
|
+
"contract_details",
|
|
18
|
+
"detect_artifact_type",
|
|
19
|
+
"inspect_artifact",
|
|
20
|
+
"summarize_directory",
|
|
21
|
+
"validate_artifact",
|
|
22
|
+
"validate_manifest",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
__version__ = "0.1.0"
|
bioartifact/__main__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Command-line interface package."""
|
bioartifact/cli/main.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from bioartifact.contracts import available_contracts, validate_artifact
|
|
8
|
+
from bioartifact.inspectors import inspect_artifact
|
|
9
|
+
from bioartifact.json import dumps_json
|
|
10
|
+
from bioartifact.manifest import validate_manifest
|
|
11
|
+
from bioartifact.metadata import artifact_type_details, contract_details
|
|
12
|
+
from bioartifact.summarize import summarize_directory
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _print_payload(payload: dict[str, Any], args: argparse.Namespace) -> None:
|
|
16
|
+
if _should_emit_json(args):
|
|
17
|
+
print(dumps_json(payload))
|
|
18
|
+
return
|
|
19
|
+
|
|
20
|
+
for key, value in payload.items():
|
|
21
|
+
if isinstance(value, list):
|
|
22
|
+
print(f"{key}:")
|
|
23
|
+
for item in value:
|
|
24
|
+
print(f" - {item}")
|
|
25
|
+
elif isinstance(value, dict):
|
|
26
|
+
print(f"{key}:")
|
|
27
|
+
for nested_key, nested_value in value.items():
|
|
28
|
+
print(f" {nested_key}: {nested_value}")
|
|
29
|
+
else:
|
|
30
|
+
print(f"{key}: {value}")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _should_emit_json(args: argparse.Namespace) -> bool:
|
|
34
|
+
if getattr(args, "human", False):
|
|
35
|
+
return False
|
|
36
|
+
if getattr(args, "json", False):
|
|
37
|
+
return True
|
|
38
|
+
return getattr(args, "output", "json") != "human"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _inspect(args: argparse.Namespace) -> int:
|
|
42
|
+
result = inspect_artifact(args.path, artifact_type=args.type)
|
|
43
|
+
_print_payload(result.to_dict(), args)
|
|
44
|
+
return 0 if result.valid else 1
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _validate(args: argparse.Namespace) -> int:
|
|
48
|
+
result = validate_artifact(args.path, args.contract, mate=args.mate)
|
|
49
|
+
_print_payload(result.to_dict(), args)
|
|
50
|
+
return 0 if result.passed else 1
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _summarize(args: argparse.Namespace) -> int:
|
|
54
|
+
result = summarize_directory(args.path, recursive=args.recursive)
|
|
55
|
+
_print_payload(result, args)
|
|
56
|
+
return 0 if result["valid"] else 1
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _contracts(args: argparse.Namespace) -> int:
|
|
60
|
+
_print_payload(contract_details(), args)
|
|
61
|
+
return 0
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _types(args: argparse.Namespace) -> int:
|
|
65
|
+
_print_payload(artifact_type_details(), args)
|
|
66
|
+
return 0
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _validate_manifest(args: argparse.Namespace) -> int:
|
|
70
|
+
result = validate_manifest(args.path, base_dir=args.base_dir)
|
|
71
|
+
_print_payload(result, args)
|
|
72
|
+
return 0 if result["passed"] else 1
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _add_output_arguments(parser: argparse.ArgumentParser) -> None:
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
"--output",
|
|
78
|
+
choices=["json", "human"],
|
|
79
|
+
default="json",
|
|
80
|
+
help="output mode; defaults to JSON",
|
|
81
|
+
)
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"--json", action="store_true", help="emit structured JSON output; this is the default"
|
|
84
|
+
)
|
|
85
|
+
parser.add_argument("--human", action="store_true", help="force human-readable output")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
89
|
+
parser = argparse.ArgumentParser(
|
|
90
|
+
prog="bioartifact",
|
|
91
|
+
description="Inspect and validate bioinformatics artifacts.",
|
|
92
|
+
)
|
|
93
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
94
|
+
|
|
95
|
+
inspect_parser = subparsers.add_parser("inspect", help="inspect one artifact")
|
|
96
|
+
inspect_parser.add_argument("path", help="artifact path")
|
|
97
|
+
inspect_parser.add_argument("--type", help="override artifact type detection")
|
|
98
|
+
_add_output_arguments(inspect_parser)
|
|
99
|
+
inspect_parser.set_defaults(func=_inspect)
|
|
100
|
+
|
|
101
|
+
validate_parser = subparsers.add_parser("validate", help="validate an artifact contract")
|
|
102
|
+
validate_parser.add_argument("path", help="artifact path")
|
|
103
|
+
validate_parser.add_argument(
|
|
104
|
+
"--contract",
|
|
105
|
+
required=True,
|
|
106
|
+
choices=available_contracts(),
|
|
107
|
+
help="contract name",
|
|
108
|
+
)
|
|
109
|
+
validate_parser.add_argument("--mate", help="mate FASTQ path for paired_fastq")
|
|
110
|
+
_add_output_arguments(validate_parser)
|
|
111
|
+
validate_parser.set_defaults(func=_validate)
|
|
112
|
+
|
|
113
|
+
summarize_parser = subparsers.add_parser("summarize", help="summarize a directory")
|
|
114
|
+
summarize_parser.add_argument("path", help="directory path")
|
|
115
|
+
summarize_parser.add_argument("--recursive", action="store_true", help="scan recursively")
|
|
116
|
+
_add_output_arguments(summarize_parser)
|
|
117
|
+
summarize_parser.set_defaults(func=_summarize)
|
|
118
|
+
|
|
119
|
+
contracts_parser = subparsers.add_parser("contracts", help="list supported contracts")
|
|
120
|
+
_add_output_arguments(contracts_parser)
|
|
121
|
+
contracts_parser.set_defaults(func=_contracts)
|
|
122
|
+
|
|
123
|
+
types_parser = subparsers.add_parser("types", help="list supported artifact types")
|
|
124
|
+
_add_output_arguments(types_parser)
|
|
125
|
+
types_parser.set_defaults(func=_types)
|
|
126
|
+
|
|
127
|
+
manifest_parser = subparsers.add_parser(
|
|
128
|
+
"validate-manifest",
|
|
129
|
+
help="validate expected workflow outputs from a JSON or YAML manifest",
|
|
130
|
+
)
|
|
131
|
+
manifest_parser.add_argument("path", help="manifest path")
|
|
132
|
+
manifest_parser.add_argument(
|
|
133
|
+
"--base-dir",
|
|
134
|
+
help="base directory for relative output paths; defaults to the manifest directory",
|
|
135
|
+
)
|
|
136
|
+
_add_output_arguments(manifest_parser)
|
|
137
|
+
manifest_parser.set_defaults(func=_validate_manifest)
|
|
138
|
+
|
|
139
|
+
return parser
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def main(argv: list[str] | None = None) -> int:
|
|
143
|
+
parser = build_parser()
|
|
144
|
+
args = parser.parse_args(argv)
|
|
145
|
+
return int(args.func(args))
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
if __name__ == "__main__":
|
|
149
|
+
raise SystemExit(main(sys.argv[1:]))
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from bioartifact.contracts.alignment import validate_indexed_bam, validate_sorted_bam
|
|
8
|
+
from bioartifact.contracts.fastq import validate_fastq, validate_paired_fastq
|
|
9
|
+
from bioartifact.contracts.intervals import validate_narrowpeak
|
|
10
|
+
from bioartifact.contracts.tables import validate_de_table
|
|
11
|
+
from bioartifact.contracts.vcf import validate_valid_vcf
|
|
12
|
+
from bioartifact.models import ContractResult, failed
|
|
13
|
+
|
|
14
|
+
ContractValidator = Callable[..., ContractResult]
|
|
15
|
+
|
|
16
|
+
CONTRACTS: dict[str, ContractValidator] = {
|
|
17
|
+
"de_table": validate_de_table,
|
|
18
|
+
"fastq": validate_fastq,
|
|
19
|
+
"indexed_bam": validate_indexed_bam,
|
|
20
|
+
"narrowpeak": validate_narrowpeak,
|
|
21
|
+
"paired_fastq": validate_paired_fastq,
|
|
22
|
+
"sorted_bam": validate_sorted_bam,
|
|
23
|
+
"valid_vcf": validate_valid_vcf,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def available_contracts() -> list[str]:
|
|
28
|
+
"""Return supported contract names."""
|
|
29
|
+
|
|
30
|
+
return sorted(CONTRACTS)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def validate_artifact(path: str | Path, contract_name: str, **kwargs: Any) -> ContractResult:
|
|
34
|
+
"""Validate one artifact against a named contract."""
|
|
35
|
+
|
|
36
|
+
validator = CONTRACTS.get(contract_name)
|
|
37
|
+
if validator is None:
|
|
38
|
+
return ContractResult(
|
|
39
|
+
contract_name=contract_name,
|
|
40
|
+
passed=False,
|
|
41
|
+
path=str(path),
|
|
42
|
+
checks=[
|
|
43
|
+
failed(
|
|
44
|
+
"known_contract",
|
|
45
|
+
f"unknown contract '{contract_name}'",
|
|
46
|
+
remediation="Run `bioartifact contracts` and choose one of the listed contract names.",
|
|
47
|
+
available_contracts=available_contracts(),
|
|
48
|
+
)
|
|
49
|
+
],
|
|
50
|
+
errors=[f"unknown contract '{contract_name}'"],
|
|
51
|
+
)
|
|
52
|
+
return validator(Path(path), **kwargs)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from bioartifact.contracts.common import result
|
|
6
|
+
from bioartifact.inspectors import inspect_artifact
|
|
7
|
+
from bioartifact.models import ContractResult, failed, passed
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def validate_sorted_bam(path: Path, **_: object) -> ContractResult:
|
|
11
|
+
artifact = inspect_artifact(path)
|
|
12
|
+
checks = [
|
|
13
|
+
passed("readable", "alignment artifact is readable")
|
|
14
|
+
if artifact.valid
|
|
15
|
+
else failed(
|
|
16
|
+
"readable",
|
|
17
|
+
"alignment artifact is not readable",
|
|
18
|
+
remediation="Regenerate the alignment file or inspect the upstream aligner output.",
|
|
19
|
+
errors=artifact.errors,
|
|
20
|
+
),
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
if artifact.artifact_type not in {"bam", "sam"}:
|
|
24
|
+
checks.append(
|
|
25
|
+
failed(
|
|
26
|
+
"artifact_type",
|
|
27
|
+
"sorted_bam contract expects a BAM or SAM artifact",
|
|
28
|
+
remediation="Provide a `.bam` or `.sam` alignment file for this contract.",
|
|
29
|
+
artifact_type=artifact.artifact_type,
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
else:
|
|
33
|
+
checks.append(passed("artifact_type", "artifact is an alignment file"))
|
|
34
|
+
|
|
35
|
+
if artifact.summary.get("sorted") is True:
|
|
36
|
+
checks.append(passed("coordinate_sorted", "alignment is coordinate sorted"))
|
|
37
|
+
else:
|
|
38
|
+
checks.append(
|
|
39
|
+
failed(
|
|
40
|
+
"coordinate_sorted",
|
|
41
|
+
"alignment is not declared coordinate sorted",
|
|
42
|
+
remediation="Sort the alignment by coordinate, for example with `samtools sort`, then rerun validation.",
|
|
43
|
+
sort_order=artifact.summary.get("sort_order"),
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return result(
|
|
48
|
+
"sorted_bam",
|
|
49
|
+
checks,
|
|
50
|
+
path=str(path),
|
|
51
|
+
artifact_type=artifact.artifact_type,
|
|
52
|
+
warnings=artifact.warnings,
|
|
53
|
+
errors=artifact.errors,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def validate_indexed_bam(path: Path, **_: object) -> ContractResult:
|
|
58
|
+
artifact = inspect_artifact(path)
|
|
59
|
+
checks = [
|
|
60
|
+
passed("readable", "BAM is readable")
|
|
61
|
+
if artifact.valid
|
|
62
|
+
else failed(
|
|
63
|
+
"readable",
|
|
64
|
+
"BAM is not readable",
|
|
65
|
+
remediation="Regenerate the BAM file or check that it is BGZF-compressed BAM.",
|
|
66
|
+
errors=artifact.errors,
|
|
67
|
+
),
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
if artifact.artifact_type != "bam":
|
|
71
|
+
checks.append(
|
|
72
|
+
failed(
|
|
73
|
+
"artifact_type",
|
|
74
|
+
"indexed_bam contract expects a BAM artifact",
|
|
75
|
+
remediation="Provide a `.bam` file for the indexed_bam contract.",
|
|
76
|
+
artifact_type=artifact.artifact_type,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
checks.append(passed("artifact_type", "artifact is a BAM file"))
|
|
81
|
+
|
|
82
|
+
if artifact.summary.get("index_present") is True:
|
|
83
|
+
checks.append(passed("index_present", "BAM index was found"))
|
|
84
|
+
else:
|
|
85
|
+
checks.append(
|
|
86
|
+
failed(
|
|
87
|
+
"index_present",
|
|
88
|
+
"no BAM index was found next to the BAM file",
|
|
89
|
+
remediation="Create an index with `samtools index aligned.bam` or provide an adjacent `.bai`/`.csi` file.",
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return result(
|
|
94
|
+
"indexed_bam",
|
|
95
|
+
checks,
|
|
96
|
+
path=str(path),
|
|
97
|
+
artifact_type=artifact.artifact_type,
|
|
98
|
+
warnings=artifact.warnings,
|
|
99
|
+
errors=artifact.errors,
|
|
100
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
|
|
5
|
+
from bioartifact.models import CheckResult, ContractResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def result(
|
|
9
|
+
contract_name: str,
|
|
10
|
+
checks: Iterable[CheckResult],
|
|
11
|
+
*,
|
|
12
|
+
path: str | None = None,
|
|
13
|
+
artifact_type: str | None = None,
|
|
14
|
+
warnings: list[str] | None = None,
|
|
15
|
+
errors: list[str] | None = None,
|
|
16
|
+
) -> ContractResult:
|
|
17
|
+
check_list = list(checks)
|
|
18
|
+
return ContractResult(
|
|
19
|
+
contract_name=contract_name,
|
|
20
|
+
passed=all(check.status != "fail" for check in check_list),
|
|
21
|
+
checks=check_list,
|
|
22
|
+
path=path,
|
|
23
|
+
artifact_type=artifact_type,
|
|
24
|
+
warnings=warnings or [],
|
|
25
|
+
errors=errors or [],
|
|
26
|
+
)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from itertools import zip_longest
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from bioartifact.contracts.common import result
|
|
7
|
+
from bioartifact.inspectors.fastq import inspect_fastq, normalize_fastq_read_id
|
|
8
|
+
from bioartifact.io import open_text
|
|
9
|
+
from bioartifact.models import ContractResult, failed, passed
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def validate_fastq(path: Path, **_: object) -> ContractResult:
|
|
13
|
+
artifact = inspect_fastq(path)
|
|
14
|
+
checks = [
|
|
15
|
+
passed("readable", "FASTQ is readable")
|
|
16
|
+
if artifact.valid
|
|
17
|
+
else failed(
|
|
18
|
+
"readable",
|
|
19
|
+
"FASTQ is not structurally valid",
|
|
20
|
+
remediation="Regenerate or repair the FASTQ file before using it downstream.",
|
|
21
|
+
errors=artifact.errors,
|
|
22
|
+
),
|
|
23
|
+
passed("records_present", "FASTQ contains records", records=artifact.summary["records"])
|
|
24
|
+
if artifact.summary["records"] > 0
|
|
25
|
+
else failed(
|
|
26
|
+
"records_present",
|
|
27
|
+
"FASTQ contains no records",
|
|
28
|
+
remediation="Check that the workflow wrote reads to the expected FASTQ path.",
|
|
29
|
+
),
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
if path.name.lower().endswith(".gz"):
|
|
33
|
+
if artifact.summary.get("gzip"):
|
|
34
|
+
checks.append(passed("valid_gzip", "gzip encoding is valid"))
|
|
35
|
+
else:
|
|
36
|
+
checks.append(
|
|
37
|
+
failed(
|
|
38
|
+
"valid_gzip",
|
|
39
|
+
"file extension indicates gzip but gzip magic is absent",
|
|
40
|
+
remediation="Recompress the file with gzip or correct the filename extension.",
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
if artifact.valid:
|
|
45
|
+
checks.append(passed("sequence_quality_lengths", "all sequence and quality lengths match"))
|
|
46
|
+
else:
|
|
47
|
+
length_errors = [
|
|
48
|
+
error for error in artifact.errors if "sequence and quality lengths differ" in error
|
|
49
|
+
]
|
|
50
|
+
if length_errors:
|
|
51
|
+
checks.append(
|
|
52
|
+
failed(
|
|
53
|
+
"sequence_quality_lengths",
|
|
54
|
+
"one or more FASTQ records have mismatched sequence and quality lengths",
|
|
55
|
+
remediation="Regenerate the FASTQ or trim/filter with a tool that preserves sequence and quality synchronization.",
|
|
56
|
+
examples=length_errors[:5],
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
return result(
|
|
61
|
+
"fastq",
|
|
62
|
+
checks,
|
|
63
|
+
path=str(path),
|
|
64
|
+
artifact_type=artifact.artifact_type,
|
|
65
|
+
warnings=artifact.warnings,
|
|
66
|
+
errors=artifact.errors,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _iter_fastq_ids(path: Path):
|
|
71
|
+
with open_text(path) as handle:
|
|
72
|
+
while True:
|
|
73
|
+
header = handle.readline()
|
|
74
|
+
if not header:
|
|
75
|
+
break
|
|
76
|
+
if not header.strip():
|
|
77
|
+
break
|
|
78
|
+
sequence = handle.readline()
|
|
79
|
+
plus = handle.readline()
|
|
80
|
+
quality = handle.readline()
|
|
81
|
+
if not sequence or not plus or not quality:
|
|
82
|
+
break
|
|
83
|
+
yield normalize_fastq_read_id(header)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def validate_paired_fastq(
|
|
87
|
+
path: Path, mate: str | Path | None = None, **_: object
|
|
88
|
+
) -> ContractResult:
|
|
89
|
+
if mate is None:
|
|
90
|
+
return result(
|
|
91
|
+
"paired_fastq",
|
|
92
|
+
[
|
|
93
|
+
failed(
|
|
94
|
+
"mate_provided",
|
|
95
|
+
"paired_fastq contract requires --mate",
|
|
96
|
+
remediation="Pass the second FASTQ file with `--mate`.",
|
|
97
|
+
)
|
|
98
|
+
],
|
|
99
|
+
path=str(path),
|
|
100
|
+
artifact_type="fastq",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
mate_path = Path(mate)
|
|
104
|
+
first = inspect_fastq(path)
|
|
105
|
+
second = inspect_fastq(mate_path)
|
|
106
|
+
checks = [
|
|
107
|
+
passed("first_readable", "first FASTQ is valid")
|
|
108
|
+
if first.valid
|
|
109
|
+
else failed(
|
|
110
|
+
"first_readable",
|
|
111
|
+
"first FASTQ is invalid",
|
|
112
|
+
remediation="Repair or regenerate the first mate FASTQ.",
|
|
113
|
+
errors=first.errors,
|
|
114
|
+
),
|
|
115
|
+
passed("second_readable", "second FASTQ is valid")
|
|
116
|
+
if second.valid
|
|
117
|
+
else failed(
|
|
118
|
+
"second_readable",
|
|
119
|
+
"second FASTQ is invalid",
|
|
120
|
+
remediation="Repair or regenerate the second mate FASTQ.",
|
|
121
|
+
errors=second.errors,
|
|
122
|
+
),
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
first_records = first.summary.get("records", 0)
|
|
126
|
+
second_records = second.summary.get("records", 0)
|
|
127
|
+
if first_records == second_records:
|
|
128
|
+
checks.append(passed("synchronized_read_counts", "FASTQ files contain equal read counts"))
|
|
129
|
+
else:
|
|
130
|
+
checks.append(
|
|
131
|
+
failed(
|
|
132
|
+
"synchronized_read_counts",
|
|
133
|
+
"FASTQ files contain different read counts",
|
|
134
|
+
remediation="Recreate the paired FASTQ files from the same synchronized filtering step.",
|
|
135
|
+
first_records=first_records,
|
|
136
|
+
second_records=second_records,
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
if first.valid and second.valid:
|
|
141
|
+
mismatches = []
|
|
142
|
+
compared = 0
|
|
143
|
+
for compared, (first_id, second_id) in enumerate(
|
|
144
|
+
zip_longest(_iter_fastq_ids(path), _iter_fastq_ids(mate_path)),
|
|
145
|
+
start=1,
|
|
146
|
+
):
|
|
147
|
+
if first_id != second_id:
|
|
148
|
+
mismatches.append(
|
|
149
|
+
{"record": compared, "first_id": first_id, "second_id": second_id}
|
|
150
|
+
)
|
|
151
|
+
if len(mismatches) >= 10:
|
|
152
|
+
break
|
|
153
|
+
if mismatches:
|
|
154
|
+
checks.append(
|
|
155
|
+
failed(
|
|
156
|
+
"matching_read_ids",
|
|
157
|
+
"paired FASTQ read IDs differ",
|
|
158
|
+
remediation="Verify that R1 and R2 files belong to the same sample and filtering step.",
|
|
159
|
+
examples=mismatches,
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
else:
|
|
163
|
+
checks.append(
|
|
164
|
+
passed("matching_read_ids", "paired FASTQ read IDs match", compared=compared)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
return result(
|
|
168
|
+
"paired_fastq",
|
|
169
|
+
checks,
|
|
170
|
+
path=str(path),
|
|
171
|
+
artifact_type="fastq",
|
|
172
|
+
warnings=first.warnings + second.warnings,
|
|
173
|
+
errors=first.errors + second.errors,
|
|
174
|
+
)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from bioartifact.contracts.common import result
|
|
6
|
+
from bioartifact.inspectors.bed import inspect_narrowpeak
|
|
7
|
+
from bioartifact.models import ContractResult, failed, passed
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def validate_narrowpeak(path: Path, **_: object) -> ContractResult:
|
|
11
|
+
artifact = inspect_narrowpeak(path)
|
|
12
|
+
checks = [
|
|
13
|
+
passed("readable", "narrowPeak file is readable")
|
|
14
|
+
if artifact.valid
|
|
15
|
+
else failed(
|
|
16
|
+
"readable",
|
|
17
|
+
"narrowPeak file is invalid",
|
|
18
|
+
remediation="Regenerate the peak caller output or validate that the file is tab-delimited narrowPeak.",
|
|
19
|
+
errors=artifact.errors,
|
|
20
|
+
),
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
records = artifact.summary.get("records", 0)
|
|
24
|
+
if records:
|
|
25
|
+
checks.append(passed("records_present", "narrowPeak contains records", records=records))
|
|
26
|
+
else:
|
|
27
|
+
checks.append(
|
|
28
|
+
failed(
|
|
29
|
+
"records_present",
|
|
30
|
+
"narrowPeak contains no records",
|
|
31
|
+
remediation="Check that peak calling completed and wrote peaks to the expected path.",
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
column_errors = [error for error in artifact.errors if "fewer than 10" in error]
|
|
36
|
+
if column_errors:
|
|
37
|
+
checks.append(
|
|
38
|
+
failed(
|
|
39
|
+
"required_columns",
|
|
40
|
+
"one or more rows have fewer than 10 columns",
|
|
41
|
+
remediation="Use a narrowPeak output, not a BED3/BED6 peak file, or choose a BED-oriented contract.",
|
|
42
|
+
examples=column_errors,
|
|
43
|
+
)
|
|
44
|
+
)
|
|
45
|
+
else:
|
|
46
|
+
checks.append(passed("required_columns", "all rows contain required narrowPeak columns"))
|
|
47
|
+
|
|
48
|
+
coordinate_errors = [
|
|
49
|
+
error
|
|
50
|
+
for error in artifact.errors
|
|
51
|
+
if "coordinate" in error or "end before start" in error or "negative start" in error
|
|
52
|
+
]
|
|
53
|
+
if coordinate_errors:
|
|
54
|
+
checks.append(
|
|
55
|
+
failed(
|
|
56
|
+
"coordinates_valid",
|
|
57
|
+
"one or more rows contain invalid genomic coordinates",
|
|
58
|
+
remediation="Ensure starts are non-negative integers and ends are greater than or equal to starts.",
|
|
59
|
+
examples=coordinate_errors,
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
else:
|
|
63
|
+
checks.append(passed("coordinates_valid", "all genomic coordinates are valid"))
|
|
64
|
+
|
|
65
|
+
return result(
|
|
66
|
+
"narrowpeak",
|
|
67
|
+
checks,
|
|
68
|
+
path=str(path),
|
|
69
|
+
artifact_type=artifact.artifact_type,
|
|
70
|
+
warnings=artifact.warnings,
|
|
71
|
+
errors=artifact.errors,
|
|
72
|
+
)
|