pipeconcord 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ """Semantic comparison of bioinformatics pipeline outputs."""
2
+
3
+ from pipeconcord._version import __version__
4
+ from pipeconcord.core.engine import ComparisonEngine
5
+ from pipeconcord.core.report import ConcordanceReport
6
+
7
+ __all__ = ["ComparisonEngine", "ConcordanceReport", "__version__"]
8
+
@@ -0,0 +1,5 @@
1
+ from pipeconcord.cli import main
2
+
3
+ if __name__ == "__main__":
4
+ raise SystemExit(main())
5
+
@@ -0,0 +1 @@
1
+ __version__ = "0.2.0"
pipeconcord/cli.py ADDED
@@ -0,0 +1,147 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+
6
+ from pipeconcord.core.batch import run_batch
7
+ from pipeconcord.core.engine import ComparisonEngine
8
+ from pipeconcord.io.report_writers import (
9
+ batch_to_html,
10
+ batch_to_json,
11
+ batch_to_text,
12
+ batch_to_tsv,
13
+ report_to_html,
14
+ report_to_json,
15
+ report_to_text,
16
+ write_batch,
17
+ write_report,
18
+ )
19
+
20
+
21
+ def build_parser() -> argparse.ArgumentParser:
22
+ parser = argparse.ArgumentParser(
23
+ prog="pipeconcord",
24
+ description="Generate semantic concordance reports for bioinformatics outputs.",
25
+ )
26
+ subparsers = parser.add_subparsers(dest="command")
27
+
28
+ compare_parser = subparsers.add_parser("compare", help="Compare one pair of files.")
29
+ add_common_options(compare_parser)
30
+ compare_parser.add_argument("file_a", help="First output file to compare.")
31
+ compare_parser.add_argument("file_b", help="Second output file to compare.")
32
+ compare_parser.add_argument("-o", "--output", help="Write the report to a file instead of stdout.")
33
+ compare_parser.add_argument("--format", choices=["html", "json", "text"], default="json", help="Output format.")
34
+
35
+ batch_parser = subparsers.add_parser("batch", help="Compare file pairs listed in a CSV/TSV manifest.")
36
+ add_common_options(batch_parser)
37
+ batch_parser.add_argument("manifest", help="CSV/TSV manifest with file_a and file_b columns.")
38
+ batch_parser.add_argument("--min-concordance", type=float, help="Fail if any successful comparison is below this threshold.")
39
+ batch_parser.add_argument("--stop-on-error", action="store_true", help="Stop on the first failed comparison.")
40
+ batch_parser.add_argument("-o", "--output", help="Write the batch report to a file instead of stdout.")
41
+ batch_parser.add_argument("--format", choices=["html", "json", "tsv", "text"], default="tsv", help="Batch output format.")
42
+
43
+ return parser
44
+
45
+
46
+ def add_common_options(parser: argparse.ArgumentParser) -> None:
47
+ parser.add_argument("-t", "--type", dest="file_type", help="Force a comparator/file type such as bam_stats, bed, counts, deg, expression, fasta, fastq, table, csv, tsv, or vcf.")
48
+ parser.add_argument("--key", dest="key_column", help="Column to use for row alignment.")
49
+ parser.add_argument("--delimiter", help="Force a delimiter for tabular files.")
50
+ parser.add_argument("--alpha", type=float, help="DEG adjusted p-value threshold. Default: 0.05.")
51
+ parser.add_argument("--lfc-threshold", type=float, help="DEG absolute log-fold-change threshold. Default: 0.0.")
52
+ parser.add_argument("--top-n", type=int, help="Number of top-ranked DEG genes to compare. Default: 50.")
53
+ parser.add_argument("--gene-column", help="Gene identifier column override for DEG/count matrices.")
54
+ parser.add_argument("--sample-columns", help="Counts comparator sample columns as a comma-separated list.")
55
+ parser.add_argument("--min-reciprocal-overlap", type=float, help="BED interval match threshold. Default: 0.0 for any overlap.")
56
+ parser.add_argument("--reference-fasta", help="Reference FASTA for optional VCF indel left-alignment.")
57
+ parser.add_argument("--logfc-column", help="DEG log-fold-change column override.")
58
+ parser.add_argument("--padj-column", help="DEG adjusted p-value column override.")
59
+ parser.add_argument("--pvalue-column", help="DEG raw p-value column override when adjusted p-values are absent.")
60
+
61
+
62
+ def main(argv: list[str] | None = None) -> int:
63
+ args_list = list(sys.argv[1:] if argv is None else argv)
64
+ if args_list and args_list[0] not in {"compare", "batch", "-h", "--help"}:
65
+ args_list = ["compare", *args_list]
66
+ parser = build_parser()
67
+ args = parser.parse_args(args_list)
68
+ if args.command is None:
69
+ parser.print_help()
70
+ return 2
71
+
72
+ try:
73
+ if args.command == "batch":
74
+ return run_batch_command(args)
75
+ return run_compare_command(args)
76
+ except Exception as exc:
77
+ parser.exit(2, f"pipeconcord: error: {exc}\n")
78
+ return 0
79
+
80
+
81
+ def run_compare_command(args: argparse.Namespace) -> int:
82
+ engine = ComparisonEngine()
83
+ report = engine.compare(
84
+ args.file_a,
85
+ args.file_b,
86
+ **comparison_kwargs(args),
87
+ )
88
+ if args.output:
89
+ write_report(report, args.output, fmt=args.format)
90
+ elif args.format == "html":
91
+ print(report_to_html(report))
92
+ elif args.format == "text":
93
+ print(report_to_text(report))
94
+ else:
95
+ print(report_to_json(report))
96
+ return 0
97
+
98
+
99
+ def run_batch_command(args: argparse.Namespace) -> int:
100
+ results = run_batch(
101
+ args.manifest,
102
+ stop_on_error=args.stop_on_error,
103
+ default_file_type=args.file_type,
104
+ **comparison_kwargs(args, include_file_type=False),
105
+ )
106
+ if args.output:
107
+ write_batch(results, args.output, fmt=args.format)
108
+ elif args.format == "html":
109
+ print(batch_to_html(results))
110
+ elif args.format == "json":
111
+ print(batch_to_json(results))
112
+ elif args.format == "text":
113
+ print(batch_to_text(results))
114
+ else:
115
+ print(batch_to_tsv(results))
116
+ if args.min_concordance is not None and not 0.0 <= args.min_concordance <= 1.0:
117
+ raise ValueError("min-concordance must be between 0.0 and 1.0")
118
+ below_threshold = [
119
+ result
120
+ for result in results
121
+ if result.report is not None and args.min_concordance is not None and result.report.overall_concordance < args.min_concordance
122
+ ]
123
+ return 1 if any(result.error for result in results) or below_threshold else 0
124
+
125
+
126
+ def comparison_kwargs(args: argparse.Namespace, *, include_file_type: bool = True) -> dict[str, object]:
127
+ kwargs: dict[str, object] = {
128
+ "key_column": args.key_column,
129
+ "delimiter": args.delimiter,
130
+ "alpha": args.alpha,
131
+ "lfc_threshold": args.lfc_threshold,
132
+ "top_n": args.top_n,
133
+ "gene_column": args.gene_column,
134
+ "sample_columns": args.sample_columns,
135
+ "min_reciprocal_overlap": args.min_reciprocal_overlap,
136
+ "reference_fasta": args.reference_fasta,
137
+ "logfc_column": args.logfc_column,
138
+ "padj_column": args.padj_column,
139
+ "pvalue_column": args.pvalue_column,
140
+ }
141
+ if include_file_type:
142
+ kwargs["file_type"] = args.file_type
143
+ return kwargs
144
+
145
+
146
+ if __name__ == "__main__":
147
+ raise SystemExit(main())
@@ -0,0 +1,25 @@
1
+ from __future__ import annotations
2
+
3
+ from pipeconcord.comparators.bam_stats import BAMStatsComparator
4
+ from pipeconcord.comparators.bed import BEDComparator
5
+ from pipeconcord.comparators.counts import CountsComparator
6
+ from pipeconcord.comparators.deg import DEGComparator
7
+ from pipeconcord.comparators.expression import ExpressionComparator
8
+ from pipeconcord.comparators.fasta import FASTAComparator
9
+ from pipeconcord.comparators.table import TableComparator
10
+ from pipeconcord.comparators.vcf import VCFComparator
11
+ from pipeconcord.core.registry import ComparatorRegistry
12
+
13
+
14
+ def register_builtin_comparators(registry: type[ComparatorRegistry] = ComparatorRegistry) -> None:
15
+ registry.register(DEGComparator)
16
+ registry.register(ExpressionComparator)
17
+ registry.register(CountsComparator)
18
+ registry.register(BEDComparator)
19
+ registry.register(FASTAComparator)
20
+ registry.register(VCFComparator)
21
+ registry.register(BAMStatsComparator)
22
+ registry.register(TableComparator)
23
+
24
+
25
+ __all__ = ["BAMStatsComparator", "BEDComparator", "CountsComparator", "DEGComparator", "ExpressionComparator", "FASTAComparator", "TableComparator", "VCFComparator", "register_builtin_comparators"]
@@ -0,0 +1,229 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+ from pipeconcord.comparators.base import Comparator
8
+ from pipeconcord.core.report import ConcordanceReport
9
+ from pipeconcord.core.utils import clamp01
10
+
11
+
12
+ FLAGSTAT_COUNT_RE = re.compile(r"^(\d+)\s+\+\s+\d+\s+(.+)$")
13
+
14
+
15
+ @dataclass(slots=True)
16
+ class AlignmentStats:
17
+ path: str
18
+ total_reads: float | None = None
19
+ mapped_reads: float | None = None
20
+ duplicate_reads: float | None = None
21
+ paired_reads: float | None = None
22
+ properly_paired_reads: float | None = None
23
+ insert_size_average: float | None = None
24
+ average_length: float | None = None
25
+ error_rate: float | None = None
26
+ recognized_metrics: int = 0
27
+
28
+ @property
29
+ def alignment_rate(self) -> float | None:
30
+ return fraction(self.mapped_reads, self.total_reads)
31
+
32
+ @property
33
+ def duplicate_rate(self) -> float | None:
34
+ return fraction(self.duplicate_reads, self.total_reads)
35
+
36
+ @property
37
+ def proper_pair_rate(self) -> float | None:
38
+ return fraction(self.properly_paired_reads, self.paired_reads)
39
+
40
+
41
+ class BAMStatsComparator(Comparator):
42
+ """Comparator for samtools flagstat/stats alignment summaries."""
43
+
44
+ name = "bam_stats"
45
+ supported_types = ("bam_stats", "bam-stats", "flagstat", "samtools-stats")
46
+
47
+ def can_handle(self, file_a: str, file_b: str, **kwargs: object) -> bool:
48
+ requested_type = kwargs.get("file_type")
49
+ if requested_type in self.supported_types:
50
+ return True
51
+ if requested_type is not None:
52
+ return False
53
+ return looks_like_bam_stats(file_a) and looks_like_bam_stats(file_b)
54
+
55
+ def compare(self, file_a: str, file_b: str, **kwargs: object) -> ConcordanceReport:
56
+ stats_a = parse_alignment_stats(file_a)
57
+ stats_b = parse_alignment_stats(file_b)
58
+ metrics: dict[str, float] = {}
59
+ scores: list[float] = []
60
+
61
+ add_ratio_metric(metrics, scores, "total_reads_ratio", stats_a.total_reads, stats_b.total_reads)
62
+ add_ratio_metric(metrics, scores, "mapped_reads_ratio", stats_a.mapped_reads, stats_b.mapped_reads)
63
+ add_rate_similarity(metrics, scores, "alignment_rate_similarity", stats_a.alignment_rate, stats_b.alignment_rate)
64
+ add_rate_similarity(metrics, scores, "duplicate_rate_similarity", stats_a.duplicate_rate, stats_b.duplicate_rate)
65
+ add_rate_similarity(metrics, scores, "proper_pair_rate_similarity", stats_a.proper_pair_rate, stats_b.proper_pair_rate)
66
+ add_ratio_metric(metrics, scores, "insert_size_average_ratio", stats_a.insert_size_average, stats_b.insert_size_average)
67
+ add_ratio_metric(metrics, scores, "average_length_ratio", stats_a.average_length, stats_b.average_length)
68
+ add_rate_similarity(metrics, scores, "error_rate_similarity", stats_a.error_rate, stats_b.error_rate)
69
+
70
+ if stats_a.alignment_rate is not None:
71
+ metrics["alignment_rate_file_a"] = stats_a.alignment_rate
72
+ if stats_b.alignment_rate is not None:
73
+ metrics["alignment_rate_file_b"] = stats_b.alignment_rate
74
+ if stats_a.duplicate_rate is not None:
75
+ metrics["duplicate_rate_file_a"] = stats_a.duplicate_rate
76
+ if stats_b.duplicate_rate is not None:
77
+ metrics["duplicate_rate_file_b"] = stats_b.duplicate_rate
78
+
79
+ details = {
80
+ "file_a": stats_to_dict(stats_a),
81
+ "file_b": stats_to_dict(stats_b),
82
+ "scored_metric_count": len(scores),
83
+ }
84
+ return ConcordanceReport(
85
+ comparator=self.__class__.__name__,
86
+ file_a=str(file_a),
87
+ file_b=str(file_b),
88
+ overall_concordance=clamp01(mean(scores)),
89
+ metrics=metrics,
90
+ details=details,
91
+ warnings=[],
92
+ )
93
+
94
+
95
+ def looks_like_bam_stats(path: str) -> bool:
96
+ text = Path(path).read_text(encoding="utf-8", errors="replace")[:8192]
97
+ if text.startswith("SN\t") or "\nSN\t" in text:
98
+ return True
99
+ lowered = text.lower()
100
+ return " in total " in lowered and " mapped (" in lowered
101
+
102
+
103
+ def parse_alignment_stats(path: str) -> AlignmentStats:
104
+ stats = AlignmentStats(path=str(path))
105
+ for line in Path(path).read_text(encoding="utf-8", errors="replace").splitlines():
106
+ stripped = line.strip()
107
+ if not stripped:
108
+ continue
109
+ if stripped.startswith("SN\t"):
110
+ parse_samtools_stats_line(stats, stripped)
111
+ else:
112
+ parse_flagstat_line(stats, stripped)
113
+ if stats.recognized_metrics == 0:
114
+ raise ValueError(f"{path!r} does not look like samtools stats or flagstat output")
115
+ return stats
116
+
117
+
118
+ def parse_samtools_stats_line(stats: AlignmentStats, line: str) -> None:
119
+ fields = line.split("\t")
120
+ if len(fields) < 3:
121
+ return
122
+ key = fields[1].rstrip(":").lower()
123
+ value = parse_number(fields[2])
124
+ if value is None:
125
+ return
126
+ mapping = {
127
+ "raw total sequences": "total_reads",
128
+ "reads mapped": "mapped_reads",
129
+ "reads duplicated": "duplicate_reads",
130
+ "insert size average": "insert_size_average",
131
+ "average length": "average_length",
132
+ "error rate": "error_rate",
133
+ }
134
+ attribute = mapping.get(key)
135
+ if attribute is not None:
136
+ setattr(stats, attribute, value)
137
+ stats.recognized_metrics += 1
138
+
139
+
140
+ def parse_flagstat_line(stats: AlignmentStats, line: str) -> None:
141
+ match = FLAGSTAT_COUNT_RE.match(line)
142
+ if match is None:
143
+ return
144
+ count = float(match.group(1))
145
+ label = match.group(2).lower()
146
+ if " in total " in f" {label} ":
147
+ stats.total_reads = count
148
+ elif label.startswith("mapped "):
149
+ stats.mapped_reads = count
150
+ elif label.startswith("duplicates"):
151
+ stats.duplicate_reads = count
152
+ elif label.startswith("paired in sequencing"):
153
+ stats.paired_reads = count
154
+ elif label.startswith("properly paired"):
155
+ stats.properly_paired_reads = count
156
+ else:
157
+ return
158
+ stats.recognized_metrics += 1
159
+
160
+
161
+ def parse_number(value: str) -> float | None:
162
+ try:
163
+ return float(value.strip())
164
+ except ValueError:
165
+ return None
166
+
167
+
168
+ def fraction(numerator: float | None, denominator: float | None) -> float | None:
169
+ if numerator is None or denominator is None or denominator == 0:
170
+ return None
171
+ return numerator / denominator
172
+
173
+
174
+ def add_ratio_metric(
175
+ metrics: dict[str, float],
176
+ scores: list[float],
177
+ name: str,
178
+ left: float | None,
179
+ right: float | None,
180
+ ) -> None:
181
+ if left is None and right is None:
182
+ return
183
+ score = magnitude_ratio(left or 0.0, right or 0.0)
184
+ metrics[name] = score
185
+ scores.append(score)
186
+
187
+
188
+ def add_rate_similarity(
189
+ metrics: dict[str, float],
190
+ scores: list[float],
191
+ name: str,
192
+ left: float | None,
193
+ right: float | None,
194
+ ) -> None:
195
+ if left is None and right is None:
196
+ return
197
+ score = 0.0 if left is None or right is None else clamp01(1.0 - abs(left - right))
198
+ metrics[name] = score
199
+ scores.append(score)
200
+
201
+
202
+ def magnitude_ratio(left: float, right: float) -> float:
203
+ if left == 0 and right == 0:
204
+ return 1.0
205
+ return min(abs(left), abs(right)) / max(abs(left), abs(right))
206
+
207
+
208
+ def mean(values: list[float]) -> float:
209
+ if not values:
210
+ return 0.0
211
+ return sum(values) / len(values)
212
+
213
+
214
+ def stats_to_dict(stats: AlignmentStats) -> dict[str, float | str | None]:
215
+ return {
216
+ "path": stats.path,
217
+ "total_reads": stats.total_reads,
218
+ "mapped_reads": stats.mapped_reads,
219
+ "duplicate_reads": stats.duplicate_reads,
220
+ "paired_reads": stats.paired_reads,
221
+ "properly_paired_reads": stats.properly_paired_reads,
222
+ "insert_size_average": stats.insert_size_average,
223
+ "average_length": stats.average_length,
224
+ "error_rate": stats.error_rate,
225
+ "alignment_rate": stats.alignment_rate,
226
+ "duplicate_rate": stats.duplicate_rate,
227
+ "proper_pair_rate": stats.proper_pair_rate,
228
+ }
229
+
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ from pipeconcord.core.report import ConcordanceReport
6
+
7
+
8
+ class Comparator(ABC):
9
+ """Base class for semantic output comparators."""
10
+
11
+ name = "base"
12
+ supported_types: tuple[str, ...] = ()
13
+
14
+ @abstractmethod
15
+ def can_handle(self, file_a: str, file_b: str, **kwargs: object) -> bool:
16
+ """Return True when this comparator can compare the two inputs."""
17
+
18
+ @abstractmethod
19
+ def compare(self, file_a: str, file_b: str, **kwargs: object) -> ConcordanceReport:
20
+ """Run comparison and return a unified concordance report."""
21
+