pipeconcord 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pipeconcord-0.2.0/LICENSE +21 -0
- pipeconcord-0.2.0/PKG-INFO +232 -0
- pipeconcord-0.2.0/README.md +205 -0
- pipeconcord-0.2.0/pipeconcord/__init__.py +8 -0
- pipeconcord-0.2.0/pipeconcord/__main__.py +5 -0
- pipeconcord-0.2.0/pipeconcord/_version.py +1 -0
- pipeconcord-0.2.0/pipeconcord/cli.py +147 -0
- pipeconcord-0.2.0/pipeconcord/comparators/__init__.py +25 -0
- pipeconcord-0.2.0/pipeconcord/comparators/bam_stats.py +229 -0
- pipeconcord-0.2.0/pipeconcord/comparators/base.py +21 -0
- pipeconcord-0.2.0/pipeconcord/comparators/bed.py +269 -0
- pipeconcord-0.2.0/pipeconcord/comparators/counts.py +342 -0
- pipeconcord-0.2.0/pipeconcord/comparators/deg.py +411 -0
- pipeconcord-0.2.0/pipeconcord/comparators/expression.py +228 -0
- pipeconcord-0.2.0/pipeconcord/comparators/fasta.py +232 -0
- pipeconcord-0.2.0/pipeconcord/comparators/table.py +266 -0
- pipeconcord-0.2.0/pipeconcord/comparators/vcf.py +393 -0
- pipeconcord-0.2.0/pipeconcord/core/__init__.py +5 -0
- pipeconcord-0.2.0/pipeconcord/core/batch.py +109 -0
- pipeconcord-0.2.0/pipeconcord/core/engine.py +36 -0
- pipeconcord-0.2.0/pipeconcord/core/registry.py +63 -0
- pipeconcord-0.2.0/pipeconcord/core/report.py +39 -0
- pipeconcord-0.2.0/pipeconcord/core/utils.py +113 -0
- pipeconcord-0.2.0/pipeconcord/detection/__init__.py +4 -0
- pipeconcord-0.2.0/pipeconcord/detection/dispatch.py +15 -0
- pipeconcord-0.2.0/pipeconcord/detection/filetype.py +88 -0
- pipeconcord-0.2.0/pipeconcord/io/__init__.py +23 -0
- pipeconcord-0.2.0/pipeconcord/io/report_writers.py +411 -0
- pipeconcord-0.2.0/pipeconcord.egg-info/PKG-INFO +232 -0
- pipeconcord-0.2.0/pipeconcord.egg-info/SOURCES.txt +47 -0
- pipeconcord-0.2.0/pipeconcord.egg-info/dependency_links.txt +1 -0
- pipeconcord-0.2.0/pipeconcord.egg-info/entry_points.txt +12 -0
- pipeconcord-0.2.0/pipeconcord.egg-info/requires.txt +5 -0
- pipeconcord-0.2.0/pipeconcord.egg-info/top_level.txt +1 -0
- pipeconcord-0.2.0/pyproject.toml +69 -0
- pipeconcord-0.2.0/setup.cfg +4 -0
- pipeconcord-0.2.0/tests/test_bam_stats_comparator.py +97 -0
- pipeconcord-0.2.0/tests/test_batch.py +98 -0
- pipeconcord-0.2.0/tests/test_bed_comparator.py +62 -0
- pipeconcord-0.2.0/tests/test_cli.py +68 -0
- pipeconcord-0.2.0/tests/test_counts_comparator.py +80 -0
- pipeconcord-0.2.0/tests/test_deg_comparator.py +69 -0
- pipeconcord-0.2.0/tests/test_engine_integration.py +22 -0
- pipeconcord-0.2.0/tests/test_expression_comparator.py +56 -0
- pipeconcord-0.2.0/tests/test_fasta_comparator.py +61 -0
- pipeconcord-0.2.0/tests/test_filetype.py +44 -0
- pipeconcord-0.2.0/tests/test_report_writers.py +58 -0
- pipeconcord-0.2.0/tests/test_table_comparator.py +50 -0
- pipeconcord-0.2.0/tests/test_vcf_comparator.py +140 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 JunhaoQiu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pipeconcord
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Format-aware concordance reports for bioinformatics pipeline outputs.
|
|
5
|
+
Author: PipeConcord contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://qchiujunhao.github.io/pipeconcord/
|
|
8
|
+
Project-URL: Repository, https://github.com/qchiujunhao/pipeconcord
|
|
9
|
+
Project-URL: Issues, https://github.com/qchiujunhao/pipeconcord/issues
|
|
10
|
+
Project-URL: Documentation, https://qchiujunhao.github.io/pipeconcord/
|
|
11
|
+
Keywords: bioinformatics,testing,concordance,pipeline,comparison
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: coverage[toml]>=7.0; extra == "dev"
|
|
24
|
+
Requires-Dist: mkdocs>=1.6; extra == "dev"
|
|
25
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# PipeConcord
|
|
29
|
+
|
|
30
|
+
`pipeconcord` is a Python toolkit for comparing bioinformatics pipeline outputs
|
|
31
|
+
with semantic, format-aware metrics. Instead of only checking whether files are
|
|
32
|
+
byte-for-byte identical, it measures whether two runs agree in biologically or
|
|
33
|
+
analytically meaningful ways.
|
|
34
|
+
|
|
35
|
+
Project website: <https://qchiujunhao.github.io/pipeconcord/>
|
|
36
|
+
|
|
37
|
+
Status: alpha. The core comparison model and initial comparators are usable, but
|
|
38
|
+
APIs and metrics may change as more bioinformatics formats and workflows are
|
|
39
|
+
validated.
|
|
40
|
+
|
|
41
|
+
Rename note: the first alpha release used the package name `biocompare`. Current
|
|
42
|
+
and future releases use `pipeconcord` to avoid confusion with an unrelated
|
|
43
|
+
life-science product directory.
|
|
44
|
+
|
|
45
|
+
This repository currently implements the Phase 1 vertical slice:
|
|
46
|
+
|
|
47
|
+
- a shared `ConcordanceReport` model
|
|
48
|
+
- comparator registry with plugin entry point support
|
|
49
|
+
- file type detection for common bioinformatics and tabular formats
|
|
50
|
+
- a differential expression result comparator
|
|
51
|
+
- a count/expression matrix comparator
|
|
52
|
+
- a normalized expression matrix comparator
|
|
53
|
+
- a BED interval comparator
|
|
54
|
+
- a FASTA/FASTQ sequence comparator
|
|
55
|
+
- a lightweight VCF comparator with ALT splitting and minimal allele trimming
|
|
56
|
+
- a `samtools flagstat`/`samtools stats` comparator
|
|
57
|
+
- a generic CSV/TSV table comparator
|
|
58
|
+
- JSON/text report writers
|
|
59
|
+
- a command-line interface
|
|
60
|
+
- automated tests with `unittest`
|
|
61
|
+
|
|
62
|
+
## Quickstart
|
|
63
|
+
|
|
64
|
+
Install from PyPI:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
python3 -m pip install pipeconcord
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
The package is published at <https://pypi.org/project/pipeconcord/>.
|
|
71
|
+
PipeConcord requires Python 3.10 or newer.
|
|
72
|
+
|
|
73
|
+
To test unreleased changes from the default branch, install from GitHub:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
python3 -m pip install git+https://github.com/qchiujunhao/pipeconcord.git
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Create two small TSV files:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
cat > old.tsv <<'EOF'
|
|
83
|
+
gene_id value
|
|
84
|
+
A 1.0
|
|
85
|
+
B 2.0
|
|
86
|
+
EOF
|
|
87
|
+
|
|
88
|
+
cat > new.tsv <<'EOF'
|
|
89
|
+
gene_id value
|
|
90
|
+
A 1.1
|
|
91
|
+
B 2.0
|
|
92
|
+
EOF
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Compare them by `gene_id`:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
pipeconcord compare old.tsv new.tsv --key gene_id
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Write a report to disk:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pipeconcord compare old.tsv new.tsv \
|
|
105
|
+
--key gene_id \
|
|
106
|
+
--output report.json
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Compare differential expression tables:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
pipeconcord compare old_de.tsv new_de.tsv \
|
|
113
|
+
--type deg \
|
|
114
|
+
--alpha 0.05
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Compare count matrices:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
pipeconcord compare old_counts.tsv new_counts.tsv \
|
|
121
|
+
--type counts
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Compare normalized expression matrices:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
pipeconcord compare old_tpm.tsv new_tpm.tsv \
|
|
128
|
+
--type expression
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Compare BED intervals:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
pipeconcord compare old_peaks.bed new_peaks.bed \
|
|
135
|
+
--type bed \
|
|
136
|
+
--min-reciprocal-overlap 0.5
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Compare FASTA sequences:
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
pipeconcord compare old_sequences.fa new_sequences.fa \
|
|
143
|
+
--type fasta
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Compare VCF calls:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
pipeconcord compare old_calls.vcf new_calls.vcf \
|
|
150
|
+
--type vcf
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Optionally provide a reference FASTA for simple repeated-indel left alignment:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
pipeconcord compare calls_a.vcf calls_b.vcf \
|
|
157
|
+
--type vcf \
|
|
158
|
+
--reference-fasta reference.fa
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Compare alignment summary statistics:
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
pipeconcord compare old_flagstat.txt new_flagstat.txt \
|
|
165
|
+
--type bam_stats
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Run a batch comparison from a manifest:
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
pipeconcord batch manifest.tsv --format text
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
The manifest must contain `file_a` and `file_b` columns. Optional columns are
|
|
175
|
+
`label` and `type`.
|
|
176
|
+
|
|
177
|
+
Use `--min-concordance` in CI to fail when any comparison falls below a chosen
|
|
178
|
+
threshold:
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
pipeconcord batch manifest.tsv --min-concordance 0.95
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Write an HTML report:
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
pipeconcord compare old_peaks.bed new_peaks.bed \
|
|
188
|
+
--type bed \
|
|
189
|
+
--format html \
|
|
190
|
+
--output report.html
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
Batch reports also support `--format html`.
|
|
194
|
+
|
|
195
|
+
## Development
|
|
196
|
+
|
|
197
|
+
Install the repository in editable mode with development tools:
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
python3 -m pip install -e ".[dev]"
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
Run the tests:
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
python3 -m unittest discover -s tests
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
Run lint and coverage:
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
python3 -m ruff check .
|
|
213
|
+
python3 -m coverage run -m unittest discover -s tests
|
|
214
|
+
python3 -m coverage report
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Plugin Model
|
|
218
|
+
|
|
219
|
+
Comparators subclass `pipeconcord.comparators.base.Comparator` and return a
|
|
220
|
+
`pipeconcord.core.report.ConcordanceReport`. Third-party packages can register
|
|
221
|
+
comparators with the `pipeconcord.comparators` entry point group.
|
|
222
|
+
|
|
223
|
+
## Documentation
|
|
224
|
+
|
|
225
|
+
Additional documentation is available on the project website and in `docs/`,
|
|
226
|
+
including API notes, design rationale, and tutorials for regression testing,
|
|
227
|
+
RNA-seq outputs, variant calls, and BED peak comparisons.
|
|
228
|
+
|
|
229
|
+
## Citation and Paper Draft
|
|
230
|
+
|
|
231
|
+
Citation metadata is available in `CITATION.cff`. A draft JOSS-style paper is
|
|
232
|
+
available under `paper/`.
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# PipeConcord
|
|
2
|
+
|
|
3
|
+
`pipeconcord` is a Python toolkit for comparing bioinformatics pipeline outputs
|
|
4
|
+
with semantic, format-aware metrics. Instead of only checking whether files are
|
|
5
|
+
byte-for-byte identical, it measures whether two runs agree in biologically or
|
|
6
|
+
analytically meaningful ways.
|
|
7
|
+
|
|
8
|
+
Project website: <https://qchiujunhao.github.io/pipeconcord/>
|
|
9
|
+
|
|
10
|
+
Status: alpha. The core comparison model and initial comparators are usable, but
|
|
11
|
+
APIs and metrics may change as more bioinformatics formats and workflows are
|
|
12
|
+
validated.
|
|
13
|
+
|
|
14
|
+
Rename note: the first alpha release used the package name `biocompare`. Current
|
|
15
|
+
and future releases use `pipeconcord` to avoid confusion with an unrelated
|
|
16
|
+
life-science product directory.
|
|
17
|
+
|
|
18
|
+
This repository currently implements the Phase 1 vertical slice:
|
|
19
|
+
|
|
20
|
+
- a shared `ConcordanceReport` model
|
|
21
|
+
- comparator registry with plugin entry point support
|
|
22
|
+
- file type detection for common bioinformatics and tabular formats
|
|
23
|
+
- a differential expression result comparator
|
|
24
|
+
- a count/expression matrix comparator
|
|
25
|
+
- a normalized expression matrix comparator
|
|
26
|
+
- a BED interval comparator
|
|
27
|
+
- a FASTA/FASTQ sequence comparator
|
|
28
|
+
- a lightweight VCF comparator with ALT splitting and minimal allele trimming
|
|
29
|
+
- a `samtools flagstat`/`samtools stats` comparator
|
|
30
|
+
- a generic CSV/TSV table comparator
|
|
31
|
+
- JSON/text report writers
|
|
32
|
+
- a command-line interface
|
|
33
|
+
- automated tests with `unittest`
|
|
34
|
+
|
|
35
|
+
## Quickstart
|
|
36
|
+
|
|
37
|
+
Install from PyPI:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
python3 -m pip install pipeconcord
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
The package is published at <https://pypi.org/project/pipeconcord/>.
|
|
44
|
+
PipeConcord requires Python 3.10 or newer.
|
|
45
|
+
|
|
46
|
+
To test unreleased changes from the default branch, install from GitHub:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
python3 -m pip install git+https://github.com/qchiujunhao/pipeconcord.git
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Create two small TSV files:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
cat > old.tsv <<'EOF'
|
|
56
|
+
gene_id value
|
|
57
|
+
A 1.0
|
|
58
|
+
B 2.0
|
|
59
|
+
EOF
|
|
60
|
+
|
|
61
|
+
cat > new.tsv <<'EOF'
|
|
62
|
+
gene_id value
|
|
63
|
+
A 1.1
|
|
64
|
+
B 2.0
|
|
65
|
+
EOF
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Compare them by `gene_id`:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pipeconcord compare old.tsv new.tsv --key gene_id
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Write a report to disk:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pipeconcord compare old.tsv new.tsv \
|
|
78
|
+
--key gene_id \
|
|
79
|
+
--output report.json
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Compare differential expression tables:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pipeconcord compare old_de.tsv new_de.tsv \
|
|
86
|
+
--type deg \
|
|
87
|
+
--alpha 0.05
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Compare count matrices:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pipeconcord compare old_counts.tsv new_counts.tsv \
|
|
94
|
+
--type counts
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Compare normalized expression matrices:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
pipeconcord compare old_tpm.tsv new_tpm.tsv \
|
|
101
|
+
--type expression
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Compare BED intervals:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
pipeconcord compare old_peaks.bed new_peaks.bed \
|
|
108
|
+
--type bed \
|
|
109
|
+
--min-reciprocal-overlap 0.5
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Compare FASTA sequences:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
pipeconcord compare old_sequences.fa new_sequences.fa \
|
|
116
|
+
--type fasta
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Compare VCF calls:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
pipeconcord compare old_calls.vcf new_calls.vcf \
|
|
123
|
+
--type vcf
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Optionally provide a reference FASTA for simple repeated-indel left alignment:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
pipeconcord compare calls_a.vcf calls_b.vcf \
|
|
130
|
+
--type vcf \
|
|
131
|
+
--reference-fasta reference.fa
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Compare alignment summary statistics:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
pipeconcord compare old_flagstat.txt new_flagstat.txt \
|
|
138
|
+
--type bam_stats
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Run a batch comparison from a manifest:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
pipeconcord batch manifest.tsv --format text
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
The manifest must contain `file_a` and `file_b` columns. Optional columns are
|
|
148
|
+
`label` and `type`.
|
|
149
|
+
|
|
150
|
+
Use `--min-concordance` in CI to fail when any comparison falls below a chosen
|
|
151
|
+
threshold:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
pipeconcord batch manifest.tsv --min-concordance 0.95
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Write an HTML report:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
pipeconcord compare old_peaks.bed new_peaks.bed \
|
|
161
|
+
--type bed \
|
|
162
|
+
--format html \
|
|
163
|
+
--output report.html
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Batch reports also support `--format html`.
|
|
167
|
+
|
|
168
|
+
## Development
|
|
169
|
+
|
|
170
|
+
Install the repository in editable mode with development tools:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
python3 -m pip install -e ".[dev]"
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Run the tests:
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
python3 -m unittest discover -s tests
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Run lint and coverage:
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
python3 -m ruff check .
|
|
186
|
+
python3 -m coverage run -m unittest discover -s tests
|
|
187
|
+
python3 -m coverage report
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Plugin Model
|
|
191
|
+
|
|
192
|
+
Comparators subclass `pipeconcord.comparators.base.Comparator` and return a
|
|
193
|
+
`pipeconcord.core.report.ConcordanceReport`. Third-party packages can register
|
|
194
|
+
comparators with the `pipeconcord.comparators` entry point group.
|
|
195
|
+
|
|
196
|
+
## Documentation
|
|
197
|
+
|
|
198
|
+
Additional documentation is available on the project website and in `docs/`,
|
|
199
|
+
including API notes, design rationale, and tutorials for regression testing,
|
|
200
|
+
RNA-seq outputs, variant calls, and BED peak comparisons.
|
|
201
|
+
|
|
202
|
+
## Citation and Paper Draft
|
|
203
|
+
|
|
204
|
+
Citation metadata is available in `CITATION.cff`. A draft JOSS-style paper is
|
|
205
|
+
available under `paper/`.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Semantic comparison of bioinformatics pipeline outputs."""
|
|
2
|
+
|
|
3
|
+
from pipeconcord._version import __version__
|
|
4
|
+
from pipeconcord.core.engine import ComparisonEngine
|
|
5
|
+
from pipeconcord.core.report import ConcordanceReport
|
|
6
|
+
|
|
7
|
+
__all__ = ["ComparisonEngine", "ConcordanceReport", "__version__"]
|
|
8
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from pipeconcord.core.batch import run_batch
|
|
7
|
+
from pipeconcord.core.engine import ComparisonEngine
|
|
8
|
+
from pipeconcord.io.report_writers import (
|
|
9
|
+
batch_to_html,
|
|
10
|
+
batch_to_json,
|
|
11
|
+
batch_to_text,
|
|
12
|
+
batch_to_tsv,
|
|
13
|
+
report_to_html,
|
|
14
|
+
report_to_json,
|
|
15
|
+
report_to_text,
|
|
16
|
+
write_batch,
|
|
17
|
+
write_report,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
22
|
+
parser = argparse.ArgumentParser(
|
|
23
|
+
prog="pipeconcord",
|
|
24
|
+
description="Generate semantic concordance reports for bioinformatics outputs.",
|
|
25
|
+
)
|
|
26
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
27
|
+
|
|
28
|
+
compare_parser = subparsers.add_parser("compare", help="Compare one pair of files.")
|
|
29
|
+
add_common_options(compare_parser)
|
|
30
|
+
compare_parser.add_argument("file_a", help="First output file to compare.")
|
|
31
|
+
compare_parser.add_argument("file_b", help="Second output file to compare.")
|
|
32
|
+
compare_parser.add_argument("-o", "--output", help="Write the report to a file instead of stdout.")
|
|
33
|
+
compare_parser.add_argument("--format", choices=["html", "json", "text"], default="json", help="Output format.")
|
|
34
|
+
|
|
35
|
+
batch_parser = subparsers.add_parser("batch", help="Compare file pairs listed in a CSV/TSV manifest.")
|
|
36
|
+
add_common_options(batch_parser)
|
|
37
|
+
batch_parser.add_argument("manifest", help="CSV/TSV manifest with file_a and file_b columns.")
|
|
38
|
+
batch_parser.add_argument("--min-concordance", type=float, help="Fail if any successful comparison is below this threshold.")
|
|
39
|
+
batch_parser.add_argument("--stop-on-error", action="store_true", help="Stop on the first failed comparison.")
|
|
40
|
+
batch_parser.add_argument("-o", "--output", help="Write the batch report to a file instead of stdout.")
|
|
41
|
+
batch_parser.add_argument("--format", choices=["html", "json", "tsv", "text"], default="tsv", help="Batch output format.")
|
|
42
|
+
|
|
43
|
+
return parser
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def add_common_options(parser: argparse.ArgumentParser) -> None:
|
|
47
|
+
parser.add_argument("-t", "--type", dest="file_type", help="Force a comparator/file type such as bam_stats, bed, counts, deg, expression, fasta, fastq, table, csv, tsv, or vcf.")
|
|
48
|
+
parser.add_argument("--key", dest="key_column", help="Column to use for row alignment.")
|
|
49
|
+
parser.add_argument("--delimiter", help="Force a delimiter for tabular files.")
|
|
50
|
+
parser.add_argument("--alpha", type=float, help="DEG adjusted p-value threshold. Default: 0.05.")
|
|
51
|
+
parser.add_argument("--lfc-threshold", type=float, help="DEG absolute log-fold-change threshold. Default: 0.0.")
|
|
52
|
+
parser.add_argument("--top-n", type=int, help="Number of top-ranked DEG genes to compare. Default: 50.")
|
|
53
|
+
parser.add_argument("--gene-column", help="Gene identifier column override for DEG/count matrices.")
|
|
54
|
+
parser.add_argument("--sample-columns", help="Counts comparator sample columns as a comma-separated list.")
|
|
55
|
+
parser.add_argument("--min-reciprocal-overlap", type=float, help="BED interval match threshold. Default: 0.0 for any overlap.")
|
|
56
|
+
parser.add_argument("--reference-fasta", help="Reference FASTA for optional VCF indel left-alignment.")
|
|
57
|
+
parser.add_argument("--logfc-column", help="DEG log-fold-change column override.")
|
|
58
|
+
parser.add_argument("--padj-column", help="DEG adjusted p-value column override.")
|
|
59
|
+
parser.add_argument("--pvalue-column", help="DEG raw p-value column override when adjusted p-values are absent.")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main(argv: list[str] | None = None) -> int:
|
|
63
|
+
args_list = list(sys.argv[1:] if argv is None else argv)
|
|
64
|
+
if args_list and args_list[0] not in {"compare", "batch", "-h", "--help"}:
|
|
65
|
+
args_list = ["compare", *args_list]
|
|
66
|
+
parser = build_parser()
|
|
67
|
+
args = parser.parse_args(args_list)
|
|
68
|
+
if args.command is None:
|
|
69
|
+
parser.print_help()
|
|
70
|
+
return 2
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
if args.command == "batch":
|
|
74
|
+
return run_batch_command(args)
|
|
75
|
+
return run_compare_command(args)
|
|
76
|
+
except Exception as exc:
|
|
77
|
+
parser.exit(2, f"pipeconcord: error: {exc}\n")
|
|
78
|
+
return 0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def run_compare_command(args: argparse.Namespace) -> int:
|
|
82
|
+
engine = ComparisonEngine()
|
|
83
|
+
report = engine.compare(
|
|
84
|
+
args.file_a,
|
|
85
|
+
args.file_b,
|
|
86
|
+
**comparison_kwargs(args),
|
|
87
|
+
)
|
|
88
|
+
if args.output:
|
|
89
|
+
write_report(report, args.output, fmt=args.format)
|
|
90
|
+
elif args.format == "html":
|
|
91
|
+
print(report_to_html(report))
|
|
92
|
+
elif args.format == "text":
|
|
93
|
+
print(report_to_text(report))
|
|
94
|
+
else:
|
|
95
|
+
print(report_to_json(report))
|
|
96
|
+
return 0
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def run_batch_command(args: argparse.Namespace) -> int:
|
|
100
|
+
results = run_batch(
|
|
101
|
+
args.manifest,
|
|
102
|
+
stop_on_error=args.stop_on_error,
|
|
103
|
+
default_file_type=args.file_type,
|
|
104
|
+
**comparison_kwargs(args, include_file_type=False),
|
|
105
|
+
)
|
|
106
|
+
if args.output:
|
|
107
|
+
write_batch(results, args.output, fmt=args.format)
|
|
108
|
+
elif args.format == "html":
|
|
109
|
+
print(batch_to_html(results))
|
|
110
|
+
elif args.format == "json":
|
|
111
|
+
print(batch_to_json(results))
|
|
112
|
+
elif args.format == "text":
|
|
113
|
+
print(batch_to_text(results))
|
|
114
|
+
else:
|
|
115
|
+
print(batch_to_tsv(results))
|
|
116
|
+
if args.min_concordance is not None and not 0.0 <= args.min_concordance <= 1.0:
|
|
117
|
+
raise ValueError("min-concordance must be between 0.0 and 1.0")
|
|
118
|
+
below_threshold = [
|
|
119
|
+
result
|
|
120
|
+
for result in results
|
|
121
|
+
if result.report is not None and args.min_concordance is not None and result.report.overall_concordance < args.min_concordance
|
|
122
|
+
]
|
|
123
|
+
return 1 if any(result.error for result in results) or below_threshold else 0
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def comparison_kwargs(args: argparse.Namespace, *, include_file_type: bool = True) -> dict[str, object]:
|
|
127
|
+
kwargs: dict[str, object] = {
|
|
128
|
+
"key_column": args.key_column,
|
|
129
|
+
"delimiter": args.delimiter,
|
|
130
|
+
"alpha": args.alpha,
|
|
131
|
+
"lfc_threshold": args.lfc_threshold,
|
|
132
|
+
"top_n": args.top_n,
|
|
133
|
+
"gene_column": args.gene_column,
|
|
134
|
+
"sample_columns": args.sample_columns,
|
|
135
|
+
"min_reciprocal_overlap": args.min_reciprocal_overlap,
|
|
136
|
+
"reference_fasta": args.reference_fasta,
|
|
137
|
+
"logfc_column": args.logfc_column,
|
|
138
|
+
"padj_column": args.padj_column,
|
|
139
|
+
"pvalue_column": args.pvalue_column,
|
|
140
|
+
}
|
|
141
|
+
if include_file_type:
|
|
142
|
+
kwargs["file_type"] = args.file_type
|
|
143
|
+
return kwargs
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pipeconcord.comparators.bam_stats import BAMStatsComparator
|
|
4
|
+
from pipeconcord.comparators.bed import BEDComparator
|
|
5
|
+
from pipeconcord.comparators.counts import CountsComparator
|
|
6
|
+
from pipeconcord.comparators.deg import DEGComparator
|
|
7
|
+
from pipeconcord.comparators.expression import ExpressionComparator
|
|
8
|
+
from pipeconcord.comparators.fasta import FASTAComparator
|
|
9
|
+
from pipeconcord.comparators.table import TableComparator
|
|
10
|
+
from pipeconcord.comparators.vcf import VCFComparator
|
|
11
|
+
from pipeconcord.core.registry import ComparatorRegistry
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def register_builtin_comparators(registry: type[ComparatorRegistry] = ComparatorRegistry) -> None:
|
|
15
|
+
registry.register(DEGComparator)
|
|
16
|
+
registry.register(ExpressionComparator)
|
|
17
|
+
registry.register(CountsComparator)
|
|
18
|
+
registry.register(BEDComparator)
|
|
19
|
+
registry.register(FASTAComparator)
|
|
20
|
+
registry.register(VCFComparator)
|
|
21
|
+
registry.register(BAMStatsComparator)
|
|
22
|
+
registry.register(TableComparator)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
__all__ = ["BAMStatsComparator", "BEDComparator", "CountsComparator", "DEGComparator", "ExpressionComparator", "FASTAComparator", "TableComparator", "VCFComparator", "register_builtin_comparators"]
|