bioartifact 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. bioartifact-0.1.0/.gitattributes +3 -0
  2. bioartifact-0.1.0/.github/workflows/ci.yml +24 -0
  3. bioartifact-0.1.0/.github/workflows/release.yml +56 -0
  4. bioartifact-0.1.0/.gitignore +13 -0
  5. bioartifact-0.1.0/.pre-commit-config.yaml +8 -0
  6. bioartifact-0.1.0/CHANGELOG.md +20 -0
  7. bioartifact-0.1.0/CONTRIBUTING.md +67 -0
  8. bioartifact-0.1.0/LICENSE +21 -0
  9. bioartifact-0.1.0/PKG-INFO +389 -0
  10. bioartifact-0.1.0/README.md +336 -0
  11. bioartifact-0.1.0/SKILLS.md +224 -0
  12. bioartifact-0.1.0/SUPPORT.md +30 -0
  13. bioartifact-0.1.0/examples/de_table.tsv +4 -0
  14. bioartifact-0.1.0/examples/peaks.narrowPeak +3 -0
  15. bioartifact-0.1.0/examples/reads_R1.fastq +8 -0
  16. bioartifact-0.1.0/examples/reads_R2.fastq +8 -0
  17. bioartifact-0.1.0/examples/variants.vcf +4 -0
  18. bioartifact-0.1.0/pyproject.toml +84 -0
  19. bioartifact-0.1.0/schemas/artifact_result.schema.json +38 -0
  20. bioartifact-0.1.0/schemas/contract_result.schema.json +37 -0
  21. bioartifact-0.1.0/schemas/manifest_result.schema.json +40 -0
  22. bioartifact-0.1.0/src/bioartifact/__init__.py +25 -0
  23. bioartifact-0.1.0/src/bioartifact/__main__.py +4 -0
  24. bioartifact-0.1.0/src/bioartifact/cli/__init__.py +1 -0
  25. bioartifact-0.1.0/src/bioartifact/cli/main.py +149 -0
  26. bioartifact-0.1.0/src/bioartifact/contracts/__init__.py +52 -0
  27. bioartifact-0.1.0/src/bioartifact/contracts/alignment.py +100 -0
  28. bioartifact-0.1.0/src/bioartifact/contracts/common.py +26 -0
  29. bioartifact-0.1.0/src/bioartifact/contracts/fastq.py +174 -0
  30. bioartifact-0.1.0/src/bioartifact/contracts/intervals.py +72 -0
  31. bioartifact-0.1.0/src/bioartifact/contracts/tables.py +134 -0
  32. bioartifact-0.1.0/src/bioartifact/contracts/vcf.py +68 -0
  33. bioartifact-0.1.0/src/bioartifact/detection.py +55 -0
  34. bioartifact-0.1.0/src/bioartifact/exceptions.py +14 -0
  35. bioartifact-0.1.0/src/bioartifact/inspectors/__init__.py +64 -0
  36. bioartifact-0.1.0/src/bioartifact/inspectors/alignment.py +191 -0
  37. bioartifact-0.1.0/src/bioartifact/inspectors/bed.py +131 -0
  38. bioartifact-0.1.0/src/bioartifact/inspectors/fasta.py +66 -0
  39. bioartifact-0.1.0/src/bioartifact/inspectors/fastq.py +99 -0
  40. bioartifact-0.1.0/src/bioartifact/inspectors/gtf.py +92 -0
  41. bioartifact-0.1.0/src/bioartifact/inspectors/html.py +52 -0
  42. bioartifact-0.1.0/src/bioartifact/inspectors/tables.py +69 -0
  43. bioartifact-0.1.0/src/bioartifact/inspectors/vcf.py +84 -0
  44. bioartifact-0.1.0/src/bioartifact/io.py +29 -0
  45. bioartifact-0.1.0/src/bioartifact/json.py +8 -0
  46. bioartifact-0.1.0/src/bioartifact/manifest.py +211 -0
  47. bioartifact-0.1.0/src/bioartifact/metadata.py +144 -0
  48. bioartifact-0.1.0/src/bioartifact/models.py +89 -0
  49. bioartifact-0.1.0/src/bioartifact/summarize.py +59 -0
  50. bioartifact-0.1.0/tests/fixtures/README.md +34 -0
  51. bioartifact-0.1.0/tests/fixtures/aligned.sorted.bam +0 -0
  52. bioartifact-0.1.0/tests/fixtures/aligned.sorted.sam +6 -0
  53. bioartifact-0.1.0/tests/fixtures/annotation.gtf +6 -0
  54. bioartifact-0.1.0/tests/fixtures/de_table.tsv +5 -0
  55. bioartifact-0.1.0/tests/fixtures/invalid/bad.fastq +5 -0
  56. bioartifact-0.1.0/tests/fixtures/invalid/bad.narrowPeak +2 -0
  57. bioartifact-0.1.0/tests/fixtures/invalid/bad.vcf +3 -0
  58. bioartifact-0.1.0/tests/fixtures/invalid/bad_de_table.tsv +4 -0
  59. bioartifact-0.1.0/tests/fixtures/multiqc_report.html +12 -0
  60. bioartifact-0.1.0/tests/fixtures/peaks.narrowPeak +4 -0
  61. bioartifact-0.1.0/tests/fixtures/reads_R1.fastq +9 -0
  62. bioartifact-0.1.0/tests/fixtures/reads_R1.fastq.gz +0 -0
  63. bioartifact-0.1.0/tests/fixtures/reads_R2.fastq +9 -0
  64. bioartifact-0.1.0/tests/fixtures/reference.fa +5 -0
  65. bioartifact-0.1.0/tests/fixtures/regions.bed +4 -0
  66. bioartifact-0.1.0/tests/fixtures/scripts/build_binary_fixtures.py +71 -0
  67. bioartifact-0.1.0/tests/fixtures/variants.vcf +6 -0
  68. bioartifact-0.1.0/tests/fixtures/variants.vcf.gz +0 -0
  69. bioartifact-0.1.0/tests/fixtures/workflow_manifest.fail.json +22 -0
  70. bioartifact-0.1.0/tests/fixtures/workflow_manifest.pass.json +36 -0
  71. bioartifact-0.1.0/tests/test_cli_manifest_and_schemas.py +217 -0
  72. bioartifact-0.1.0/tests/test_detection.py +31 -0
  73. bioartifact-0.1.0/tests/test_fixture_suite.py +118 -0
  74. bioartifact-0.1.0/tests/test_inspection_and_contracts.py +114 -0
@@ -0,0 +1,3 @@
1
+ *.bam binary
2
+ *.gz binary
3
+
@@ -0,0 +1,24 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ test:
9
+ runs-on: ubuntu-latest
10
+ strategy:
11
+ fail-fast: false
12
+ matrix:
13
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: actions/setup-python@v5
17
+ with:
18
+ python-version: ${{ matrix.python-version }}
19
+ - name: Install
20
+ run: python -m pip install -e ".[dev]"
21
+ - name: Lint
22
+ run: ruff check .
23
+ - name: Test
24
+ run: python -m unittest discover -s tests
@@ -0,0 +1,56 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+ workflow_dispatch:
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ build:
14
+ name: Build distributions
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.12"
22
+
23
+ - name: Install build tools
24
+ run: python -m pip install --upgrade build twine
25
+
26
+ - name: Build
27
+ run: python -m build
28
+
29
+ - name: Check distributions
30
+ run: twine check dist/*
31
+
32
+ - name: Upload distributions
33
+ uses: actions/upload-artifact@v4
34
+ with:
35
+ name: python-distributions
36
+ path: dist/
37
+ if-no-files-found: error
38
+
39
+ publish-pypi:
40
+ name: Publish to PyPI
41
+ needs: build
42
+ runs-on: ubuntu-latest
43
+ environment:
44
+ name: pypi
45
+ url: https://pypi.org/project/bioartifact/
46
+ permissions:
47
+ id-token: write
48
+ steps:
49
+ - name: Download distributions
50
+ uses: actions/download-artifact@v4
51
+ with:
52
+ name: python-distributions
53
+ path: dist/
54
+
55
+ - name: Publish distributions to PyPI
56
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,13 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .pytest_cache/
5
+ .ruff_cache/
6
+ .venv/
7
+ build/
8
+ dist/
9
+ .DS_Store
10
+ design.md
11
+ DESIGN.md
12
+ to-do.md
13
+ TODO.md
@@ -0,0 +1,8 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.6.9
4
+ hooks:
5
+ - id: ruff
6
+ args: ["--fix"]
7
+ - id: ruff-format
8
+
@@ -0,0 +1,20 @@
1
+ # Changelog
2
+
3
+ All notable changes to `bioartifact` will be documented in this file.
4
+
5
+ ## 0.1.0 - 2026-05-10
6
+
7
+ Initial release:
8
+
9
+ - Dependency-free inspection core with JSON-serializable result models.
10
+ - CLI commands for `inspect`, `validate`, `summarize`, `contracts`, `types`,
11
+ and `validate-manifest`.
12
+ - JSON output by default for every command, with human-readable text available
13
+ through `--human` or `--output human`.
14
+ - Inspectors for FASTQ, FASTA, SAM, BAM headers, VCF, BED, narrowPeak, GTF/GFF,
15
+ CSV/TSV, and HTML reports.
16
+ - Contracts for FASTQ, paired FASTQ, sorted/indexed BAM, narrowPeak,
17
+ differential-expression tables, and valid VCF.
18
+ - Schema-versioned JSON outputs and JSON schemas.
19
+ - Deterministic fixture suite for tests, documentation, and future JOSS examples.
20
+ - Manifest-based workflow output validation.
@@ -0,0 +1,67 @@
1
+ # Contributing
2
+
3
+ `bioartifact` is early-stage software. Contributions should preserve the core
4
+ scope: lightweight, deterministic, machine-readable inspection and validation of
5
+ bioinformatics artifacts.
6
+
7
+ ## Development Setup
8
+
9
+ Create an isolated environment and install the package:
10
+
11
+ ```bash
12
+ python -m venv .venv
13
+ .venv/bin/python -m pip install -e ".[dev]"
14
+ ```
15
+
16
+ On systems where Python is not externally managed, `python -m pip install -e
17
+ ".[dev]"` is also sufficient.
18
+
19
+ ## Checks
20
+
21
+ Run tests:
22
+
23
+ ```bash
24
+ PYTHONPATH=src python -m unittest discover -s tests
25
+ ```
26
+
27
+ Run linting:
28
+
29
+ ```bash
30
+ ruff check .
31
+ ```
32
+
33
+ Run formatting before submitting broad edits:
34
+
35
+ ```bash
36
+ ruff format .
37
+ ```
38
+
39
+ ## Adding Inspectors
40
+
41
+ When adding an artifact type:
42
+
43
+ 1. Add conservative extension detection.
44
+ 2. Add an independent inspector module.
45
+ 3. Return an `ArtifactResult` with deterministic summary fields.
46
+ 4. Avoid heavy runtime dependencies unless they are optional extras.
47
+ 5. Add valid and invalid fixtures when practical.
48
+ 6. Add unit tests and CLI examples.
49
+
50
+ Inspectors should validate structure and compatibility, not scientific
51
+ interpretation.
52
+
53
+ ## Adding Contracts
54
+
55
+ When adding a contract:
56
+
57
+ 1. Add a named validator under `src/bioartifact/contracts/`.
58
+ 2. Register it in the contract registry.
59
+ 3. Emit named `CheckResult` entries with useful messages and remediation hints.
60
+ 4. Add a fixture-backed passing test and at least one failing test.
61
+ 5. Document the contract in the README and `SKILLS.md`.
62
+
63
+ ## Pull Request Guidance
64
+
65
+ Keep pull requests focused. Include the command output for tests and linting in
66
+ the PR description. For behavior changes, include before/after JSON examples.
67
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Junhao Qiu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,389 @@
1
+ Metadata-Version: 2.4
2
+ Name: bioartifact
3
+ Version: 0.1.0
4
+ Summary: Agent-friendly inspection and validation of bioinformatics artifacts.
5
+ Project-URL: Homepage, https://github.com/qchiujunhao/bioartifact
6
+ Project-URL: Issues, https://github.com/qchiujunhao/bioartifact/issues
7
+ Project-URL: Source, https://github.com/qchiujunhao/bioartifact
8
+ Author: Junhao Qiu
9
+ License: MIT License
10
+
11
+ Copyright (c) 2026 Junhao Qiu
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Keywords: artifacts,bioinformatics,reproducibility,validation,workflow
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Science/Research
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Programming Language :: Python :: 3
36
+ Classifier: Programming Language :: Python :: 3.10
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Programming Language :: Python :: 3.12
39
+ Classifier: Programming Language :: Python :: 3.13
40
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
41
+ Requires-Python: >=3.10
42
+ Provides-Extra: bio
43
+ Requires-Dist: pysam>=0.22; extra == 'bio'
44
+ Provides-Extra: dev
45
+ Requires-Dist: pre-commit>=3.7; extra == 'dev'
46
+ Requires-Dist: pytest>=8.0; extra == 'dev'
47
+ Requires-Dist: ruff>=0.6; extra == 'dev'
48
+ Provides-Extra: manifest
49
+ Requires-Dist: pyyaml>=6.0; extra == 'manifest'
50
+ Provides-Extra: tables
51
+ Requires-Dist: pandas>=2.0; extra == 'tables'
52
+ Description-Content-Type: text/markdown
53
+
54
+ # bioartifact
55
+
56
+ `bioartifact` is a lightweight Python package and command-line tool for inspecting and
57
+ validating bioinformatics output files in deterministic, machine-readable form.
58
+
59
+ The project targets AI agents, workflow systems, benchmark platforms, and
60
+ reproducibility pipelines that need to answer practical questions:
61
+
62
+ - What kind of artifact was generated?
63
+ - Is the artifact structurally readable?
64
+ - What basic properties does it contain?
65
+ - Does it satisfy a declared contract?
66
+ - Is it usable as input to a downstream workflow step?
67
+
68
+ The package focuses on structure and compatibility, not biological interpretation.
69
+
70
+ ## Motivation
71
+
72
+ AI agents, workflow engines, and benchmark systems increasingly need to make
73
+ automated decisions about bioinformatics outputs. Traditional tools are strong
74
+ at format-specific parsing, command-line statistics, or human-readable QC
75
+ reports, but they do not provide a small unified layer for asking: "Was the
76
+ expected artifact produced, is it structurally usable, and does it satisfy the
77
+ contract for the next step?"
78
+
79
+ `bioartifact` fills that layer with deterministic JSON outputs and named
80
+ contracts. It is designed to complement tools such as `samtools`, `bcftools`,
81
+ FastQC, MultiQC, and workflow engines, not replace them.
82
+
83
+ ## Current Status
84
+
85
+ This repository contains:
86
+
87
+ - extension-based artifact detection
88
+ - structured dataclass result models
89
+ - JSON-serializable inspection results
90
+ - a dependency-free CLI built on `argparse`
91
+ - inspectors for FASTQ, FASTA, SAM, BAM headers, VCF, BED, narrowPeak, GTF/GFF,
92
+ generic CSV/TSV tables, and HTML reports
93
+ - contracts for FASTQ, paired FASTQ, sorted/indexed BAM, narrowPeak,
94
+ differential-expression tables, and valid VCF
95
+ - schema-versioned JSON outputs
96
+ - CLI discovery commands for supported artifact types and contracts
97
+ - manifest-based workflow output validation
98
+ - directory summarization
99
+ - unit tests and CI configuration
100
+
101
+ The core has no runtime dependencies. Optional extras can be added for richer
102
+ format support without making the base installation heavy.
103
+
104
+ ## Installation
105
+
106
+ From PyPI:
107
+
108
+ ```bash
109
+ pip install bioartifact
110
+ ```
111
+
112
+ From a checkout for local development:
113
+
114
+ ```bash
115
+ python -m pip install -e .
116
+ ```
117
+
118
+ For development:
119
+
120
+ ```bash
121
+ python -m pip install -e ".[dev]"
122
+ pre-commit install
123
+ ```
124
+
125
+ Optional richer BAM/SAM support:
126
+
127
+ ```bash
128
+ pip install "bioartifact[bio]"
129
+ ```
130
+
131
+ From a checkout, use `python -m pip install -e ".[bio]"`.
132
+
133
+ ## CLI Examples
134
+
135
+ Inspect an artifact:
136
+
137
+ ```bash
138
+ bioartifact inspect sample.fastq
139
+ ```
140
+
141
+ Validate a named contract:
142
+
143
+ ```bash
144
+ bioartifact validate peaks.narrowPeak --contract narrowpeak
145
+ ```
146
+
147
+ Validate paired FASTQ files:
148
+
149
+ ```bash
150
+ bioartifact validate sample_R1.fastq.gz --contract paired_fastq --mate sample_R2.fastq.gz
151
+ ```
152
+
153
+ Summarize a directory:
154
+
155
+ ```bash
156
+ bioartifact summarize outputs/ --recursive
157
+ ```
158
+
159
+ List supported contracts and artifact types:
160
+
161
+ ```bash
162
+ bioartifact contracts
163
+ bioartifact types
164
+ ```
165
+
166
+ Validate all expected outputs declared in a manifest:
167
+
168
+ ```bash
169
+ bioartifact validate-manifest workflow_manifest.json
170
+ ```
171
+
172
+ Agent-facing CLI usage guidance is available in [SKILLS.md](SKILLS.md).
173
+
174
+ ## Output Modes
175
+
176
+ The CLI emits structured JSON by default for every command. This is intentional:
177
+ `bioartifact` is agent-first, and the default output should be deterministic and
178
+ machine-readable regardless of whether a command runs in a terminal, a PTY, CI,
179
+ or a captured subprocess.
180
+
181
+ This means agents and workflow systems do not need to pass an output-format
182
+ flag. Humans can opt into text output when desired.
183
+
184
+ For deterministic overrides:
185
+
186
+ ```bash
187
+ bioartifact inspect sample.fastq --output human
188
+ bioartifact inspect sample.fastq --human
189
+ ```
190
+
191
+ The `--json` flag is still accepted for compatibility, but JSON is already the
192
+ default.
193
+
194
+ ## Quickstart With Fixtures
195
+
196
+ The repository includes small synthetic fixture files that can be used without
197
+ downloading external data:
198
+
199
+ ```bash
200
+ PYTHONPATH=src python -m bioartifact inspect tests/fixtures/variants.vcf.gz
201
+ PYTHONPATH=src python -m bioartifact validate tests/fixtures/peaks.narrowPeak --contract narrowpeak
202
+ PYTHONPATH=src python -m bioartifact validate tests/fixtures/reads_R1.fastq --contract paired_fastq --mate tests/fixtures/reads_R2.fastq
203
+ PYTHONPATH=src python -m bioartifact validate-manifest tests/fixtures/workflow_manifest.pass.json
204
+ ```
205
+
206
+ ## Python API
207
+
208
+ ```python
209
+ from bioartifact import inspect_artifact, validate_artifact
210
+
211
+ artifact = inspect_artifact("sample.vcf.gz")
212
+ print(artifact.to_dict())
213
+
214
+ contract = validate_artifact("peaks.narrowPeak", "narrowpeak")
215
+ print(contract.to_dict())
216
+ ```
217
+
218
+ ## Design Principles
219
+
220
+ - Agent-first: deterministic, structured, JSON-serializable output.
221
+ - Lightweight: useful with a single command and no workflow engine.
222
+ - Modular: each inspector and contract is independent.
223
+ - Contract-oriented: validate structure, compatibility, and required properties.
224
+ - Conservative: report limitations explicitly instead of inferring scientific meaning.
225
+
226
+ ## Supported Artifact Types
227
+
228
+ Initial detection and inspection support:
229
+
230
+ - FASTQ / FASTQ.GZ
231
+ - FASTA / FASTA.GZ
232
+ - SAM
233
+ - BAM header inspection, with optional `pysam` statistics when installed
234
+ - VCF / VCF.GZ
235
+ - BED
236
+ - narrowPeak
237
+ - GTF / GFF / GFF3
238
+ - CSV / TSV
239
+ - HTML / MultiQC HTML
240
+
241
+ ## Inspection Methods
242
+
243
+ `bioartifact inspect` first detects the artifact type from the filename extension,
244
+ then runs a format-specific structural inspector. Inspectors are conservative:
245
+ they report whether a file is readable and structurally compatible with the
246
+ expected format, but they do not infer biological correctness.
247
+
248
+ | Artifact type | Detection | Inspection approach | Summary fields |
249
+ | --- | --- | --- | --- |
250
+ | FASTQ / FASTQ.GZ | `.fastq`, `.fq`, `.fastq.gz`, `.fq.gz` | Opens plain text or gzip input, reads four-line FASTQ records, checks `@` headers, `+` separators, incomplete records, and sequence/quality length equality. | record count, base count, min/max/mean read length, gzip flag |
251
+ | FASTA / FASTA.GZ | `.fasta`, `.fa`, `.fna`, and gzip variants | Opens plain text or gzip input, checks that sequence data follows FASTA headers, counts records, and records sequence lengths. | sequence count, base count, min/max/mean sequence length, gzip flag |
252
+ | SAM | `.sam` | Parses SAM text headers and alignment rows, checks that alignment records have at least 11 columns, parses flags, extracts references from `@SQ`, and detects coordinate sorting from `@HD SO:coordinate`. | alignment count, mapped/unmapped counts, references, sort order, flag counts |
253
+ | BAM | `.bam` | Reads the BGZF/gzip BAM header directly, checks the `BAM\1` magic header, parses header text and reference dictionary, detects sort order, and checks for adjacent `.bai`/`.csi` indexes. If `pysam` is installed, indexed BAM read statistics are also attempted. | references, reference names, sort order, index presence, optional mapped/unmapped counts |
254
+ | VCF / VCF.GZ | `.vcf`, `.vcf.gz` | Opens plain text or gzip input, checks metadata/header structure, validates required first 8 VCF columns, detects sample columns, and checks basic record fields such as positive `POS`, non-empty `REF`, and non-empty `ALT`. | metadata line count, variant record count, sample names, sample count, gzip flag |
255
+ | BED | `.bed` | Reads tab-delimited interval rows, ignores comments and browser/track lines, checks at least 3 columns, integer coordinates, non-negative starts, and `end >= start`. | record count, chromosome count, per-chromosome counts, min/max interval width |
256
+ | narrowPeak | `.narrowPeak` | Applies BED coordinate checks plus ENCODE narrowPeak structure checks: at least 10 columns, integer score, valid strand, numeric signal/p/q columns, and integer peak offset. | record count, chromosome counts, min/max width, required column count |
257
+ | GTF | `.gtf` | Parses 9-column GTF rows, validates positive coordinates, summarizes feature types, and extracts `gene_id` and `transcript_id` attributes when present. | record count, feature counts, gene count, transcript count |
258
+ | GFF / GFF3 | `.gff`, `.gff3` | Parses 9-column GFF rows, validates positive coordinates, summarizes feature types, and extracts `ID` attributes for gene-like records where available. | record count, feature counts, gene count, transcript count |
259
+ | CSV / TSV | `.csv`, `.tsv`, `.tab` | Uses Python's CSV parser with delimiter inferred from extension, reads the header, counts rows/columns, tracks empty cells, and rejects rows with inconsistent column counts. | delimiter, row count, column names, column count, missing values, inconsistent rows |
260
+ | HTML / MultiQC HTML | `.html`, `.htm` | Samples the report text, checks for an HTML root/doctype marker, extracts the `<title>`, and detects MultiQC-like reports by searching for `multiqc`. | title, MultiQC flag, sampled byte count |
261
+
262
+ The current BAM inspector intentionally keeps the default installation light by
263
+ parsing the BAM header without requiring `pysam`. Installing `bioartifact[bio]`
264
+ enables optional `pysam`-based statistics for indexed BAM files.
265
+
266
+ ## Supported Contracts
267
+
268
+ - `fastq`
269
+ - `paired_fastq`
270
+ - `sorted_bam`
271
+ - `indexed_bam`
272
+ - `narrowpeak`
273
+ - `de_table`
274
+ - `valid_vcf`
275
+
276
+ ## Contract Reference
277
+
278
+ | Contract | Expected input | Behavior | Common limitation | Example |
279
+ | --- | --- | --- | --- | --- |
280
+ | `fastq` | FASTQ or FASTQ.GZ | Checks readability, gzip integrity when applicable, record presence, and sequence/quality length equality. | Does not run per-base quality QC. | `bioartifact validate reads.fastq.gz --contract fastq` |
281
+ | `paired_fastq` | R1 FASTQ plus `--mate` R2 FASTQ | Checks both files, read-count synchronization, and normalized read ID matching. | Does not infer mates automatically. | `bioartifact validate R1.fastq.gz --contract paired_fastq --mate R2.fastq.gz` |
282
+ | `sorted_bam` | BAM or SAM | Checks readability and whether the alignment header declares coordinate sorting. | Does not prove record-level sort order without deeper parsing. | `bioartifact validate aligned.bam --contract sorted_bam` |
283
+ | `indexed_bam` | BAM | Checks readability and adjacent `.bai` or `.csi` presence. | Does not verify full index correctness. | `bioartifact validate aligned.bam --contract indexed_bam` |
284
+ | `narrowpeak` | narrowPeak | Checks required 10-column structure and interval coordinates. | Does not judge peak-calling quality. | `bioartifact validate peaks.narrowPeak --contract narrowpeak` |
285
+ | `de_table` | CSV or TSV | Checks required DE columns, p-value ranges, and duplicate/empty genes. | Assumes exact column names. | `bioartifact validate de_table.tsv --contract de_table` |
286
+ | `valid_vcf` | VCF or VCF.GZ | Checks header, required columns, basic records, and sample column structure. | Does not replace `bcftools` validation. | `bioartifact validate variants.vcf.gz --contract valid_vcf` |
287
+
288
+ ## Manifest Validation
289
+
290
+ Use `validate-manifest` when a workflow run should produce multiple expected
291
+ artifacts. Relative paths are resolved against the manifest directory unless
292
+ `--base-dir` is provided.
293
+
294
+ Minimal JSON manifest:
295
+
296
+ ```json
297
+ {
298
+ "outputs": [
299
+ {
300
+ "name": "peaks",
301
+ "path": "peaks.narrowPeak",
302
+ "type": "narrowPeak",
303
+ "contract": "narrowpeak"
304
+ },
305
+ {
306
+ "name": "paired_reads",
307
+ "path": "reads_R1.fastq.gz",
308
+ "type": "fastq",
309
+ "contract": "paired_fastq",
310
+ "mate": "reads_R2.fastq.gz"
311
+ }
312
+ ]
313
+ }
314
+ ```
315
+
316
+ Run:
317
+
318
+ ```bash
319
+ bioartifact validate-manifest workflow_manifest.json
320
+ ```
321
+
322
+ YAML manifests are supported when `PyYAML` is installed, for example with
323
+ `python -m pip install -e ".[manifest]"`.
324
+
325
+ ## JSON Output Reference
326
+
327
+ All structured CLI outputs include `schema_version`. The current schema version
328
+ is `1.0.0`.
329
+
330
+ - Artifact inspection schema: [schemas/artifact_result.schema.json](schemas/artifact_result.schema.json)
331
+ - Contract validation schema: [schemas/contract_result.schema.json](schemas/contract_result.schema.json)
332
+ - Manifest validation schema: [schemas/manifest_result.schema.json](schemas/manifest_result.schema.json)
333
+
334
+ Schema files are intended to be part of the user-facing interface for the `1.x`
335
+ line. Additive fields may be introduced in minor releases; breaking output
336
+ changes require a new major schema version.
337
+
338
+ ## CLI Exit Codes
339
+
340
+ - `0`: the requested inspection, validation, summary, discovery, or manifest
341
+ validation succeeded.
342
+ - `1`: inspection or validation failed, a manifest did not pass, or the input
343
+ path/manifest was invalid.
344
+
345
+ Warnings do not cause a non-zero exit code unless a required structural check or
346
+ contract check fails.
347
+
348
+ ## Development
349
+
350
+ Run the standard-library test suite:
351
+
352
+ ```bash
353
+ PYTHONPATH=src python -m unittest discover -s tests
354
+ ```
355
+
356
+ Run the configured developer checks after installing dev dependencies:
357
+
358
+ ```bash
359
+ ruff check .
360
+ pytest
361
+ ```
362
+
363
+ ## Releasing
364
+
365
+ PyPI publishing is handled by GitHub Actions through PyPI Trusted Publishing.
366
+ For the first release, create a pending publisher on PyPI with:
367
+
368
+ - PyPI project name: `bioartifact`
369
+ - Owner: `qchiujunhao`
370
+ - Repository name: `bioartifact`
371
+ - Workflow name: `release.yml`
372
+ - Environment name: `pypi`
373
+
374
+ After the pending publisher is configured, run the `Release` workflow manually
375
+ from GitHub Actions or push a `v*` tag for future releases.
376
+
377
+ ## Reproducible Fixtures
378
+
379
+ The repository includes a small fixture suite under `tests/fixtures/` with
380
+ versioned FASTA, FASTQ, FASTQ.GZ, SAM, BAM, VCF, VCF.GZ, BED, narrowPeak, GTF,
381
+ TSV, and HTML report examples. These files are synthetic but structurally valid,
382
+ small enough for CI, and documented with provenance notes so they can support
383
+ examples in documentation and a future JOSS paper.
384
+
385
+ Binary fixtures are deterministic and can be regenerated with:
386
+
387
+ ```bash
388
+ python tests/fixtures/scripts/build_binary_fixtures.py
389
+ ```