samplesheet-parser 0.3.3__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/.gitignore +2 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/CHANGELOG.md +47 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/PKG-INFO +76 -2
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/README.md +75 -1
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/pyproject.toml +1 -1
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/__init__.py +2 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/cli.py +105 -1
- samplesheet_parser-0.3.4/samplesheet_parser/index_utils.py +184 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/merger.py +12 -2
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/validators.py +9 -1
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_cli.py +155 -0
- samplesheet_parser-0.3.4/tests/test_index_utils.py +163 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_validators/test_validators.py +57 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/.github/workflows/ci.yml +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/.github/workflows/copilot-instructions.md +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/.zenodo.json +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/CITATION.cff +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/CONTRIBUTING.md +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/LICENSE +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/demo_merger.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/parse_examples.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/ProjectAlpha_SampleSheet.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/ProjectBeta_SampleSheet.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/ProjectDelta_SampleSheet_collision.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/ProjectGamma_SampleSheet.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/README.md +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/combined_clean.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/combined_collision_forced.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/combined_mixed_formats.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v1_dual_index.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v1_multi_lane.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v1_single_index.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v1_with_lab_qc_settings.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v1_with_manifests.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_nextseq_single_index.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_novaseq_x_dual_index.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_with_cloud_settings.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_with_index_umi.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_with_pipeline_settings.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_with_read_umi.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/images/samplesheet_parser_arch_v03.png +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/images/samplesheet_parser_overview.png +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/converter.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/diff.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/enums.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/factory.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/parsers/__init__.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/parsers/v1.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/parsers/v2.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/writer.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/scripts/demo_converter.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/scripts/demo_diff.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/scripts/demo_writer.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/__init__.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/conftest.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/fixtures/SampleSheet_v1_dual_index.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/fixtures/SampleSheet_v2_dual_index.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/fixtures/SampleSheet_v2_modified.csv +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_converter.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_diff.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_factory.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_merger.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_parsers/__init__.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_parsers/test_v1.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_parsers/test_v2.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_validators/__init__.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_validators/test_hamming.py +0 -0
- {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_writer.py +0 -0
|
@@ -6,6 +6,53 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|
|
6
6
|
|
|
7
7
|
---
|
|
8
8
|
|
|
9
|
+
## [0.3.4] - 2026-04-04
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- **`samplesheet info` CLI command** — prints a concise summary of any V1 or
|
|
14
|
+
V2 sample sheet (format, sample count, lanes, index type, read lengths,
|
|
15
|
+
adapters, experiment name, instrument). Supports `--format json` for
|
|
16
|
+
machine-readable output; exits 0 on success, 2 on unreadable files.
|
|
17
|
+
|
|
18
|
+
- **Configurable Hamming distance threshold** — `SampleSheetValidator.validate()`
|
|
19
|
+
now accepts a `min_hamming_distance` keyword argument (default: 3) so labs
|
|
20
|
+
using longer indexes can enforce stricter thresholds without changing the
|
|
21
|
+
module-level constant.
|
|
22
|
+
- `SampleSheetMerger` accepts the same parameter in `__init__()` and applies
|
|
23
|
+
it to both the intra-sheet and cross-sheet Hamming checks as well as the
|
|
24
|
+
post-merge validation step.
|
|
25
|
+
- `samplesheet validate` exposes `--min-hamming N` (must be ≥ 1; exits 2 on
|
|
26
|
+
invalid input). The JSON output includes `min_hamming_distance` for
|
|
27
|
+
auditability.
|
|
28
|
+
|
|
29
|
+
- **`normalize_index_lengths()` utility** — normalizes index sequence lengths
|
|
30
|
+
across a list of sample dicts (output of `sheet.samples()`) to a consistent
|
|
31
|
+
length before merging sheets with mixed-length indexes.
|
|
32
|
+
- `strategy="trim"` — trims all indexes to the shortest sequence length.
|
|
33
|
+
- `strategy="pad"` — pads shorter indexes to the longest length using `"N"`
|
|
34
|
+
wildcard characters (supported by BCLConvert ≥ 3.9 and bcl2fastq ≥ 2.20).
|
|
35
|
+
- Auto-detects V1-style (`index`/`index2`) and V2-style (`Index`/`Index2`)
|
|
36
|
+
field names; explicit `index1_key`/`index2_key` overrides supported.
|
|
37
|
+
- Exported from the top-level package as `normalize_index_lengths`.
|
|
38
|
+
|
|
39
|
+
- **CI / pre-commit integration guide** in README — GitHub Actions workflow
|
|
40
|
+
and pre-commit hook configuration for automatic sample sheet validation on
|
|
41
|
+
every commit or pull request that touches a `SampleSheet.csv`.
|
|
42
|
+
|
|
43
|
+
### Fixed
|
|
44
|
+
|
|
45
|
+
- `_detect_key()` in `index_utils` now selects the key with at least one
|
|
46
|
+
non-empty value before falling back to key presence, preventing silent
|
|
47
|
+
normalization skip when a key exists but all its values are `None` or `""`.
|
|
48
|
+
|
|
49
|
+
### Changed
|
|
50
|
+
|
|
51
|
+
- `--min-hamming` CLI option default and help text are now derived from the
|
|
52
|
+
`MIN_HAMMING_DISTANCE` constant in `validators.py` to prevent drift.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
9
56
|
## [0.3.3] - 2026-03-13
|
|
10
57
|
|
|
11
58
|
### Documentation
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: samplesheet-parser
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.4
|
|
4
4
|
Summary: Format-agnostic parser for Illumina SampleSheet.csv files — supports IEM V1 and BCLConvert V2
|
|
5
5
|
Project-URL: Homepage, https://github.com/chaitanyakasaraneni/samplesheet-parser
|
|
6
6
|
Project-URL: Documentation, https://illumina-samplesheet.readthedocs.io
|
|
@@ -557,6 +557,80 @@ sheet.get_read_structure() # → ReadStructure dataclass
|
|
|
557
557
|
|
|
558
558
|
---
|
|
559
559
|
|
|
560
|
+
## CI / pre-commit integration
|
|
561
|
+
|
|
562
|
+
The CLI exits with meaningful codes (`0` = clean, `1` = issues, `2` = error), making it easy to wire into automated pipelines.
|
|
563
|
+
|
|
564
|
+
### GitHub Actions
|
|
565
|
+
|
|
566
|
+
Add a validation step to any workflow that touches `SampleSheet.csv`:
|
|
567
|
+
|
|
568
|
+
```yaml
|
|
569
|
+
# .github/workflows/validate-samplesheet.yml
|
|
570
|
+
name: Validate SampleSheet
|
|
571
|
+
|
|
572
|
+
on:
|
|
573
|
+
push:
|
|
574
|
+
paths:
|
|
575
|
+
- '**/SampleSheet.csv'
|
|
576
|
+
pull_request:
|
|
577
|
+
paths:
|
|
578
|
+
- '**/SampleSheet.csv'
|
|
579
|
+
|
|
580
|
+
jobs:
|
|
581
|
+
validate:
|
|
582
|
+
runs-on: ubuntu-latest
|
|
583
|
+
steps:
|
|
584
|
+
- uses: actions/checkout@v4
|
|
585
|
+
|
|
586
|
+
- uses: actions/setup-python@v5
|
|
587
|
+
with:
|
|
588
|
+
python-version: '3.12'
|
|
589
|
+
|
|
590
|
+
- run: pip install "samplesheet-parser[cli]"
|
|
591
|
+
|
|
592
|
+
- name: Validate SampleSheet
|
|
593
|
+
run: samplesheet validate SampleSheet.csv --format json
|
|
594
|
+
```
|
|
595
|
+
|
|
596
|
+
### pre-commit hook
|
|
597
|
+
|
|
598
|
+
Gate commits that touch any `SampleSheet.csv` in the repository:
|
|
599
|
+
|
|
600
|
+
```yaml
|
|
601
|
+
# .pre-commit-config.yaml
|
|
602
|
+
repos:
|
|
603
|
+
- repo: local
|
|
604
|
+
hooks:
|
|
605
|
+
- id: samplesheet-validate
|
|
606
|
+
name: Validate SampleSheet.csv
|
|
607
|
+
entry: samplesheet validate
|
|
608
|
+
language: python
|
|
609
|
+
additional_dependencies: ["samplesheet-parser[cli]"]
|
|
610
|
+
files: SampleSheet\.csv$
|
|
611
|
+
pass_filenames: true
|
|
612
|
+
```
|
|
613
|
+
|
|
614
|
+
Install and run once to verify:
|
|
615
|
+
|
|
616
|
+
```bash
|
|
617
|
+
pip install pre-commit
|
|
618
|
+
pre-commit install
|
|
619
|
+
pre-commit run samplesheet-validate --all-files
|
|
620
|
+
```
|
|
621
|
+
|
|
622
|
+
### Stricter Hamming distance in CI
|
|
623
|
+
|
|
624
|
+
If your lab uses longer indexes (10 bp+), raise the minimum Hamming distance threshold to catch borderline cases earlier:
|
|
625
|
+
|
|
626
|
+
```bash
|
|
627
|
+
samplesheet validate SampleSheet.csv --min-hamming 4
|
|
628
|
+
```
|
|
629
|
+
|
|
630
|
+
This is especially useful in CI where you want to prevent runs that will likely fail demultiplexing.
|
|
631
|
+
|
|
632
|
+
---
|
|
633
|
+
|
|
560
634
|
## Contributing
|
|
561
635
|
|
|
562
636
|
```bash
|
|
@@ -585,7 +659,7 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for the full local testing guide and PR c
|
|
|
585
659
|
title = {samplesheet-parser: Format-agnostic parser for Illumina SampleSheet.csv},
|
|
586
660
|
year = {2026},
|
|
587
661
|
url = {https://github.com/chaitanyakasaraneni/samplesheet-parser},
|
|
588
|
-
version = {0.3.
|
|
662
|
+
version = {0.3.4}
|
|
589
663
|
}
|
|
590
664
|
```
|
|
591
665
|
|
|
@@ -506,6 +506,80 @@ sheet.get_read_structure() # → ReadStructure dataclass
|
|
|
506
506
|
|
|
507
507
|
---
|
|
508
508
|
|
|
509
|
+
## CI / pre-commit integration
|
|
510
|
+
|
|
511
|
+
The CLI exits with meaningful codes (`0` = clean, `1` = issues, `2` = error), making it easy to wire into automated pipelines.
|
|
512
|
+
|
|
513
|
+
### GitHub Actions
|
|
514
|
+
|
|
515
|
+
Add a validation step to any workflow that touches `SampleSheet.csv`:
|
|
516
|
+
|
|
517
|
+
```yaml
|
|
518
|
+
# .github/workflows/validate-samplesheet.yml
|
|
519
|
+
name: Validate SampleSheet
|
|
520
|
+
|
|
521
|
+
on:
|
|
522
|
+
push:
|
|
523
|
+
paths:
|
|
524
|
+
- '**/SampleSheet.csv'
|
|
525
|
+
pull_request:
|
|
526
|
+
paths:
|
|
527
|
+
- '**/SampleSheet.csv'
|
|
528
|
+
|
|
529
|
+
jobs:
|
|
530
|
+
validate:
|
|
531
|
+
runs-on: ubuntu-latest
|
|
532
|
+
steps:
|
|
533
|
+
- uses: actions/checkout@v4
|
|
534
|
+
|
|
535
|
+
- uses: actions/setup-python@v5
|
|
536
|
+
with:
|
|
537
|
+
python-version: '3.12'
|
|
538
|
+
|
|
539
|
+
- run: pip install "samplesheet-parser[cli]"
|
|
540
|
+
|
|
541
|
+
- name: Validate SampleSheet
|
|
542
|
+
run: samplesheet validate SampleSheet.csv --format json
|
|
543
|
+
```
|
|
544
|
+
|
|
545
|
+
### pre-commit hook
|
|
546
|
+
|
|
547
|
+
Gate commits that touch any `SampleSheet.csv` in the repository:
|
|
548
|
+
|
|
549
|
+
```yaml
|
|
550
|
+
# .pre-commit-config.yaml
|
|
551
|
+
repos:
|
|
552
|
+
- repo: local
|
|
553
|
+
hooks:
|
|
554
|
+
- id: samplesheet-validate
|
|
555
|
+
name: Validate SampleSheet.csv
|
|
556
|
+
entry: samplesheet validate
|
|
557
|
+
language: python
|
|
558
|
+
additional_dependencies: ["samplesheet-parser[cli]"]
|
|
559
|
+
files: SampleSheet\.csv$
|
|
560
|
+
pass_filenames: true
|
|
561
|
+
```
|
|
562
|
+
|
|
563
|
+
Install and run once to verify:
|
|
564
|
+
|
|
565
|
+
```bash
|
|
566
|
+
pip install pre-commit
|
|
567
|
+
pre-commit install
|
|
568
|
+
pre-commit run samplesheet-validate --all-files
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
### Stricter Hamming distance in CI
|
|
572
|
+
|
|
573
|
+
If your lab uses longer indexes (10 bp+), raise the minimum Hamming distance threshold to catch borderline cases earlier:
|
|
574
|
+
|
|
575
|
+
```bash
|
|
576
|
+
samplesheet validate SampleSheet.csv --min-hamming 4
|
|
577
|
+
```
|
|
578
|
+
|
|
579
|
+
This is especially useful in CI where you want to prevent runs that will likely fail demultiplexing.
|
|
580
|
+
|
|
581
|
+
---
|
|
582
|
+
|
|
509
583
|
## Contributing
|
|
510
584
|
|
|
511
585
|
```bash
|
|
@@ -534,7 +608,7 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for the full local testing guide and PR c
|
|
|
534
608
|
title = {samplesheet-parser: Format-agnostic parser for Illumina SampleSheet.csv},
|
|
535
609
|
year = {2026},
|
|
536
610
|
url = {https://github.com/chaitanyakasaraneni/samplesheet-parser},
|
|
537
|
-
version = {0.3.
|
|
611
|
+
version = {0.3.4}
|
|
538
612
|
}
|
|
539
613
|
```
|
|
540
614
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "samplesheet-parser"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.4"
|
|
8
8
|
description = "Format-agnostic parser for Illumina SampleSheet.csv files — supports IEM V1 and BCLConvert V2"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -36,6 +36,7 @@ from samplesheet_parser.converter import SampleSheetConverter
|
|
|
36
36
|
from samplesheet_parser.diff import DiffResult, SampleSheetDiff
|
|
37
37
|
from samplesheet_parser.enums import IndexType, SampleSheetVersion
|
|
38
38
|
from samplesheet_parser.factory import SampleSheetFactory
|
|
39
|
+
from samplesheet_parser.index_utils import normalize_index_lengths
|
|
39
40
|
from samplesheet_parser.merger import MergeResult, SampleSheetMerger
|
|
40
41
|
from samplesheet_parser.parsers.v1 import SampleSheetV1
|
|
41
42
|
from samplesheet_parser.parsers.v2 import SampleSheetV2
|
|
@@ -56,5 +57,6 @@ __all__ = [
|
|
|
56
57
|
"SampleSheetWriter",
|
|
57
58
|
"SampleSheetMerger",
|
|
58
59
|
"MergeResult",
|
|
60
|
+
"normalize_index_lengths",
|
|
59
61
|
"__version__",
|
|
60
62
|
]
|
|
@@ -5,6 +5,7 @@ Entry point: ``samplesheet`` (configured in ``pyproject.toml``).
|
|
|
5
5
|
|
|
6
6
|
Commands
|
|
7
7
|
--------
|
|
8
|
+
info Show a quick summary of a sample sheet.
|
|
8
9
|
validate Validate a sheet — exit 0 if clean, exit 1 if errors.
|
|
9
10
|
convert Convert between V1 and V2 formats.
|
|
10
11
|
diff Diff two sheets — exit 1 if changes detected.
|
|
@@ -20,8 +21,12 @@ Usage
|
|
|
20
21
|
-----
|
|
21
22
|
::
|
|
22
23
|
|
|
24
|
+
samplesheet info SampleSheet.csv
|
|
25
|
+
samplesheet info SampleSheet.csv --format json
|
|
26
|
+
|
|
23
27
|
samplesheet validate SampleSheet.csv
|
|
24
28
|
samplesheet validate SampleSheet.csv --format json
|
|
29
|
+
samplesheet validate SampleSheet.csv --min-hamming 4
|
|
25
30
|
|
|
26
31
|
samplesheet convert SampleSheet_v1.csv --to v2 --output SampleSheet_v2.csv
|
|
27
32
|
samplesheet convert SampleSheet_v2.csv --to v1 --output SampleSheet_v1.csv
|
|
@@ -50,6 +55,7 @@ except ImportError: # pragma: no cover
|
|
|
50
55
|
_TYPER_AVAILABLE = False
|
|
51
56
|
|
|
52
57
|
from samplesheet_parser.enums import SampleSheetVersion
|
|
58
|
+
from samplesheet_parser.validators import MIN_HAMMING_DISTANCE as _MIN_HAMMING_DEFAULT
|
|
53
59
|
|
|
54
60
|
if _TYPER_AVAILABLE:
|
|
55
61
|
app = typer.Typer(
|
|
@@ -115,6 +121,87 @@ if _TYPER_AVAILABLE:
|
|
|
115
121
|
typer.echo(f"Error: unknown format '{fmt}'. Use 'text' or 'json'.", err=True)
|
|
116
122
|
raise typer.Exit(code=2)
|
|
117
123
|
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
# info
|
|
126
|
+
# ---------------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
@app.command()
|
|
129
|
+
def info(
|
|
130
|
+
path: Annotated[Path, typer.Argument(help="Path to SampleSheet.csv.", metavar="FILE")],
|
|
131
|
+
fmt: _FormatOption = "text",
|
|
132
|
+
) -> None:
|
|
133
|
+
"""Display a quick summary of a sample sheet without full validation.
|
|
134
|
+
|
|
135
|
+
Shows format version, sample count, lanes, index type, read lengths,
|
|
136
|
+
and adapter sequences at a glance.
|
|
137
|
+
|
|
138
|
+
Exits 0 on success, 2 on unreadable files.
|
|
139
|
+
"""
|
|
140
|
+
from samplesheet_parser.factory import SampleSheetFactory
|
|
141
|
+
from samplesheet_parser.parsers.v1 import SampleSheetV1
|
|
142
|
+
|
|
143
|
+
_validate_fmt(fmt)
|
|
144
|
+
if not path.exists():
|
|
145
|
+
typer.echo(f"Error: file not found: {path}", err=True)
|
|
146
|
+
raise typer.Exit(code=2)
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
factory = SampleSheetFactory()
|
|
150
|
+
sheet = factory.create_parser(str(path), parse=True, clean=False)
|
|
151
|
+
except Exception as exc:
|
|
152
|
+
typer.echo(f"Error: could not parse {path}: {exc}", err=True)
|
|
153
|
+
raise typer.Exit(code=2) from exc
|
|
154
|
+
|
|
155
|
+
if factory.version is None: # pragma: no cover
|
|
156
|
+
raise RuntimeError("SampleSheetFactory.version must be set after create_parser")
|
|
157
|
+
|
|
158
|
+
samples = sheet.samples()
|
|
159
|
+
lanes = sorted({str(s.get("lane") or "") for s in samples} - {""}) or ["(none)"]
|
|
160
|
+
index_type = sheet.index_type()
|
|
161
|
+
adapters: list[str] = getattr(sheet, "adapters", []) or []
|
|
162
|
+
experiment_name: str | None = getattr(sheet, "experiment_name", None)
|
|
163
|
+
|
|
164
|
+
if isinstance(sheet, SampleSheetV1):
|
|
165
|
+
read_lengths = [str(r) for r in (sheet.read_lengths or [])]
|
|
166
|
+
instrument = sheet.instrument_type
|
|
167
|
+
else:
|
|
168
|
+
reads_dict = sheet.reads or {}
|
|
169
|
+
read_lengths = [
|
|
170
|
+
str(reads_dict[k])
|
|
171
|
+
for k in ("Read1Cycles", "Read2Cycles")
|
|
172
|
+
if k in reads_dict
|
|
173
|
+
]
|
|
174
|
+
instrument = sheet.instrument_platform
|
|
175
|
+
|
|
176
|
+
if fmt == "json":
|
|
177
|
+
_print_json({
|
|
178
|
+
"file": str(path),
|
|
179
|
+
"format": factory.version.value,
|
|
180
|
+
"sample_count": len(samples),
|
|
181
|
+
"lanes": lanes,
|
|
182
|
+
"index_type": index_type,
|
|
183
|
+
"read_lengths": read_lengths,
|
|
184
|
+
"adapters": adapters,
|
|
185
|
+
"experiment_name": experiment_name,
|
|
186
|
+
"instrument": instrument,
|
|
187
|
+
})
|
|
188
|
+
else:
|
|
189
|
+
typer.echo(f"File: {path}")
|
|
190
|
+
typer.echo(f"Format: {factory.version.value}")
|
|
191
|
+
typer.echo(f"Samples: {len(samples)}")
|
|
192
|
+
typer.echo(f"Lanes: {', '.join(lanes)}")
|
|
193
|
+
typer.echo(f"Index type: {index_type}")
|
|
194
|
+
typer.echo(
|
|
195
|
+
f"Read lengths: {' + '.join(read_lengths) if read_lengths else '(not set)'}"
|
|
196
|
+
)
|
|
197
|
+
typer.echo(f"Adapters: {', '.join(adapters) if adapters else '(none)'}")
|
|
198
|
+
if experiment_name:
|
|
199
|
+
typer.echo(f"Experiment: {experiment_name}")
|
|
200
|
+
if instrument:
|
|
201
|
+
typer.echo(f"Instrument: {instrument}")
|
|
202
|
+
|
|
203
|
+
raise typer.Exit(code=0)
|
|
204
|
+
|
|
118
205
|
# ---------------------------------------------------------------------------
|
|
119
206
|
# validate
|
|
120
207
|
# ---------------------------------------------------------------------------
|
|
@@ -123,6 +210,17 @@ if _TYPER_AVAILABLE:
|
|
|
123
210
|
def validate(
|
|
124
211
|
path: Annotated[Path, typer.Argument(help="Path to SampleSheet.csv.", metavar="FILE")],
|
|
125
212
|
fmt: _FormatOption = "text",
|
|
213
|
+
min_hamming: Annotated[
|
|
214
|
+
int,
|
|
215
|
+
typer.Option(
|
|
216
|
+
"--min-hamming",
|
|
217
|
+
help=(
|
|
218
|
+
f"Minimum Hamming distance between indexes "
|
|
219
|
+
f"(default: {_MIN_HAMMING_DEFAULT}, must be >= 1)."
|
|
220
|
+
),
|
|
221
|
+
metavar="N",
|
|
222
|
+
),
|
|
223
|
+
] = _MIN_HAMMING_DEFAULT,
|
|
126
224
|
) -> None:
|
|
127
225
|
"""Validate a sample sheet for index, adapter, and structural issues.
|
|
128
226
|
|
|
@@ -134,6 +232,11 @@ if _TYPER_AVAILABLE:
|
|
|
134
232
|
from samplesheet_parser.validators import SampleSheetValidator
|
|
135
233
|
|
|
136
234
|
_validate_fmt(fmt)
|
|
235
|
+
if min_hamming < 1:
|
|
236
|
+
typer.echo(
|
|
237
|
+
f"Error: --min-hamming must be >= 1, got {min_hamming}.", err=True
|
|
238
|
+
)
|
|
239
|
+
raise typer.Exit(code=2)
|
|
137
240
|
if not path.exists():
|
|
138
241
|
typer.echo(f"Error: file not found: {path}", err=True)
|
|
139
242
|
raise typer.Exit(code=2)
|
|
@@ -149,13 +252,14 @@ if _TYPER_AVAILABLE:
|
|
|
149
252
|
raise RuntimeError("SampleSheetFactory.version must be set after create_parser")
|
|
150
253
|
version = factory.version
|
|
151
254
|
|
|
152
|
-
result = SampleSheetValidator().validate(sheet)
|
|
255
|
+
result = SampleSheetValidator().validate(sheet, min_hamming_distance=min_hamming)
|
|
153
256
|
|
|
154
257
|
if fmt == "json":
|
|
155
258
|
_print_json({
|
|
156
259
|
"file": str(path),
|
|
157
260
|
"version": version.value,
|
|
158
261
|
"is_valid": result.is_valid,
|
|
262
|
+
"min_hamming_distance": min_hamming,
|
|
159
263
|
"errors": [
|
|
160
264
|
{"code": e.code, "message": e.message, "context": e.context}
|
|
161
265
|
for e in result.errors
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Index normalization utilities for Illumina sample sheets.
|
|
3
|
+
|
|
4
|
+
When merging sheets from different projects, indexes may have been designed
|
|
5
|
+
with different lengths (e.g. 8 bp vs 10 bp). This can cause
|
|
6
|
+
``INDEX_DISTANCE_TOO_LOW`` or ``INDEX_COLLISION`` errors at merge time because
|
|
7
|
+
the comparison is length-aware (shorter sequence wins).
|
|
8
|
+
|
|
9
|
+
Two strategies are provided:
|
|
10
|
+
|
|
11
|
+
``"trim"``
|
|
12
|
+
Trim all index sequences to the length of the *shortest* index in the
|
|
13
|
+
sample list. Safe when the extra cycles are padding bases.
|
|
14
|
+
|
|
15
|
+
``"pad"``
|
|
16
|
+
Pad shorter indexes to the length of the *longest* index using ``"N"``
|
|
17
|
+
wildcard characters. ``"N"`` matches any base during demultiplexing in
|
|
18
|
+
BCLConvert ≥ 3.9 and bcl2fastq ≥ 2.20.
|
|
19
|
+
|
|
20
|
+
Examples
|
|
21
|
+
--------
|
|
22
|
+
>>> from samplesheet_parser import SampleSheetFactory
|
|
23
|
+
>>> from samplesheet_parser.index_utils import normalize_index_lengths
|
|
24
|
+
>>>
|
|
25
|
+
>>> sheet = SampleSheetFactory().create_parser("SampleSheet.csv", parse=True)
|
|
26
|
+
>>> normalized = normalize_index_lengths(sheet.samples(), strategy="trim")
|
|
27
|
+
>>> for s in normalized:
|
|
28
|
+
... print(s["sample_id"], s.get("index") or s.get("Index"))
|
|
29
|
+
|
|
30
|
+
Authors
|
|
31
|
+
-------
|
|
32
|
+
Chaitanya Kasaraneni
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
from typing import Any, Literal
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def normalize_index_lengths(
|
|
41
|
+
samples: list[dict[str, Any]],
|
|
42
|
+
strategy: Literal["trim", "pad"] = "trim",
|
|
43
|
+
*,
|
|
44
|
+
index1_key: str | None = None,
|
|
45
|
+
index2_key: str | None = None,
|
|
46
|
+
) -> list[dict[str, Any]]:
|
|
47
|
+
"""Normalize index sequence lengths across all samples.
|
|
48
|
+
|
|
49
|
+
Detects whether the samples use V1-style keys (``"index"`` / ``"index2"``)
|
|
50
|
+
or V2-style keys (``"Index"`` / ``"Index2"``) automatically, unless
|
|
51
|
+
``index1_key`` / ``index2_key`` are supplied explicitly.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
samples:
|
|
56
|
+
Output of ``sheet.samples()``. Not modified in place — the
|
|
57
|
+
function returns new dicts with the index values replaced.
|
|
58
|
+
strategy:
|
|
59
|
+
``"trim"`` — trim all indexes to the length of the shortest
|
|
60
|
+
sequence in *samples* (default).
|
|
61
|
+
|
|
62
|
+
``"pad"`` — pad all indexes to the length of the longest
|
|
63
|
+
sequence using ``"N"`` wildcard characters.
|
|
64
|
+
index1_key:
|
|
65
|
+
Override the dict key for the primary index (I7). Auto-detected
|
|
66
|
+
from the first sample that has an index value if ``None``.
|
|
67
|
+
index2_key:
|
|
68
|
+
Override the dict key for the secondary index (I5). Auto-detected
|
|
69
|
+
from the first sample that has an index2 value if ``None``. Pass
|
|
70
|
+
an explicit value of ``""`` to suppress I5 normalization entirely.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
list[dict]
|
|
75
|
+
Shallow copies of the input dicts with index values replaced.
|
|
76
|
+
Samples that have no index are returned unchanged.
|
|
77
|
+
|
|
78
|
+
Raises
|
|
79
|
+
------
|
|
80
|
+
ValueError
|
|
81
|
+
If *strategy* is not ``"trim"`` or ``"pad"``.
|
|
82
|
+
|
|
83
|
+
Examples
|
|
84
|
+
--------
|
|
85
|
+
>>> samples = [
|
|
86
|
+
... {"sample_id": "S1", "index": "ATTACTCG", "index2": "TATAGCCT"},
|
|
87
|
+
... {"sample_id": "S2", "index": "TCCGGAGAGG", "index2": "ATAGAGGCTA"},
|
|
88
|
+
... ]
|
|
89
|
+
>>> normalize_index_lengths(samples, strategy="trim")
|
|
90
|
+
[{'sample_id': 'S1', 'index': 'ATTACTCG', 'index2': 'TATAGCCT'},
|
|
91
|
+
{'sample_id': 'S2', 'index': 'TCCGGAGA', 'index2': 'ATAGAGGC'}]
|
|
92
|
+
|
|
93
|
+
>>> normalize_index_lengths(samples, strategy="pad")
|
|
94
|
+
[{'sample_id': 'S1', 'index': 'ATTACTCGNN', 'index2': 'TATAGCCTNN'},
|
|
95
|
+
{'sample_id': 'S2', 'index': 'TCCGGAGAGG', 'index2': 'ATAGAGGCTA'}]
|
|
96
|
+
"""
|
|
97
|
+
if strategy not in ("trim", "pad"):
|
|
98
|
+
raise ValueError(f"strategy must be 'trim' or 'pad', got {strategy!r}")
|
|
99
|
+
|
|
100
|
+
if not samples:
|
|
101
|
+
return []
|
|
102
|
+
|
|
103
|
+
# ── Auto-detect key names ────────────────────────────────────────────────
|
|
104
|
+
if index1_key is None:
|
|
105
|
+
index1_key = _detect_key(samples, ("index", "Index"))
|
|
106
|
+
if index2_key is None:
|
|
107
|
+
index2_key = _detect_key(samples, ("index2", "Index2"))
|
|
108
|
+
|
|
109
|
+
# ── Collect index lengths ────────────────────────────────────────────────
|
|
110
|
+
i1_lengths: list[int] = []
|
|
111
|
+
i2_lengths: list[int] = []
|
|
112
|
+
|
|
113
|
+
for s in samples:
|
|
114
|
+
if index1_key:
|
|
115
|
+
v = s.get(index1_key)
|
|
116
|
+
if v:
|
|
117
|
+
i1_lengths.append(len(v))
|
|
118
|
+
if index2_key:
|
|
119
|
+
v = s.get(index2_key)
|
|
120
|
+
if v:
|
|
121
|
+
i2_lengths.append(len(v))
|
|
122
|
+
|
|
123
|
+
# If all samples have the same length already, return copies unchanged.
|
|
124
|
+
i1_uniform = len(set(i1_lengths)) <= 1
|
|
125
|
+
i2_uniform = len(set(i2_lengths)) <= 1
|
|
126
|
+
if i1_uniform and i2_uniform:
|
|
127
|
+
return [dict(s) for s in samples]
|
|
128
|
+
|
|
129
|
+
# ── Determine target lengths ─────────────────────────────────────────────
|
|
130
|
+
if strategy == "trim":
|
|
131
|
+
target_i1 = min(i1_lengths) if i1_lengths else 0
|
|
132
|
+
target_i2 = min(i2_lengths) if i2_lengths else 0
|
|
133
|
+
else: # pad
|
|
134
|
+
target_i1 = max(i1_lengths) if i1_lengths else 0
|
|
135
|
+
target_i2 = max(i2_lengths) if i2_lengths else 0
|
|
136
|
+
|
|
137
|
+
# ── Apply normalization ──────────────────────────────────────────────────
|
|
138
|
+
result: list[dict[str, Any]] = []
|
|
139
|
+
for sample in samples:
|
|
140
|
+
out = dict(sample)
|
|
141
|
+
|
|
142
|
+
if index1_key and target_i1:
|
|
143
|
+
v1: str | None = out.get(index1_key) # type: ignore[assignment]
|
|
144
|
+
if v1:
|
|
145
|
+
out[index1_key] = _apply(v1, target_i1, strategy)
|
|
146
|
+
|
|
147
|
+
if index2_key and target_i2:
|
|
148
|
+
v2: str | None = out.get(index2_key) # type: ignore[assignment]
|
|
149
|
+
if v2:
|
|
150
|
+
out[index2_key] = _apply(v2, target_i2, strategy)
|
|
151
|
+
|
|
152
|
+
result.append(out)
|
|
153
|
+
|
|
154
|
+
return result
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# ---------------------------------------------------------------------------
|
|
158
|
+
# Helpers
|
|
159
|
+
# ---------------------------------------------------------------------------
|
|
160
|
+
|
|
161
|
+
def _detect_key(samples: list[dict[str, Any]], candidates: tuple[str, ...]) -> str:
|
|
162
|
+
"""Return the first candidate key that has at least one non-empty value.
|
|
163
|
+
|
|
164
|
+
Falls back to key *presence* (regardless of value) if no candidate has
|
|
165
|
+
any non-empty value, and finally to the first candidate name if none are
|
|
166
|
+
present at all.
|
|
167
|
+
"""
|
|
168
|
+
# Prefer a key that actually carries data
|
|
169
|
+
for key in candidates:
|
|
170
|
+
if any(s.get(key) for s in samples):
|
|
171
|
+
return key
|
|
172
|
+
# Fall back to key presence (all values empty/None but key exists)
|
|
173
|
+
for key in candidates:
|
|
174
|
+
if any(key in s for s in samples):
|
|
175
|
+
return key
|
|
176
|
+
return candidates[0]
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _apply(seq: str, target_length: int, strategy: str) -> str:
|
|
180
|
+
"""Trim or pad *seq* to *target_length*."""
|
|
181
|
+
if strategy == "trim":
|
|
182
|
+
return seq[:target_length]
|
|
183
|
+
# pad
|
|
184
|
+
return seq.ljust(target_length, "N")
|
|
@@ -167,6 +167,11 @@ class SampleSheetMerger:
|
|
|
167
167
|
Output format for the merged sheet. Defaults to
|
|
168
168
|
:attr:`SampleSheetVersion.V2`. If inputs are mixed V1/V2, all
|
|
169
169
|
are converted to this format.
|
|
170
|
+
min_hamming_distance:
|
|
171
|
+
Minimum Hamming distance required between any two index sequences
|
|
172
|
+
in the same lane, both within a sheet and across sheets. Pairs
|
|
173
|
+
below this threshold produce an ``INDEX_DISTANCE_TOO_LOW`` warning.
|
|
174
|
+
Defaults to :data:`MIN_HAMMING_DISTANCE` (3).
|
|
170
175
|
|
|
171
176
|
Examples
|
|
172
177
|
--------
|
|
@@ -181,8 +186,11 @@ class SampleSheetMerger:
|
|
|
181
186
|
def __init__(
|
|
182
187
|
self,
|
|
183
188
|
target_version: SampleSheetVersion = SampleSheetVersion.V2,
|
|
189
|
+
*,
|
|
190
|
+
min_hamming_distance: int = MIN_HAMMING_DISTANCE,
|
|
184
191
|
) -> None:
|
|
185
192
|
self.target_version = target_version
|
|
193
|
+
self.min_hamming_distance = min_hamming_distance
|
|
186
194
|
self._paths: list[Path] = []
|
|
187
195
|
|
|
188
196
|
# ------------------------------------------------------------------
|
|
@@ -267,7 +275,7 @@ class SampleSheetMerger:
|
|
|
267
275
|
self._check_read_lengths(parsed, result)
|
|
268
276
|
self._check_adapters(parsed, result)
|
|
269
277
|
self._check_index_collisions(parsed, result)
|
|
270
|
-
self._check_index_distances(parsed, result)
|
|
278
|
+
self._check_index_distances(parsed, result, min_distance=self.min_hamming_distance)
|
|
271
279
|
|
|
272
280
|
# ── 3. Abort early if hard conflicts found ───────────────────────
|
|
273
281
|
if result.has_conflicts and abort_on_conflicts:
|
|
@@ -688,7 +696,9 @@ class SampleSheetMerger:
|
|
|
688
696
|
sheet = SampleSheetFactory().create_parser(
|
|
689
697
|
tmp_path, parse=True, clean=False
|
|
690
698
|
)
|
|
691
|
-
vresult = SampleSheetValidator().validate(
|
|
699
|
+
vresult = SampleSheetValidator().validate(
|
|
700
|
+
sheet, min_hamming_distance=self.min_hamming_distance
|
|
701
|
+
)
|
|
692
702
|
except Exception as exc:
|
|
693
703
|
# Convert any parse/validation failure into a structured conflict
|
|
694
704
|
# so merge() always returns a MergeResult rather than raising.
|