samplesheet-parser 0.3.3__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/.gitignore +2 -0
  2. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/CHANGELOG.md +47 -0
  3. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/PKG-INFO +76 -2
  4. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/README.md +75 -1
  5. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/pyproject.toml +1 -1
  6. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/__init__.py +2 -0
  7. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/cli.py +105 -1
  8. samplesheet_parser-0.3.4/samplesheet_parser/index_utils.py +184 -0
  9. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/merger.py +12 -2
  10. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/validators.py +9 -1
  11. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_cli.py +155 -0
  12. samplesheet_parser-0.3.4/tests/test_index_utils.py +163 -0
  13. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_validators/test_validators.py +57 -0
  14. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/.github/workflows/ci.yml +0 -0
  15. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/.github/workflows/copilot-instructions.md +0 -0
  16. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/.zenodo.json +0 -0
  17. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/CITATION.cff +0 -0
  18. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/CONTRIBUTING.md +0 -0
  19. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/LICENSE +0 -0
  20. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/demo_merger.py +0 -0
  21. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/parse_examples.py +0 -0
  22. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/ProjectAlpha_SampleSheet.csv +0 -0
  23. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/ProjectBeta_SampleSheet.csv +0 -0
  24. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/ProjectDelta_SampleSheet_collision.csv +0 -0
  25. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/ProjectGamma_SampleSheet.csv +0 -0
  26. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/README.md +0 -0
  27. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/combined_clean.csv +0 -0
  28. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/combined_collision_forced.csv +0 -0
  29. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/combined_mixed_formats.csv +0 -0
  30. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v1_dual_index.csv +0 -0
  31. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v1_multi_lane.csv +0 -0
  32. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v1_single_index.csv +0 -0
  33. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v1_with_lab_qc_settings.csv +0 -0
  34. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v1_with_manifests.csv +0 -0
  35. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_nextseq_single_index.csv +0 -0
  36. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_novaseq_x_dual_index.csv +0 -0
  37. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_with_cloud_settings.csv +0 -0
  38. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_with_index_umi.csv +0 -0
  39. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_with_pipeline_settings.csv +0 -0
  40. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/examples/sample_sheets/v2_with_read_umi.csv +0 -0
  41. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/images/samplesheet_parser_arch_v03.png +0 -0
  42. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/images/samplesheet_parser_overview.png +0 -0
  43. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/converter.py +0 -0
  44. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/diff.py +0 -0
  45. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/enums.py +0 -0
  46. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/factory.py +0 -0
  47. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/parsers/__init__.py +0 -0
  48. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/parsers/v1.py +0 -0
  49. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/parsers/v2.py +0 -0
  50. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/samplesheet_parser/writer.py +0 -0
  51. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/scripts/demo_converter.py +0 -0
  52. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/scripts/demo_diff.py +0 -0
  53. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/scripts/demo_writer.py +0 -0
  54. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/__init__.py +0 -0
  55. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/conftest.py +0 -0
  56. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/fixtures/SampleSheet_v1_dual_index.csv +0 -0
  57. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/fixtures/SampleSheet_v2_dual_index.csv +0 -0
  58. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/fixtures/SampleSheet_v2_modified.csv +0 -0
  59. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_converter.py +0 -0
  60. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_diff.py +0 -0
  61. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_factory.py +0 -0
  62. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_merger.py +0 -0
  63. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_parsers/__init__.py +0 -0
  64. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_parsers/test_v1.py +0 -0
  65. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_parsers/test_v2.py +0 -0
  66. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_validators/__init__.py +0 -0
  67. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_validators/test_hamming.py +0 -0
  68. {samplesheet_parser-0.3.3 → samplesheet_parser-0.3.4}/tests/test_writer.py +0 -0
@@ -2,6 +2,8 @@
2
2
  BLOGPOST.md
3
3
  tests/fixtures/outputs/
4
4
  demo_output.txt
5
+ **/CSBJ Submission/
6
+ **/.claude/
5
7
 
6
8
  # Cache and build artifacts
7
9
  __pycache__/
@@ -6,6 +6,53 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
6
6
 
7
7
  ---
8
8
 
9
+ ## [0.3.4] - 2026-04-04
10
+
11
+ ### Added
12
+
13
+ - **`samplesheet info` CLI command** — prints a concise summary of any V1 or
14
+ V2 sample sheet (format, sample count, lanes, index type, read lengths,
15
+ adapters, experiment name, instrument). Supports `--format json` for
16
+ machine-readable output; exits 0 on success, 2 on unreadable files.
17
+
18
+ - **Configurable Hamming distance threshold** — `SampleSheetValidator.validate()`
19
+ now accepts a `min_hamming_distance` keyword argument (default: 3) so labs
20
+ using longer indexes can enforce stricter thresholds without changing the
21
+ module-level constant.
22
+ - `SampleSheetMerger` accepts the same parameter in `__init__()` and applies
23
+ it to both the intra-sheet and cross-sheet Hamming checks as well as the
24
+ post-merge validation step.
25
+ - `samplesheet validate` exposes `--min-hamming N` (must be ≥ 1; exits 2 on
26
+ invalid input). The JSON output includes `min_hamming_distance` for
27
+ auditability.
28
+
29
+ - **`normalize_index_lengths()` utility** — normalizes index sequence lengths
30
+ across a list of sample dicts (output of `sheet.samples()`) to a consistent
31
+ length before merging sheets with mixed-length indexes.
32
+ - `strategy="trim"` — trims all indexes to the shortest sequence length.
33
+ - `strategy="pad"` — pads shorter indexes to the longest length using `"N"`
34
+ wildcard characters (supported by BCLConvert ≥ 3.9 and bcl2fastq ≥ 2.20).
35
+ - Auto-detects V1-style (`index`/`index2`) and V2-style (`Index`/`Index2`)
36
+ field names; explicit `index1_key`/`index2_key` overrides supported.
37
+ - Exported from the top-level package as `normalize_index_lengths`.
38
+
39
+ - **CI / pre-commit integration guide** in README — GitHub Actions workflow
40
+ and pre-commit hook configuration for automatic sample sheet validation on
41
+ every commit or pull request that touches a `SampleSheet.csv`.
42
+
43
+ ### Fixed
44
+
45
+ - `_detect_key()` in `index_utils` now selects the key with at least one
46
+ non-empty value before falling back to key presence, preventing silent
47
+ normalization skip when a key exists but all its values are `None` or `""`.
48
+
49
+ ### Changed
50
+
51
+ - `--min-hamming` CLI option default and help text are now derived from the
52
+ `MIN_HAMMING_DISTANCE` constant in `validators.py` to prevent drift.
53
+
54
+ ---
55
+
9
56
  ## [0.3.3] - 2026-03-13
10
57
 
11
58
  ### Documentation
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: samplesheet-parser
3
- Version: 0.3.3
3
+ Version: 0.3.4
4
4
  Summary: Format-agnostic parser for Illumina SampleSheet.csv files — supports IEM V1 and BCLConvert V2
5
5
  Project-URL: Homepage, https://github.com/chaitanyakasaraneni/samplesheet-parser
6
6
  Project-URL: Documentation, https://illumina-samplesheet.readthedocs.io
@@ -557,6 +557,80 @@ sheet.get_read_structure() # → ReadStructure dataclass
557
557
 
558
558
  ---
559
559
 
560
+ ## CI / pre-commit integration
561
+
562
+ The CLI exits with meaningful codes (`0` = clean, `1` = issues, `2` = error), making it easy to wire into automated pipelines.
563
+
564
+ ### GitHub Actions
565
+
566
+ Add a validation step to any workflow that touches `SampleSheet.csv`:
567
+
568
+ ```yaml
569
+ # .github/workflows/validate-samplesheet.yml
570
+ name: Validate SampleSheet
571
+
572
+ on:
573
+ push:
574
+ paths:
575
+ - '**/SampleSheet.csv'
576
+ pull_request:
577
+ paths:
578
+ - '**/SampleSheet.csv'
579
+
580
+ jobs:
581
+ validate:
582
+ runs-on: ubuntu-latest
583
+ steps:
584
+ - uses: actions/checkout@v4
585
+
586
+ - uses: actions/setup-python@v5
587
+ with:
588
+ python-version: '3.12'
589
+
590
+ - run: pip install "samplesheet-parser[cli]"
591
+
592
+ - name: Validate SampleSheet
593
+ run: samplesheet validate SampleSheet.csv --format json
594
+ ```
595
+
596
+ ### pre-commit hook
597
+
598
+ Gate commits that touch any `SampleSheet.csv` in the repository:
599
+
600
+ ```yaml
601
+ # .pre-commit-config.yaml
602
+ repos:
603
+ - repo: local
604
+ hooks:
605
+ - id: samplesheet-validate
606
+ name: Validate SampleSheet.csv
607
+ entry: samplesheet validate
608
+ language: python
609
+ additional_dependencies: ["samplesheet-parser[cli]"]
610
+ files: SampleSheet\.csv$
611
+ pass_filenames: true
612
+ ```
613
+
614
+ Install and run once to verify:
615
+
616
+ ```bash
617
+ pip install pre-commit
618
+ pre-commit install
619
+ pre-commit run samplesheet-validate --all-files
620
+ ```
621
+
622
+ ### Stricter Hamming distance in CI
623
+
624
+ If your lab uses longer indexes (10 bp+), raise the minimum Hamming distance threshold to catch borderline cases earlier:
625
+
626
+ ```bash
627
+ samplesheet validate SampleSheet.csv --min-hamming 4
628
+ ```
629
+
630
+ This is especially useful in CI where you want to prevent runs that will likely fail demultiplexing.
631
+
632
+ ---
633
+
560
634
  ## Contributing
561
635
 
562
636
  ```bash
@@ -585,7 +659,7 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for the full local testing guide and PR c
585
659
  title = {samplesheet-parser: Format-agnostic parser for Illumina SampleSheet.csv},
586
660
  year = {2026},
587
661
  url = {https://github.com/chaitanyakasaraneni/samplesheet-parser},
588
- version = {0.3.3}
662
+ version = {0.3.4}
589
663
  }
590
664
  ```
591
665
 
@@ -506,6 +506,80 @@ sheet.get_read_structure() # → ReadStructure dataclass
506
506
 
507
507
  ---
508
508
 
509
+ ## CI / pre-commit integration
510
+
511
+ The CLI exits with meaningful codes (`0` = clean, `1` = issues, `2` = error), making it easy to wire into automated pipelines.
512
+
513
+ ### GitHub Actions
514
+
515
+ Add a validation step to any workflow that touches `SampleSheet.csv`:
516
+
517
+ ```yaml
518
+ # .github/workflows/validate-samplesheet.yml
519
+ name: Validate SampleSheet
520
+
521
+ on:
522
+ push:
523
+ paths:
524
+ - '**/SampleSheet.csv'
525
+ pull_request:
526
+ paths:
527
+ - '**/SampleSheet.csv'
528
+
529
+ jobs:
530
+ validate:
531
+ runs-on: ubuntu-latest
532
+ steps:
533
+ - uses: actions/checkout@v4
534
+
535
+ - uses: actions/setup-python@v5
536
+ with:
537
+ python-version: '3.12'
538
+
539
+ - run: pip install "samplesheet-parser[cli]"
540
+
541
+ - name: Validate SampleSheet
542
+ run: samplesheet validate SampleSheet.csv --format json
543
+ ```
544
+
545
+ ### pre-commit hook
546
+
547
+ Gate commits that touch any `SampleSheet.csv` in the repository:
548
+
549
+ ```yaml
550
+ # .pre-commit-config.yaml
551
+ repos:
552
+ - repo: local
553
+ hooks:
554
+ - id: samplesheet-validate
555
+ name: Validate SampleSheet.csv
556
+ entry: samplesheet validate
557
+ language: python
558
+ additional_dependencies: ["samplesheet-parser[cli]"]
559
+ files: SampleSheet\.csv$
560
+ pass_filenames: true
561
+ ```
562
+
563
+ Install and run once to verify:
564
+
565
+ ```bash
566
+ pip install pre-commit
567
+ pre-commit install
568
+ pre-commit run samplesheet-validate --all-files
569
+ ```
570
+
571
+ ### Stricter Hamming distance in CI
572
+
573
+ If your lab uses longer indexes (10 bp+), raise the minimum Hamming distance threshold to catch borderline cases earlier:
574
+
575
+ ```bash
576
+ samplesheet validate SampleSheet.csv --min-hamming 4
577
+ ```
578
+
579
+ This is especially useful in CI where you want to prevent runs that will likely fail demultiplexing.
580
+
581
+ ---
582
+
509
583
  ## Contributing
510
584
 
511
585
  ```bash
@@ -534,7 +608,7 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for the full local testing guide and PR c
534
608
  title = {samplesheet-parser: Format-agnostic parser for Illumina SampleSheet.csv},
535
609
  year = {2026},
536
610
  url = {https://github.com/chaitanyakasaraneni/samplesheet-parser},
537
- version = {0.3.3}
611
+ version = {0.3.4}
538
612
  }
539
613
  ```
540
614
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "samplesheet-parser"
7
- version = "0.3.3"
7
+ version = "0.3.4"
8
8
  description = "Format-agnostic parser for Illumina SampleSheet.csv files — supports IEM V1 and BCLConvert V2"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -36,6 +36,7 @@ from samplesheet_parser.converter import SampleSheetConverter
36
36
  from samplesheet_parser.diff import DiffResult, SampleSheetDiff
37
37
  from samplesheet_parser.enums import IndexType, SampleSheetVersion
38
38
  from samplesheet_parser.factory import SampleSheetFactory
39
+ from samplesheet_parser.index_utils import normalize_index_lengths
39
40
  from samplesheet_parser.merger import MergeResult, SampleSheetMerger
40
41
  from samplesheet_parser.parsers.v1 import SampleSheetV1
41
42
  from samplesheet_parser.parsers.v2 import SampleSheetV2
@@ -56,5 +57,6 @@ __all__ = [
56
57
  "SampleSheetWriter",
57
58
  "SampleSheetMerger",
58
59
  "MergeResult",
60
+ "normalize_index_lengths",
59
61
  "__version__",
60
62
  ]
@@ -5,6 +5,7 @@ Entry point: ``samplesheet`` (configured in ``pyproject.toml``).
5
5
 
6
6
  Commands
7
7
  --------
8
+ info Show a quick summary of a sample sheet.
8
9
  validate Validate a sheet — exit 0 if clean, exit 1 if errors.
9
10
  convert Convert between V1 and V2 formats.
10
11
  diff Diff two sheets — exit 1 if changes detected.
@@ -20,8 +21,12 @@ Usage
20
21
  -----
21
22
  ::
22
23
 
24
+ samplesheet info SampleSheet.csv
25
+ samplesheet info SampleSheet.csv --format json
26
+
23
27
  samplesheet validate SampleSheet.csv
24
28
  samplesheet validate SampleSheet.csv --format json
29
+ samplesheet validate SampleSheet.csv --min-hamming 4
25
30
 
26
31
  samplesheet convert SampleSheet_v1.csv --to v2 --output SampleSheet_v2.csv
27
32
  samplesheet convert SampleSheet_v2.csv --to v1 --output SampleSheet_v1.csv
@@ -50,6 +55,7 @@ except ImportError: # pragma: no cover
50
55
  _TYPER_AVAILABLE = False
51
56
 
52
57
  from samplesheet_parser.enums import SampleSheetVersion
58
+ from samplesheet_parser.validators import MIN_HAMMING_DISTANCE as _MIN_HAMMING_DEFAULT
53
59
 
54
60
  if _TYPER_AVAILABLE:
55
61
  app = typer.Typer(
@@ -115,6 +121,87 @@ if _TYPER_AVAILABLE:
115
121
  typer.echo(f"Error: unknown format '{fmt}'. Use 'text' or 'json'.", err=True)
116
122
  raise typer.Exit(code=2)
117
123
 
124
+ # ---------------------------------------------------------------------------
125
+ # info
126
+ # ---------------------------------------------------------------------------
127
+
128
+ @app.command()
129
+ def info(
130
+ path: Annotated[Path, typer.Argument(help="Path to SampleSheet.csv.", metavar="FILE")],
131
+ fmt: _FormatOption = "text",
132
+ ) -> None:
133
+ """Display a quick summary of a sample sheet without full validation.
134
+
135
+ Shows format version, sample count, lanes, index type, read lengths,
136
+ and adapter sequences at a glance.
137
+
138
+ Exits 0 on success, 2 on unreadable files.
139
+ """
140
+ from samplesheet_parser.factory import SampleSheetFactory
141
+ from samplesheet_parser.parsers.v1 import SampleSheetV1
142
+
143
+ _validate_fmt(fmt)
144
+ if not path.exists():
145
+ typer.echo(f"Error: file not found: {path}", err=True)
146
+ raise typer.Exit(code=2)
147
+
148
+ try:
149
+ factory = SampleSheetFactory()
150
+ sheet = factory.create_parser(str(path), parse=True, clean=False)
151
+ except Exception as exc:
152
+ typer.echo(f"Error: could not parse {path}: {exc}", err=True)
153
+ raise typer.Exit(code=2) from exc
154
+
155
+ if factory.version is None: # pragma: no cover
156
+ raise RuntimeError("SampleSheetFactory.version must be set after create_parser")
157
+
158
+ samples = sheet.samples()
159
+ lanes = sorted({str(s.get("lane") or "") for s in samples} - {""}) or ["(none)"]
160
+ index_type = sheet.index_type()
161
+ adapters: list[str] = getattr(sheet, "adapters", []) or []
162
+ experiment_name: str | None = getattr(sheet, "experiment_name", None)
163
+
164
+ if isinstance(sheet, SampleSheetV1):
165
+ read_lengths = [str(r) for r in (sheet.read_lengths or [])]
166
+ instrument = sheet.instrument_type
167
+ else:
168
+ reads_dict = sheet.reads or {}
169
+ read_lengths = [
170
+ str(reads_dict[k])
171
+ for k in ("Read1Cycles", "Read2Cycles")
172
+ if k in reads_dict
173
+ ]
174
+ instrument = sheet.instrument_platform
175
+
176
+ if fmt == "json":
177
+ _print_json({
178
+ "file": str(path),
179
+ "format": factory.version.value,
180
+ "sample_count": len(samples),
181
+ "lanes": lanes,
182
+ "index_type": index_type,
183
+ "read_lengths": read_lengths,
184
+ "adapters": adapters,
185
+ "experiment_name": experiment_name,
186
+ "instrument": instrument,
187
+ })
188
+ else:
189
+ typer.echo(f"File: {path}")
190
+ typer.echo(f"Format: {factory.version.value}")
191
+ typer.echo(f"Samples: {len(samples)}")
192
+ typer.echo(f"Lanes: {', '.join(lanes)}")
193
+ typer.echo(f"Index type: {index_type}")
194
+ typer.echo(
195
+ f"Read lengths: {' + '.join(read_lengths) if read_lengths else '(not set)'}"
196
+ )
197
+ typer.echo(f"Adapters: {', '.join(adapters) if adapters else '(none)'}")
198
+ if experiment_name:
199
+ typer.echo(f"Experiment: {experiment_name}")
200
+ if instrument:
201
+ typer.echo(f"Instrument: {instrument}")
202
+
203
+ raise typer.Exit(code=0)
204
+
118
205
  # ---------------------------------------------------------------------------
119
206
  # validate
120
207
  # ---------------------------------------------------------------------------
@@ -123,6 +210,17 @@ if _TYPER_AVAILABLE:
123
210
  def validate(
124
211
  path: Annotated[Path, typer.Argument(help="Path to SampleSheet.csv.", metavar="FILE")],
125
212
  fmt: _FormatOption = "text",
213
+ min_hamming: Annotated[
214
+ int,
215
+ typer.Option(
216
+ "--min-hamming",
217
+ help=(
218
+ f"Minimum Hamming distance between indexes "
219
+ f"(default: {_MIN_HAMMING_DEFAULT}, must be >= 1)."
220
+ ),
221
+ metavar="N",
222
+ ),
223
+ ] = _MIN_HAMMING_DEFAULT,
126
224
  ) -> None:
127
225
  """Validate a sample sheet for index, adapter, and structural issues.
128
226
 
@@ -134,6 +232,11 @@ if _TYPER_AVAILABLE:
134
232
  from samplesheet_parser.validators import SampleSheetValidator
135
233
 
136
234
  _validate_fmt(fmt)
235
+ if min_hamming < 1:
236
+ typer.echo(
237
+ f"Error: --min-hamming must be >= 1, got {min_hamming}.", err=True
238
+ )
239
+ raise typer.Exit(code=2)
137
240
  if not path.exists():
138
241
  typer.echo(f"Error: file not found: {path}", err=True)
139
242
  raise typer.Exit(code=2)
@@ -149,13 +252,14 @@ if _TYPER_AVAILABLE:
149
252
  raise RuntimeError("SampleSheetFactory.version must be set after create_parser")
150
253
  version = factory.version
151
254
 
152
- result = SampleSheetValidator().validate(sheet)
255
+ result = SampleSheetValidator().validate(sheet, min_hamming_distance=min_hamming)
153
256
 
154
257
  if fmt == "json":
155
258
  _print_json({
156
259
  "file": str(path),
157
260
  "version": version.value,
158
261
  "is_valid": result.is_valid,
262
+ "min_hamming_distance": min_hamming,
159
263
  "errors": [
160
264
  {"code": e.code, "message": e.message, "context": e.context}
161
265
  for e in result.errors
@@ -0,0 +1,184 @@
1
+ """
2
+ Index normalization utilities for Illumina sample sheets.
3
+
4
+ When merging sheets from different projects, indexes may have been designed
5
+ with different lengths (e.g. 8 bp vs 10 bp). This can cause
6
+ ``INDEX_DISTANCE_TOO_LOW`` or ``INDEX_COLLISION`` errors at merge time because
7
+ the comparison is length-aware (shorter sequence wins).
8
+
9
+ Two strategies are provided:
10
+
11
+ ``"trim"``
12
+ Trim all index sequences to the length of the *shortest* index in the
13
+ sample list. Safe when the extra cycles are padding bases.
14
+
15
+ ``"pad"``
16
+ Pad shorter indexes to the length of the *longest* index using ``"N"``
17
+ wildcard characters. ``"N"`` matches any base during demultiplexing in
18
+ BCLConvert ≥ 3.9 and bcl2fastq ≥ 2.20.
19
+
20
+ Examples
21
+ --------
22
+ >>> from samplesheet_parser import SampleSheetFactory
23
+ >>> from samplesheet_parser.index_utils import normalize_index_lengths
24
+ >>>
25
+ >>> sheet = SampleSheetFactory().create_parser("SampleSheet.csv", parse=True)
26
+ >>> normalized = normalize_index_lengths(sheet.samples(), strategy="trim")
27
+ >>> for s in normalized:
28
+ ... print(s["sample_id"], s.get("index") or s.get("Index"))
29
+
30
+ Authors
31
+ -------
32
+ Chaitanya Kasaraneni
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ from typing import Any, Literal
38
+
39
+
40
+ def normalize_index_lengths(
41
+ samples: list[dict[str, Any]],
42
+ strategy: Literal["trim", "pad"] = "trim",
43
+ *,
44
+ index1_key: str | None = None,
45
+ index2_key: str | None = None,
46
+ ) -> list[dict[str, Any]]:
47
+ """Normalize index sequence lengths across all samples.
48
+
49
+ Detects whether the samples use V1-style keys (``"index"`` / ``"index2"``)
50
+ or V2-style keys (``"Index"`` / ``"Index2"``) automatically, unless
51
+ ``index1_key`` / ``index2_key`` are supplied explicitly.
52
+
53
+ Parameters
54
+ ----------
55
+ samples:
56
+ Output of ``sheet.samples()``. Not modified in place — the
57
+ function returns new dicts with the index values replaced.
58
+ strategy:
59
+ ``"trim"`` — trim all indexes to the length of the shortest
60
+ sequence in *samples* (default).
61
+
62
+ ``"pad"`` — pad all indexes to the length of the longest
63
+ sequence using ``"N"`` wildcard characters.
64
+ index1_key:
65
+ Override the dict key for the primary index (I7). Auto-detected
66
+ from the first sample that has an index value if ``None``.
67
+ index2_key:
68
+ Override the dict key for the secondary index (I5). Auto-detected
69
+ from the first sample that has an index2 value if ``None``. Pass
70
+ an explicit value of ``""`` to suppress I5 normalization entirely.
71
+
72
+ Returns
73
+ -------
74
+ list[dict]
75
+ Shallow copies of the input dicts with index values replaced.
76
+ Samples that have no index are returned unchanged.
77
+
78
+ Raises
79
+ ------
80
+ ValueError
81
+ If *strategy* is not ``"trim"`` or ``"pad"``.
82
+
83
+ Examples
84
+ --------
85
+ >>> samples = [
86
+ ... {"sample_id": "S1", "index": "ATTACTCG", "index2": "TATAGCCT"},
87
+ ... {"sample_id": "S2", "index": "TCCGGAGAGG", "index2": "ATAGAGGCTA"},
88
+ ... ]
89
+ >>> normalize_index_lengths(samples, strategy="trim")
90
+ [{'sample_id': 'S1', 'index': 'ATTACTCG', 'index2': 'TATAGCCT'},
91
+ {'sample_id': 'S2', 'index': 'TCCGGAGA', 'index2': 'ATAGAGGC'}]
92
+
93
+ >>> normalize_index_lengths(samples, strategy="pad")
94
+ [{'sample_id': 'S1', 'index': 'ATTACTCGNN', 'index2': 'TATAGCCTNN'},
95
+ {'sample_id': 'S2', 'index': 'TCCGGAGAGG', 'index2': 'ATAGAGGCTA'}]
96
+ """
97
+ if strategy not in ("trim", "pad"):
98
+ raise ValueError(f"strategy must be 'trim' or 'pad', got {strategy!r}")
99
+
100
+ if not samples:
101
+ return []
102
+
103
+ # ── Auto-detect key names ────────────────────────────────────────────────
104
+ if index1_key is None:
105
+ index1_key = _detect_key(samples, ("index", "Index"))
106
+ if index2_key is None:
107
+ index2_key = _detect_key(samples, ("index2", "Index2"))
108
+
109
+ # ── Collect index lengths ────────────────────────────────────────────────
110
+ i1_lengths: list[int] = []
111
+ i2_lengths: list[int] = []
112
+
113
+ for s in samples:
114
+ if index1_key:
115
+ v = s.get(index1_key)
116
+ if v:
117
+ i1_lengths.append(len(v))
118
+ if index2_key:
119
+ v = s.get(index2_key)
120
+ if v:
121
+ i2_lengths.append(len(v))
122
+
123
+ # If all samples have the same length already, return copies unchanged.
124
+ i1_uniform = len(set(i1_lengths)) <= 1
125
+ i2_uniform = len(set(i2_lengths)) <= 1
126
+ if i1_uniform and i2_uniform:
127
+ return [dict(s) for s in samples]
128
+
129
+ # ── Determine target lengths ─────────────────────────────────────────────
130
+ if strategy == "trim":
131
+ target_i1 = min(i1_lengths) if i1_lengths else 0
132
+ target_i2 = min(i2_lengths) if i2_lengths else 0
133
+ else: # pad
134
+ target_i1 = max(i1_lengths) if i1_lengths else 0
135
+ target_i2 = max(i2_lengths) if i2_lengths else 0
136
+
137
+ # ── Apply normalization ──────────────────────────────────────────────────
138
+ result: list[dict[str, Any]] = []
139
+ for sample in samples:
140
+ out = dict(sample)
141
+
142
+ if index1_key and target_i1:
143
+ v1: str | None = out.get(index1_key) # type: ignore[assignment]
144
+ if v1:
145
+ out[index1_key] = _apply(v1, target_i1, strategy)
146
+
147
+ if index2_key and target_i2:
148
+ v2: str | None = out.get(index2_key) # type: ignore[assignment]
149
+ if v2:
150
+ out[index2_key] = _apply(v2, target_i2, strategy)
151
+
152
+ result.append(out)
153
+
154
+ return result
155
+
156
+
157
+ # ---------------------------------------------------------------------------
158
+ # Helpers
159
+ # ---------------------------------------------------------------------------
160
+
161
+ def _detect_key(samples: list[dict[str, Any]], candidates: tuple[str, ...]) -> str:
162
+ """Return the first candidate key that has at least one non-empty value.
163
+
164
+ Falls back to key *presence* (regardless of value) if no candidate has
165
+ any non-empty value, and finally to the first candidate name if none are
166
+ present at all.
167
+ """
168
+ # Prefer a key that actually carries data
169
+ for key in candidates:
170
+ if any(s.get(key) for s in samples):
171
+ return key
172
+ # Fall back to key presence (all values empty/None but key exists)
173
+ for key in candidates:
174
+ if any(key in s for s in samples):
175
+ return key
176
+ return candidates[0]
177
+
178
+
179
+ def _apply(seq: str, target_length: int, strategy: str) -> str:
180
+ """Trim or pad *seq* to *target_length*."""
181
+ if strategy == "trim":
182
+ return seq[:target_length]
183
+ # pad
184
+ return seq.ljust(target_length, "N")
@@ -167,6 +167,11 @@ class SampleSheetMerger:
167
167
  Output format for the merged sheet. Defaults to
168
168
  :attr:`SampleSheetVersion.V2`. If inputs are mixed V1/V2, all
169
169
  are converted to this format.
170
+ min_hamming_distance:
171
+ Minimum Hamming distance required between any two index sequences
172
+ in the same lane, both within a sheet and across sheets. Pairs
173
+ below this threshold produce an ``INDEX_DISTANCE_TOO_LOW`` warning.
174
+ Defaults to :data:`MIN_HAMMING_DISTANCE` (3).
170
175
 
171
176
  Examples
172
177
  --------
@@ -181,8 +186,11 @@ class SampleSheetMerger:
181
186
  def __init__(
182
187
  self,
183
188
  target_version: SampleSheetVersion = SampleSheetVersion.V2,
189
+ *,
190
+ min_hamming_distance: int = MIN_HAMMING_DISTANCE,
184
191
  ) -> None:
185
192
  self.target_version = target_version
193
+ self.min_hamming_distance = min_hamming_distance
186
194
  self._paths: list[Path] = []
187
195
 
188
196
  # ------------------------------------------------------------------
@@ -267,7 +275,7 @@ class SampleSheetMerger:
267
275
  self._check_read_lengths(parsed, result)
268
276
  self._check_adapters(parsed, result)
269
277
  self._check_index_collisions(parsed, result)
270
- self._check_index_distances(parsed, result)
278
+ self._check_index_distances(parsed, result, min_distance=self.min_hamming_distance)
271
279
 
272
280
  # ── 3. Abort early if hard conflicts found ───────────────────────
273
281
  if result.has_conflicts and abort_on_conflicts:
@@ -688,7 +696,9 @@ class SampleSheetMerger:
688
696
  sheet = SampleSheetFactory().create_parser(
689
697
  tmp_path, parse=True, clean=False
690
698
  )
691
- vresult = SampleSheetValidator().validate(sheet)
699
+ vresult = SampleSheetValidator().validate(
700
+ sheet, min_hamming_distance=self.min_hamming_distance
701
+ )
692
702
  except Exception as exc:
693
703
  # Convert any parse/validation failure into a structured conflict
694
704
  # so merge() always returns a MergeResult rather than raising.