samplesheet-parser 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/.github/workflows/ci.yml +1 -1
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/PKG-INFO +2 -4
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/examples/parse_examples.py +31 -13
- samplesheet_parser-0.2.1/examples/sample_sheets/README.md +195 -0
- samplesheet_parser-0.2.1/examples/sample_sheets/v1_with_lab_qc_settings.csv +35 -0
- samplesheet_parser-0.2.1/examples/sample_sheets/v1_with_manifests.csv +32 -0
- samplesheet_parser-0.2.1/examples/sample_sheets/v2_with_cloud_settings.csv +32 -0
- samplesheet_parser-0.2.1/examples/sample_sheets/v2_with_pipeline_settings.csv +32 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/pyproject.toml +5 -7
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/samplesheet_parser/enums.py +5 -5
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/samplesheet_parser/factory.py +2 -1
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/samplesheet_parser/parsers/v1.py +188 -56
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/samplesheet_parser/parsers/v2.py +130 -7
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/samplesheet_parser/validators.py +10 -9
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/conftest.py +243 -1
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/test_parsers/test_v1.py +173 -3
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/test_parsers/test_v2.py +191 -0
- samplesheet_parser-0.2.0/examples/sample_sheets/README.md +0 -92
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/.github/workflows/copilot-instructions.md +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/.gitignore +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/CHANGELOG.md +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/CONTRIBUTING.md +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/LICENSE +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/README.md +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/examples/sample_sheets/v1_dual_index.csv +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/examples/sample_sheets/v1_multi_lane.csv +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/examples/sample_sheets/v1_single_index.csv +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/examples/sample_sheets/v2_nextseq_single_index.csv +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/examples/sample_sheets/v2_novaseq_x_dual_index.csv +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/examples/sample_sheets/v2_with_index_umi.csv +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/examples/sample_sheets/v2_with_read_umi.csv +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/images/samplesheet_parser_overview.png +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/samplesheet_parser/__init__.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/samplesheet_parser/converter.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/samplesheet_parser/diff.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/samplesheet_parser/parsers/__init__.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/samplesheet_parser/writer.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/scripts/demo_converter.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/scripts/demo_diff.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/scripts/demo_writer.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/__init__.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/fixtures/SampleSheet_v1_dual_index.csv +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/fixtures/SampleSheet_v2_dual_index.csv +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/fixtures/SampleSheet_v2_modified.csv +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/test_converter.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/test_diff.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/test_factory.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/test_parsers/__init__.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/test_validators/__init__.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/test_validators/test_hamming.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/test_validators/test_validators.py +0 -0
- {samplesheet_parser-0.2.0 → samplesheet_parser-0.2.1}/tests/test_writer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: samplesheet-parser
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Format-agnostic parser for Illumina SampleSheet.csv files — supports IEM V1 and BCLConvert V2
|
|
5
5
|
Project-URL: Homepage, https://github.com/chaitanyakasaraneni/samplesheet-parser
|
|
6
6
|
Project-URL: Documentation, https://illumina-samplesheet.readthedocs.io
|
|
@@ -33,12 +33,10 @@ Classifier: Intended Audience :: Developers
|
|
|
33
33
|
Classifier: Intended Audience :: Science/Research
|
|
34
34
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
35
35
|
Classifier: Programming Language :: Python :: 3
|
|
36
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
37
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
38
36
|
Classifier: Programming Language :: Python :: 3.12
|
|
39
37
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
40
38
|
Classifier: Typing :: Typed
|
|
41
|
-
Requires-Python: >=3.
|
|
39
|
+
Requires-Python: >=3.12
|
|
42
40
|
Requires-Dist: loguru>=0.7
|
|
43
41
|
Provides-Extra: dev
|
|
44
42
|
Requires-Dist: black>=24.0; extra == 'dev'
|
|
@@ -6,7 +6,8 @@ Run from the repo root:
|
|
|
6
6
|
python examples/parse_examples.py
|
|
7
7
|
|
|
8
8
|
Demonstrates auto-detection, samples(), index_type(), UMI extraction,
|
|
9
|
-
and
|
|
9
|
+
validation, and custom section parsing for every example sheet in
|
|
10
|
+
examples/sample_sheets/.
|
|
10
11
|
"""
|
|
11
12
|
|
|
12
13
|
from __future__ import annotations
|
|
@@ -22,18 +23,23 @@ from samplesheet_parser import SampleSheetFactory, SampleSheetValidator
|
|
|
22
23
|
SHEETS_DIR = Path(__file__).parent / "sample_sheets"
|
|
23
24
|
|
|
24
25
|
# Ordered for readability: V1 first, then V2
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
"
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
26
|
+
# Each entry is (filename, list of custom section names to demo, or [])
|
|
27
|
+
EXAMPLE_FILES: list[tuple[str, list[str]]] = [
|
|
28
|
+
("v1_dual_index.csv", []),
|
|
29
|
+
("v1_single_index.csv", []),
|
|
30
|
+
("v1_multi_lane.csv", []),
|
|
31
|
+
("v1_with_manifests.csv", ["Manifests"]),
|
|
32
|
+
("v1_with_lab_qc_settings.csv", ["Lab_QC_Settings"]),
|
|
33
|
+
("v2_novaseq_x_dual_index.csv", []),
|
|
34
|
+
("v2_with_index_umi.csv", []),
|
|
35
|
+
("v2_with_read_umi.csv", []),
|
|
36
|
+
("v2_nextseq_single_index.csv", []),
|
|
37
|
+
("v2_with_cloud_settings.csv", ["Cloud_Settings"]),
|
|
38
|
+
("v2_with_pipeline_settings.csv", ["Pipeline_Settings"]),
|
|
33
39
|
]
|
|
34
40
|
|
|
35
41
|
|
|
36
|
-
def parse_sheet(path: Path) -> None:
|
|
42
|
+
def parse_sheet(path: Path, custom_sections: list[str]) -> None:
|
|
37
43
|
print(f"\n{'='*60}")
|
|
38
44
|
print(f" {path.name}")
|
|
39
45
|
print(f"{'='*60}")
|
|
@@ -70,6 +76,18 @@ def parse_sheet(path: Path) -> None:
|
|
|
70
76
|
print(f" UMI location : {rs.umi_location}")
|
|
71
77
|
print(f" Read structure : {rs.read_structure}")
|
|
72
78
|
|
|
79
|
+
# Custom sections
|
|
80
|
+
if custom_sections:
|
|
81
|
+
print("\n Custom sections:")
|
|
82
|
+
for section_name in custom_sections:
|
|
83
|
+
data = sheet.parse_custom_section(section_name)
|
|
84
|
+
if data:
|
|
85
|
+
print(f" [{section_name}]")
|
|
86
|
+
for key, value in data.items():
|
|
87
|
+
print(f" {key:<28} {value}")
|
|
88
|
+
else:
|
|
89
|
+
print(f" [{section_name}] — (empty or not present)")
|
|
90
|
+
|
|
73
91
|
# Samples table
|
|
74
92
|
samples = sheet.samples()
|
|
75
93
|
print(f"\n Samples ({len(samples)} total):")
|
|
@@ -97,14 +115,14 @@ def main() -> None:
|
|
|
97
115
|
print("samplesheet-parser — Example Sheet Demo")
|
|
98
116
|
print(f"Parsing {len(EXAMPLE_FILES)} example sheets from {SHEETS_DIR}\n")
|
|
99
117
|
|
|
100
|
-
missing = [f for f in EXAMPLE_FILES if not (SHEETS_DIR / f).exists()]
|
|
118
|
+
missing = [f for f, _ in EXAMPLE_FILES if not (SHEETS_DIR / f).exists()]
|
|
101
119
|
if missing:
|
|
102
120
|
print(f"Warning: missing files: {missing}")
|
|
103
121
|
|
|
104
|
-
for filename in EXAMPLE_FILES:
|
|
122
|
+
for filename, custom_sections in EXAMPLE_FILES:
|
|
105
123
|
path = SHEETS_DIR / filename
|
|
106
124
|
if path.exists():
|
|
107
|
-
parse_sheet(path)
|
|
125
|
+
parse_sheet(path, custom_sections)
|
|
108
126
|
|
|
109
127
|
print(f"\n{'='*60}")
|
|
110
128
|
print("Done.")
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# Example Sample Sheets
|
|
2
|
+
|
|
3
|
+
Reference sample sheets covering the full range of supported formats.
|
|
4
|
+
Each file is a valid, runnable example that can be parsed by `samplesheet-parser`.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## V1 — IEM / bcl2fastq format
|
|
9
|
+
|
|
10
|
+
Used with: NovaSeq 6000, HiSeq, NextSeq 500/550, MiSeq
|
|
11
|
+
Identified by: `IEMFileVersion` in `[Header]`
|
|
12
|
+
|
|
13
|
+
| File | Instrument | Indexes | Key feature |
|
|
14
|
+
|---|---|---|---|
|
|
15
|
+
| `v1_dual_index.csv` | NovaSeq 6000 | Dual (10+10 bp) | Multi-lane, TruSeq UD adapters |
|
|
16
|
+
| `v1_single_index.csv` | NextSeq 500 | Single (6 bp) | Small RNA, TruSeq Small RNA adapters |
|
|
17
|
+
| `v1_multi_lane.csv` | NovaSeq 6000 | Dual (10+10 bp) | 4 lanes, 2 projects, mixed assays |
|
|
18
|
+
| `v1_with_manifests.csv` | NovaSeq 6000 | Dual (10+10 bp) | Custom `[Manifests]` section — HyperCapture WES |
|
|
19
|
+
| `v1_with_lab_qc_settings.csv` | NovaSeq 6000 | Dual (10+10 bp) | Custom `[Lab_QC_Settings]` section — QC thresholds |
|
|
20
|
+
|
|
21
|
+
### V1 `[Settings]` adapter keys
|
|
22
|
+
|
|
23
|
+
The official IEM spec uses two separate keys — not `AdapterRead1`:
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
[Settings]
|
|
27
|
+
ReverseComplement,0
|
|
28
|
+
Adapter,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA ← Read 1
|
|
29
|
+
AdapterRead2,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT ← Read 2
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
`ReverseComplement,1` is only for Nextera Mate Pair libraries.
|
|
33
|
+
`Chemistry,Amplicon` means dual-index. `Chemistry,Default` means no or single index.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## V2 — BCLConvert format
|
|
38
|
+
|
|
39
|
+
Used with: NovaSeq X, NovaSeq X Plus, NextSeq 1000/2000
|
|
40
|
+
Identified by: `FileFormatVersion` in `[Header]`, or `[BCLConvert_Settings]` / `[BCLConvert_Data]` section names
|
|
41
|
+
|
|
42
|
+
| File | Instrument | Indexes | UMI | Key feature |
|
|
43
|
+
|---|---|---|---|---|
|
|
44
|
+
| `v2_novaseq_x_dual_index.csv` | NovaSeq X | Dual (10+10 bp) | No | Standard multi-lane |
|
|
45
|
+
| `v2_with_index_umi.csv` | NovaSeq X | Dual (10+10 bp) | Yes — Index1 UMI (9 bp) | cfDNA / liquid biopsy |
|
|
46
|
+
| `v2_with_read_umi.csv` | NovaSeq X | Dual (8+8 bp) | Yes — read-level UMI (5 bp) | Duplex sequencing |
|
|
47
|
+
| `v2_nextseq_single_index.csv` | NextSeq 1000/2000 | Single (8 bp) | No | Amplicon panel, no Lane column |
|
|
48
|
+
| `v2_with_cloud_settings.csv` | NovaSeq X | Dual (10+10 bp) | No | Custom `[Cloud_Settings]` — BaseSpace upload config |
|
|
49
|
+
| `v2_with_pipeline_settings.csv` | NextSeq 1000/2000 | Single (8 bp) | No | Custom `[Pipeline_Settings]` — downstream pipeline config |
|
|
50
|
+
|
|
51
|
+
### V2 `OverrideCycles` format
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
Y151;I10;I10;Y151 — 151bp PE, 10bp dual index, no UMI
|
|
55
|
+
Y151;I10U9;I10;Y151 — same, with 9bp UMI appended to Index1
|
|
56
|
+
U5Y146;I8;I8;U5Y146 — 5bp UMI on both reads (read-level UMI)
|
|
57
|
+
Y151;I8;Y151 — single index, no Index2 cycle
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Segment order: Read1 ; Index1 ; Index2 ; Read2
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Custom sections
|
|
65
|
+
|
|
66
|
+
Both V1 and V2 sheets support non-standard sections. These are preserved verbatim
|
|
67
|
+
during parsing and accessible via `sheet.parse_custom_section(name)`.
|
|
68
|
+
|
|
69
|
+
### V1 — `[Manifests]`
|
|
70
|
+
|
|
71
|
+
Used by Illumina's HyperCapture and other enrichment workflows to specify the
|
|
72
|
+
target capture manifest files the demultiplexer or aligner should load.
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
[Manifests]
|
|
76
|
+
MFGmanifest,HyperCapture_ExomeV2_manifest.txt
|
|
77
|
+
PoolingManifest,pooling_batch3_v1.txt
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### V1 — `[Lab_QC_Settings]`
|
|
81
|
+
|
|
82
|
+
A lab-defined section for embedding QC thresholds and pipeline metadata
|
|
83
|
+
directly in the sample sheet, so downstream tools can read them without a
|
|
84
|
+
separate config file.
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
[Lab_QC_Settings]
|
|
88
|
+
MinQ30,85
|
|
89
|
+
TargetCoverage,100x
|
|
90
|
+
MinMappingRate,90
|
|
91
|
+
LibraryKit,TruSeq_Stranded_mRNA
|
|
92
|
+
SequencingCore,GenomicsCoreFacility
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### V2 — `[Cloud_Settings]`
|
|
96
|
+
|
|
97
|
+
Used by Illumina DRAGEN and BaseSpace to configure automated cloud upload
|
|
98
|
+
after demultiplexing. `UploadToBaseSpace,1` triggers the upload; `BaseSpaceProjectId`
|
|
99
|
+
routes the data to the correct project.
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
[Cloud_Settings]
|
|
103
|
+
GeneratedVersion,3.9.14
|
|
104
|
+
UploadToBaseSpace,1
|
|
105
|
+
BaseSpaceProjectId,bs-proj-240715-wgs
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### V2 — `[Pipeline_Settings]`
|
|
109
|
+
|
|
110
|
+
A lab-defined section for downstream pipeline configuration — reference genome,
|
|
111
|
+
variant caller, output format — bundled with the sample sheet so the compute
|
|
112
|
+
environment has everything it needs in one file.
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
[Pipeline_Settings]
|
|
116
|
+
PipelineVersion,2.1.0
|
|
117
|
+
ReferenceGenome,hg38
|
|
118
|
+
OutputFormat,CRAM
|
|
119
|
+
VariantCaller,DeepVariant
|
|
120
|
+
MinBaseQuality,20
|
|
121
|
+
MinMappingQuality,30
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Accessing custom sections in code
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from samplesheet_parser import SampleSheetFactory
|
|
128
|
+
|
|
129
|
+
sheet_with_manifests = SampleSheetFactory().create_parser(
|
|
130
|
+
"examples/sample_sheets/v1_with_manifests.csv", parse=True
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Returns {} if section is absent (default)
|
|
134
|
+
manifests = sheet_with_manifests.parse_custom_section("Manifests")
|
|
135
|
+
print(manifests)
|
|
136
|
+
# {'MFGmanifest': 'HyperCapture_ExomeV2_manifest.txt',
|
|
137
|
+
# 'PoolingManifest': 'pooling_batch3_v1.txt'}
|
|
138
|
+
|
|
139
|
+
# Raise if a section your pipeline depends on is missing
|
|
140
|
+
sheet_with_lab_qc_settings = SampleSheetFactory().create_parser(
|
|
141
|
+
"examples/sample_sheets/v1_with_lab_qc_settings.csv", parse=True
|
|
142
|
+
)
|
|
143
|
+
qc = sheet_with_lab_qc_settings.parse_custom_section("Lab_QC_Settings", required=True)
|
|
144
|
+
|
|
145
|
+
# Works identically on V2 sheets
|
|
146
|
+
sheet_v2 = SampleSheetFactory().create_parser(
|
|
147
|
+
"examples/sample_sheets/v2_with_cloud_settings.csv", parse=True
|
|
148
|
+
)
|
|
149
|
+
cloud = sheet_v2.parse_custom_section("Cloud_Settings")
|
|
150
|
+
print(cloud["UploadToBaseSpace"]) # '1'
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Asserting required sections before parsing
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
# parse() raises ValueError immediately if a required section is absent
|
|
157
|
+
sheet = SampleSheetFactory().create_parser("SampleSheet.csv", parse=False)
|
|
158
|
+
sheet.parse(required_sections=["Manifests", "Lab_QC_Settings"])
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Parsing examples
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
from samplesheet_parser import SampleSheetFactory, SampleSheetValidator
|
|
167
|
+
|
|
168
|
+
# Works for any of the files above — format is auto-detected
|
|
169
|
+
factory = SampleSheetFactory()
|
|
170
|
+
sheet = factory.create_parser("examples/sample_sheets/v2_with_index_umi.csv", parse=True)
|
|
171
|
+
|
|
172
|
+
print(factory.version) # SampleSheetVersion.V2
|
|
173
|
+
print(sheet.index_type()) # "dual"
|
|
174
|
+
print(factory.get_umi_length()) # 9
|
|
175
|
+
|
|
176
|
+
for sample in sheet.samples():
|
|
177
|
+
print(sample["sample_id"], sample["index"])
|
|
178
|
+
|
|
179
|
+
result = SampleSheetValidator().validate(sheet)
|
|
180
|
+
print(result.summary()) # PASS — 0 error(s), 0 warning(s)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Notes on column capitalisation (V1)
|
|
186
|
+
|
|
187
|
+
From the Illumina IEM reference: **capitalisation in the `[Data]` header row matters.**
|
|
188
|
+
|
|
189
|
+
Standard capitalisation:
|
|
190
|
+
- `Sample_ID`, `Sample_Name`, `Sample_Plate`, `Sample_Well` — Title_Case with underscore
|
|
191
|
+
- `I7_Index_ID`, `I5_Index_ID` — uppercase I, mixed
|
|
192
|
+
- `index`, `index2` — **all lowercase**
|
|
193
|
+
- `Sample_Project`, `Description` — Title_Case
|
|
194
|
+
|
|
195
|
+
`index` and `index2` being lowercase is deliberate and required by bcl2fastq.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[Header]
|
|
2
|
+
IEMFileVersion,5
|
|
3
|
+
Experiment Name,240610_A00123_0112_BHJNMMDRX3
|
|
4
|
+
Date,2024-06-10
|
|
5
|
+
Workflow,GenerateFASTQ
|
|
6
|
+
Application,FASTQ Only
|
|
7
|
+
Instrument Type,NovaSeq 6000
|
|
8
|
+
Assay,TruSeq Stranded mRNA
|
|
9
|
+
Index Adapters,TruSeq RNA UD Indexes (96 Indexes)
|
|
10
|
+
Chemistry,Amplicon
|
|
11
|
+
|
|
12
|
+
[Reads]
|
|
13
|
+
151
|
|
14
|
+
151
|
|
15
|
+
|
|
16
|
+
[Settings]
|
|
17
|
+
ReverseComplement,0
|
|
18
|
+
Adapter,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
|
|
19
|
+
AdapterRead2,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
|
|
20
|
+
|
|
21
|
+
[Lab_QC_Settings]
|
|
22
|
+
MinQ30,85
|
|
23
|
+
TargetCoverage,100x
|
|
24
|
+
MinMappingRate,90
|
|
25
|
+
LibraryKit,TruSeq_Stranded_mRNA
|
|
26
|
+
SequencingCore,GenomicsCoreFacility
|
|
27
|
+
|
|
28
|
+
[Data]
|
|
29
|
+
Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
|
|
30
|
+
1,RNA_001,Control_Rep1,,A01,UDP0001,CAAGACAGAT,UDP0001,ACTATAGCCT,RNASeq_Project,mRNA_expression
|
|
31
|
+
1,RNA_002,Control_Rep2,,B01,UDP0002,TGAACCTGAT,UDP0002,TGATACGTCC,RNASeq_Project,mRNA_expression
|
|
32
|
+
1,RNA_003,Control_Rep3,,C01,UDP0003,GCACAACGTT,UDP0003,CATCTCACAG,RNASeq_Project,mRNA_expression
|
|
33
|
+
1,RNA_004,Treatment_Rep1,,D01,UDP0004,ATCGCCTGTT,UDP0004,GACTAGCATG,RNASeq_Project,mRNA_expression
|
|
34
|
+
1,RNA_005,Treatment_Rep2,,E01,UDP0005,CTTGTAGCAA,UDP0005,TGCGTCAGCC,RNASeq_Project,mRNA_expression
|
|
35
|
+
1,RNA_006,Treatment_Rep3,,F01,UDP0006,GATCCTAAGT,UDP0006,CATGCGGTTG,RNASeq_Project,mRNA_expression
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[Header]
|
|
2
|
+
IEMFileVersion,5
|
|
3
|
+
Experiment Name,240501_A01234_0088_AHJNYGDRX3
|
|
4
|
+
Date,2024-05-01
|
|
5
|
+
Workflow,GenerateFASTQ
|
|
6
|
+
Application,FASTQ Only
|
|
7
|
+
Instrument Type,NovaSeq 6000
|
|
8
|
+
Assay,Nextera DNA Flex for Enrichment
|
|
9
|
+
Index Adapters,IDT for Illumina DNA/RNA UD Indexes (96 Indexes)
|
|
10
|
+
Chemistry,Amplicon
|
|
11
|
+
|
|
12
|
+
[Reads]
|
|
13
|
+
151
|
|
14
|
+
151
|
|
15
|
+
|
|
16
|
+
[Settings]
|
|
17
|
+
ReverseComplement,0
|
|
18
|
+
Adapter,CTGTCTCTTATACACATCT
|
|
19
|
+
AdapterRead2,CTGTCTCTTATACACATCT
|
|
20
|
+
|
|
21
|
+
[Manifests]
|
|
22
|
+
MFGmanifest,HyperCapture_ExomeV2_manifest.txt
|
|
23
|
+
PoolingManifest,pooling_batch3_v1.txt
|
|
24
|
+
|
|
25
|
+
[Data]
|
|
26
|
+
Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
|
|
27
|
+
1,WES_001,TumorA_WES,,A01,UDP0001,CAAGACAGAT,UDP0001,ACTATAGCCT,WES_Project,tumor_normal_pair
|
|
28
|
+
1,WES_002,NormalA_WES,,B01,UDP0002,TGAACCTGAT,UDP0002,TGATACGTCC,WES_Project,tumor_normal_pair
|
|
29
|
+
1,WES_003,TumorB_WES,,C01,UDP0003,GCACAACGTT,UDP0003,CATCTCACAG,WES_Project,tumor_normal_pair
|
|
30
|
+
1,WES_004,NormalB_WES,,D01,UDP0004,ATCGCCTGTT,UDP0004,GACTAGCATG,WES_Project,tumor_normal_pair
|
|
31
|
+
2,WES_005,TumorC_WES,,E01,UDP0005,CTTGTAGCAA,UDP0005,TGCGTCAGCC,WES_Project,tumor_normal_pair
|
|
32
|
+
2,WES_006,NormalC_WES,,F01,UDP0006,GATCCTAAGT,UDP0006,CATGCGGTTG,WES_Project,tumor_normal_pair
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[Header]
|
|
2
|
+
FileFormatVersion,2
|
|
3
|
+
RunName,240715_LH00336_0078_A22TNHKLT3
|
|
4
|
+
InstrumentPlatform,NovaSeqXSeries
|
|
5
|
+
ExperimentName,WGS_CloudUpload_Batch7
|
|
6
|
+
|
|
7
|
+
[Reads]
|
|
8
|
+
Read1Cycles,151
|
|
9
|
+
Read2Cycles,151
|
|
10
|
+
Index1Cycles,10
|
|
11
|
+
Index2Cycles,10
|
|
12
|
+
|
|
13
|
+
[BCLConvert_Settings]
|
|
14
|
+
SoftwareVersion,3.9.3
|
|
15
|
+
AdapterRead1,CTGTCTCTTATACACATCT
|
|
16
|
+
AdapterRead2,CTGTCTCTTATACACATCT
|
|
17
|
+
OverrideCycles,Y151;I10;I10;Y151
|
|
18
|
+
BarcodeMismatchesIndex1,1
|
|
19
|
+
BarcodeMismatchesIndex2,1
|
|
20
|
+
|
|
21
|
+
[BCLConvert_Data]
|
|
22
|
+
Lane,Sample_ID,Sample_Name,Index,Index2,Sample_Project
|
|
23
|
+
1,WGS_001,SampleAlpha,ATTACTCGAT,TATAGCCTGT,WGS_Cloud
|
|
24
|
+
1,WGS_002,SampleBeta,TCCGGAGACC,ATAGAGGCAC,WGS_Cloud
|
|
25
|
+
1,WGS_003,SampleGamma,TAGGCATGCA,CCTATCCTAG,WGS_Cloud
|
|
26
|
+
2,WGS_004,SampleDelta,CTCTCTACGC,GGCTCTGAGA,WGS_Cloud
|
|
27
|
+
2,WGS_005,SampleEpsilon,CGGAGCCTAA,AGGCGAAGAG,WGS_Cloud
|
|
28
|
+
|
|
29
|
+
[Cloud_Settings]
|
|
30
|
+
GeneratedVersion,3.9.14
|
|
31
|
+
UploadToBaseSpace,1
|
|
32
|
+
BaseSpaceProjectId,bs-proj-240715-wgs
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[Header]
|
|
2
|
+
FileFormatVersion,2
|
|
3
|
+
RunName,240820_VH00123_0041_AACNPJKHV
|
|
4
|
+
InstrumentPlatform,NextSeq1000/2000
|
|
5
|
+
ExperimentName,AmpliSeq_Pipeline_Config_Run
|
|
6
|
+
|
|
7
|
+
[Reads]
|
|
8
|
+
Read1Cycles,151
|
|
9
|
+
Read2Cycles,151
|
|
10
|
+
Index1Cycles,8
|
|
11
|
+
|
|
12
|
+
[BCLConvert_Settings]
|
|
13
|
+
SoftwareVersion,3.9.3
|
|
14
|
+
AdapterRead1,CTGTCTCTTATACACATCT
|
|
15
|
+
OverrideCycles,Y151;I8;Y151
|
|
16
|
+
BarcodeMismatchesIndex1,1
|
|
17
|
+
|
|
18
|
+
[BCLConvert_Data]
|
|
19
|
+
Sample_ID,Sample_Name,Index,Sample_Project
|
|
20
|
+
Panel_001,CancerHotspot_Rep1,ATTACTCG,AmpliconPanel
|
|
21
|
+
Panel_002,CancerHotspot_Rep2,TCCGGAGA,AmpliconPanel
|
|
22
|
+
Panel_003,CancerHotspot_Rep3,TAGGCATG,AmpliconPanel
|
|
23
|
+
Panel_004,NormalControl_Rep1,CTCTCTAC,AmpliconPanel
|
|
24
|
+
Panel_005,NormalControl_Rep2,TAATCTTA,AmpliconPanel
|
|
25
|
+
|
|
26
|
+
[Pipeline_Settings]
|
|
27
|
+
PipelineVersion,2.1.0
|
|
28
|
+
ReferenceGenome,hg38
|
|
29
|
+
OutputFormat,CRAM
|
|
30
|
+
VariantCaller,DeepVariant
|
|
31
|
+
MinBaseQuality,20
|
|
32
|
+
MinMappingQuality,30
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "samplesheet-parser"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.1"
|
|
8
8
|
description = "Format-agnostic parser for Illumina SampleSheet.csv files — supports IEM V1 and BCLConvert V2"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -19,13 +19,11 @@ classifiers = [
|
|
|
19
19
|
"Intended Audience :: Developers",
|
|
20
20
|
"License :: OSI Approved :: Apache Software License",
|
|
21
21
|
"Programming Language :: Python :: 3",
|
|
22
|
-
"Programming Language :: Python :: 3.10",
|
|
23
|
-
"Programming Language :: Python :: 3.11",
|
|
24
22
|
"Programming Language :: Python :: 3.12",
|
|
25
23
|
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
26
24
|
"Typing :: Typed",
|
|
27
25
|
]
|
|
28
|
-
requires-python = ">=3.
|
|
26
|
+
requires-python = ">=3.12"
|
|
29
27
|
dependencies = [
|
|
30
28
|
"loguru>=0.7",
|
|
31
29
|
]
|
|
@@ -50,17 +48,17 @@ packages = ["samplesheet_parser"]
|
|
|
50
48
|
|
|
51
49
|
[tool.black]
|
|
52
50
|
line-length = 100
|
|
53
|
-
target-version = ["
|
|
51
|
+
target-version = ["py312"]
|
|
54
52
|
|
|
55
53
|
[tool.ruff]
|
|
56
54
|
line-length = 100
|
|
57
|
-
target-version = "
|
|
55
|
+
target-version = "py312"
|
|
58
56
|
|
|
59
57
|
[tool.ruff.lint]
|
|
60
58
|
select = ["E", "F", "I", "W", "UP", "B"]
|
|
61
59
|
|
|
62
60
|
[tool.mypy]
|
|
63
|
-
python_version = "3.
|
|
61
|
+
python_version = "3.12"
|
|
64
62
|
strict = true
|
|
65
63
|
ignore_missing_imports = true
|
|
66
64
|
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
Enumerations for samplesheet-parser.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from enum import
|
|
5
|
+
from enum import StrEnum
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class SampleSheetVersion(
|
|
8
|
+
class SampleSheetVersion(StrEnum):
|
|
9
9
|
"""Illumina sample sheet format version.
|
|
10
10
|
|
|
11
11
|
V1 — Illumina Experiment Manager (IEM) format, used with bcl2fastq.
|
|
@@ -21,7 +21,7 @@ class SampleSheetVersion(str, Enum):
|
|
|
21
21
|
V2 = "V2"
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class IndexType(
|
|
24
|
+
class IndexType(StrEnum):
|
|
25
25
|
"""Sequencing index configuration.
|
|
26
26
|
|
|
27
27
|
SINGLE — I7 index only (single-index libraries).
|
|
@@ -33,7 +33,7 @@ class IndexType(str, Enum):
|
|
|
33
33
|
NONE = "none"
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
class InstrumentPlatform(
|
|
36
|
+
class InstrumentPlatform(StrEnum):
|
|
37
37
|
"""Standard Illumina instrument platform identifiers used in V2 sample sheets."""
|
|
38
38
|
NOVASEQ_6000 = "NovaSeq6000"
|
|
39
39
|
NOVASEQ_X_SERIES = "NovaSeqXSeries"
|
|
@@ -43,7 +43,7 @@ class InstrumentPlatform(str, Enum):
|
|
|
43
43
|
HISEQ_X = "HiSeqX"
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
class UMILocation(
|
|
46
|
+
class UMILocation(StrEnum):
|
|
47
47
|
"""Where the UMI is encoded in the read structure (OverrideCycles string)."""
|
|
48
48
|
READ1 = "read1"
|
|
49
49
|
READ2 = "read2"
|
|
@@ -37,6 +37,7 @@ Examples
|
|
|
37
37
|
from __future__ import annotations
|
|
38
38
|
|
|
39
39
|
from pathlib import Path
|
|
40
|
+
from typing import Any
|
|
40
41
|
|
|
41
42
|
from loguru import logger
|
|
42
43
|
|
|
@@ -121,7 +122,7 @@ class SampleSheetFactory:
|
|
|
121
122
|
detected = self._detect_version(path)
|
|
122
123
|
|
|
123
124
|
self.version = detected
|
|
124
|
-
kwargs: dict = dict(clean=clean, experiment_id=experiment_id, parse=parse)
|
|
125
|
+
kwargs: dict[str, Any] = dict(clean=clean, experiment_id=experiment_id, parse=parse)
|
|
125
126
|
|
|
126
127
|
if detected == SampleSheetVersion.V2:
|
|
127
128
|
logger.info("Detected BCLConvert V2 format — using SampleSheetV2")
|