samplesheet-parser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- samplesheet_parser/__init__.py +50 -0
- samplesheet_parser/enums.py +54 -0
- samplesheet_parser/factory.py +240 -0
- samplesheet_parser/parsers/__init__.py +6 -0
- samplesheet_parser/parsers/v1.py +536 -0
- samplesheet_parser/parsers/v2.py +622 -0
- samplesheet_parser/validators.py +323 -0
- samplesheet_parser-0.1.0.dist-info/METADATA +384 -0
- samplesheet_parser-0.1.0.dist-info/RECORD +11 -0
- samplesheet_parser-0.1.0.dist-info/WHEEL +4 -0
- samplesheet_parser-0.1.0.dist-info/licenses/LICENSE +19 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
samplesheet-parser
|
|
3
|
+
====================
|
|
4
|
+
|
|
5
|
+
Format-agnostic parser for Illumina SampleSheet.csv files.
|
|
6
|
+
|
|
7
|
+
Supports:
|
|
8
|
+
- Illumina Experiment Manager (IEM) V1 format — bcl2fastq era
|
|
9
|
+
- BCLConvert V2 format — NovaSeq X / modern era
|
|
10
|
+
|
|
11
|
+
Quickstart
|
|
12
|
+
----------
|
|
13
|
+
>>> from samplesheet_parser import SampleSheetFactory
|
|
14
|
+
>>> sheet = SampleSheetFactory().create_parser("SampleSheet.csv")
|
|
15
|
+
>>> sheet.parse()
|
|
16
|
+
>>> for sample in sheet.samples():
|
|
17
|
+
... print(sample["sample_id"], sample["index"])
|
|
18
|
+
|
|
19
|
+
Or use the version-specific parsers directly:
|
|
20
|
+
|
|
21
|
+
>>> from samplesheet_parser import SampleSheetV1, SampleSheetV2
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
__version__ = version("samplesheet-parser")
|
|
28
|
+
except PackageNotFoundError:
|
|
29
|
+
__version__ = "0.1.0-dev"
|
|
30
|
+
|
|
31
|
+
__author__ = "Chaitanya Kasaraneni"
|
|
32
|
+
__email__ = "kc.kasaraneni@gmail.com"
|
|
33
|
+
__license__ = "Apache 2.0"
|
|
34
|
+
|
|
35
|
+
from samplesheet_parser.enums import IndexType, SampleSheetVersion
|
|
36
|
+
from samplesheet_parser.factory import SampleSheetFactory
|
|
37
|
+
from samplesheet_parser.parsers.v1 import SampleSheetV1
|
|
38
|
+
from samplesheet_parser.parsers.v2 import SampleSheetV2
|
|
39
|
+
from samplesheet_parser.validators import SampleSheetValidator, ValidationResult
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
"SampleSheetV1",
|
|
43
|
+
"SampleSheetV2",
|
|
44
|
+
"SampleSheetFactory",
|
|
45
|
+
"SampleSheetVersion",
|
|
46
|
+
"IndexType",
|
|
47
|
+
"SampleSheetValidator",
|
|
48
|
+
"ValidationResult",
|
|
49
|
+
"__version__",
|
|
50
|
+
]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Enumerations for samplesheet-parser.
|
|
3
|
+
|
|
4
|
+
All enums use standard Illumina terminology from public documentation.
|
|
5
|
+
No proprietary assay codes are included.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from enum import Enum
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SampleSheetVersion(str, Enum):
|
|
12
|
+
"""Illumina sample sheet format version.
|
|
13
|
+
|
|
14
|
+
V1 — Illumina Experiment Manager (IEM) format, used with bcl2fastq.
|
|
15
|
+
Identified by ``IEMFileVersion`` in the [Header] section.
|
|
16
|
+
|
|
17
|
+
V2 — BCLConvert format, used with BCLConvert and required for
|
|
18
|
+
NovaSeq X series instruments.
|
|
19
|
+
Identified by ``FileFormatVersion`` in the [Header] section,
|
|
20
|
+
or by the presence of ``[BCLConvert_Settings]`` / ``[BCLConvert_Data]``
|
|
21
|
+
sections.
|
|
22
|
+
"""
|
|
23
|
+
V1 = "V1"
|
|
24
|
+
V2 = "V2"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class IndexType(str, Enum):
|
|
28
|
+
"""Sequencing index configuration.
|
|
29
|
+
|
|
30
|
+
SINGLE — I7 index only (single-index libraries).
|
|
31
|
+
DUAL — I7 + I5 indexes (dual-index libraries, standard for modern workflows).
|
|
32
|
+
NONE — No index (rare; full-lane libraries).
|
|
33
|
+
"""
|
|
34
|
+
SINGLE = "single"
|
|
35
|
+
DUAL = "dual"
|
|
36
|
+
NONE = "none"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class InstrumentPlatform(str, Enum):
|
|
40
|
+
"""Standard Illumina instrument platform identifiers used in V2 sample sheets."""
|
|
41
|
+
NOVASEQ_6000 = "NovaSeq6000"
|
|
42
|
+
NOVASEQ_X_SERIES = "NovaSeqXSeries"
|
|
43
|
+
NEXTSEQ_1000_2000 = "NextSeq1000/2000"
|
|
44
|
+
NEXTSEQ_550 = "NextSeq550"
|
|
45
|
+
MISEQ = "MiSeq"
|
|
46
|
+
HISEQ_X = "HiSeqX"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class UMILocation(str, Enum):
|
|
50
|
+
"""Where the UMI is encoded in the read structure (OverrideCycles string)."""
|
|
51
|
+
READ1 = "read1"
|
|
52
|
+
READ2 = "read2"
|
|
53
|
+
INDEX1 = "index1"
|
|
54
|
+
INDEX2 = "index2"
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Format-detection factory for Illumina sample sheets.
|
|
3
|
+
|
|
4
|
+
The factory inspects the ``[Header]`` section and section names of a
|
|
5
|
+
``SampleSheet.csv`` to select the correct parser — :class:`SampleSheetV1`
|
|
6
|
+
for classic IEM / bcl2fastq files and :class:`SampleSheetV2` for
|
|
7
|
+
BCLConvert files — without requiring the caller to know the format
|
|
8
|
+
upfront.
|
|
9
|
+
|
|
10
|
+
Detection logic
|
|
11
|
+
---------------
|
|
12
|
+
1. Read the ``[Header]`` section and look for a version discriminator:
|
|
13
|
+
- ``FileFormatVersion`` → V2 (BCLConvert)
|
|
14
|
+
- ``IEMFileVersion`` → V1 (IEM / bcl2fastq)
|
|
15
|
+
|
|
16
|
+
2. If no header discriminator is found, scan the full file for
|
|
17
|
+
BCLConvert-specific section names (``[BCLConvert_Settings]``,
|
|
18
|
+
``[BCLConvert_Data]``) and fall back to V2 if found.
|
|
19
|
+
|
|
20
|
+
3. If nothing matches, default to V1 (broadest compatibility).
|
|
21
|
+
|
|
22
|
+
Examples
|
|
23
|
+
--------
|
|
24
|
+
>>> from samplesheet_parser import SampleSheetFactory
|
|
25
|
+
>>>
|
|
26
|
+
>>> # Auto-detect format
|
|
27
|
+
>>> sheet = SampleSheetFactory().create_parser("SampleSheet.csv")
|
|
28
|
+
>>> sheet.parse()
|
|
29
|
+
>>> print(sheet.samples())
|
|
30
|
+
>>>
|
|
31
|
+
>>> # Check what was detected
|
|
32
|
+
>>> factory = SampleSheetFactory()
|
|
33
|
+
>>> sheet = factory.create_parser("SampleSheet.csv")
|
|
34
|
+
>>> print(factory.version) # SampleSheetVersion.V2
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
from __future__ import annotations
|
|
38
|
+
|
|
39
|
+
from pathlib import Path
|
|
40
|
+
|
|
41
|
+
from loguru import logger
|
|
42
|
+
|
|
43
|
+
from samplesheet_parser.enums import SampleSheetVersion
|
|
44
|
+
from samplesheet_parser.parsers.v1 import SampleSheetV1
|
|
45
|
+
from samplesheet_parser.parsers.v2 import SampleSheetV2
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SampleSheetFactory:
|
|
49
|
+
"""
|
|
50
|
+
Auto-detecting factory for Illumina sample sheet parsers.
|
|
51
|
+
|
|
52
|
+
After calling :meth:`create_parser`, the detected version is
|
|
53
|
+
available as ``factory.version`` and the parser as
|
|
54
|
+
``factory.parser``.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
None — the factory is stateless until :meth:`create_parser` is called.
|
|
59
|
+
|
|
60
|
+
Examples
|
|
61
|
+
--------
|
|
62
|
+
>>> factory = SampleSheetFactory()
|
|
63
|
+
>>> sheet = factory.create_parser("SampleSheet.csv", parse=True)
|
|
64
|
+
>>> print(factory.version) # SampleSheetVersion.V1 or .V2
|
|
65
|
+
>>> print(sheet.samples())
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(self) -> None:
|
|
69
|
+
self.version: SampleSheetVersion | None = None
|
|
70
|
+
self.parser: SampleSheetV1 | SampleSheetV2 | None = None
|
|
71
|
+
|
|
72
|
+
# ------------------------------------------------------------------
|
|
73
|
+
# Public API
|
|
74
|
+
# ------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
def create_parser(
|
|
77
|
+
self,
|
|
78
|
+
path: str | Path,
|
|
79
|
+
*,
|
|
80
|
+
clean: bool = True,
|
|
81
|
+
experiment_id: str | None = None,
|
|
82
|
+
parse: bool | None = None,
|
|
83
|
+
) -> SampleSheetV1 | SampleSheetV2:
|
|
84
|
+
"""Detect the sample sheet format and return the appropriate parser.
|
|
85
|
+
|
|
86
|
+
The returned parser shares the same interface:
|
|
87
|
+
- :meth:`parse` — load and parse all sections
|
|
88
|
+
- :meth:`samples` — return a list of sample records
|
|
89
|
+
- :meth:`index_type` — return ``"dual"``, ``"single"``, or ``"none"``
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
path:
|
|
94
|
+
Path to the ``SampleSheet.csv`` file.
|
|
95
|
+
clean:
|
|
96
|
+
Passed to the underlying parser's ``clean`` parameter.
|
|
97
|
+
experiment_id:
|
|
98
|
+
Override the experiment/run name in the header.
|
|
99
|
+
parse:
|
|
100
|
+
If ``True``, call ``parse()`` immediately on the returned
|
|
101
|
+
parser. If ``False`` (default), defer until the caller
|
|
102
|
+
calls ``parse()`` explicitly.
|
|
103
|
+
|
|
104
|
+
Returns
|
|
105
|
+
-------
|
|
106
|
+
SampleSheetV1 | SampleSheetV2
|
|
107
|
+
The version-appropriate parser instance.
|
|
108
|
+
|
|
109
|
+
Raises
|
|
110
|
+
------
|
|
111
|
+
FileNotFoundError
|
|
112
|
+
If the given path does not exist.
|
|
113
|
+
ValueError
|
|
114
|
+
If the file cannot be read as a valid sample sheet.
|
|
115
|
+
"""
|
|
116
|
+
path = Path(path)
|
|
117
|
+
if not path.exists():
|
|
118
|
+
raise FileNotFoundError(f"Sample sheet not found: {path}")
|
|
119
|
+
|
|
120
|
+
logger.info(f"Detecting sample sheet format for: {path}")
|
|
121
|
+
detected = self._detect_version(path)
|
|
122
|
+
|
|
123
|
+
self.version = detected
|
|
124
|
+
kwargs: dict = dict(clean=clean, experiment_id=experiment_id, parse=parse)
|
|
125
|
+
|
|
126
|
+
if detected == SampleSheetVersion.V2:
|
|
127
|
+
logger.info("Detected BCLConvert V2 format — using SampleSheetV2")
|
|
128
|
+
self.parser = SampleSheetV2(path, **kwargs)
|
|
129
|
+
else:
|
|
130
|
+
logger.info("Detected IEM V1 format — using SampleSheetV1")
|
|
131
|
+
self.parser = SampleSheetV1(path, **kwargs)
|
|
132
|
+
|
|
133
|
+
return self.parser
|
|
134
|
+
|
|
135
|
+
def get_umi_length(self) -> int:
|
|
136
|
+
"""Return the UMI length for the currently selected parser.
|
|
137
|
+
|
|
138
|
+
Delegates to ``parser.get_umi_length()`` for V2, or reads
|
|
139
|
+
``IndexUMILength`` from the V1 header if present.
|
|
140
|
+
|
|
141
|
+
Returns
|
|
142
|
+
-------
|
|
143
|
+
int
|
|
144
|
+
UMI length in bases. ``0`` if no UMI is present.
|
|
145
|
+
|
|
146
|
+
Raises
|
|
147
|
+
------
|
|
148
|
+
RuntimeError
|
|
149
|
+
If called before :meth:`create_parser`.
|
|
150
|
+
"""
|
|
151
|
+
if self.parser is None:
|
|
152
|
+
raise RuntimeError("Call create_parser() before get_umi_length().")
|
|
153
|
+
|
|
154
|
+
if self.version == SampleSheetVersion.V2:
|
|
155
|
+
return self.parser.get_umi_length() # type: ignore[union-attr]
|
|
156
|
+
|
|
157
|
+
# V1: UMI length is occasionally stored as IndexUMILength in [Header]
|
|
158
|
+
if isinstance(self.parser, SampleSheetV1):
|
|
159
|
+
if self.parser.header:
|
|
160
|
+
try:
|
|
161
|
+
return int(self.parser.header.get("IndexUMILength", 0))
|
|
162
|
+
except (ValueError, TypeError):
|
|
163
|
+
pass
|
|
164
|
+
return 0
|
|
165
|
+
|
|
166
|
+
# ------------------------------------------------------------------
|
|
167
|
+
# Format detection
|
|
168
|
+
# ------------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
def _detect_version(self, path: Path) -> SampleSheetVersion:
|
|
171
|
+
"""Inspect the file and return the appropriate SampleSheetVersion.
|
|
172
|
+
|
|
173
|
+
Reads only as much of the file as needed:
|
|
174
|
+
1. Scan [Header] for FileFormatVersion / IEMFileVersion.
|
|
175
|
+
2. If undetermined, scan the full file for BCLConvert section names.
|
|
176
|
+
3. Default to V1.
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
path:
|
|
181
|
+
Path to the sample sheet.
|
|
182
|
+
|
|
183
|
+
Returns
|
|
184
|
+
-------
|
|
185
|
+
SampleSheetVersion
|
|
186
|
+
Detected version enum value.
|
|
187
|
+
"""
|
|
188
|
+
# --- Phase 1: check [Header] section only ----------------------
|
|
189
|
+
# Read lines until we leave the [Header] section (hit a new section
|
|
190
|
+
# or EOF). Avoids loading the entire file for the common case.
|
|
191
|
+
header_lines: list[str] = []
|
|
192
|
+
full_content: list[str] = []
|
|
193
|
+
|
|
194
|
+
with open(path, encoding="utf-8-sig") as fh:
|
|
195
|
+
in_header = False
|
|
196
|
+
for line in fh:
|
|
197
|
+
full_content.append(line)
|
|
198
|
+
stripped = line.strip()
|
|
199
|
+
|
|
200
|
+
if stripped.lower().startswith("[header]"):
|
|
201
|
+
in_header = True
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
if stripped.startswith("[") and in_header:
|
|
205
|
+
# Leaving the header section
|
|
206
|
+
break
|
|
207
|
+
|
|
208
|
+
if in_header and stripped:
|
|
209
|
+
header_lines.append(stripped)
|
|
210
|
+
|
|
211
|
+
for line in header_lines:
|
|
212
|
+
key = line.split(",")[0].strip()
|
|
213
|
+
if key == "FileFormatVersion":
|
|
214
|
+
logger.debug("Discriminator: FileFormatVersion → V2")
|
|
215
|
+
return SampleSheetVersion.V2
|
|
216
|
+
if key == "IEMFileVersion":
|
|
217
|
+
logger.debug("Discriminator: IEMFileVersion → V1")
|
|
218
|
+
return SampleSheetVersion.V1
|
|
219
|
+
|
|
220
|
+
# --- Phase 2: scan for BCLConvert section names ----------------
|
|
221
|
+
# Use the already-read content — no second file open needed.
|
|
222
|
+
content = "".join(full_content)
|
|
223
|
+
if "[BCLConvert_Settings]" in content or "[BCLConvert_Data]" in content:
|
|
224
|
+
logger.debug("Discriminator: BCLConvert section names → V2")
|
|
225
|
+
return SampleSheetVersion.V2
|
|
226
|
+
|
|
227
|
+
# --- Phase 3: default to V1 ------------------------------------
|
|
228
|
+
logger.debug("No discriminator found — defaulting to V1")
|
|
229
|
+
return SampleSheetVersion.V1
|
|
230
|
+
|
|
231
|
+
# ------------------------------------------------------------------
|
|
232
|
+
# Dunder
|
|
233
|
+
# ------------------------------------------------------------------
|
|
234
|
+
|
|
235
|
+
def __repr__(self) -> str:
|
|
236
|
+
return (
|
|
237
|
+
f"SampleSheetFactory("
|
|
238
|
+
f"version={self.version!r}, "
|
|
239
|
+
f"parser={type(self.parser).__name__ if self.parser else None})"
|
|
240
|
+
)
|