samplesheet-parser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,50 @@
1
+ """
2
+ samplesheet-parser
3
+ ====================
4
+
5
+ Format-agnostic parser for Illumina SampleSheet.csv files.
6
+
7
+ Supports:
8
+ - Illumina Experiment Manager (IEM) V1 format — bcl2fastq era
9
+ - BCLConvert V2 format — NovaSeq X / modern era
10
+
11
+ Quickstart
12
+ ----------
13
+ >>> from samplesheet_parser import SampleSheetFactory
14
+ >>> sheet = SampleSheetFactory().create_parser("SampleSheet.csv")
15
+ >>> sheet.parse()
16
+ >>> for sample in sheet.samples():
17
+ ... print(sample["sample_id"], sample["index"])
18
+
19
+ Or use the version-specific parsers directly:
20
+
21
+ >>> from samplesheet_parser import SampleSheetV1, SampleSheetV2
22
+ """
23
+
24
+ from importlib.metadata import PackageNotFoundError, version
25
+
26
+ try:
27
+ __version__ = version("samplesheet-parser")
28
+ except PackageNotFoundError:
29
+ __version__ = "0.1.0-dev"
30
+
31
+ __author__ = "Chaitanya Kasaraneni"
32
+ __email__ = "kc.kasaraneni@gmail.com"
33
+ __license__ = "Apache 2.0"
34
+
35
+ from samplesheet_parser.enums import IndexType, SampleSheetVersion
36
+ from samplesheet_parser.factory import SampleSheetFactory
37
+ from samplesheet_parser.parsers.v1 import SampleSheetV1
38
+ from samplesheet_parser.parsers.v2 import SampleSheetV2
39
+ from samplesheet_parser.validators import SampleSheetValidator, ValidationResult
40
+
41
+ __all__ = [
42
+ "SampleSheetV1",
43
+ "SampleSheetV2",
44
+ "SampleSheetFactory",
45
+ "SampleSheetVersion",
46
+ "IndexType",
47
+ "SampleSheetValidator",
48
+ "ValidationResult",
49
+ "__version__",
50
+ ]
@@ -0,0 +1,54 @@
1
+ """
2
+ Enumerations for samplesheet-parser.
3
+
4
+ All enums use standard Illumina terminology from public documentation.
5
+ No proprietary assay codes are included.
6
+ """
7
+
8
+ from enum import Enum
9
+
10
+
11
+ class SampleSheetVersion(str, Enum):
12
+ """Illumina sample sheet format version.
13
+
14
+ V1 — Illumina Experiment Manager (IEM) format, used with bcl2fastq.
15
+ Identified by ``IEMFileVersion`` in the [Header] section.
16
+
17
+ V2 — BCLConvert format, used with BCLConvert and required for
18
+ NovaSeq X series instruments.
19
+ Identified by ``FileFormatVersion`` in the [Header] section,
20
+ or by the presence of ``[BCLConvert_Settings]`` / ``[BCLConvert_Data]``
21
+ sections.
22
+ """
23
+ V1 = "V1"
24
+ V2 = "V2"
25
+
26
+
27
+ class IndexType(str, Enum):
28
+ """Sequencing index configuration.
29
+
30
+ SINGLE — I7 index only (single-index libraries).
31
+ DUAL — I7 + I5 indexes (dual-index libraries, standard for modern workflows).
32
+ NONE — No index (rare; full-lane libraries).
33
+ """
34
+ SINGLE = "single"
35
+ DUAL = "dual"
36
+ NONE = "none"
37
+
38
+
39
+ class InstrumentPlatform(str, Enum):
40
+ """Standard Illumina instrument platform identifiers used in V2 sample sheets."""
41
+ NOVASEQ_6000 = "NovaSeq6000"
42
+ NOVASEQ_X_SERIES = "NovaSeqXSeries"
43
+ NEXTSEQ_1000_2000 = "NextSeq1000/2000"
44
+ NEXTSEQ_550 = "NextSeq550"
45
+ MISEQ = "MiSeq"
46
+ HISEQ_X = "HiSeqX"
47
+
48
+
49
+ class UMILocation(str, Enum):
50
+ """Where the UMI is encoded in the read structure (OverrideCycles string)."""
51
+ READ1 = "read1"
52
+ READ2 = "read2"
53
+ INDEX1 = "index1"
54
+ INDEX2 = "index2"
@@ -0,0 +1,240 @@
1
+ """
2
+ Format-detection factory for Illumina sample sheets.
3
+
4
+ The factory inspects the ``[Header]`` section and section names of a
5
+ ``SampleSheet.csv`` to select the correct parser — :class:`SampleSheetV1`
6
+ for classic IEM / bcl2fastq files and :class:`SampleSheetV2` for
7
+ BCLConvert files — without requiring the caller to know the format
8
+ upfront.
9
+
10
+ Detection logic
11
+ ---------------
12
+ 1. Read the ``[Header]`` section and look for a version discriminator:
13
+ - ``FileFormatVersion`` → V2 (BCLConvert)
14
+ - ``IEMFileVersion`` → V1 (IEM / bcl2fastq)
15
+
16
+ 2. If no header discriminator is found, scan the full file for
17
+ BCLConvert-specific section names (``[BCLConvert_Settings]``,
18
+ ``[BCLConvert_Data]``) and fall back to V2 if found.
19
+
20
+ 3. If nothing matches, default to V1 (broadest compatibility).
21
+
22
+ Examples
23
+ --------
24
+ >>> from samplesheet_parser import SampleSheetFactory
25
+ >>>
26
+ >>> # Auto-detect format
27
+ >>> sheet = SampleSheetFactory().create_parser("SampleSheet.csv")
28
+ >>> sheet.parse()
29
+ >>> print(sheet.samples())
30
+ >>>
31
+ >>> # Check what was detected
32
+ >>> factory = SampleSheetFactory()
33
+ >>> sheet = factory.create_parser("SampleSheet.csv")
34
+ >>> print(factory.version) # SampleSheetVersion.V2
35
+ """
36
+
37
+ from __future__ import annotations
38
+
39
+ from pathlib import Path
40
+
41
+ from loguru import logger
42
+
43
+ from samplesheet_parser.enums import SampleSheetVersion
44
+ from samplesheet_parser.parsers.v1 import SampleSheetV1
45
+ from samplesheet_parser.parsers.v2 import SampleSheetV2
46
+
47
+
48
+ class SampleSheetFactory:
49
+ """
50
+ Auto-detecting factory for Illumina sample sheet parsers.
51
+
52
+ After calling :meth:`create_parser`, the detected version is
53
+ available as ``factory.version`` and the parser as
54
+ ``factory.parser``.
55
+
56
+ Parameters
57
+ ----------
58
+ None — the factory is stateless until :meth:`create_parser` is called.
59
+
60
+ Examples
61
+ --------
62
+ >>> factory = SampleSheetFactory()
63
+ >>> sheet = factory.create_parser("SampleSheet.csv", parse=True)
64
+ >>> print(factory.version) # SampleSheetVersion.V1 or .V2
65
+ >>> print(sheet.samples())
66
+ """
67
+
68
+ def __init__(self) -> None:
69
+ self.version: SampleSheetVersion | None = None
70
+ self.parser: SampleSheetV1 | SampleSheetV2 | None = None
71
+
72
+ # ------------------------------------------------------------------
73
+ # Public API
74
+ # ------------------------------------------------------------------
75
+
76
+ def create_parser(
77
+ self,
78
+ path: str | Path,
79
+ *,
80
+ clean: bool = True,
81
+ experiment_id: str | None = None,
82
+ parse: bool | None = None,
83
+ ) -> SampleSheetV1 | SampleSheetV2:
84
+ """Detect the sample sheet format and return the appropriate parser.
85
+
86
+ The returned parser shares the same interface:
87
+ - :meth:`parse` — load and parse all sections
88
+ - :meth:`samples` — return a list of sample records
89
+ - :meth:`index_type` — return ``"dual"``, ``"single"``, or ``"none"``
90
+
91
+ Parameters
92
+ ----------
93
+ path:
94
+ Path to the ``SampleSheet.csv`` file.
95
+ clean:
96
+ Passed to the underlying parser's ``clean`` parameter.
97
+ experiment_id:
98
+ Override the experiment/run name in the header.
99
+ parse:
100
+ If ``True``, call ``parse()`` immediately on the returned
101
+ parser. If ``False`` (default), defer until the caller
102
+ calls ``parse()`` explicitly.
103
+
104
+ Returns
105
+ -------
106
+ SampleSheetV1 | SampleSheetV2
107
+ The version-appropriate parser instance.
108
+
109
+ Raises
110
+ ------
111
+ FileNotFoundError
112
+ If the given path does not exist.
113
+ ValueError
114
+ If the file cannot be read as a valid sample sheet.
115
+ """
116
+ path = Path(path)
117
+ if not path.exists():
118
+ raise FileNotFoundError(f"Sample sheet not found: {path}")
119
+
120
+ logger.info(f"Detecting sample sheet format for: {path}")
121
+ detected = self._detect_version(path)
122
+
123
+ self.version = detected
124
+ kwargs: dict = dict(clean=clean, experiment_id=experiment_id, parse=parse)
125
+
126
+ if detected == SampleSheetVersion.V2:
127
+ logger.info("Detected BCLConvert V2 format — using SampleSheetV2")
128
+ self.parser = SampleSheetV2(path, **kwargs)
129
+ else:
130
+ logger.info("Detected IEM V1 format — using SampleSheetV1")
131
+ self.parser = SampleSheetV1(path, **kwargs)
132
+
133
+ return self.parser
134
+
135
+ def get_umi_length(self) -> int:
136
+ """Return the UMI length for the currently selected parser.
137
+
138
+ Delegates to ``parser.get_umi_length()`` for V2, or reads
139
+ ``IndexUMILength`` from the V1 header if present.
140
+
141
+ Returns
142
+ -------
143
+ int
144
+ UMI length in bases. ``0`` if no UMI is present.
145
+
146
+ Raises
147
+ ------
148
+ RuntimeError
149
+ If called before :meth:`create_parser`.
150
+ """
151
+ if self.parser is None:
152
+ raise RuntimeError("Call create_parser() before get_umi_length().")
153
+
154
+ if self.version == SampleSheetVersion.V2:
155
+ return self.parser.get_umi_length() # type: ignore[union-attr]
156
+
157
+ # V1: UMI length is occasionally stored as IndexUMILength in [Header]
158
+ if isinstance(self.parser, SampleSheetV1):
159
+ if self.parser.header:
160
+ try:
161
+ return int(self.parser.header.get("IndexUMILength", 0))
162
+ except (ValueError, TypeError):
163
+ pass
164
+ return 0
165
+
166
+ # ------------------------------------------------------------------
167
+ # Format detection
168
+ # ------------------------------------------------------------------
169
+
170
+ def _detect_version(self, path: Path) -> SampleSheetVersion:
171
+ """Inspect the file and return the appropriate SampleSheetVersion.
172
+
173
+ Reads only as much of the file as needed:
174
+ 1. Scan [Header] for FileFormatVersion / IEMFileVersion.
175
+ 2. If undetermined, scan the full file for BCLConvert section names.
176
+ 3. Default to V1.
177
+
178
+ Parameters
179
+ ----------
180
+ path:
181
+ Path to the sample sheet.
182
+
183
+ Returns
184
+ -------
185
+ SampleSheetVersion
186
+ Detected version enum value.
187
+ """
188
+ # --- Phase 1: check [Header] section only ----------------------
189
+ # Read lines until we leave the [Header] section (hit a new section
190
+ # or EOF). Avoids loading the entire file for the common case.
191
+ header_lines: list[str] = []
192
+ full_content: list[str] = []
193
+
194
+ with open(path, encoding="utf-8-sig") as fh:
195
+ in_header = False
196
+ for line in fh:
197
+ full_content.append(line)
198
+ stripped = line.strip()
199
+
200
+ if stripped.lower().startswith("[header]"):
201
+ in_header = True
202
+ continue
203
+
204
+ if stripped.startswith("[") and in_header:
205
+ # Leaving the header section
206
+ break
207
+
208
+ if in_header and stripped:
209
+ header_lines.append(stripped)
210
+
211
+ for line in header_lines:
212
+ key = line.split(",")[0].strip()
213
+ if key == "FileFormatVersion":
214
+ logger.debug("Discriminator: FileFormatVersion → V2")
215
+ return SampleSheetVersion.V2
216
+ if key == "IEMFileVersion":
217
+ logger.debug("Discriminator: IEMFileVersion → V1")
218
+ return SampleSheetVersion.V1
219
+
220
+ # --- Phase 2: scan for BCLConvert section names ----------------
221
+ # Use the already-read content — no second file open needed.
222
+ content = "".join(full_content)
223
+ if "[BCLConvert_Settings]" in content or "[BCLConvert_Data]" in content:
224
+ logger.debug("Discriminator: BCLConvert section names → V2")
225
+ return SampleSheetVersion.V2
226
+
227
+ # --- Phase 3: default to V1 ------------------------------------
228
+ logger.debug("No discriminator found — defaulting to V1")
229
+ return SampleSheetVersion.V1
230
+
231
+ # ------------------------------------------------------------------
232
+ # Dunder
233
+ # ------------------------------------------------------------------
234
+
235
+ def __repr__(self) -> str:
236
+ return (
237
+ f"SampleSheetFactory("
238
+ f"version={self.version!r}, "
239
+ f"parser={type(self.parser).__name__ if self.parser else None})"
240
+ )
@@ -0,0 +1,6 @@
1
+ """Version-specific Illumina sample sheet parsers."""
2
+
3
+ from samplesheet_parser.parsers.v1 import SampleSheetV1
4
+ from samplesheet_parser.parsers.v2 import SampleSheetV2
5
+
6
+ __all__ = ["SampleSheetV1", "SampleSheetV2"]