convert-genome 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ .venv/
2
+ .pytest_cache/
3
+ dist/
4
+ build/
5
+ **/__pycache__/
6
+ *.egg-info/
@@ -0,0 +1,150 @@
1
+ Metadata-Version: 2.4
2
+ Name: convert_genome
3
+ Version: 0.1.0
4
+ Summary: Python wrapper for SauersML/convert_genome (DTC → VCF/BCF/PLINK conversion).
5
+ Project-URL: Homepage, https://github.com/SauersML/convert_genome
6
+ Project-URL: Issues, https://github.com/SauersML/convert_genome/issues
7
+ Author: SauersML
8
+ License: MIT
9
+ Keywords: 23andme,ancestry,bcf,bioinformatics,dtc,genomics,plink,vcf
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
16
+ Requires-Python: >=3.9
17
+ Provides-Extra: test
18
+ Requires-Dist: pytest>=7; extra == 'test'
19
+ Description-Content-Type: text/markdown
20
+
21
+ # convert_genome (Python)
22
+
23
+ Python wrapper for the
24
+ [`SauersML/convert_genome`](https://github.com/SauersML/convert_genome) CLI.
25
+ Convert direct-to-consumer dumps (23andMe, AncestryDNA, MyHeritage,
26
+ deCODEme) and standard VCF/BCF into compliant VCF, BCF, or PLINK 1.9
27
+ binary — with build detection, sex inference, liftover, and panel
28
+ harmonisation, all controllable from kwargs.
29
+
30
+ ```python
31
+ from convert_genome import convert, OutputFormat
32
+
33
+ result = convert(
34
+ input="23andme.txt",
35
+ output="out.vcf",
36
+ format=OutputFormat.VCF,
37
+ assembly="hg38",
38
+ standardize=True,
39
+ )
40
+
41
+ result.statistics.emitted_records # int
42
+ result.sample.sex_inferred # bool
43
+ result.build_detection.detected_build # 'GRCh37' / 'GRCh38' / ...
44
+ result.report_path # path to <stem>_report.json
45
+ result.output_paths # files that actually exist on disk
46
+ result.yield_rate # emitted / total
47
+ ```
48
+
49
+ The wrapper runs the Rust binary, parses the sidecar
50
+ `<stem>_report.json` into typed frozen dataclasses, and returns a
51
+ single `ConversionResult`.
52
+
53
+ ## Install
54
+
55
+ ```bash
56
+ pip install convert_genome
57
+ # the Rust binary:
58
+ cargo install convert_genome
59
+ ```
60
+
61
+ Binary located via `binary=` or PATH. No env-var indirection — if
62
+ the binary isn't on PATH, pass `binary=` explicitly. Missing binary
63
+ → `ConvertGenomeBinaryNotFound` with the suggested install command.
64
+
65
+ ## Shortcuts: skip every auto-discovery step
66
+
67
+ The CLI will download/auto-detect things it doesn't need to. Pass them
68
+ in directly:
69
+
70
+ ```python
71
+ convert(
72
+ input="raw.txt",
73
+ output="out.vcf",
74
+ reference="/cache/hg38.fa", # skip FASTA download
75
+ reference_fai="/cache/hg38.fa.fai", # skip .fai indexing
76
+ input_build="hg19", # skip build detection
77
+ assembly="GRCh38", # target build (still does liftover)
78
+ panel="/cache/1kg_panel.vcf", # supply harmonisation panel
79
+ sex="female", # skip sex inference
80
+ standardize=True,
81
+ )
82
+ ```
83
+
84
+ `sex` is lenient: passing `"unknown"` or `"indeterminate"` (e.g. when
85
+ chaining out of `infer_sex`) silently omits the `--sex` flag and lets
86
+ the CLI run its own inference.
87
+
88
+ ## Builder
89
+
90
+ `Converter` is a frozen dataclass; every `with_*` returns a new
91
+ instance, so branching is safe.
92
+
93
+ ```python
94
+ from convert_genome import Converter, Sex, OutputFormat
95
+
96
+ plan = (
97
+ Converter(input="raw.txt", output_dir="out/", format=OutputFormat.PLINK)
98
+ .with_assembly("GRCh38")
99
+ .with_reference("/cache/hg38.fa", "/cache/hg38.fa.fai")
100
+ .with_panel("/data/1kg_panel.vcf.gz")
101
+ .with_standardize()
102
+ .with_sex(Sex.MALE)
103
+ )
104
+
105
+ print(plan.argv()) # exact argv that would be passed to the CLI
106
+ result = plan.run()
107
+ ```
108
+
109
+ ## Enums
110
+
111
+ ```python
112
+ InputFormat.AUTO / .DTC / .VCF / .BCF
113
+ OutputFormat.VCF / .BCF / .PLINK
114
+ Sex.MALE / .FEMALE
115
+ Assembly.GRCH37 / .GRCH38 # plus a `.parse()` classmethod that
116
+ # accepts 'hg19' / 'hg38' / 'build38' / ...
117
+ ```
118
+
119
+ ## Output
120
+
121
+ The Rust tool writes `<stem>_report.json` alongside the main output.
122
+ The wrapper loads it into `ConversionResult`, with sub-dataclasses for
123
+ each section:
124
+
125
+ ```python
126
+ result.input # InputInfo (path, format, origin)
127
+ result.output # OutputInfo (path, format)
128
+ result.reference # ReferenceInfo (path, origin, assembly)
129
+ result.panel # PanelInfo | None
130
+ result.sample # SampleInfo (id, sex, sex_inferred)
131
+ result.build_detection # BuildDetection | None (detected_build, match rates)
132
+ result.statistics # Statistics (total / emitted / variant / ... records)
133
+ result.report_path # path to the JSON sidecar
134
+ result.output_paths # tuple[Path] — files that actually exist on disk
135
+ ```
136
+
137
+ For PLINK output, `output_paths` includes the `.bed/.bim/.fam` trio. For
138
+ `output_dir` with a panel, it includes `panel.vcf`. Non-existent paths
139
+ are filtered out automatically.
140
+
141
+ ## Errors
142
+
143
+ * `ConvertGenomeBinaryNotFound` — CLI not installed / not on PATH.
144
+ * `InvalidConfig` — argument combination rejected before launching
145
+ (e.g. missing input file, conflicting output/output_dir).
146
+ * `ConvertGenomeFailed` — CLI exited non-zero. The exception carries
147
+ `stdout`, `stderr`, `returncode`.
148
+ * `ReportNotFound` — CLI ran clean but didn't write a JSON sidecar.
149
+
150
+ All subclass `ConvertGenomeError`.
@@ -0,0 +1,130 @@
1
+ # convert_genome (Python)
2
+
3
+ Python wrapper for the
4
+ [`SauersML/convert_genome`](https://github.com/SauersML/convert_genome) CLI.
5
+ Convert direct-to-consumer dumps (23andMe, AncestryDNA, MyHeritage,
6
+ deCODEme) and standard VCF/BCF into compliant VCF, BCF, or PLINK 1.9
7
+ binary — with build detection, sex inference, liftover, and panel
8
+ harmonisation, all controllable from kwargs.
9
+
10
+ ```python
11
+ from convert_genome import convert, OutputFormat
12
+
13
+ result = convert(
14
+ input="23andme.txt",
15
+ output="out.vcf",
16
+ format=OutputFormat.VCF,
17
+ assembly="hg38",
18
+ standardize=True,
19
+ )
20
+
21
+ result.statistics.emitted_records # int
22
+ result.sample.sex_inferred # bool
23
+ result.build_detection.detected_build # 'GRCh37' / 'GRCh38' / ...
24
+ result.report_path # path to <stem>_report.json
25
+ result.output_paths # files that actually exist on disk
26
+ result.yield_rate # emitted / total
27
+ ```
28
+
29
+ The wrapper runs the Rust binary, parses the sidecar
30
+ `<stem>_report.json` into typed frozen dataclasses, and returns a
31
+ single `ConversionResult`.
32
+
33
+ ## Install
34
+
35
+ ```bash
36
+ pip install convert_genome
37
+ # the Rust binary:
38
+ cargo install convert_genome
39
+ ```
40
+
41
+ Binary located via `binary=` or PATH. No env-var indirection — if
42
+ the binary isn't on PATH, pass `binary=` explicitly. Missing binary
43
+ → `ConvertGenomeBinaryNotFound` with the suggested install command.
44
+
45
+ ## Shortcuts: skip every auto-discovery step
46
+
47
+ The CLI will download/auto-detect things it doesn't need to. Pass them
48
+ in directly:
49
+
50
+ ```python
51
+ convert(
52
+ input="raw.txt",
53
+ output="out.vcf",
54
+ reference="/cache/hg38.fa", # skip FASTA download
55
+ reference_fai="/cache/hg38.fa.fai", # skip .fai indexing
56
+ input_build="hg19", # skip build detection
57
+ assembly="GRCh38", # target build (still does liftover)
58
+ panel="/cache/1kg_panel.vcf", # supply harmonisation panel
59
+ sex="female", # skip sex inference
60
+ standardize=True,
61
+ )
62
+ ```
63
+
64
+ `sex` is lenient: passing `"unknown"` or `"indeterminate"` (e.g. when
65
+ chaining out of `infer_sex`) silently omits the `--sex` flag and lets
66
+ the CLI run its own inference.
67
+
68
+ ## Builder
69
+
70
+ `Converter` is a frozen dataclass; every `with_*` returns a new
71
+ instance, so branching is safe.
72
+
73
+ ```python
74
+ from convert_genome import Converter, Sex, OutputFormat
75
+
76
+ plan = (
77
+ Converter(input="raw.txt", output_dir="out/", format=OutputFormat.PLINK)
78
+ .with_assembly("GRCh38")
79
+ .with_reference("/cache/hg38.fa", "/cache/hg38.fa.fai")
80
+ .with_panel("/data/1kg_panel.vcf.gz")
81
+ .with_standardize()
82
+ .with_sex(Sex.MALE)
83
+ )
84
+
85
+ print(plan.argv()) # exact argv that would be passed to the CLI
86
+ result = plan.run()
87
+ ```
88
+
89
+ ## Enums
90
+
91
+ ```python
92
+ InputFormat.AUTO / .DTC / .VCF / .BCF
93
+ OutputFormat.VCF / .BCF / .PLINK
94
+ Sex.MALE / .FEMALE
95
+ Assembly.GRCH37 / .GRCH38 # plus a `.parse()` classmethod that
96
+ # accepts 'hg19' / 'hg38' / 'build38' / ...
97
+ ```
98
+
99
+ ## Output
100
+
101
+ The Rust tool writes `<stem>_report.json` alongside the main output.
102
+ The wrapper loads it into `ConversionResult`, with sub-dataclasses for
103
+ each section:
104
+
105
+ ```python
106
+ result.input # InputInfo (path, format, origin)
107
+ result.output # OutputInfo (path, format)
108
+ result.reference # ReferenceInfo (path, origin, assembly)
109
+ result.panel # PanelInfo | None
110
+ result.sample # SampleInfo (id, sex, sex_inferred)
111
+ result.build_detection # BuildDetection | None (detected_build, match rates)
112
+ result.statistics # Statistics (total / emitted / variant / ... records)
113
+ result.report_path # path to the JSON sidecar
114
+ result.output_paths # tuple[Path] — files that actually exist on disk
115
+ ```
116
+
117
+ For PLINK output, `output_paths` includes the `.bed/.bim/.fam` trio. For
118
+ `output_dir` with a panel, it includes `panel.vcf`. Non-existent paths
119
+ are filtered out automatically.
120
+
121
+ ## Errors
122
+
123
+ * `ConvertGenomeBinaryNotFound` — CLI not installed / not on PATH.
124
+ * `InvalidConfig` — argument combination rejected before launching
125
+ (e.g. missing input file, conflicting output/output_dir).
126
+ * `ConvertGenomeFailed` — CLI exited non-zero. The exception carries
127
+ `stdout`, `stderr`, `returncode`.
128
+ * `ReportNotFound` — CLI ran clean but didn't write a JSON sidecar.
129
+
130
+ All subclass `ConvertGenomeError`.
@@ -0,0 +1,76 @@
1
+ """convert_genome — Python bindings for the SauersML/convert_genome CLI.
2
+
3
+ Convert direct-to-consumer (23andMe, AncestryDNA, ...) and standard
4
+ VCF/BCF inputs into compliant VCF, BCF, or PLINK 1.9 binary, with build
5
+ detection, sex inference, liftover, and panel harmonisation.
6
+
7
+ This package shells out to the `convert_genome` Rust binary and parses
8
+ its sidecar `_report.json` into typed dataclasses. The Python API is
9
+ typed kwargs end-to-end; you never need to remember CLI flag names.
10
+
11
+ Quick start
12
+ -----------
13
+
14
+ >>> from convert_genome import convert, OutputFormat
15
+ >>> result = convert(
16
+ ... input="23andme.txt",
17
+ ... output="out.vcf",
18
+ ... format=OutputFormat.VCF,
19
+ ... assembly="GRCh38",
20
+ ... standardize=True,
21
+ ... )
22
+ >>> result.sample.sex_inferred
23
+ True
24
+ >>> result.statistics.emitted_records
25
+ 612_345
26
+ >>> result.build_detection.detected_build
27
+ 'GRCh37'
28
+ """
29
+
30
+ from ._api import (
31
+ convert,
32
+ Converter,
33
+ ConversionResult,
34
+ InputInfo,
35
+ OutputInfo,
36
+ ReferenceInfo,
37
+ PanelInfo,
38
+ SampleInfo,
39
+ BuildDetection,
40
+ Statistics,
41
+ InputFormat,
42
+ OutputFormat,
43
+ Sex,
44
+ Assembly,
45
+ ConvertGenomeError,
46
+ ConvertGenomeBinaryNotFound,
47
+ ConvertGenomeFailed,
48
+ InvalidConfig,
49
+ ReportNotFound,
50
+ locate_binary,
51
+ )
52
+
53
+ __all__ = [
54
+ "convert",
55
+ "Converter",
56
+ "ConversionResult",
57
+ "InputInfo",
58
+ "OutputInfo",
59
+ "ReferenceInfo",
60
+ "PanelInfo",
61
+ "SampleInfo",
62
+ "BuildDetection",
63
+ "Statistics",
64
+ "InputFormat",
65
+ "OutputFormat",
66
+ "Sex",
67
+ "Assembly",
68
+ "ConvertGenomeError",
69
+ "ConvertGenomeBinaryNotFound",
70
+ "ConvertGenomeFailed",
71
+ "InvalidConfig",
72
+ "ReportNotFound",
73
+ "locate_binary",
74
+ ]
75
+
76
+ __version__ = "0.1.0"
@@ -0,0 +1,560 @@
1
+ """Pythonic wrapper around the convert_genome CLI.
2
+
3
+ Design
4
+ ------
5
+ The CLI already writes a structured ``<stem>_report.json`` next to each
6
+ output, so we don't parse stdout — we run the binary, wait for it to
7
+ finish, then load that JSON into typed frozen dataclasses.
8
+
9
+ The ``Converter`` class is an immutable builder. The top-level
10
+ ``convert(...)`` is the one-shot convenience.
11
+
12
+ We deliberately don't shadow the CLI's auto-detection logic: pass
13
+ ``input_format=InputFormat.AUTO`` (the default) and let the Rust tool
14
+ sniff. Where we *do* validate eagerly is on parameter combinations that
15
+ the CLI rejects late and noisily — e.g. ``--output`` xor ``--output-dir``,
16
+ ``--reference-fai`` requiring ``--reference``.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import enum
22
+ import json
23
+ import os
24
+ import re
25
+ import shutil
26
+ import subprocess
27
+ from dataclasses import dataclass, field, replace
28
+ from pathlib import Path
29
+ from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union
30
+
31
+ PathLike = Union[str, os.PathLike]
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Enums (mirror src/cli.rs)
36
+ # ---------------------------------------------------------------------------
37
+
38
+
39
+ class InputFormat(str, enum.Enum):
40
+ AUTO = "auto"
41
+ DTC = "dtc"
42
+ VCF = "vcf"
43
+ BCF = "bcf"
44
+
45
+
46
+ class OutputFormat(str, enum.Enum):
47
+ VCF = "vcf"
48
+ BCF = "bcf"
49
+ PLINK = "plink"
50
+
51
+
52
+ class Sex(str, enum.Enum):
53
+ MALE = "male"
54
+ FEMALE = "female"
55
+
56
+
57
+ class Assembly(str, enum.Enum):
58
+ GRCH37 = "GRCh37"
59
+ GRCH38 = "GRCh38"
60
+
61
+ @classmethod
62
+ def parse(cls, value: Union[str, "Assembly", None]) -> Optional[str]:
63
+ if value is None:
64
+ return None
65
+ if isinstance(value, cls):
66
+ return value.value
67
+ norm = str(value).strip()
68
+ low = norm.lower().replace("-", "").replace("_", "")
69
+ aliases = {
70
+ "grch37": "GRCh37",
71
+ "hg19": "GRCh37",
72
+ "build37": "GRCh37",
73
+ "grch38": "GRCh38",
74
+ "hg38": "GRCh38",
75
+ "build38": "GRCh38",
76
+ }
77
+ return aliases.get(low, norm)
78
+
79
+
80
+ # ---------------------------------------------------------------------------
81
+ # Errors
82
+ # ---------------------------------------------------------------------------
83
+
84
+
85
+ class ConvertGenomeError(Exception):
86
+ """Base class for all convert_genome wrapper errors."""
87
+
88
+
89
+ class ConvertGenomeBinaryNotFound(ConvertGenomeError, FileNotFoundError):
90
+ """The `convert_genome` binary could not be located."""
91
+
92
+
93
+ class InvalidConfig(ConvertGenomeError, ValueError):
94
+ """A combination of arguments is mutually exclusive or incomplete."""
95
+
96
+
97
+ class ConvertGenomeFailed(ConvertGenomeError, RuntimeError):
98
+ """The binary ran but returned a non-zero exit code."""
99
+
100
+ def __init__(self, message: str, *, stdout: str = "", stderr: str = "", returncode: int = 0):
101
+ super().__init__(message)
102
+ self.stdout = stdout
103
+ self.stderr = stderr
104
+ self.returncode = returncode
105
+
106
+
107
+ class ReportNotFound(ConvertGenomeError, FileNotFoundError):
108
+ """The binary exited 0 but produced no sidecar `_report.json`."""
109
+
110
+
111
+ # ---------------------------------------------------------------------------
112
+ # Result dataclasses — mirror src/report.rs
113
+ # ---------------------------------------------------------------------------
114
+
115
+
116
+ @dataclass(frozen=True)
117
+ class InputInfo:
118
+ path: str
119
+ format: str
120
+ origin: str
121
+
122
+
123
+ @dataclass(frozen=True)
124
+ class OutputInfo:
125
+ path: str
126
+ format: str
127
+
128
+
129
+ @dataclass(frozen=True)
130
+ class ReferenceInfo:
131
+ path: str
132
+ origin: str
133
+ assembly: str
134
+
135
+
136
+ @dataclass(frozen=True)
137
+ class PanelInfo:
138
+ path: str
139
+ total_sites: int
140
+ modified_sites: int
141
+ novel_sites: int
142
+
143
+
144
+ @dataclass(frozen=True)
145
+ class SampleInfo:
146
+ id: str
147
+ sex: str
148
+ sex_inferred: bool
149
+
150
+
151
+ @dataclass(frozen=True)
152
+ class BuildDetection:
153
+ detected_build: str
154
+ hg19_match_rate: float
155
+ hg38_match_rate: float
156
+
157
+
158
+ @dataclass(frozen=True)
159
+ class Statistics:
160
+ total_records: int
161
+ emitted_records: int
162
+ variant_records: int
163
+ reference_records: int
164
+ missing_genotype_records: int
165
+ skipped_reference_sites: int
166
+ unknown_chromosomes: int
167
+ reference_failures: int
168
+ invalid_genotypes: int
169
+ symbolic_allele_records: int
170
+ parse_errors: int
171
+
172
+
173
+ @dataclass(frozen=True)
174
+ class ConversionResult:
175
+ """The full run report parsed from ``<stem>_report.json``."""
176
+
177
+ version: str
178
+ timestamp: str
179
+ input: InputInfo
180
+ output: OutputInfo
181
+ reference: ReferenceInfo
182
+ standardize: bool
183
+ sample: SampleInfo
184
+ statistics: Statistics
185
+ panel: Optional[PanelInfo] = None
186
+ build_detection: Optional[BuildDetection] = None
187
+ report_path: Optional[Path] = None
188
+ output_paths: Tuple[Path, ...] = field(default_factory=tuple)
189
+ stdout: str = ""
190
+ stderr: str = ""
191
+
192
+ @property
193
+ def main_output(self) -> Path:
194
+ return Path(self.output.path)
195
+
196
+ @property
197
+ def emitted_records(self) -> int:
198
+ return self.statistics.emitted_records
199
+
200
+ @property
201
+ def total_records(self) -> int:
202
+ return self.statistics.total_records
203
+
204
+ @property
205
+ def yield_rate(self) -> float:
206
+ if self.statistics.total_records == 0:
207
+ return 0.0
208
+ return self.statistics.emitted_records / self.statistics.total_records
209
+
210
+
211
+ # ---------------------------------------------------------------------------
212
+ # Binary location
213
+ # ---------------------------------------------------------------------------
214
+
215
+
216
+ def locate_binary(override: Optional[PathLike] = None) -> Path:
217
+ """Locate `convert_genome` or raise `ConvertGenomeBinaryNotFound`.
218
+
219
+ Resolution: explicit ``override`` → ``convert_genome`` on PATH.
220
+ No environment-variable indirection.
221
+ """
222
+ if override is not None:
223
+ p = Path(override)
224
+ if not p.exists():
225
+ raise ConvertGenomeBinaryNotFound(f"convert_genome binary not at {p}")
226
+ return p
227
+ which = shutil.which("convert_genome")
228
+ if which:
229
+ return Path(which)
230
+ raise ConvertGenomeBinaryNotFound(
231
+ "convert_genome not found. Install with: cargo install convert_genome, "
232
+ "or pass binary=... explicitly."
233
+ )
234
+
235
+
236
+ # ---------------------------------------------------------------------------
237
+ # Converter
238
+ # ---------------------------------------------------------------------------
239
+
240
+
241
+ @dataclass(frozen=True)
242
+ class Converter:
243
+ """Immutable conversion plan. Call ``.run()`` to execute."""
244
+
245
+ input: Path
246
+ output: Optional[Path] = None
247
+ output_dir: Optional[Path] = None
248
+ format: OutputFormat = OutputFormat.VCF
249
+ input_format: InputFormat = InputFormat.AUTO
250
+ assembly: str = "GRCh38"
251
+ input_build: Optional[str] = None
252
+ reference: Optional[Path] = None
253
+ reference_fai: Optional[Path] = None
254
+ panel: Optional[Path] = None
255
+ sample: Optional[str] = None
256
+ sex: Optional[Sex] = None
257
+ standardize: bool = False
258
+ variants_only: bool = False
259
+ log_level: str = "info"
260
+ binary: Optional[Path] = None
261
+ timeout: Optional[float] = None
262
+ extra_args: Tuple[str, ...] = field(default_factory=tuple)
263
+
264
+ def __post_init__(self) -> None:
265
+ if self.output is None and self.output_dir is None:
266
+ raise InvalidConfig("Either output= or output_dir= must be provided.")
267
+ if self.output is not None and self.output_dir is not None:
268
+ raise InvalidConfig("output= and output_dir= are mutually exclusive.")
269
+ if self.reference_fai is not None and self.reference is None:
270
+ raise InvalidConfig("reference_fai= requires reference=")
271
+ if not Path(self.input).exists():
272
+ raise InvalidConfig(f"Input file does not exist: {self.input}")
273
+
274
+ # Builder helpers — return a new Converter with the field replaced.
275
+
276
+ def with_output(self, output: PathLike) -> "Converter":
277
+ return replace(self, output=Path(output), output_dir=None)
278
+
279
+ def with_output_dir(self, output_dir: PathLike) -> "Converter":
280
+ return replace(self, output_dir=Path(output_dir), output=None)
281
+
282
+ def with_reference(self, ref: PathLike, fai: Optional[PathLike] = None) -> "Converter":
283
+ return replace(self, reference=Path(ref), reference_fai=Path(fai) if fai else None)
284
+
285
+ def with_panel(self, panel: PathLike) -> "Converter":
286
+ return replace(self, panel=Path(panel))
287
+
288
+ def with_sex(self, sex: Sex) -> "Converter":
289
+ return replace(self, sex=sex)
290
+
291
+ def with_sample(self, sample: str) -> "Converter":
292
+ return replace(self, sample=sample)
293
+
294
+ def with_standardize(self, on: bool = True) -> "Converter":
295
+ return replace(self, standardize=on)
296
+
297
+ def with_variants_only(self, on: bool = True) -> "Converter":
298
+ return replace(self, variants_only=on)
299
+
300
+ def with_assembly(self, assembly: str) -> "Converter":
301
+ return replace(self, assembly=Assembly.parse(assembly) or "GRCh38")
302
+
303
+ def with_input_build(self, build: Optional[str]) -> "Converter":
304
+ return replace(self, input_build=Assembly.parse(build))
305
+
306
+ def with_binary(self, path: PathLike) -> "Converter":
307
+ return replace(self, binary=Path(path))
308
+
309
+ def with_timeout(self, seconds: Optional[float]) -> "Converter":
310
+ return replace(self, timeout=seconds)
311
+
312
+ def with_log_level(self, level: str) -> "Converter":
313
+ return replace(self, log_level=level)
314
+
315
+ def with_extra_args(self, args: Iterable[str]) -> "Converter":
316
+ return replace(self, extra_args=tuple(args))
317
+
318
+ # --- Execution ---------------------------------------------------------
319
+
320
+ def argv(self) -> List[str]:
321
+ """Compute the argv that would be invoked. Useful for tests / dry-runs."""
322
+ binary = locate_binary(self.binary)
323
+ argv: List[str] = [str(binary)]
324
+
325
+ argv += ["--input-format", self.input_format.value]
326
+ argv += ["--format", self.format.value]
327
+ argv += ["--output-build", Assembly.parse(self.assembly) or self.assembly]
328
+
329
+ if self.reference is not None:
330
+ argv += ["--reference", str(self.reference)]
331
+ if self.reference_fai is not None:
332
+ argv += ["--reference-fai", str(self.reference_fai)]
333
+ if self.panel is not None:
334
+ argv += ["--panel", str(self.panel)]
335
+ if self.sample is not None:
336
+ argv += ["--sample", self.sample]
337
+ if self.input_build is not None:
338
+ argv += ["--input-build", self.input_build]
339
+ if self.sex is not None:
340
+ argv += ["--sex", self.sex.value]
341
+ if self.standardize:
342
+ argv += ["--standardize"]
343
+ if self.variants_only:
344
+ argv += ["--variants-only"]
345
+ if self.log_level and self.log_level != "info":
346
+ argv += ["--log-level", self.log_level]
347
+
348
+ if self.output_dir is not None:
349
+ argv += ["--output-dir", str(self.output_dir)]
350
+
351
+ argv += list(self.extra_args)
352
+
353
+ # positional: INPUT first, then OUTPUT if not --output-dir
354
+ argv.append(str(self.input))
355
+ if self.output is not None:
356
+ argv.append(str(self.output))
357
+ return argv
358
+
359
+ def run(self, *, capture: bool = True) -> ConversionResult:
360
+ argv = self.argv()
361
+ try:
362
+ completed = subprocess.run(
363
+ argv,
364
+ capture_output=capture,
365
+ text=True,
366
+ timeout=self.timeout,
367
+ check=False,
368
+ )
369
+ except FileNotFoundError as e:
370
+ raise ConvertGenomeBinaryNotFound(str(e)) from e
371
+
372
+ if completed.returncode != 0:
373
+ raise ConvertGenomeFailed(
374
+ f"convert_genome exited with status {completed.returncode}",
375
+ stdout=completed.stdout or "",
376
+ stderr=completed.stderr or "",
377
+ returncode=completed.returncode,
378
+ )
379
+
380
+ report_path = self._resolve_report_path(completed.stdout or "", completed.stderr or "")
381
+ if not report_path.exists():
382
+ raise ReportNotFound(f"Expected report at {report_path} but it is missing.")
383
+
384
+ with open(report_path) as f:
385
+ data = json.load(f)
386
+ return _result_from_report(
387
+ data,
388
+ report_path=report_path,
389
+ output_paths=self._resolve_outputs(),
390
+ stdout=completed.stdout or "",
391
+ stderr=completed.stderr or "",
392
+ )
393
+
394
+ # --- helpers -----------------------------------------------------------
395
+
396
+ def _resolve_outputs(self) -> Tuple[Path, ...]:
397
+ outs: List[Path] = []
398
+ if self.output is not None:
399
+ outs.append(self.output)
400
+ if self.format is OutputFormat.PLINK:
401
+ base = self.output.with_suffix("")
402
+ outs += [base.with_suffix(s) for s in (".bed", ".bim", ".fam")]
403
+ if self.output_dir is not None:
404
+ d = self.output_dir
405
+ fname = {
406
+ OutputFormat.VCF: "genotypes.vcf",
407
+ OutputFormat.BCF: "genotypes.bcf",
408
+ OutputFormat.PLINK: "genotypes",
409
+ }[self.format]
410
+ primary = d / fname
411
+ outs.append(primary)
412
+ if self.format is OutputFormat.PLINK:
413
+ outs += [d / f"genotypes{s}" for s in (".bed", ".bim", ".fam")]
414
+ if self.panel is not None:
415
+ outs.append(d / "panel.vcf")
416
+ return tuple(p for p in outs if p.exists())
417
+
418
+ def _resolve_report_path(self, stdout: str, stderr: str) -> Path:
419
+ m = re.search(r"Wrote run report to ([^\r\n]+)", stdout + "\n" + stderr)
420
+ if m:
421
+ return Path(m.group(1).strip())
422
+
423
+ if self.output is not None:
424
+ stem = self.output.stem
425
+ return self.output.with_name(f"{stem}_report.json")
426
+ # output_dir case (validated in __post_init__)
427
+ d = self.output_dir
428
+ assert d is not None
429
+ return d / "genotypes_report.json"
430
+
431
+
432
+ # ---------------------------------------------------------------------------
433
+ # Top-level convenience
434
+ # ---------------------------------------------------------------------------
435
+
436
+
437
+ def convert(
438
+ *,
439
+ input: PathLike,
440
+ output: Optional[PathLike] = None,
441
+ output_dir: Optional[PathLike] = None,
442
+ format: Union[OutputFormat, str] = OutputFormat.VCF,
443
+ input_format: Union[InputFormat, str] = InputFormat.AUTO,
444
+ assembly: str = "GRCh38",
445
+ input_build: Optional[str] = None,
446
+ reference: Optional[PathLike] = None,
447
+ reference_fai: Optional[PathLike] = None,
448
+ panel: Optional[PathLike] = None,
449
+ sample: Optional[str] = None,
450
+ sex: Optional[Union[Sex, str]] = None,
451
+ standardize: bool = False,
452
+ variants_only: bool = False,
453
+ log_level: str = "info",
454
+ binary: Optional[PathLike] = None,
455
+ timeout: Optional[float] = None,
456
+ extra_args: Optional[Iterable[str]] = None,
457
+ capture: bool = True,
458
+ ) -> ConversionResult:
459
+ """Run one conversion. Returns the parsed run report."""
460
+ # Lenient sex coercion: callers chaining results out of infer_sex
461
+ # may pass `InferredSex.INDETERMINATE.value` ("indeterminate") or
462
+ # `gnomon`'s "unknown". Neither is a valid convert_genome --sex
463
+ # value, but the most useful behaviour is "no override — let the
464
+ # CLI run its own inference", which is the same as `sex=None`.
465
+ sex_coerced: Optional[Sex]
466
+ if sex is None:
467
+ sex_coerced = None
468
+ elif isinstance(sex, str) and sex.strip().lower() in {"unknown", "indeterminate", ""}:
469
+ sex_coerced = None
470
+ elif isinstance(sex, Sex):
471
+ sex_coerced = sex
472
+ else:
473
+ sex_coerced = _coerce_enum(sex, Sex)
474
+
475
+ converter = Converter(
476
+ input=Path(input),
477
+ output=Path(output) if output else None,
478
+ output_dir=Path(output_dir) if output_dir else None,
479
+ format=_coerce_enum(format, OutputFormat),
480
+ input_format=_coerce_enum(input_format, InputFormat),
481
+ assembly=Assembly.parse(assembly) or "GRCh38",
482
+ input_build=Assembly.parse(input_build),
483
+ reference=Path(reference) if reference else None,
484
+ reference_fai=Path(reference_fai) if reference_fai else None,
485
+ panel=Path(panel) if panel else None,
486
+ sample=sample,
487
+ sex=sex_coerced,
488
+ standardize=standardize,
489
+ variants_only=variants_only,
490
+ log_level=log_level,
491
+ binary=Path(binary) if binary else None,
492
+ timeout=timeout,
493
+ extra_args=tuple(extra_args) if extra_args else (),
494
+ )
495
+ return converter.run(capture=capture)
496
+
497
+
498
+ def _coerce_enum(value, enum_cls):
499
+ if isinstance(value, enum_cls):
500
+ return value
501
+ if isinstance(value, str):
502
+ try:
503
+ return enum_cls(value.lower())
504
+ except ValueError:
505
+ pass
506
+ for member in enum_cls:
507
+ if member.name.lower() == value.lower():
508
+ return member
509
+ raise InvalidConfig(f"Cannot coerce {value!r} to {enum_cls.__name__}")
510
+
511
+
512
+ # ---------------------------------------------------------------------------
513
+ # Report parsing
514
+ # ---------------------------------------------------------------------------
515
+
516
+
517
+ def _result_from_report(
518
+ data: Mapping[str, Any],
519
+ *,
520
+ report_path: Path,
521
+ output_paths: Tuple[Path, ...],
522
+ stdout: str,
523
+ stderr: str,
524
+ ) -> ConversionResult:
525
+ try:
526
+ stats = Statistics(**data["statistics"])
527
+ input_info = InputInfo(**data["input"])
528
+ output_info = OutputInfo(**data["output"])
529
+ reference_info = ReferenceInfo(**data["reference"])
530
+ sample_info = SampleInfo(**data["sample"])
531
+ panel_info = PanelInfo(**data["panel"]) if data.get("panel") else None
532
+ build = BuildDetection(**data["build_detection"]) if data.get("build_detection") else None
533
+ return ConversionResult(
534
+ version=data["version"],
535
+ timestamp=data["timestamp"],
536
+ input=input_info,
537
+ output=output_info,
538
+ reference=reference_info,
539
+ standardize=data["standardize"],
540
+ sample=sample_info,
541
+ statistics=stats,
542
+ panel=panel_info,
543
+ build_detection=build,
544
+ report_path=report_path,
545
+ output_paths=output_paths,
546
+ stdout=stdout,
547
+ stderr=stderr,
548
+ )
549
+ except KeyError as e:
550
+ raise ConvertGenomeFailed(
551
+ f"Report {report_path} missing expected field: {e}",
552
+ stdout=stdout,
553
+ stderr=stderr,
554
+ ) from e
555
+ except TypeError as e:
556
+ raise ConvertGenomeFailed(
557
+ f"Report {report_path} has an unexpected schema: {e}",
558
+ stdout=stdout,
559
+ stderr=stderr,
560
+ ) from e
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.18"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "convert_genome"
7
+ version = "0.1.0"
8
+ description = "Python wrapper for SauersML/convert_genome (DTC → VCF/BCF/PLINK conversion)."
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ authors = [{ name = "SauersML" }]
12
+ requires-python = ">=3.9"
13
+ dependencies = []
14
+ keywords = ["genomics", "bioinformatics", "vcf", "bcf", "plink", "23andme", "ancestry", "dtc"]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Science/Research",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3 :: Only",
21
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
22
+ ]
23
+
24
+ [project.urls]
25
+ Homepage = "https://github.com/SauersML/convert_genome"
26
+ Issues = "https://github.com/SauersML/convert_genome/issues"
27
+
28
+ [project.optional-dependencies]
29
+ test = ["pytest>=7"]
30
+
31
+ [tool.hatch.build.targets.wheel]
32
+ packages = ["convert_genome"]
@@ -0,0 +1,407 @@
1
+ """Tests for the convert_genome Python wrapper.
2
+
3
+ We don't run the real Rust binary in CI. Tests build a fake binary as a
4
+ small Python script that:
5
+
6
+ 1. Echoes argv (so we can assert flag mapping).
7
+ 2. Writes a minimal but valid run-report JSON wherever the wrapper
8
+ expects to find it.
9
+ 3. Exits 0 (or fails on demand, to test error paths).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import stat
16
+ import textwrap
17
+ from pathlib import Path
18
+
19
+ import pytest
20
+
21
+ from convert_genome import (
22
+ Assembly,
23
+ ConversionResult,
24
+ Converter,
25
+ ConvertGenomeBinaryNotFound,
26
+ ConvertGenomeError,
27
+ ConvertGenomeFailed,
28
+ InvalidConfig,
29
+ OutputFormat,
30
+ ReportNotFound,
31
+ Sex,
32
+ convert,
33
+ locate_binary,
34
+ )
35
+
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Helpers
39
+ # ---------------------------------------------------------------------------
40
+
41
+
42
+ def _make_input(tmp_path: Path, name: str = "in.txt") -> Path:
43
+ p = tmp_path / name
44
+ p.write_text("# rsid\tchromosome\tposition\tgenotype\nrs1\t1\t12345\tAG\n")
45
+ return p
46
+
47
+
48
+ _FAKE_FAILING = textwrap.dedent(
49
+ """\
50
+ #!/usr/bin/env python3
51
+ import sys
52
+ sys.stderr.write('ERROR: synthetic failure\\n')
53
+ sys.exit(2)
54
+ """
55
+ )
56
+
57
+
58
+ _FAKE_NOREPORT = textwrap.dedent(
59
+ """\
60
+ #!/usr/bin/env python3
61
+ print('Did some work, but forgot to write a report.')
62
+ """
63
+ )
64
+
65
+
66
+ def _good_fake_body(report: dict, *, log_line: str = "") -> str:
67
+ """A fake CLI that writes ``report`` to the wrapper's expected path."""
68
+ return textwrap.dedent(
69
+ f"""\
70
+ #!/usr/bin/env python3
71
+ import json, sys, pathlib
72
+
73
+ argv = sys.argv[1:]
74
+ log_argv = pathlib.Path(sys.argv[0]).parent / 'argv.json'
75
+ log_argv.write_text(json.dumps(argv))
76
+
77
+ # Figure out where the wrapper expects the report.
78
+ report = {json.dumps(report)!r}
79
+ if '--output-dir' in argv:
80
+ d = pathlib.Path(argv[argv.index('--output-dir') + 1])
81
+ d.mkdir(parents=True, exist_ok=True)
82
+ report_path = d / 'genotypes_report.json'
83
+ else:
84
+ # Positional layout: ... INPUT OUTPUT
85
+ output_path = pathlib.Path(argv[-1])
86
+ stem = output_path.stem
87
+ report_path = output_path.with_name(stem + '_report.json')
88
+
89
+ # Also write a sentinel output file so output_paths picks it up.
90
+ if '--output-dir' in argv:
91
+ (d / 'genotypes.vcf').write_text('##fake\\n')
92
+ else:
93
+ output_path.write_text('##fake\\n')
94
+
95
+ report_path.write_text(report)
96
+ sys.stdout.write({log_line!r})
97
+ """
98
+ )
99
+
100
+
101
+ def _make_fake(tmp_path: Path, body: str) -> Path:
102
+ p = tmp_path / "fake_convert_genome"
103
+ p.write_text(body)
104
+ p.chmod(p.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
105
+ return p
106
+
107
+
108
+ def _minimal_report() -> dict:
109
+ return {
110
+ "version": "0.1.2",
111
+ "timestamp": "2026-05-19T00:00:00Z",
112
+ "input": {"path": "/in.txt", "format": "DTC", "origin": "local"},
113
+ "output": {"path": "/out.vcf", "format": "VCF"},
114
+ "reference": {"path": "/ref.fa", "origin": "downloaded", "assembly": "GRCh38"},
115
+ "standardize": True,
116
+ "sample": {"id": "S1", "sex": "Female", "sex_inferred": True},
117
+ "build_detection": {
118
+ "detected_build": "GRCh38",
119
+ "hg19_match_rate": 12.3,
120
+ "hg38_match_rate": 98.7,
121
+ },
122
+ "statistics": {
123
+ "total_records": 1000,
124
+ "emitted_records": 990,
125
+ "variant_records": 800,
126
+ "reference_records": 190,
127
+ "missing_genotype_records": 10,
128
+ "skipped_reference_sites": 0,
129
+ "unknown_chromosomes": 0,
130
+ "reference_failures": 0,
131
+ "invalid_genotypes": 0,
132
+ "symbolic_allele_records": 0,
133
+ "parse_errors": 0,
134
+ },
135
+ }
136
+
137
+
138
+ # ---------------------------------------------------------------------------
139
+ # Locator
140
+ # ---------------------------------------------------------------------------
141
+
142
+
143
+ def test_locate_binary_not_on_path(monkeypatch, tmp_path):
144
+ monkeypatch.setenv("PATH", str(tmp_path)) # empty PATH
145
+ with pytest.raises(ConvertGenomeBinaryNotFound):
146
+ locate_binary()
147
+
148
+
149
+ def test_locate_binary_override(tmp_path):
150
+ fake = _make_fake(tmp_path, "#!/usr/bin/env python3\nprint('x')\n")
151
+ assert locate_binary(fake) == fake
152
+
153
+
154
+ def test_locate_binary_override_missing_raises(tmp_path):
155
+ with pytest.raises(ConvertGenomeBinaryNotFound):
156
+ locate_binary(tmp_path / "no-such-binary")
157
+
158
+
159
+ # ---------------------------------------------------------------------------
160
+ # Eager validation
161
+ # ---------------------------------------------------------------------------
162
+
163
+
164
+ def test_requires_output_or_output_dir(tmp_path):
165
+ in_ = _make_input(tmp_path)
166
+ with pytest.raises(InvalidConfig):
167
+ Converter(input=in_)
168
+
169
+
170
+ def test_output_xor_output_dir(tmp_path):
171
+ in_ = _make_input(tmp_path)
172
+ with pytest.raises(InvalidConfig):
173
+ Converter(input=in_, output=tmp_path / "out.vcf", output_dir=tmp_path / "d")
174
+
175
+
176
+ def test_reference_fai_requires_reference(tmp_path):
177
+ in_ = _make_input(tmp_path)
178
+ fai = tmp_path / "x.fai"
179
+ fai.write_text("")
180
+ with pytest.raises(InvalidConfig):
181
+ Converter(input=in_, output=tmp_path / "o.vcf", reference_fai=fai)
182
+
183
+
184
+ def test_input_must_exist(tmp_path):
185
+ with pytest.raises(InvalidConfig):
186
+ Converter(input=tmp_path / "nope.txt", output=tmp_path / "o.vcf")
187
+
188
+
189
+ # ---------------------------------------------------------------------------
190
+ # Assembly alias normalisation
191
+ # ---------------------------------------------------------------------------
192
+
193
+
194
+ def test_assembly_parses_aliases():
195
+ assert Assembly.parse("hg19") == "GRCh37"
196
+ assert Assembly.parse("GRCh38") == "GRCh38"
197
+ assert Assembly.parse("build38") == "GRCh38"
198
+ assert Assembly.parse("Hg-38") == "GRCh38"
199
+ # Unknown strings pass through (CLI will decide).
200
+ assert Assembly.parse("CHM13") == "CHM13"
201
+
202
+
203
+ # ---------------------------------------------------------------------------
204
+ # argv generation
205
+ # ---------------------------------------------------------------------------
206
+
207
+
208
+ def test_argv_for_simple_vcf(tmp_path):
209
+ in_ = _make_input(tmp_path)
210
+ fake = _make_fake(tmp_path, "")
211
+ argv = (
212
+ Converter(input=in_, output=tmp_path / "o.vcf", binary=fake)
213
+ .with_assembly("hg38")
214
+ .argv()
215
+ )
216
+ assert argv[0] == str(fake)
217
+ assert "--format" in argv and argv[argv.index("--format") + 1] == "vcf"
218
+ assert argv[argv.index("--output-build") + 1] == "GRCh38"
219
+ assert argv[-2] == str(in_)
220
+ assert argv[-1] == str(tmp_path / "o.vcf")
221
+ # No --output-dir.
222
+ assert "--output-dir" not in argv
223
+
224
+
225
+ def test_argv_for_plink_output_dir(tmp_path):
226
+ in_ = _make_input(tmp_path)
227
+ fake = _make_fake(tmp_path, "")
228
+ out_dir = tmp_path / "d"
229
+ argv = (
230
+ Converter(
231
+ input=in_,
232
+ output_dir=out_dir,
233
+ format=OutputFormat.PLINK,
234
+ binary=fake,
235
+ )
236
+ .with_standardize()
237
+ .with_variants_only()
238
+ .with_sex(Sex.MALE)
239
+ .argv()
240
+ )
241
+ assert "--standardize" in argv
242
+ assert "--variants-only" in argv
243
+ assert argv[argv.index("--sex") + 1] == "male"
244
+ assert argv[argv.index("--format") + 1] == "plink"
245
+ assert argv[argv.index("--output-dir") + 1] == str(out_dir)
246
+ # Last positional is INPUT, no trailing OUTPUT since --output-dir was set.
247
+ assert argv[-1] == str(in_)
248
+
249
+
250
+ def test_argv_includes_reference_and_panel(tmp_path):
251
+ in_ = _make_input(tmp_path)
252
+ ref = tmp_path / "ref.fa"
253
+ fai = tmp_path / "ref.fa.fai"
254
+ panel = tmp_path / "panel.vcf"
255
+ for p in (ref, fai, panel):
256
+ p.write_text("")
257
+ fake = _make_fake(tmp_path, "")
258
+ argv = (
259
+ Converter(input=in_, output=tmp_path / "o.vcf", binary=fake)
260
+ .with_reference(ref, fai)
261
+ .with_panel(panel)
262
+ .with_sample("ID1")
263
+ .with_input_build("hg19")
264
+ .argv()
265
+ )
266
+ assert argv[argv.index("--reference") + 1] == str(ref)
267
+ assert argv[argv.index("--reference-fai") + 1] == str(fai)
268
+ assert argv[argv.index("--panel") + 1] == str(panel)
269
+ assert argv[argv.index("--sample") + 1] == "ID1"
270
+ assert argv[argv.index("--input-build") + 1] == "GRCh37"
271
+
272
+
273
+ # ---------------------------------------------------------------------------
274
+ # Run + JSON parsing
275
+ # ---------------------------------------------------------------------------
276
+
277
+
278
+ def test_run_parses_report(tmp_path):
279
+ in_ = _make_input(tmp_path)
280
+ out = tmp_path / "out.vcf"
281
+ fake = _make_fake(tmp_path, _good_fake_body(_minimal_report()))
282
+
283
+ result = convert(
284
+ input=in_,
285
+ output=out,
286
+ binary=fake,
287
+ standardize=True,
288
+ assembly="hg38",
289
+ sex="female",
290
+ )
291
+ assert isinstance(result, ConversionResult)
292
+ assert result.statistics.emitted_records == 990
293
+ assert result.statistics.total_records == 1000
294
+ assert result.yield_rate == pytest.approx(0.99)
295
+ assert result.sample.sex_inferred is True
296
+ assert result.build_detection is not None
297
+ assert result.build_detection.detected_build == "GRCh38"
298
+ assert result.report_path is not None
299
+ assert result.report_path.exists()
300
+ assert any(p.suffix == ".vcf" for p in result.output_paths)
301
+
302
+
303
+ def test_run_with_output_dir(tmp_path):
304
+ in_ = _make_input(tmp_path)
305
+ out_dir = tmp_path / "outdir"
306
+ fake = _make_fake(tmp_path, _good_fake_body(_minimal_report()))
307
+
308
+ result = convert(
309
+ input=in_,
310
+ output_dir=out_dir,
311
+ format=OutputFormat.VCF,
312
+ binary=fake,
313
+ )
314
+ assert result.report_path == out_dir / "genotypes_report.json"
315
+ assert (out_dir / "genotypes.vcf") in result.output_paths
316
+
317
+
318
+ def test_run_locates_report_via_log_line(tmp_path):
319
+ """The CLI logs 'Wrote run report to <path>' — prefer that over guessing."""
320
+ in_ = _make_input(tmp_path)
321
+ out = tmp_path / "out.vcf"
322
+ # Move the report to an unusual location, then advertise it via log.
323
+ alt = tmp_path / "side_report.json"
324
+ body = textwrap.dedent(
325
+ f"""\
326
+ #!/usr/bin/env python3
327
+ import json, pathlib, sys
328
+ pathlib.Path({str(alt)!r}).write_text(json.dumps({_minimal_report()!r}))
329
+ pathlib.Path({str(out)!r}).write_text('##')
330
+ print('Wrote run report to', {str(alt)!r})
331
+ """
332
+ )
333
+ fake = _make_fake(tmp_path, body)
334
+ result = convert(input=in_, output=out, binary=fake)
335
+ assert result.report_path == alt
336
+
337
+
338
+ def test_failure_exit_code(tmp_path):
339
+ in_ = _make_input(tmp_path)
340
+ fake = _make_fake(tmp_path, _FAKE_FAILING)
341
+ with pytest.raises(ConvertGenomeFailed) as ei:
342
+ convert(input=in_, output=tmp_path / "o.vcf", binary=fake)
343
+ assert ei.value.returncode == 2
344
+ assert "synthetic failure" in ei.value.stderr
345
+
346
+
347
+ def test_missing_report_raises(tmp_path):
348
+ in_ = _make_input(tmp_path)
349
+ fake = _make_fake(tmp_path, _FAKE_NOREPORT)
350
+ with pytest.raises(ReportNotFound):
351
+ convert(input=in_, output=tmp_path / "o.vcf", binary=fake)
352
+
353
+
354
+ def test_unexpected_schema_raises(tmp_path):
355
+ in_ = _make_input(tmp_path)
356
+ bad = _minimal_report()
357
+ bad["statistics"]["mystery_new_field"] = 42
358
+ fake = _make_fake(tmp_path, _good_fake_body(bad))
359
+ with pytest.raises(ConvertGenomeFailed):
360
+ convert(input=in_, output=tmp_path / "o.vcf", binary=fake)
361
+
362
+
363
+ def test_argv_log_actually_reflects_invocation(tmp_path):
364
+ in_ = _make_input(tmp_path)
365
+ fake = _make_fake(tmp_path, _good_fake_body(_minimal_report()))
366
+ convert(
367
+ input=in_,
368
+ output=tmp_path / "o.vcf",
369
+ binary=fake,
370
+ format="bcf",
371
+ standardize=True,
372
+ variants_only=True,
373
+ extra_args=["--log-level", "warn"],
374
+ )
375
+ argv = json.loads((tmp_path / "argv.json").read_text())
376
+ assert argv[argv.index("--format") + 1] == "bcf"
377
+ assert "--standardize" in argv
378
+ assert "--variants-only" in argv
379
+ assert argv[argv.index("--log-level") + 1] == "warn"
380
+
381
+
382
+ def test_error_hierarchy():
383
+ assert issubclass(ConvertGenomeBinaryNotFound, ConvertGenomeError)
384
+ assert issubclass(ConvertGenomeFailed, ConvertGenomeError)
385
+ assert issubclass(InvalidConfig, ConvertGenomeError)
386
+ assert issubclass(ReportNotFound, ConvertGenomeError)
387
+
388
+
389
+ def test_sex_indeterminate_maps_to_no_flag(tmp_path):
390
+ """Regression: callers chaining through infer_sex may pass
391
+ 'indeterminate' or 'unknown'. Neither is a valid --sex value, but
392
+ omitting the flag and letting the CLI infer is the right behaviour."""
393
+ in_ = _make_input(tmp_path)
394
+ fake = _make_fake(tmp_path, _good_fake_body(_minimal_report()))
395
+ for value in ("indeterminate", "unknown", "INDETERMINATE", " unknown ", ""):
396
+ convert(input=in_, output=tmp_path / "o.vcf", binary=fake, sex=value)
397
+ argv = json.loads((tmp_path / "argv.json").read_text())
398
+ assert "--sex" not in argv, f"sex={value!r} should not produce --sex flag"
399
+
400
+
401
+ def test_converter_is_immutable(tmp_path):
402
+ in_ = _make_input(tmp_path)
403
+ base = Converter(input=in_, output=tmp_path / "o.vcf")
404
+ new = base.with_standardize()
405
+ assert base.standardize is False
406
+ assert new.standardize is True
407
+ assert new is not base