PyPI - convert-genome - Versions diffs - 0.1.0__tar.gz - Mend

convert-genome 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

convert_genome-0.1.0/.gitignore +6 -0
convert_genome-0.1.0/PKG-INFO +150 -0
convert_genome-0.1.0/README.md +130 -0
convert_genome-0.1.0/convert_genome/__init__.py +76 -0
convert_genome-0.1.0/convert_genome/_api.py +560 -0
convert_genome-0.1.0/pyproject.toml +32 -0
convert_genome-0.1.0/tests/test_convert_genome.py +407 -0

convert_genome-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,6 @@
+.venv/
+.pytest_cache/
+dist/
+build/
+**/__pycache__/
+*.egg-info/

convert_genome-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,150 @@
+Metadata-Version: 2.4
+Name: convert_genome
+Version: 0.1.0
+Summary: Python wrapper for SauersML/convert_genome (DTC → VCF/BCF/PLINK conversion).
+Project-URL: Homepage, https://github.com/SauersML/convert_genome
+Project-URL: Issues, https://github.com/SauersML/convert_genome/issues
+Author: SauersML
+License: MIT
+Keywords: 23andme,ancestry,bcf,bioinformatics,dtc,genomics,plink,vcf
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Requires-Python: >=3.9
+Provides-Extra: test
+Requires-Dist: pytest>=7; extra == 'test'
+Description-Content-Type: text/markdown
+# convert_genome (Python)
+Python wrapper for the
+[`SauersML/convert_genome`](https://github.com/SauersML/convert_genome) CLI.
+Convert direct-to-consumer dumps (23andMe, AncestryDNA, MyHeritage,
+deCODEme) and standard VCF/BCF into compliant VCF, BCF, or PLINK 1.9
+binary — with build detection, sex inference, liftover, and panel
+harmonisation, all controllable from kwargs.
+```python
+from convert_genome import convert, OutputFormat
+result = convert(
+    input="23andme.txt",
+    output="out.vcf",
+    format=OutputFormat.VCF,
+    assembly="hg38",
+    standardize=True,
+)
+result.statistics.emitted_records       # int
+result.sample.sex_inferred              # bool
+result.build_detection.detected_build   # 'GRCh37' / 'GRCh38' / ...
+result.report_path                      # path to <stem>_report.json
+result.output_paths                     # files that actually exist on disk
+result.yield_rate                       # emitted / total
+```
+The wrapper runs the Rust binary, parses the sidecar
+`<stem>_report.json` into typed frozen dataclasses, and returns a
+single `ConversionResult`.
+## Install
+```bash
+pip install convert_genome
+# the Rust binary:
+cargo install convert_genome
+```
+Binary located via `binary=` or PATH. No env-var indirection — if
+the binary isn't on PATH, pass `binary=` explicitly. Missing binary
+→ `ConvertGenomeBinaryNotFound` with the suggested install command.
+## Shortcuts: skip every auto-discovery step
+The CLI will download/auto-detect things it doesn't need to. Pass them
+in directly:
+```python
+convert(
+    input="raw.txt",
+    output="out.vcf",
+    reference="/cache/hg38.fa",         # skip FASTA download
+    reference_fai="/cache/hg38.fa.fai", # skip .fai indexing
+    input_build="hg19",                  # skip build detection
+    assembly="GRCh38",                   # target build (still does liftover)
+    panel="/cache/1kg_panel.vcf",        # supply harmonisation panel
+    sex="female",                        # skip sex inference
+    standardize=True,
+)
+```
+`sex` is lenient: passing `"unknown"` or `"indeterminate"` (e.g. when
+chaining out of `infer_sex`) silently omits the `--sex` flag and lets
+the CLI run its own inference.
+## Builder
+`Converter` is a frozen dataclass; every `with_*` returns a new
+instance, so branching is safe.
+```python
+from convert_genome import Converter, Sex, OutputFormat
+plan = (
+    Converter(input="raw.txt", output_dir="out/", format=OutputFormat.PLINK)
+        .with_assembly("GRCh38")
+        .with_reference("/cache/hg38.fa", "/cache/hg38.fa.fai")
+        .with_panel("/data/1kg_panel.vcf.gz")
+        .with_standardize()
+        .with_sex(Sex.MALE)
+)
+print(plan.argv())   # exact argv that would be passed to the CLI
+result = plan.run()
+```
+## Enums
+```python
+InputFormat.AUTO / .DTC / .VCF / .BCF
+OutputFormat.VCF / .BCF / .PLINK
+Sex.MALE / .FEMALE
+Assembly.GRCH37 / .GRCH38     # plus a `.parse()` classmethod that
+                              # accepts 'hg19' / 'hg38' / 'build38' / ...
+```
+## Output
+The Rust tool writes `<stem>_report.json` alongside the main output.
+The wrapper loads it into `ConversionResult`, with sub-dataclasses for
+each section:
+```python
+result.input         # InputInfo (path, format, origin)
+result.output        # OutputInfo (path, format)
+result.reference     # ReferenceInfo (path, origin, assembly)
+result.panel         # PanelInfo | None
+result.sample        # SampleInfo (id, sex, sex_inferred)
+result.build_detection  # BuildDetection | None (detected_build, match rates)
+result.statistics    # Statistics (total / emitted / variant / ... records)
+result.report_path   # path to the JSON sidecar
+result.output_paths  # tuple[Path] — files that actually exist on disk
+```
+For PLINK output, `output_paths` includes the `.bed/.bim/.fam` trio. For
+`output_dir` with a panel, it includes `panel.vcf`. Non-existent paths
+are filtered out automatically.
+## Errors
+* `ConvertGenomeBinaryNotFound` — CLI not installed / not on PATH.
+* `InvalidConfig` — argument combination rejected before launching
+  (e.g. missing input file, conflicting output/output_dir).
+* `ConvertGenomeFailed` — CLI exited non-zero. The exception carries
+  `stdout`, `stderr`, `returncode`.
+* `ReportNotFound` — CLI ran clean but didn't write a JSON sidecar.
+All subclass `ConvertGenomeError`.

convert_genome-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,130 @@
+# convert_genome (Python)
+Python wrapper for the
+[`SauersML/convert_genome`](https://github.com/SauersML/convert_genome) CLI.
+Convert direct-to-consumer dumps (23andMe, AncestryDNA, MyHeritage,
+deCODEme) and standard VCF/BCF into compliant VCF, BCF, or PLINK 1.9
+binary — with build detection, sex inference, liftover, and panel
+harmonisation, all controllable from kwargs.
+```python
+from convert_genome import convert, OutputFormat
+result = convert(
+    input="23andme.txt",
+    output="out.vcf",
+    format=OutputFormat.VCF,
+    assembly="hg38",
+    standardize=True,
+)
+result.statistics.emitted_records       # int
+result.sample.sex_inferred              # bool
+result.build_detection.detected_build   # 'GRCh37' / 'GRCh38' / ...
+result.report_path                      # path to <stem>_report.json
+result.output_paths                     # files that actually exist on disk
+result.yield_rate                       # emitted / total
+```
+The wrapper runs the Rust binary, parses the sidecar
+`<stem>_report.json` into typed frozen dataclasses, and returns a
+single `ConversionResult`.
+## Install
+```bash
+pip install convert_genome
+# the Rust binary:
+cargo install convert_genome
+```
+Binary located via `binary=` or PATH. No env-var indirection — if
+the binary isn't on PATH, pass `binary=` explicitly. Missing binary
+→ `ConvertGenomeBinaryNotFound` with the suggested install command.
+## Shortcuts: skip every auto-discovery step
+The CLI will download/auto-detect things it doesn't need to. Pass them
+in directly:
+```python
+convert(
+    input="raw.txt",
+    output="out.vcf",
+    reference="/cache/hg38.fa",         # skip FASTA download
+    reference_fai="/cache/hg38.fa.fai", # skip .fai indexing
+    input_build="hg19",                  # skip build detection
+    assembly="GRCh38",                   # target build (still does liftover)
+    panel="/cache/1kg_panel.vcf",        # supply harmonisation panel
+    sex="female",                        # skip sex inference
+    standardize=True,
+)
+```
+`sex` is lenient: passing `"unknown"` or `"indeterminate"` (e.g. when
+chaining out of `infer_sex`) silently omits the `--sex` flag and lets
+the CLI run its own inference.
+## Builder
+`Converter` is a frozen dataclass; every `with_*` returns a new
+instance, so branching is safe.
+```python
+from convert_genome import Converter, Sex, OutputFormat
+plan = (
+    Converter(input="raw.txt", output_dir="out/", format=OutputFormat.PLINK)
+        .with_assembly("GRCh38")
+        .with_reference("/cache/hg38.fa", "/cache/hg38.fa.fai")
+        .with_panel("/data/1kg_panel.vcf.gz")
+        .with_standardize()
+        .with_sex(Sex.MALE)
+)
+print(plan.argv())   # exact argv that would be passed to the CLI
+result = plan.run()
+```
+## Enums
+```python
+InputFormat.AUTO / .DTC / .VCF / .BCF
+OutputFormat.VCF / .BCF / .PLINK
+Sex.MALE / .FEMALE
+Assembly.GRCH37 / .GRCH38     # plus a `.parse()` classmethod that
+                              # accepts 'hg19' / 'hg38' / 'build38' / ...
+```
+## Output
+The Rust tool writes `<stem>_report.json` alongside the main output.
+The wrapper loads it into `ConversionResult`, with sub-dataclasses for
+each section:
+```python
+result.input         # InputInfo (path, format, origin)
+result.output        # OutputInfo (path, format)
+result.reference     # ReferenceInfo (path, origin, assembly)
+result.panel         # PanelInfo | None
+result.sample        # SampleInfo (id, sex, sex_inferred)
+result.build_detection  # BuildDetection | None (detected_build, match rates)
+result.statistics    # Statistics (total / emitted / variant / ... records)
+result.report_path   # path to the JSON sidecar
+result.output_paths  # tuple[Path] — files that actually exist on disk
+```
+For PLINK output, `output_paths` includes the `.bed/.bim/.fam` trio. For
+`output_dir` with a panel, it includes `panel.vcf`. Non-existent paths
+are filtered out automatically.
+## Errors
+* `ConvertGenomeBinaryNotFound` — CLI not installed / not on PATH.
+* `InvalidConfig` — argument combination rejected before launching
+  (e.g. missing input file, conflicting output/output_dir).
+* `ConvertGenomeFailed` — CLI exited non-zero. The exception carries
+  `stdout`, `stderr`, `returncode`.
+* `ReportNotFound` — CLI ran clean but didn't write a JSON sidecar.
+All subclass `ConvertGenomeError`.

convert_genome-0.1.0/convert_genome/__init__.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""convert_genome — Python bindings for the SauersML/convert_genome CLI.
+Convert direct-to-consumer (23andMe, AncestryDNA, ...) and standard
+VCF/BCF inputs into compliant VCF, BCF, or PLINK 1.9 binary, with build
+detection, sex inference, liftover, and panel harmonisation.
+This package shells out to the `convert_genome` Rust binary and parses
+its sidecar `_report.json` into typed dataclasses. The Python API is
+typed kwargs end-to-end; you never need to remember CLI flag names.
+Quick start
+-----------
+>>> from convert_genome import convert, OutputFormat
+>>> result = convert(
+...     input="23andme.txt",
+...     output="out.vcf",
+...     format=OutputFormat.VCF,
+...     assembly="GRCh38",
+...     standardize=True,
+... )
+>>> result.sample.sex_inferred
+True
+>>> result.statistics.emitted_records
+612_345
+>>> result.build_detection.detected_build
+'GRCh37'
+"""
+from ._api import (
+    convert,
+    Converter,
+    ConversionResult,
+    InputInfo,
+    OutputInfo,
+    ReferenceInfo,
+    PanelInfo,
+    SampleInfo,
+    BuildDetection,
+    Statistics,
+    InputFormat,
+    OutputFormat,
+    Sex,
+    Assembly,
+    ConvertGenomeError,
+    ConvertGenomeBinaryNotFound,
+    ConvertGenomeFailed,
+    InvalidConfig,
+    ReportNotFound,
+    locate_binary,
+)
+__all__ = [
+    "convert",
+    "Converter",
+    "ConversionResult",
+    "InputInfo",
+    "OutputInfo",
+    "ReferenceInfo",
+    "PanelInfo",
+    "SampleInfo",
+    "BuildDetection",
+    "Statistics",
+    "InputFormat",
+    "OutputFormat",
+    "Sex",
+    "Assembly",
+    "ConvertGenomeError",
+    "ConvertGenomeBinaryNotFound",
+    "ConvertGenomeFailed",
+    "InvalidConfig",
+    "ReportNotFound",
+    "locate_binary",
+]
+__version__ = "0.1.0"

convert_genome-0.1.0/convert_genome/_api.py ADDED Viewed

@@ -0,0 +1,560 @@
+"""Pythonic wrapper around the convert_genome CLI.
+Design
+------
+The CLI already writes a structured ``<stem>_report.json`` next to each
+output, so we don't parse stdout — we run the binary, wait for it to
+finish, then load that JSON into typed frozen dataclasses.
+The ``Converter`` class is an immutable builder. The top-level
+``convert(...)`` is the one-shot convenience.
+We deliberately don't shadow the CLI's auto-detection logic: pass
+``input_format=InputFormat.AUTO`` (the default) and let the Rust tool
+sniff. Where we *do* validate eagerly is on parameter combinations that
+the CLI rejects late and noisily — e.g. ``--output`` xor ``--output-dir``,
+``--reference-fai`` requiring ``--reference``.
+"""
+from __future__ import annotations
+import enum
+import json
+import os
+import re
+import shutil
+import subprocess
+from dataclasses import dataclass, field, replace
+from pathlib import Path
+from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union
+PathLike = Union[str, os.PathLike]
+# ---------------------------------------------------------------------------
+# Enums (mirror src/cli.rs)
+# ---------------------------------------------------------------------------
+class InputFormat(str, enum.Enum):
+    AUTO = "auto"
+    DTC = "dtc"
+    VCF = "vcf"
+    BCF = "bcf"
+class OutputFormat(str, enum.Enum):
+    VCF = "vcf"
+    BCF = "bcf"
+    PLINK = "plink"
+class Sex(str, enum.Enum):
+    MALE = "male"
+    FEMALE = "female"
+class Assembly(str, enum.Enum):
+    GRCH37 = "GRCh37"
+    GRCH38 = "GRCh38"
+    @classmethod
+    def parse(cls, value: Union[str, "Assembly", None]) -> Optional[str]:
+        if value is None:
+            return None
+        if isinstance(value, cls):
+            return value.value
+        norm = str(value).strip()
+        low = norm.lower().replace("-", "").replace("_", "")
+        aliases = {
+            "grch37": "GRCh37",
+            "hg19": "GRCh37",
+            "build37": "GRCh37",
+            "grch38": "GRCh38",
+            "hg38": "GRCh38",
+            "build38": "GRCh38",
+        }
+        return aliases.get(low, norm)
+# ---------------------------------------------------------------------------
+# Errors
+# ---------------------------------------------------------------------------
+class ConvertGenomeError(Exception):
+    """Base class for all convert_genome wrapper errors."""
+class ConvertGenomeBinaryNotFound(ConvertGenomeError, FileNotFoundError):
+    """The `convert_genome` binary could not be located."""
+class InvalidConfig(ConvertGenomeError, ValueError):
+    """A combination of arguments is mutually exclusive or incomplete."""
+class ConvertGenomeFailed(ConvertGenomeError, RuntimeError):
+    """The binary ran but returned a non-zero exit code."""
+    def __init__(self, message: str, *, stdout: str = "", stderr: str = "", returncode: int = 0):
+        super().__init__(message)
+        self.stdout = stdout
+        self.stderr = stderr
+        self.returncode = returncode
+class ReportNotFound(ConvertGenomeError, FileNotFoundError):
+    """The binary exited 0 but produced no sidecar `_report.json`."""
+# ---------------------------------------------------------------------------
+# Result dataclasses — mirror src/report.rs
+# ---------------------------------------------------------------------------
+@dataclass(frozen=True)
+class InputInfo:
+    path: str
+    format: str
+    origin: str
+@dataclass(frozen=True)
+class OutputInfo:
+    path: str
+    format: str
+@dataclass(frozen=True)
+class ReferenceInfo:
+    path: str
+    origin: str
+    assembly: str
+@dataclass(frozen=True)
+class PanelInfo:
+    path: str
+    total_sites: int
+    modified_sites: int
+    novel_sites: int
+@dataclass(frozen=True)
+class SampleInfo:
+    id: str
+    sex: str
+    sex_inferred: bool
+@dataclass(frozen=True)
+class BuildDetection:
+    detected_build: str
+    hg19_match_rate: float
+    hg38_match_rate: float
+@dataclass(frozen=True)
+class Statistics:
+    total_records: int
+    emitted_records: int
+    variant_records: int
+    reference_records: int
+    missing_genotype_records: int
+    skipped_reference_sites: int
+    unknown_chromosomes: int
+    reference_failures: int
+    invalid_genotypes: int
+    symbolic_allele_records: int
+    parse_errors: int
+@dataclass(frozen=True)
+class ConversionResult:
+    """The full run report parsed from ``<stem>_report.json``."""
+    version: str
+    timestamp: str
+    input: InputInfo
+    output: OutputInfo
+    reference: ReferenceInfo
+    standardize: bool
+    sample: SampleInfo
+    statistics: Statistics
+    panel: Optional[PanelInfo] = None
+    build_detection: Optional[BuildDetection] = None
+    report_path: Optional[Path] = None
+    output_paths: Tuple[Path, ...] = field(default_factory=tuple)
+    stdout: str = ""
+    stderr: str = ""
+    @property
+    def main_output(self) -> Path:
+        return Path(self.output.path)
+    @property
+    def emitted_records(self) -> int:
+        return self.statistics.emitted_records
+    @property
+    def total_records(self) -> int:
+        return self.statistics.total_records
+    @property
+    def yield_rate(self) -> float:
+        if self.statistics.total_records == 0:
+            return 0.0
+        return self.statistics.emitted_records / self.statistics.total_records
+# ---------------------------------------------------------------------------
+# Binary location
+# ---------------------------------------------------------------------------
+def locate_binary(override: Optional[PathLike] = None) -> Path:
+    """Locate `convert_genome` or raise `ConvertGenomeBinaryNotFound`.
+    Resolution: explicit ``override`` → ``convert_genome`` on PATH.
+    No environment-variable indirection.
+    """
+    if override is not None:
+        p = Path(override)
+        if not p.exists():
+            raise ConvertGenomeBinaryNotFound(f"convert_genome binary not at {p}")
+        return p
+    which = shutil.which("convert_genome")
+    if which:
+        return Path(which)
+    raise ConvertGenomeBinaryNotFound(
+        "convert_genome not found. Install with: cargo install convert_genome, "
+        "or pass binary=... explicitly."
+    )
+# ---------------------------------------------------------------------------
+# Converter
+# ---------------------------------------------------------------------------
+@dataclass(frozen=True)
+class Converter:
+    """Immutable conversion plan. Call ``.run()`` to execute."""
+    input: Path
+    output: Optional[Path] = None
+    output_dir: Optional[Path] = None
+    format: OutputFormat = OutputFormat.VCF
+    input_format: InputFormat = InputFormat.AUTO
+    assembly: str = "GRCh38"
+    input_build: Optional[str] = None
+    reference: Optional[Path] = None
+    reference_fai: Optional[Path] = None
+    panel: Optional[Path] = None
+    sample: Optional[str] = None
+    sex: Optional[Sex] = None
+    standardize: bool = False
+    variants_only: bool = False
+    log_level: str = "info"
+    binary: Optional[Path] = None
+    timeout: Optional[float] = None
+    extra_args: Tuple[str, ...] = field(default_factory=tuple)
+    def __post_init__(self) -> None:
+        if self.output is None and self.output_dir is None:
+            raise InvalidConfig("Either output= or output_dir= must be provided.")
+        if self.output is not None and self.output_dir is not None:
+            raise InvalidConfig("output= and output_dir= are mutually exclusive.")
+        if self.reference_fai is not None and self.reference is None:
+            raise InvalidConfig("reference_fai= requires reference=")
+        if not Path(self.input).exists():
+            raise InvalidConfig(f"Input file does not exist: {self.input}")
+    # Builder helpers — return a new Converter with the field replaced.
+    def with_output(self, output: PathLike) -> "Converter":
+        return replace(self, output=Path(output), output_dir=None)
+    def with_output_dir(self, output_dir: PathLike) -> "Converter":
+        return replace(self, output_dir=Path(output_dir), output=None)
+    def with_reference(self, ref: PathLike, fai: Optional[PathLike] = None) -> "Converter":
+        return replace(self, reference=Path(ref), reference_fai=Path(fai) if fai else None)
+    def with_panel(self, panel: PathLike) -> "Converter":
+        return replace(self, panel=Path(panel))
+    def with_sex(self, sex: Sex) -> "Converter":
+        return replace(self, sex=sex)
+    def with_sample(self, sample: str) -> "Converter":
+        return replace(self, sample=sample)
+    def with_standardize(self, on: bool = True) -> "Converter":
+        return replace(self, standardize=on)
+    def with_variants_only(self, on: bool = True) -> "Converter":
+        return replace(self, variants_only=on)
+    def with_assembly(self, assembly: str) -> "Converter":
+        return replace(self, assembly=Assembly.parse(assembly) or "GRCh38")
+    def with_input_build(self, build: Optional[str]) -> "Converter":
+        return replace(self, input_build=Assembly.parse(build))
+    def with_binary(self, path: PathLike) -> "Converter":
+        return replace(self, binary=Path(path))
+    def with_timeout(self, seconds: Optional[float]) -> "Converter":
+        return replace(self, timeout=seconds)
+    def with_log_level(self, level: str) -> "Converter":
+        return replace(self, log_level=level)
+    def with_extra_args(self, args: Iterable[str]) -> "Converter":
+        return replace(self, extra_args=tuple(args))
+    # --- Execution ---------------------------------------------------------
+    def argv(self) -> List[str]:
+        """Compute the argv that would be invoked. Useful for tests / dry-runs."""
+        binary = locate_binary(self.binary)
+        argv: List[str] = [str(binary)]
+        argv += ["--input-format", self.input_format.value]
+        argv += ["--format", self.format.value]
+        argv += ["--output-build", Assembly.parse(self.assembly) or self.assembly]
+        if self.reference is not None:
+            argv += ["--reference", str(self.reference)]
+        if self.reference_fai is not None:
+            argv += ["--reference-fai", str(self.reference_fai)]
+        if self.panel is not None:
+            argv += ["--panel", str(self.panel)]
+        if self.sample is not None:
+            argv += ["--sample", self.sample]
+        if self.input_build is not None:
+            argv += ["--input-build", self.input_build]
+        if self.sex is not None:
+            argv += ["--sex", self.sex.value]
+        if self.standardize:
+            argv += ["--standardize"]
+        if self.variants_only:
+            argv += ["--variants-only"]
+        if self.log_level and self.log_level != "info":
+            argv += ["--log-level", self.log_level]
+        if self.output_dir is not None:
+            argv += ["--output-dir", str(self.output_dir)]
+        argv += list(self.extra_args)
+        # positional: INPUT first, then OUTPUT if not --output-dir
+        argv.append(str(self.input))
+        if self.output is not None:
+            argv.append(str(self.output))
+        return argv
+    def run(self, *, capture: bool = True) -> ConversionResult:
+        argv = self.argv()
+        try:
+            completed = subprocess.run(
+                argv,
+                capture_output=capture,
+                text=True,
+                timeout=self.timeout,
+                check=False,
+            )
+        except FileNotFoundError as e:
+            raise ConvertGenomeBinaryNotFound(str(e)) from e
+        if completed.returncode != 0:
+            raise ConvertGenomeFailed(
+                f"convert_genome exited with status {completed.returncode}",
+                stdout=completed.stdout or "",
+                stderr=completed.stderr or "",
+                returncode=completed.returncode,
+            )
+        report_path = self._resolve_report_path(completed.stdout or "", completed.stderr or "")
+        if not report_path.exists():
+            raise ReportNotFound(f"Expected report at {report_path} but it is missing.")
+        with open(report_path) as f:
+            data = json.load(f)
+        return _result_from_report(
+            data,
+            report_path=report_path,
+            output_paths=self._resolve_outputs(),
+            stdout=completed.stdout or "",
+            stderr=completed.stderr or "",
+        )
+    # --- helpers -----------------------------------------------------------
+    def _resolve_outputs(self) -> Tuple[Path, ...]:
+        outs: List[Path] = []
+        if self.output is not None:
+            outs.append(self.output)
+            if self.format is OutputFormat.PLINK:
+                base = self.output.with_suffix("")
+                outs += [base.with_suffix(s) for s in (".bed", ".bim", ".fam")]
+        if self.output_dir is not None:
+            d = self.output_dir
+            fname = {
+                OutputFormat.VCF: "genotypes.vcf",
+                OutputFormat.BCF: "genotypes.bcf",
+                OutputFormat.PLINK: "genotypes",
+            }[self.format]
+            primary = d / fname
+            outs.append(primary)
+            if self.format is OutputFormat.PLINK:
+                outs += [d / f"genotypes{s}" for s in (".bed", ".bim", ".fam")]
+            if self.panel is not None:
+                outs.append(d / "panel.vcf")
+        return tuple(p for p in outs if p.exists())
+    def _resolve_report_path(self, stdout: str, stderr: str) -> Path:
+        m = re.search(r"Wrote run report to ([^\r\n]+)", stdout + "\n" + stderr)
+        if m:
+            return Path(m.group(1).strip())
+        if self.output is not None:
+            stem = self.output.stem
+            return self.output.with_name(f"{stem}_report.json")
+        # output_dir case (validated in __post_init__)
+        d = self.output_dir
+        assert d is not None
+        return d / "genotypes_report.json"
+# ---------------------------------------------------------------------------
+# Top-level convenience
+# ---------------------------------------------------------------------------
+def convert(
+    *,
+    input: PathLike,
+    output: Optional[PathLike] = None,
+    output_dir: Optional[PathLike] = None,
+    format: Union[OutputFormat, str] = OutputFormat.VCF,
+    input_format: Union[InputFormat, str] = InputFormat.AUTO,
+    assembly: str = "GRCh38",
+    input_build: Optional[str] = None,
+    reference: Optional[PathLike] = None,
+    reference_fai: Optional[PathLike] = None,
+    panel: Optional[PathLike] = None,
+    sample: Optional[str] = None,
+    sex: Optional[Union[Sex, str]] = None,
+    standardize: bool = False,
+    variants_only: bool = False,
+    log_level: str = "info",
+    binary: Optional[PathLike] = None,
+    timeout: Optional[float] = None,
+    extra_args: Optional[Iterable[str]] = None,
+    capture: bool = True,
+) -> ConversionResult:
+    """Run one conversion. Returns the parsed run report."""
+    # Lenient sex coercion: callers chaining results out of infer_sex
+    # may pass `InferredSex.INDETERMINATE.value` ("indeterminate") or
+    # `gnomon`'s "unknown". Neither is a valid convert_genome --sex
+    # value, but the most useful behaviour is "no override — let the
+    # CLI run its own inference", which is the same as `sex=None`.
+    sex_coerced: Optional[Sex]
+    if sex is None:
+        sex_coerced = None
+    elif isinstance(sex, str) and sex.strip().lower() in {"unknown", "indeterminate", ""}:
+        sex_coerced = None
+    elif isinstance(sex, Sex):
+        sex_coerced = sex
+    else:
+        sex_coerced = _coerce_enum(sex, Sex)
+    converter = Converter(
+        input=Path(input),
+        output=Path(output) if output else None,
+        output_dir=Path(output_dir) if output_dir else None,
+        format=_coerce_enum(format, OutputFormat),
+        input_format=_coerce_enum(input_format, InputFormat),
+        assembly=Assembly.parse(assembly) or "GRCh38",
+        input_build=Assembly.parse(input_build),
+        reference=Path(reference) if reference else None,
+        reference_fai=Path(reference_fai) if reference_fai else None,
+        panel=Path(panel) if panel else None,
+        sample=sample,
+        sex=sex_coerced,
+        standardize=standardize,
+        variants_only=variants_only,
+        log_level=log_level,
+        binary=Path(binary) if binary else None,
+        timeout=timeout,
+        extra_args=tuple(extra_args) if extra_args else (),
+    )
+    return converter.run(capture=capture)
+def _coerce_enum(value, enum_cls):
+    if isinstance(value, enum_cls):
+        return value
+    if isinstance(value, str):
+        try:
+            return enum_cls(value.lower())
+        except ValueError:
+            pass
+        for member in enum_cls:
+            if member.name.lower() == value.lower():
+                return member
+    raise InvalidConfig(f"Cannot coerce {value!r} to {enum_cls.__name__}")
+# ---------------------------------------------------------------------------
+# Report parsing
+# ---------------------------------------------------------------------------
+def _result_from_report(
+    data: Mapping[str, Any],
+    *,
+    report_path: Path,
+    output_paths: Tuple[Path, ...],
+    stdout: str,
+    stderr: str,
+) -> ConversionResult:
+    try:
+        stats = Statistics(**data["statistics"])
+        input_info = InputInfo(**data["input"])
+        output_info = OutputInfo(**data["output"])
+        reference_info = ReferenceInfo(**data["reference"])
+        sample_info = SampleInfo(**data["sample"])
+        panel_info = PanelInfo(**data["panel"]) if data.get("panel") else None
+        build = BuildDetection(**data["build_detection"]) if data.get("build_detection") else None
+        return ConversionResult(
+            version=data["version"],
+            timestamp=data["timestamp"],
+            input=input_info,
+            output=output_info,
+            reference=reference_info,
+            standardize=data["standardize"],
+            sample=sample_info,
+            statistics=stats,
+            panel=panel_info,
+            build_detection=build,
+            report_path=report_path,
+            output_paths=output_paths,
+            stdout=stdout,
+            stderr=stderr,
+        )
+    except KeyError as e:
+        raise ConvertGenomeFailed(
+            f"Report {report_path} missing expected field: {e}",
+            stdout=stdout,
+            stderr=stderr,
+        ) from e
+    except TypeError as e:
+        raise ConvertGenomeFailed(
+            f"Report {report_path} has an unexpected schema: {e}",
+            stdout=stdout,
+            stderr=stderr,
+        ) from e

convert_genome-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,32 @@
+[build-system]
+requires = ["hatchling>=1.18"]
+build-backend = "hatchling.build"
+[project]
+name = "convert_genome"
+version = "0.1.0"
+description = "Python wrapper for SauersML/convert_genome (DTC → VCF/BCF/PLINK conversion)."
+readme = "README.md"
+license = { text = "MIT" }
+authors = [{ name = "SauersML" }]
+requires-python = ">=3.9"
+dependencies = []
+keywords = ["genomics", "bioinformatics", "vcf", "bcf", "plink", "23andme", "ancestry", "dtc"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+[project.urls]
+Homepage = "https://github.com/SauersML/convert_genome"
+Issues = "https://github.com/SauersML/convert_genome/issues"
+[project.optional-dependencies]
+test = ["pytest>=7"]
+[tool.hatch.build.targets.wheel]
+packages = ["convert_genome"]

convert_genome-0.1.0/tests/test_convert_genome.py ADDED Viewed

@@ -0,0 +1,407 @@
+"""Tests for the convert_genome Python wrapper.
+We don't run the real Rust binary in CI. Tests build a fake binary as a
+small Python script that:
+  1. Echoes argv (so we can assert flag mapping).
+  2. Writes a minimal but valid run-report JSON wherever the wrapper
+     expects to find it.
+  3. Exits 0 (or fails on demand, to test error paths).
+"""
+from __future__ import annotations
+import json
+import stat
+import textwrap
+from pathlib import Path
+import pytest
+from convert_genome import (
+    Assembly,
+    ConversionResult,
+    Converter,
+    ConvertGenomeBinaryNotFound,
+    ConvertGenomeError,
+    ConvertGenomeFailed,
+    InvalidConfig,
+    OutputFormat,
+    ReportNotFound,
+    Sex,
+    convert,
+    locate_binary,
+)
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _make_input(tmp_path: Path, name: str = "in.txt") -> Path:
+    p = tmp_path / name
+    p.write_text("# rsid\tchromosome\tposition\tgenotype\nrs1\t1\t12345\tAG\n")
+    return p
+_FAKE_FAILING = textwrap.dedent(
+    """\
+    #!/usr/bin/env python3
+    import sys
+    sys.stderr.write('ERROR: synthetic failure\\n')
+    sys.exit(2)
+    """
+)
+_FAKE_NOREPORT = textwrap.dedent(
+    """\
+    #!/usr/bin/env python3
+    print('Did some work, but forgot to write a report.')
+    """
+)
+def _good_fake_body(report: dict, *, log_line: str = "") -> str:
+    """A fake CLI that writes ``report`` to the wrapper's expected path."""
+    return textwrap.dedent(
+        f"""\
+        #!/usr/bin/env python3
+        import json, sys, pathlib
+        argv = sys.argv[1:]
+        log_argv = pathlib.Path(sys.argv[0]).parent / 'argv.json'
+        log_argv.write_text(json.dumps(argv))
+        # Figure out where the wrapper expects the report.
+        report = {json.dumps(report)!r}
+        if '--output-dir' in argv:
+            d = pathlib.Path(argv[argv.index('--output-dir') + 1])
+            d.mkdir(parents=True, exist_ok=True)
+            report_path = d / 'genotypes_report.json'
+        else:
+            # Positional layout: ... INPUT OUTPUT
+            output_path = pathlib.Path(argv[-1])
+            stem = output_path.stem
+            report_path = output_path.with_name(stem + '_report.json')
+        # Also write a sentinel output file so output_paths picks it up.
+        if '--output-dir' in argv:
+            (d / 'genotypes.vcf').write_text('##fake\\n')
+        else:
+            output_path.write_text('##fake\\n')
+        report_path.write_text(report)
+        sys.stdout.write({log_line!r})
+        """
+    )
+def _make_fake(tmp_path: Path, body: str) -> Path:
+    p = tmp_path / "fake_convert_genome"
+    p.write_text(body)
+    p.chmod(p.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
+    return p
+def _minimal_report() -> dict:
+    return {
+        "version": "0.1.2",
+        "timestamp": "2026-05-19T00:00:00Z",
+        "input": {"path": "/in.txt", "format": "DTC", "origin": "local"},
+        "output": {"path": "/out.vcf", "format": "VCF"},
+        "reference": {"path": "/ref.fa", "origin": "downloaded", "assembly": "GRCh38"},
+        "standardize": True,
+        "sample": {"id": "S1", "sex": "Female", "sex_inferred": True},
+        "build_detection": {
+            "detected_build": "GRCh38",
+            "hg19_match_rate": 12.3,
+            "hg38_match_rate": 98.7,
+        },
+        "statistics": {
+            "total_records": 1000,
+            "emitted_records": 990,
+            "variant_records": 800,
+            "reference_records": 190,
+            "missing_genotype_records": 10,
+            "skipped_reference_sites": 0,
+            "unknown_chromosomes": 0,
+            "reference_failures": 0,
+            "invalid_genotypes": 0,
+            "symbolic_allele_records": 0,
+            "parse_errors": 0,
+        },
+    }
+# ---------------------------------------------------------------------------
+# Locator
+# ---------------------------------------------------------------------------
+def test_locate_binary_not_on_path(monkeypatch, tmp_path):
+    monkeypatch.setenv("PATH", str(tmp_path))  # empty PATH
+    with pytest.raises(ConvertGenomeBinaryNotFound):
+        locate_binary()
+def test_locate_binary_override(tmp_path):
+    fake = _make_fake(tmp_path, "#!/usr/bin/env python3\nprint('x')\n")
+    assert locate_binary(fake) == fake
+def test_locate_binary_override_missing_raises(tmp_path):
+    with pytest.raises(ConvertGenomeBinaryNotFound):
+        locate_binary(tmp_path / "no-such-binary")
+# ---------------------------------------------------------------------------
+# Eager validation
+# ---------------------------------------------------------------------------
+def test_requires_output_or_output_dir(tmp_path):
+    in_ = _make_input(tmp_path)
+    with pytest.raises(InvalidConfig):
+        Converter(input=in_)
+def test_output_xor_output_dir(tmp_path):
+    in_ = _make_input(tmp_path)
+    with pytest.raises(InvalidConfig):
+        Converter(input=in_, output=tmp_path / "out.vcf", output_dir=tmp_path / "d")
+def test_reference_fai_requires_reference(tmp_path):
+    in_ = _make_input(tmp_path)
+    fai = tmp_path / "x.fai"
+    fai.write_text("")
+    with pytest.raises(InvalidConfig):
+        Converter(input=in_, output=tmp_path / "o.vcf", reference_fai=fai)
+def test_input_must_exist(tmp_path):
+    with pytest.raises(InvalidConfig):
+        Converter(input=tmp_path / "nope.txt", output=tmp_path / "o.vcf")
+# ---------------------------------------------------------------------------
+# Assembly alias normalisation
+# ---------------------------------------------------------------------------
+def test_assembly_parses_aliases():
+    assert Assembly.parse("hg19") == "GRCh37"
+    assert Assembly.parse("GRCh38") == "GRCh38"
+    assert Assembly.parse("build38") == "GRCh38"
+    assert Assembly.parse("Hg-38") == "GRCh38"
+    # Unknown strings pass through (CLI will decide).
+    assert Assembly.parse("CHM13") == "CHM13"
+# ---------------------------------------------------------------------------
+# argv generation
+# ---------------------------------------------------------------------------
+def test_argv_for_simple_vcf(tmp_path):
+    in_ = _make_input(tmp_path)
+    fake = _make_fake(tmp_path, "")
+    argv = (
+        Converter(input=in_, output=tmp_path / "o.vcf", binary=fake)
+        .with_assembly("hg38")
+        .argv()
+    )
+    assert argv[0] == str(fake)
+    assert "--format" in argv and argv[argv.index("--format") + 1] == "vcf"
+    assert argv[argv.index("--output-build") + 1] == "GRCh38"
+    assert argv[-2] == str(in_)
+    assert argv[-1] == str(tmp_path / "o.vcf")
+    # No --output-dir.
+    assert "--output-dir" not in argv
+def test_argv_for_plink_output_dir(tmp_path):
+    in_ = _make_input(tmp_path)
+    fake = _make_fake(tmp_path, "")
+    out_dir = tmp_path / "d"
+    argv = (
+        Converter(
+            input=in_,
+            output_dir=out_dir,
+            format=OutputFormat.PLINK,
+            binary=fake,
+        )
+        .with_standardize()
+        .with_variants_only()
+        .with_sex(Sex.MALE)
+        .argv()
+    )
+    assert "--standardize" in argv
+    assert "--variants-only" in argv
+    assert argv[argv.index("--sex") + 1] == "male"
+    assert argv[argv.index("--format") + 1] == "plink"
+    assert argv[argv.index("--output-dir") + 1] == str(out_dir)
+    # Last positional is INPUT, no trailing OUTPUT since --output-dir was set.
+    assert argv[-1] == str(in_)
+def test_argv_includes_reference_and_panel(tmp_path):
+    in_ = _make_input(tmp_path)
+    ref = tmp_path / "ref.fa"
+    fai = tmp_path / "ref.fa.fai"
+    panel = tmp_path / "panel.vcf"
+    for p in (ref, fai, panel):
+        p.write_text("")
+    fake = _make_fake(tmp_path, "")
+    argv = (
+        Converter(input=in_, output=tmp_path / "o.vcf", binary=fake)
+        .with_reference(ref, fai)
+        .with_panel(panel)
+        .with_sample("ID1")
+        .with_input_build("hg19")
+        .argv()
+    )
+    assert argv[argv.index("--reference") + 1] == str(ref)
+    assert argv[argv.index("--reference-fai") + 1] == str(fai)
+    assert argv[argv.index("--panel") + 1] == str(panel)
+    assert argv[argv.index("--sample") + 1] == "ID1"
+    assert argv[argv.index("--input-build") + 1] == "GRCh37"
+# ---------------------------------------------------------------------------
+# Run + JSON parsing
+# ---------------------------------------------------------------------------
+def test_run_parses_report(tmp_path):
+    in_ = _make_input(tmp_path)
+    out = tmp_path / "out.vcf"
+    fake = _make_fake(tmp_path, _good_fake_body(_minimal_report()))
+    result = convert(
+        input=in_,
+        output=out,
+        binary=fake,
+        standardize=True,
+        assembly="hg38",
+        sex="female",
+    )
+    assert isinstance(result, ConversionResult)
+    assert result.statistics.emitted_records == 990
+    assert result.statistics.total_records == 1000
+    assert result.yield_rate == pytest.approx(0.99)
+    assert result.sample.sex_inferred is True
+    assert result.build_detection is not None
+    assert result.build_detection.detected_build == "GRCh38"
+    assert result.report_path is not None
+    assert result.report_path.exists()
+    assert any(p.suffix == ".vcf" for p in result.output_paths)
+def test_run_with_output_dir(tmp_path):
+    in_ = _make_input(tmp_path)
+    out_dir = tmp_path / "outdir"
+    fake = _make_fake(tmp_path, _good_fake_body(_minimal_report()))
+    result = convert(
+        input=in_,
+        output_dir=out_dir,
+        format=OutputFormat.VCF,
+        binary=fake,
+    )
+    assert result.report_path == out_dir / "genotypes_report.json"
+    assert (out_dir / "genotypes.vcf") in result.output_paths
+def test_run_locates_report_via_log_line(tmp_path):
+    """The CLI logs 'Wrote run report to <path>' — prefer that over guessing."""
+    in_ = _make_input(tmp_path)
+    out = tmp_path / "out.vcf"
+    # Move the report to an unusual location, then advertise it via log.
+    alt = tmp_path / "side_report.json"
+    body = textwrap.dedent(
+        f"""\
+        #!/usr/bin/env python3
+        import json, pathlib, sys
+        pathlib.Path({str(alt)!r}).write_text(json.dumps({_minimal_report()!r}))
+        pathlib.Path({str(out)!r}).write_text('##')
+        print('Wrote run report to', {str(alt)!r})
+        """
+    )
+    fake = _make_fake(tmp_path, body)
+    result = convert(input=in_, output=out, binary=fake)
+    assert result.report_path == alt
+def test_failure_exit_code(tmp_path):
+    in_ = _make_input(tmp_path)
+    fake = _make_fake(tmp_path, _FAKE_FAILING)
+    with pytest.raises(ConvertGenomeFailed) as ei:
+        convert(input=in_, output=tmp_path / "o.vcf", binary=fake)
+    assert ei.value.returncode == 2
+    assert "synthetic failure" in ei.value.stderr
+def test_missing_report_raises(tmp_path):
+    in_ = _make_input(tmp_path)
+    fake = _make_fake(tmp_path, _FAKE_NOREPORT)
+    with pytest.raises(ReportNotFound):
+        convert(input=in_, output=tmp_path / "o.vcf", binary=fake)
+def test_unexpected_schema_raises(tmp_path):
+    in_ = _make_input(tmp_path)
+    bad = _minimal_report()
+    bad["statistics"]["mystery_new_field"] = 42
+    fake = _make_fake(tmp_path, _good_fake_body(bad))
+    with pytest.raises(ConvertGenomeFailed):
+        convert(input=in_, output=tmp_path / "o.vcf", binary=fake)
+def test_argv_log_actually_reflects_invocation(tmp_path):
+    in_ = _make_input(tmp_path)
+    fake = _make_fake(tmp_path, _good_fake_body(_minimal_report()))
+    convert(
+        input=in_,
+        output=tmp_path / "o.vcf",
+        binary=fake,
+        format="bcf",
+        standardize=True,
+        variants_only=True,
+        extra_args=["--log-level", "warn"],
+    )
+    argv = json.loads((tmp_path / "argv.json").read_text())
+    assert argv[argv.index("--format") + 1] == "bcf"
+    assert "--standardize" in argv
+    assert "--variants-only" in argv
+    assert argv[argv.index("--log-level") + 1] == "warn"
+def test_error_hierarchy():
+    assert issubclass(ConvertGenomeBinaryNotFound, ConvertGenomeError)
+    assert issubclass(ConvertGenomeFailed, ConvertGenomeError)
+    assert issubclass(InvalidConfig, ConvertGenomeError)
+    assert issubclass(ReportNotFound, ConvertGenomeError)
+def test_sex_indeterminate_maps_to_no_flag(tmp_path):
+    """Regression: callers chaining through infer_sex may pass
+    'indeterminate' or 'unknown'. Neither is a valid --sex value, but
+    omitting the flag and letting the CLI infer is the right behaviour."""
+    in_ = _make_input(tmp_path)
+    fake = _make_fake(tmp_path, _good_fake_body(_minimal_report()))
+    for value in ("indeterminate", "unknown", "INDETERMINATE", "  unknown  ", ""):
+        convert(input=in_, output=tmp_path / "o.vcf", binary=fake, sex=value)
+        argv = json.loads((tmp_path / "argv.json").read_text())
+        assert "--sex" not in argv, f"sex={value!r} should not produce --sex flag"
+def test_converter_is_immutable(tmp_path):
+    in_ = _make_input(tmp_path)
+    base = Converter(input=in_, output=tmp_path / "o.vcf")
+    new = base.with_standardize()
+    assert base.standardize is False
+    assert new.standardize is True
+    assert new is not base