PyPI - bio2zarr - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

bio2zarr 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

bio2zarr/__main__.py +2 -1
bio2zarr/_version.py +16 -3
bio2zarr/cli.py +102 -22
bio2zarr/core.py +43 -22
bio2zarr/plink.py +316 -189
bio2zarr/tskit.py +296 -0
bio2zarr/typing.py +1 -2
bio2zarr/{vcf2zarr/icf.py → vcf.py} +606 -114
bio2zarr/vcf_utils.py +12 -11
bio2zarr/{vcf2zarr/vcz.py → vcz.py} +568 -739
bio2zarr/{vcf2zarr/verification.py → vcz_verification.py} +5 -2
bio2zarr/zarr_utils.py +169 -2
{bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/METADATA +23 -8
bio2zarr-0.1.7.dist-info/RECORD +21 -0
{bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/WHEEL +1 -1
{bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/entry_points.txt +2 -0
bio2zarr/vcf2zarr/__init__.py +0 -38
bio2zarr-0.1.5.dist-info/RECORD +0 -21
{bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/licenses/LICENSE +0 -0
{bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/top_level.txt +0 -0

bio2zarr/plink.py CHANGED Viewed

@@ -1,207 +1,334 @@
+import dataclasses
 import logging
+import pathlib
-import bed_reader
-import humanfriendly
-import numcodecs
 import numpy as np
-import zarr
+import pandas as pd
-from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
-from . import core
+from bio2zarr import constants, core, vcz
+from bio2zarr.zarr_utils import STRING_DTYPE_NAME
 logger = logging.getLogger(__name__)
-def encode_genotypes_slice(bed_path, zarr_path, start, stop):
-    # We need to count the A2 alleles here if we want to keep the
-    # alleles reported as allele_1, allele_2. It's obvious here what
-    # the correct approach is, but it is important to note that the
-    # 0th allele is *not* necessarily the REF for these datasets.
-    bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
-    root = zarr.open(store=zarr_path, mode="a", **ZARR_FORMAT_KWARGS)
-    gt = core.BufferedArray(root["call_genotype"], start)
-    gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
-    gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
-    variants_chunk_size = gt.array.chunks[0]
-    assert start % variants_chunk_size == 0
-    logger.debug(f"Reading slice {start}:{stop}")
-    chunk_start = start
-    while chunk_start < stop:
-        chunk_stop = min(chunk_start + variants_chunk_size, stop)
-        logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
-        bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
-        logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
-        # Probably should do this without iterating over rows, but it's a bit
-        # simpler and lines up better with the array buffering API. The bottleneck
-        # is in the encoding anyway.
-        for values in bed_chunk:
-            j = gt.next_buffer_row()
-            g = np.zeros_like(gt.buff[j])
-            g[values == -127] = -1
-            g[values == 2] = 1
-            g[values == 1, 0] = 1
-            gt.buff[j] = g
-            j = gt_phased.next_buffer_row()
-            gt_phased.buff[j] = False
-            j = gt_mask.next_buffer_row()
-            gt_mask.buff[j] = gt.buff[j] == -1
-        chunk_start = chunk_stop
-    gt.flush()
-    gt_phased.flush()
-    gt_mask.flush()
-    logger.debug(f"GT slice {start}:{stop} done")
+FAM_FIELDS = [
+    ("family_id", str, "U"),
+    ("individual_id", str, "U"),
+    ("paternal_id", str, "U"),
+    ("maternal_id", str, "U"),
+    ("sex", str, "int8"),
+    ("phenotype", str, "int8"),
+]
+FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS])
+FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS])
+BIM_FIELDS = [
+    ("contig", str, "U"),
+    ("variant_id", str, "U"),
+    ("cm_position", "float32", "float32"),
+    ("position", "int32", "int32"),
+    ("allele_1", str, "S"),
+    ("allele_2", str, "S"),
+]
+BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
+BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
+# See https://github.com/sgkit-dev/bio2zarr/issues/409 for discussion
+# on the parameters to Pandas here.
+def read_fam(path):
+    # See: https://www.cog-genomics.org/plink/1.9/formats#fam
+    names = [f[0] for f in FAM_FIELDS]
+    df = pd.read_csv(path, sep=None, names=names, dtype=FAM_DF_DTYPE, engine="python")
+    return df
+def read_bim(path):
+    # See: https://www.cog-genomics.org/plink/1.9/formats#bim
+    names = [f[0] for f in BIM_FIELDS]
+    df = pd.read_csv(path, sep=None, names=names, dtype=BIM_DF_DTYPE, engine="python")
+    return df
+@dataclasses.dataclass
+class PlinkPaths:
+    bed_path: str
+    bim_path: str
+    fam_path: str
+class BedReader:
+    def __init__(self, path, num_variants, num_samples):
+        self.num_variants = num_variants
+        self.num_samples = num_samples
+        self.path = path
+        # bytes per variant: 1 byte per 4 samples, rounded up
+        self.bytes_per_variant = (self.num_samples + 3) // 4
+        # TODO open this as a persistent file and support reading from a
+        # stream
+        with open(self.path, "rb") as f:
+            magic = f.read(3)
+            if magic != b"\x6c\x1b\x01":
+                raise ValueError("Invalid BED file magic bytes")
+        # We could check the size of the bed file here, but that would
+        # mean we can't work with streams.
+        # Initialize the lookup table with shape (256, 4, 2)
+        # 256 possible byte values, 4 samples per byte, 2 alleles per sample
+        lookup = np.zeros((256, 4, 2), dtype=np.int8)
+        # For each possible byte value (0-255)
+        for byte in range(256):
+            # For each of the 4 samples encoded in this byte
+            for sample in range(4):
+                # Extract the 2 bits for this sample
+                bits = (byte >> (sample * 2)) & 0b11
+                # Convert PLINK's bit encoding to genotype values
+                if bits == 0b00:
+                    lookup[byte, sample] = [1, 1]
+                elif bits == 0b01:
+                    lookup[byte, sample] = [-1, -1]
+                elif bits == 0b10:
+                    lookup[byte, sample] = [0, 1]
+                elif bits == 0b11:
+                    lookup[byte, sample] = [0, 0]
+        self.byte_lookup = lookup
+    def iter_decode(self, start, stop, buffer_size=None):
+        """
+        Iterate of over the variants in the specified window
+        with the specified approximate buffer size in bytes (default=10MiB).
+        """
+        if buffer_size is None:
+            buffer_size = 10 * 1024 * 1024
+        variants_per_read = max(1, int(buffer_size / self.bytes_per_variant))
+        for off in range(start, stop, variants_per_read):
+            genotypes = self.decode(off, min(off + variants_per_read, stop))
+            yield from genotypes
+    def decode(self, start, stop):
+        chunk_size = stop - start
+        # Calculate file offsets for the required data
+        # 3 bytes for the magic number at the beginning of the file
+        start_offset = 3 + (start * self.bytes_per_variant)
+        bytes_to_read = chunk_size * self.bytes_per_variant
+        logger.debug(
+            f"Reading {chunk_size} variants ({bytes_to_read} bytes) "
+            f"from {self.path}"
+        )
+        # TODO make it possible to read sequentially from the same file handle,
+        # seeking only when necessary.
+        with open(self.path, "rb") as f:
+            f.seek(start_offset)
+            chunk_data = f.read(bytes_to_read)
+        data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
+        data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_variant)
+        # Apply lookup table to get genotypes
+        # Shape becomes: (chunk_size, bytes_per_variant, 4, 2)
+        all_genotypes = self.byte_lookup[data_matrix]
+        # Reshape to get all samples in one dimension
+        # (chunk_size, bytes_per_variant*4, 2)
+        samples_padded = self.bytes_per_variant * 4
+        genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
+        return genotypes_reshaped[:, : self.num_samples]
+class PlinkFormat(vcz.Source):
+    def __init__(self, prefix):
+        # TODO we will need support multiple chromosomes here to join
+        # plinks into on big zarr. So, these will require multiple
+        # bed and bim files, but should share a .fam
+        self.prefix = str(prefix)
+        self.paths = PlinkPaths(
+            self.prefix + ".bed",
+            self.prefix + ".bim",
+            self.prefix + ".fam",
+        )
+        self.bim = read_bim(self.paths.bim_path)
+        self.fam = read_fam(self.paths.fam_path)
+        self._num_records = self.bim.shape[0]
+        self._num_samples = self.fam.shape[0]
+        self.bed_reader = BedReader(
+            self.paths.bed_path, self.num_records, self.num_samples
+        )
+    @property
+    def path(self):
+        return self.prefix
+    @property
+    def num_records(self):
+        return self._num_records
+    @property
+    def num_samples(self):
+        return self._num_samples
+    @property
+    def samples(self):
+        return [vcz.Sample(id=iid) for iid in self.fam.individual_id]
+    @property
+    def contigs(self):
+        return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()]
+    def iter_contig(self, start, stop):
+        chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
+        for chrom in self.bim.contig[start:stop]:
+            yield chrom_to_contig_index[str(chrom)]
+    def iter_field(self, field_name, shape, start, stop):
+        assert field_name == "position"  # Only position field is supported from plink
+        yield from self.bim.position[start:stop]
+    def iter_id(self, start, stop):
+        yield from self.bim.variant_id[start:stop]
+    def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
+        alt_iter = self.bim.allele_1.values[start:stop]
+        ref_iter = self.bim.allele_2.values[start:stop]
+        gt_iter = self.bed_reader.iter_decode(start, stop)
+        for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
+            alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
+            alleles[0] = ref
+            alleles[1 : 1 + len(alt)] = alt
+            phased = np.zeros(gt.shape[0], dtype=bool)
+            # rlen is the length of the REF in PLINK as there's no END annotations
+            yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
+    def generate_schema(
+        self,
+        variants_chunk_size=None,
+        samples_chunk_size=None,
+    ):
+        n = self.num_samples
+        m = self.num_records
+        logging.info(f"Scanned plink with {n} samples and {m} variants")
+        dimensions = vcz.standard_dimensions(
+            variants_size=m,
+            variants_chunk_size=variants_chunk_size,
+            samples_size=n,
+            samples_chunk_size=samples_chunk_size,
+            ploidy_size=2,
+            alleles_size=2,
+        )
+        schema_instance = vcz.VcfZarrSchema(
+            format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
+            dimensions=dimensions,
+            fields=[],
+        )
+        logger.info(
+            "Generating schema with chunks="
+            f"variants={dimensions['variants'].chunk_size}, "
+            f"samples={dimensions['samples'].chunk_size}"
+        )
+        # If we don't have SVLEN or END annotations, the rlen field is defined
+        # as the length of the REF
+        # Explicitly cast to fixed size array to support pandas 2.x and 3.x
+        allele_2_array = self.bim.allele_2.values.astype("S")
+        max_len = allele_2_array.itemsize
+        array_specs = [
+            vcz.ZarrArraySpec(
+                source="position",
+                name="variant_position",
+                dtype="i4",
+                dimensions=["variants"],
+                description=None,
+            ),
+            vcz.ZarrArraySpec(
+                name="variant_allele",
+                dtype=STRING_DTYPE_NAME,
+                dimensions=["variants", "alleles"],
+                description=None,
+            ),
+            vcz.ZarrArraySpec(
+                name="variant_id",
+                dtype=STRING_DTYPE_NAME,
+                dimensions=["variants"],
+                description=None,
+            ),
+            vcz.ZarrArraySpec(
+                name="variant_id_mask",
+                dtype="bool",
+                dimensions=["variants"],
+                description=None,
+            ),
+            vcz.ZarrArraySpec(
+                source=None,
+                name="variant_length",
+                dtype=core.min_int_dtype(0, max_len),
+                dimensions=["variants"],
+                description="Length of each variant",
+            ),
+            vcz.ZarrArraySpec(
+                name="variant_contig",
+                dtype=core.min_int_dtype(0, len(np.unique(self.bim.contig))),
+                dimensions=["variants"],
+                description="Contig/chromosome index for each variant",
+            ),
+            vcz.ZarrArraySpec(
+                name="call_genotype_phased",
+                dtype="bool",
+                dimensions=["variants", "samples"],
+                description=None,
+                compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
+            ),
+            vcz.ZarrArraySpec(
+                name="call_genotype",
+                dtype="i1",
+                dimensions=["variants", "samples", "ploidy"],
+                description=None,
+                compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
+            ),
+            vcz.ZarrArraySpec(
+                name="call_genotype_mask",
+                dtype="bool",
+                dimensions=["variants", "samples", "ploidy"],
+                description=None,
+                compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
+            ),
+        ]
+        schema_instance.fields = array_specs
+        return schema_instance
 def convert(
-    bed_path,
-    zarr_path,
+    prefix,
+    out,
     *,
-    show_progress=False,
-    worker_processes=1,
     variants_chunk_size=None,
     samples_chunk_size=None,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
+    show_progress=False,
 ):
-    bed = bed_reader.open_bed(bed_path, num_threads=1)
-    n = bed.iid_count
-    m = bed.sid_count
-    logging.info(f"Scanned plink with {n} samples and {m} variants")
-    # FIXME
-    if samples_chunk_size is None:
-        samples_chunk_size = 1000
-    if variants_chunk_size is None:
-        variants_chunk_size = 10_000
-    root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
-    ploidy = 2
-    shape = [m, n]
-    chunks = [variants_chunk_size, samples_chunk_size]
-    dimensions = ["variants", "samples"]
-    # TODO we should be reusing some logic from vcfzarr here on laying
-    # out the basic dataset, and using the schema generator. Currently
-    # we're not using the best Blosc settings for genotypes here.
-    default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
-    a = root.array(
-        "sample_id",
-        data=bed.iid,
-        shape=bed.iid.shape,
-        dtype="str",
-        compressor=default_compressor,
-        chunks=(samples_chunk_size,),
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
-    logger.debug("Encoded samples")
-    # TODO encode these in slices - but read them in one go to avoid
-    # fetching repeatedly from bim file
-    a = root.array(
-        "variant_position",
-        data=bed.bp_position,
-        shape=bed.bp_position.shape,
-        dtype=np.int32,
-        compressor=default_compressor,
-        chunks=(variants_chunk_size,),
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
-    logger.debug("encoded variant_position")
-    alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
-    a = root.array(
-        "variant_allele",
-        data=alleles,
-        shape=alleles.shape,
-        dtype="str",
-        compressor=default_compressor,
-        chunks=(variants_chunk_size, alleles.shape[1]),
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
-    logger.debug("encoded variant_allele")
-    # TODO remove this?
-    a = root.empty(
-        name="call_genotype_phased",
-        dtype="bool",
-        shape=list(shape),
-        chunks=list(chunks),
-        compressor=default_compressor,
-        **ZARR_FORMAT_KWARGS,
+    plink_format = PlinkFormat(prefix)
+    schema_instance = plink_format.generate_schema(
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
     )
-    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
-    shape += [ploidy]
-    dimensions += ["ploidy"]
-    a = root.empty(
-        name="call_genotype",
-        dtype="i1",
-        shape=list(shape),
-        chunks=list(chunks),
-        compressor=default_compressor,
-        **ZARR_FORMAT_KWARGS,
+    zarr_path = pathlib.Path(out)
+    vzw = vcz.VcfZarrWriter(PlinkFormat, zarr_path)
+    # Rough heuristic to split work up enough to keep utilisation high
+    target_num_partitions = max(1, worker_processes * 4)
+    vzw.init(
+        plink_format,
+        target_num_partitions=target_num_partitions,
+        schema=schema_instance,
     )
-    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
-    a = root.empty(
-        name="call_genotype_mask",
-        dtype="bool",
-        shape=list(shape),
-        chunks=list(chunks),
-        compressor=default_compressor,
-        **ZARR_FORMAT_KWARGS,
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
-    del bed
-    num_slices = max(1, worker_processes * 4)
-    slices = core.chunk_aligned_slices(a, num_slices)
-    total_chunks = sum(a.nchunks for _, a in root.arrays())
-    progress_config = core.ProgressConfig(
-        total=total_chunks, title="Convert", units="chunks", show=show_progress
+    vzw.encode_all_partitions(
+        worker_processes=worker_processes,
+        show_progress=show_progress,
     )
-    with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
-        for start, stop in slices:
-            pwm.submit(encode_genotypes_slice, bed_path, zarr_path, start, stop)
-    # TODO also add atomic swap like VCF. Should be abstracted to
-    # share basic code for setting up the variation dataset zarr
-    zarr.consolidate_metadata(zarr_path)
-# FIXME do this more efficiently - currently reading the whole thing
-# in for convenience, and also comparing call-by-call
-def validate(bed_path, zarr_path):
-    root = zarr.open(store=zarr_path, mode="r")
-    call_genotype = root["call_genotype"][:]
-    bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
-    assert call_genotype.shape[0] == bed.sid_count
-    assert call_genotype.shape[1] == bed.iid_count
-    bed_genotypes = bed.read(dtype="int8").T
-    assert call_genotype.shape[0] == bed_genotypes.shape[0]
-    assert call_genotype.shape[1] == bed_genotypes.shape[1]
-    assert call_genotype.shape[2] == 2
-    row_id = 0
-    for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
-        # print("ROW", row_id)
-        # print(bed_row, zarr_row)
-        row_id += 1
-        for bed_call, zarr_call in zip(bed_row, zarr_row):
-            if bed_call == -127:
-                assert list(zarr_call) == [-1, -1]
-            elif bed_call == 0:
-                assert list(zarr_call) == [0, 0]
-            elif bed_call == 1:
-                assert list(zarr_call) == [1, 0]
-            elif bed_call == 2:
-                assert list(zarr_call) == [1, 1]
-            else:  # pragma no cover
-                raise AssertionError(f"Unexpected bed call {bed_call}")
+    vzw.finalise(show_progress)
+    vzw.create_index()

bio2zarr 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

bio2zarr 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl