bio2zarr 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/plink.py CHANGED
@@ -4,6 +4,7 @@ import humanfriendly
4
4
  import numpy as np
5
5
  import zarr
6
6
  import bed_reader
7
+ import numcodecs
7
8
 
8
9
  from . import core
9
10
 
@@ -22,14 +23,14 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
22
23
  gt = core.BufferedArray(root["call_genotype"], start)
23
24
  gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
24
25
  gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
25
- chunk_length = gt.array.chunks[0]
26
+ variants_chunk_size = gt.array.chunks[0]
26
27
  n = gt.array.shape[1]
27
- assert start % chunk_length == 0
28
+ assert start % variants_chunk_size == 0
28
29
 
29
30
  logger.debug(f"Reading slice {start}:{stop}")
30
31
  chunk_start = start
31
32
  while chunk_start < stop:
32
- chunk_stop = min(chunk_start + chunk_length, stop)
33
+ chunk_stop = min(chunk_start + variants_chunk_size, stop)
33
34
  logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
34
35
  bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
35
36
  logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
@@ -60,8 +61,8 @@ def convert(
60
61
  *,
61
62
  show_progress=False,
62
63
  worker_processes=1,
63
- chunk_length=None,
64
- chunk_width=None,
64
+ variants_chunk_size=None,
65
+ samples_chunk_size=None,
65
66
  ):
66
67
  bed = bed_reader.open_bed(bed_path, num_threads=1)
67
68
  n = bed.iid_count
@@ -69,25 +70,30 @@ def convert(
69
70
  logging.info(f"Scanned plink with {n} samples and {m} variants")
70
71
 
71
72
  # FIXME
72
- if chunk_width is None:
73
- chunk_width = 1000
74
- if chunk_length is None:
75
- chunk_length = 10_000
73
+ if samples_chunk_size is None:
74
+ samples_chunk_size = 1000
75
+ if variants_chunk_size is None:
76
+ variants_chunk_size = 10_000
76
77
 
77
78
  store = zarr.DirectoryStore(zarr_path)
78
79
  root = zarr.group(store=store, overwrite=True)
79
80
 
80
81
  ploidy = 2
81
82
  shape = [m, n]
82
- chunks = [chunk_length, chunk_width]
83
+ chunks = [variants_chunk_size, samples_chunk_size]
83
84
  dimensions = ["variants", "samples"]
84
85
 
86
+ # TODO we should be reusing some logic from vcfzarr here on laying
87
+ # out the basic dataset, and using the schema generator. Currently
88
+ # we're not using the best Blosc settings for genotypes here.
89
+ default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
90
+
85
91
  a = root.array(
86
92
  "sample_id",
87
93
  bed.iid,
88
94
  dtype="str",
89
- compressor=core.default_compressor,
90
- chunks=(chunk_width,),
95
+ compressor=default_compressor,
96
+ chunks=(samples_chunk_size,),
91
97
  )
92
98
  a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
93
99
  logger.debug(f"Encoded samples")
@@ -98,8 +104,8 @@ def convert(
98
104
  "variant_position",
99
105
  bed.bp_position,
100
106
  dtype=np.int32,
101
- compressor=core.default_compressor,
102
- chunks=(chunk_length,),
107
+ compressor=default_compressor,
108
+ chunks=(variants_chunk_size,),
103
109
  )
104
110
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
105
111
  logger.debug(f"encoded variant_position")
@@ -109,8 +115,8 @@ def convert(
109
115
  "variant_allele",
110
116
  alleles,
111
117
  dtype="str",
112
- compressor=core.default_compressor,
113
- chunks=(chunk_length,),
118
+ compressor=default_compressor,
119
+ chunks=(variants_chunk_size,),
114
120
  )
115
121
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
116
122
  logger.debug(f"encoded variant_allele")
@@ -121,7 +127,7 @@ def convert(
121
127
  dtype="bool",
122
128
  shape=list(shape),
123
129
  chunks=list(chunks),
124
- compressor=core.default_compressor,
130
+ compressor=default_compressor,
125
131
  )
126
132
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
127
133
 
@@ -132,7 +138,7 @@ def convert(
132
138
  dtype="i1",
133
139
  shape=list(shape),
134
140
  chunks=list(chunks),
135
- compressor=core.default_compressor,
141
+ compressor=default_compressor,
136
142
  )
137
143
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
138
144
 
@@ -141,7 +147,7 @@ def convert(
141
147
  dtype="bool",
142
148
  shape=list(shape),
143
149
  chunks=list(chunks),
144
- compressor=core.default_compressor,
150
+ compressor=default_compressor,
145
151
  )
146
152
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
147
153