bio2zarr 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from . provenance import __version__
bio2zarr/__main__.py ADDED
@@ -0,0 +1,20 @@
1
+ import click
2
+
3
+ from . import cli
4
+
5
+ @cli.version
6
+ @click.group()
7
+ def bio2zarr():
8
+ pass
9
+
10
+ # Provide a single top-level interface to all of the functionality.
11
+ # This probably isn't the recommended way of interacting, as we
12
+ # install individual commands as console scripts. However, this
13
+ # is handy for development and for those whose PATHs aren't set
14
+ # up in the right way.
15
+ bio2zarr.add_command(cli.vcf2zarr)
16
+ bio2zarr.add_command(cli.plink2zarr)
17
+ bio2zarr.add_command(cli.vcf_partition)
18
+
19
+ if __name__ == "__main__":
20
+ bio2zarr()
bio2zarr/_version.py ADDED
@@ -0,0 +1,16 @@
1
+ # file generated by setuptools_scm
2
+ # don't change, don't track in version control
3
+ TYPE_CHECKING = False
4
+ if TYPE_CHECKING:
5
+ from typing import Tuple, Union
6
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
7
+ else:
8
+ VERSION_TUPLE = object
9
+
10
+ version: str
11
+ __version__: str
12
+ __version_tuple__: VERSION_TUPLE
13
+ version_tuple: VERSION_TUPLE
14
+
15
+ __version__ = version = '0.0.1'
16
+ __version_tuple__ = version_tuple = (0, 0, 1)
bio2zarr/cli.py ADDED
@@ -0,0 +1,229 @@
1
+ import click
2
+ import tabulate
3
+ import coloredlogs
4
+
5
+ from . import vcf
6
+ from . import vcf_utils
7
+ from . import plink
8
+ from . import provenance
9
+
10
+ # Common arguments/options
11
+ verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
12
+
13
+ worker_processes = click.option(
14
+ "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
15
+ )
16
+
17
+ # TODO help text
18
+ chunk_length = click.option(
19
+ "-l",
20
+ "--chunk-length",
21
+ type=int,
22
+ default=None,
23
+ help="Chunk size in the variants dimension",
24
+ )
25
+
26
+ chunk_width = click.option(
27
+ "-w",
28
+ "--chunk-width",
29
+ type=int,
30
+ default=None,
31
+ help="Chunk size in the samples dimension",
32
+ )
33
+
34
+ version = click.version_option(version=f"bio2zarr {provenance.__version__}")
35
+
36
+
37
+ # Note: logging hasn't been implemented in the code at all, this is just
38
+ # a first pass to try out some ways of doing things to see what works.
39
+ def setup_logging(verbosity):
40
+ level = "WARNING"
41
+ if verbosity == 1:
42
+ level = "INFO"
43
+ elif verbosity >= 2:
44
+ level = "DEBUG"
45
+ # NOTE: I'm not that excited about coloredlogs, just trying it out
46
+ # as it is installed by cyvcf2 anyway. We will have some complicated
47
+ # stuff doing on with threads and processes, to logs might not work
48
+ # so well anyway.
49
+ coloredlogs.install(level=level)
50
+
51
+
52
+ @click.command
53
+ @click.argument("vcfs", nargs=-1, required=True)
54
+ @click.argument("out_path", type=click.Path())
55
+ @verbose
56
+ @worker_processes
57
+ @click.option("-c", "--column-chunk-size", type=int, default=64)
58
+ def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
59
+ """
60
+ Convert VCF(s) to columnar intermediate format
61
+ """
62
+ setup_logging(verbose)
63
+ vcf.explode(
64
+ vcfs,
65
+ out_path,
66
+ worker_processes=worker_processes,
67
+ column_chunk_size=column_chunk_size,
68
+ show_progress=True,
69
+ )
70
+
71
+
72
+ @click.command
73
+ @click.argument("if_path", type=click.Path())
74
+ @verbose
75
+ def inspect(if_path, verbose):
76
+ """
77
+ Inspect an intermediate format file
78
+ """
79
+ setup_logging(verbose)
80
+ data = vcf.inspect(if_path)
81
+ click.echo(tabulate.tabulate(data, headers="keys"))
82
+
83
+
84
+ @click.command
85
+ @click.argument("if_path", type=click.Path())
86
+ def mkschema(if_path):
87
+ """
88
+ Generate a schema for zarr encoding
89
+ """
90
+ stream = click.get_text_stream("stdout")
91
+ vcf.mkschema(if_path, stream)
92
+
93
+
94
+ @click.command
95
+ @click.argument("if_path", type=click.Path())
96
+ @click.argument("zarr_path", type=click.Path())
97
+ @verbose
98
+ @click.option("-s", "--schema", default=None, type=click.Path(exists=True))
99
+ @chunk_length
100
+ @chunk_width
101
+ @click.option(
102
+ "-V",
103
+ "--max-variant-chunks",
104
+ type=int,
105
+ default=None,
106
+ help=(
107
+ "Truncate the output in the variants dimension to have "
108
+ "this number of chunks. Mainly intended to help with "
109
+ "schema tuning."
110
+ ),
111
+ )
112
+ @worker_processes
113
+ def encode(
114
+ if_path,
115
+ zarr_path,
116
+ verbose,
117
+ schema,
118
+ chunk_length,
119
+ chunk_width,
120
+ max_variant_chunks,
121
+ worker_processes,
122
+ ):
123
+ """
124
+ Encode intermediate format (see explode) to vcfzarr
125
+ """
126
+ setup_logging(verbose)
127
+ vcf.encode(
128
+ if_path,
129
+ zarr_path,
130
+ schema,
131
+ chunk_length=chunk_length,
132
+ chunk_width=chunk_width,
133
+ max_v_chunks=max_variant_chunks,
134
+ worker_processes=worker_processes,
135
+ show_progress=True,
136
+ )
137
+
138
+
139
+ @click.command(name="convert")
140
+ @click.argument("vcfs", nargs=-1, required=True)
141
+ @click.argument("out_path", type=click.Path())
142
+ @chunk_length
143
+ @chunk_width
144
+ @verbose
145
+ @worker_processes
146
+ def convert_vcf(vcfs, out_path, chunk_length, chunk_width, verbose, worker_processes):
147
+ """
148
+ Convert input VCF(s) directly to vcfzarr (not recommended for large files)
149
+ """
150
+ setup_logging(verbose)
151
+ vcf.convert(
152
+ vcfs,
153
+ out_path,
154
+ chunk_length=chunk_length,
155
+ chunk_width=chunk_width,
156
+ show_progress=True,
157
+ worker_processes=worker_processes,
158
+ )
159
+
160
+
161
+ @click.command
162
+ @click.argument("vcfs", nargs=-1, required=True)
163
+ @click.argument("out_path", type=click.Path())
164
+ def validate(vcfs, out_path):
165
+ """
166
+ Development only, do not use. Will be removed before release.
167
+ """
168
+ # FIXME! Will silently not look at remaining VCFs
169
+ vcf.validate(vcfs[0], out_path, show_progress=True)
170
+
171
+
172
+ @version
173
+ @click.group()
174
+ def vcf2zarr():
175
+ pass
176
+
177
+
178
+ # TODO figure out how to get click to list these in the given order.
179
+ vcf2zarr.add_command(explode)
180
+ vcf2zarr.add_command(inspect)
181
+ vcf2zarr.add_command(mkschema)
182
+ vcf2zarr.add_command(encode)
183
+ vcf2zarr.add_command(convert_vcf)
184
+ vcf2zarr.add_command(validate)
185
+
186
+
187
+ @click.command(name="convert")
188
+ @click.argument("in_path", type=click.Path())
189
+ @click.argument("out_path", type=click.Path())
190
+ @worker_processes
191
+ @verbose
192
+ @chunk_length
193
+ @chunk_width
194
+ def convert_plink(
195
+ in_path, out_path, verbose, worker_processes, chunk_length, chunk_width
196
+ ):
197
+ """
198
+ In development; DO NOT USE!
199
+ """
200
+ setup_logging(verbose)
201
+ plink.convert(
202
+ in_path,
203
+ out_path,
204
+ show_progress=True,
205
+ worker_processes=worker_processes,
206
+ chunk_width=chunk_width,
207
+ chunk_length=chunk_length,
208
+ )
209
+
210
+
211
+ @version
212
+ @click.group()
213
+ def plink2zarr():
214
+ pass
215
+
216
+
217
+ plink2zarr.add_command(convert_plink)
218
+
219
+
220
+ @click.command
221
+ @version
222
+ @click.argument("vcf_path", type=click.Path())
223
+ @click.option("-i", "--index", type=click.Path(), default=None)
224
+ @click.option("-n", "--num-parts", type=int, default=None)
225
+ # @click.option("-s", "--part-size", type=int, default=None)
226
+ def vcf_partition(vcf_path, index, num_parts):
227
+ indexed_vcf = vcf_utils.IndexedVcf(vcf_path, index)
228
+ regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
229
+ click.echo("\n".join(map(str, regions)))
bio2zarr/core.py ADDED
@@ -0,0 +1,235 @@
1
+ import dataclasses
2
+ import contextlib
3
+ import concurrent.futures as cf
4
+ import multiprocessing
5
+ import threading
6
+ import logging
7
+ import time
8
+
9
+ import zarr
10
+ import numpy as np
11
+ import tqdm
12
+ import numcodecs
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ numcodecs.blosc.use_threads = False
18
+
19
+ # TODO this should probably go in another module where we abstract
20
+ # out the zarr defaults
21
+ default_compressor = numcodecs.Blosc(
22
+ cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
23
+ )
24
+
25
+
26
+ def chunk_aligned_slices(z, n, max_chunks=None):
27
+ """
28
+ Returns at n slices in the specified zarr array, aligned
29
+ with its chunks
30
+ """
31
+ chunk_size = z.chunks[0]
32
+ num_chunks = int(np.ceil(z.shape[0] / chunk_size))
33
+ if max_chunks is not None:
34
+ num_chunks = min(num_chunks, max_chunks)
35
+ slices = []
36
+ splits = np.array_split(np.arange(num_chunks), min(n, num_chunks))
37
+ for split in splits:
38
+ start = split[0] * chunk_size
39
+ stop = (split[-1] + 1) * chunk_size
40
+ stop = min(stop, z.shape[0])
41
+ slices.append((start, stop))
42
+ return slices
43
+
44
+
45
+ class SynchronousExecutor(cf.Executor):
46
+ def submit(self, fn, /, *args, **kwargs):
47
+ future = cf.Future()
48
+ future.set_result(fn(*args, **kwargs))
49
+ return future
50
+
51
+
52
+ def wait_on_futures(futures):
53
+ for future in cf.as_completed(futures):
54
+ exception = future.exception()
55
+ if exception is not None:
56
+ raise exception
57
+
58
+
59
+ def cancel_futures(futures):
60
+ for future in futures:
61
+ future.cancel()
62
+
63
+
64
+ @dataclasses.dataclass
65
+ class BufferedArray:
66
+ array: zarr.Array
67
+ array_offset: int
68
+ buff: np.ndarray
69
+ buffer_row: int
70
+
71
+ def __init__(self, array, offset):
72
+ self.array = array
73
+ self.array_offset = offset
74
+ assert offset % array.chunks[0] == 0
75
+ dims = list(array.shape)
76
+ dims[0] = min(array.chunks[0], array.shape[0])
77
+ self.buff = np.zeros(dims, dtype=array.dtype)
78
+ self.buffer_row = 0
79
+
80
+ @property
81
+ def chunk_length(self):
82
+ return self.buff.shape[0]
83
+
84
+ def next_buffer_row(self):
85
+ if self.buffer_row == self.chunk_length:
86
+ self.flush()
87
+ row = self.buffer_row
88
+ self.buffer_row += 1
89
+ return row
90
+
91
+ def flush(self):
92
+ if self.buffer_row != 0:
93
+ if len(self.array.chunks) <= 1:
94
+ sync_flush_1d_array(
95
+ self.buff[: self.buffer_row], self.array, self.array_offset
96
+ )
97
+ else:
98
+ sync_flush_2d_array(
99
+ self.buff[: self.buffer_row], self.array, self.array_offset
100
+ )
101
+ logger.debug(
102
+ f"Flushed <{self.array.name} {self.array.shape} "
103
+ f"{self.array.dtype}> "
104
+ f"{self.array_offset}:{self.array_offset + self.buffer_row}"
105
+ f"{self.buff.nbytes / 2**20: .2f}Mb"
106
+ )
107
+ self.array_offset += self.chunk_length
108
+ self.buffer_row = 0
109
+
110
+
111
+ def sync_flush_1d_array(np_buffer, zarr_array, offset):
112
+ zarr_array[offset : offset + np_buffer.shape[0]] = np_buffer
113
+ update_progress(1)
114
+
115
+
116
+ def sync_flush_2d_array(np_buffer, zarr_array, offset):
117
+ # Write chunks in the second dimension 1-by-1 to make progress more
118
+ # incremental, and to avoid large memcopies in the underlying
119
+ # encoder implementations.
120
+ s = slice(offset, offset + np_buffer.shape[0])
121
+ chunk_width = zarr_array.chunks[1]
122
+ zarr_array_width = zarr_array.shape[1]
123
+ start = 0
124
+ while start < zarr_array_width:
125
+ stop = min(start + chunk_width, zarr_array_width)
126
+ zarr_array[s, start:stop] = np_buffer[:, start:stop]
127
+ update_progress(1)
128
+ start = stop
129
+
130
+
131
+ @dataclasses.dataclass
132
+ class ProgressConfig:
133
+ total: int = 0
134
+ units: str = ""
135
+ title: str = ""
136
+ show: bool = False
137
+ poll_interval: float = 0.01
138
+
139
+
140
+ # NOTE: this approach means that we cannot have more than one
141
+ # progressable thing happening per source process. This is
142
+ # probably fine in practise, but there could be corner cases
143
+ # where it's not. Something to watch out for.
144
+ _progress_counter = multiprocessing.Value("Q", 0)
145
+
146
+
147
+ def update_progress(inc):
148
+ with _progress_counter.get_lock():
149
+ _progress_counter.value += inc
150
+
151
+
152
+ def get_progress():
153
+ with _progress_counter.get_lock():
154
+ val = _progress_counter.value
155
+ return val
156
+
157
+
158
+ def set_progress(value):
159
+ with _progress_counter.get_lock():
160
+ _progress_counter.value = value
161
+
162
+
163
+ class ParallelWorkManager(contextlib.AbstractContextManager):
164
+ def __init__(self, worker_processes=1, progress_config=None):
165
+ if worker_processes <= 0:
166
+ # NOTE: this is only for testing, not for production use!
167
+ self.executor = SynchronousExecutor()
168
+ else:
169
+ self.executor = cf.ProcessPoolExecutor(
170
+ max_workers=worker_processes,
171
+ )
172
+ self.futures = []
173
+
174
+ set_progress(0)
175
+ if progress_config is None:
176
+ progress_config = ProgressConfig()
177
+ self.progress_config = progress_config
178
+ self.progress_bar = tqdm.tqdm(
179
+ total=progress_config.total,
180
+ desc=f"{progress_config.title:>9}",
181
+ unit_scale=True,
182
+ unit=progress_config.units,
183
+ smoothing=0.1,
184
+ disable=not progress_config.show,
185
+ )
186
+ self.completed = False
187
+ self.completed_lock = threading.Lock()
188
+ self.progress_thread = threading.Thread(
189
+ target=self._update_progress_worker,
190
+ name="progress-update",
191
+ daemon=True, # Avoids deadlock on exit in awkward error conditions
192
+ )
193
+ self.progress_thread.start()
194
+
195
+ def _update_progress(self):
196
+ current = get_progress()
197
+ inc = current - self.progress_bar.n
198
+ # print("UPDATE PROGRESS: current = ", current, self.progress_config.total, inc)
199
+ self.progress_bar.update(inc)
200
+
201
+ def _update_progress_worker(self):
202
+ completed = False
203
+ while not completed:
204
+ self._update_progress()
205
+ time.sleep(self.progress_config.poll_interval)
206
+ with self.completed_lock:
207
+ completed = self.completed
208
+ logger.debug("Exit progress thread")
209
+
210
+ def submit(self, *args, **kwargs):
211
+ self.futures.append(self.executor.submit(*args, **kwargs))
212
+
213
+ def results_as_completed(self):
214
+ for future in cf.as_completed(self.futures):
215
+ yield future.result()
216
+
217
+ def __exit__(self, exc_type, exc_val, exc_tb):
218
+ if exc_type is None:
219
+ wait_on_futures(self.futures)
220
+ else:
221
+ cancel_futures(self.futures)
222
+ # There's probably a much cleaner way of doing this with a Condition
223
+ # or something, but this seems to work OK for now. This setup might
224
+ # make small conversions a bit laggy as we wait on the sleep interval
225
+ # though.
226
+ with self.completed_lock:
227
+ self.completed = True
228
+ self.executor.shutdown(wait=False)
229
+ # FIXME there's currently some thing weird happening at the end of
230
+ # Encode 1D for 1kg-p3. The progress bar disappears, like we're
231
+ # setting a total of zero or something.
232
+ self.progress_thread.join()
233
+ self._update_progress()
234
+ self.progress_bar.close()
235
+ return False
bio2zarr/plink.py ADDED
@@ -0,0 +1,198 @@
1
+ import logging
2
+
3
+ import humanfriendly
4
+ import numpy as np
5
+ import zarr
6
+ import bed_reader
7
+
8
+ from . import core
9
+
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
15
+ # We need to count the A2 alleles here if we want to keep the
16
+ # alleles reported as allele_1, allele_2. It's obvious here what
17
+ # the correct approach is, but it is important to note that the
18
+ # 0th allele is *not* necessarily the REF for these datasets.
19
+ bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
20
+ store = zarr.DirectoryStore(zarr_path)
21
+ root = zarr.group(store=store)
22
+ gt = core.BufferedArray(root["call_genotype"], start)
23
+ gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
24
+ gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
25
+ chunk_length = gt.array.chunks[0]
26
+ n = gt.array.shape[1]
27
+ assert start % chunk_length == 0
28
+
29
+ logger.debug(f"Reading slice {start}:{stop}")
30
+ chunk_start = start
31
+ while chunk_start < stop:
32
+ chunk_stop = min(chunk_start + chunk_length, stop)
33
+ logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
34
+ bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
35
+ logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
36
+ # Probably should do this without iterating over rows, but it's a bit
37
+ # simpler and lines up better with the array buffering API. The bottleneck
38
+ # is in the encoding anyway.
39
+ for values in bed_chunk:
40
+ j = gt.next_buffer_row()
41
+ g = np.zeros_like(gt.buff[j])
42
+ g[values == -127] = -1
43
+ g[values == 2] = 1
44
+ g[values == 1, 0] = 1
45
+ gt.buff[j] = g
46
+ j = gt_phased.next_buffer_row()
47
+ gt_phased.buff[j] = False
48
+ j = gt_mask.next_buffer_row()
49
+ gt_mask.buff[j] = gt.buff[j] == -1
50
+ chunk_start = chunk_stop
51
+ gt.flush()
52
+ gt_phased.flush()
53
+ gt_mask.flush()
54
+ logger.debug(f"GT slice {start}:{stop} done")
55
+
56
+
57
+ def convert(
58
+ bed_path,
59
+ zarr_path,
60
+ *,
61
+ show_progress=False,
62
+ worker_processes=1,
63
+ chunk_length=None,
64
+ chunk_width=None,
65
+ ):
66
+ bed = bed_reader.open_bed(bed_path, num_threads=1)
67
+ n = bed.iid_count
68
+ m = bed.sid_count
69
+ logging.info(f"Scanned plink with {n} samples and {m} variants")
70
+
71
+ # FIXME
72
+ if chunk_width is None:
73
+ chunk_width = 1000
74
+ if chunk_length is None:
75
+ chunk_length = 10_000
76
+
77
+ store = zarr.DirectoryStore(zarr_path)
78
+ root = zarr.group(store=store, overwrite=True)
79
+
80
+ ploidy = 2
81
+ shape = [m, n]
82
+ chunks = [chunk_length, chunk_width]
83
+ dimensions = ["variants", "samples"]
84
+
85
+ a = root.array(
86
+ "sample_id",
87
+ bed.iid,
88
+ dtype="str",
89
+ compressor=core.default_compressor,
90
+ chunks=(chunk_width,),
91
+ )
92
+ a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
93
+ logger.debug(f"Encoded samples")
94
+
95
+ # TODO encode these in slices - but read them in one go to avoid
96
+ # fetching repeatedly from bim file
97
+ a = root.array(
98
+ "variant_position",
99
+ bed.bp_position,
100
+ dtype=np.int32,
101
+ compressor=core.default_compressor,
102
+ chunks=(chunk_length,),
103
+ )
104
+ a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
105
+ logger.debug(f"encoded variant_position")
106
+
107
+ alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
108
+ a = root.array(
109
+ "variant_allele",
110
+ alleles,
111
+ dtype="str",
112
+ compressor=core.default_compressor,
113
+ chunks=(chunk_length,),
114
+ )
115
+ a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
116
+ logger.debug(f"encoded variant_allele")
117
+
118
+ # TODO remove this?
119
+ a = root.empty(
120
+ "call_genotype_phased",
121
+ dtype="bool",
122
+ shape=list(shape),
123
+ chunks=list(chunks),
124
+ compressor=core.default_compressor,
125
+ )
126
+ a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
127
+
128
+ shape += [ploidy]
129
+ dimensions += ["ploidy"]
130
+ a = root.empty(
131
+ "call_genotype",
132
+ dtype="i1",
133
+ shape=list(shape),
134
+ chunks=list(chunks),
135
+ compressor=core.default_compressor,
136
+ )
137
+ a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
138
+
139
+ a = root.empty(
140
+ "call_genotype_mask",
141
+ dtype="bool",
142
+ shape=list(shape),
143
+ chunks=list(chunks),
144
+ compressor=core.default_compressor,
145
+ )
146
+ a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
147
+
148
+ del bed
149
+
150
+ num_slices = max(1, worker_processes * 4)
151
+ slices = core.chunk_aligned_slices(a, num_slices)
152
+
153
+ total_chunks = sum(a.nchunks for a in root.values())
154
+
155
+ progress_config = core.ProgressConfig(
156
+ total=total_chunks, title="Convert", units="chunks", show=show_progress
157
+ )
158
+ with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
159
+ for start, stop in slices:
160
+ pwm.submit(encode_genotypes_slice, bed_path, zarr_path, start, stop)
161
+
162
+ # TODO also add atomic swap like VCF. Should be abstracted to
163
+ # share basic code for setting up the variation dataset zarr
164
+ zarr.consolidate_metadata(zarr_path)
165
+
166
+
167
+ # FIXME do this more efficiently - currently reading the whole thing
168
+ # in for convenience, and also comparing call-by-call
169
+ def validate(bed_path, zarr_path):
170
+ store = zarr.DirectoryStore(zarr_path)
171
+ root = zarr.group(store=store)
172
+ call_genotype = root["call_genotype"][:]
173
+
174
+ bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
175
+
176
+ assert call_genotype.shape[0] == bed.sid_count
177
+ assert call_genotype.shape[1] == bed.iid_count
178
+ bed_genotypes = bed.read(dtype="int8").T
179
+ assert call_genotype.shape[0] == bed_genotypes.shape[0]
180
+ assert call_genotype.shape[1] == bed_genotypes.shape[1]
181
+ assert call_genotype.shape[2] == 2
182
+
183
+ row_id = 0
184
+ for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
185
+ # print("ROW", row_id)
186
+ # print(bed_row, zarr_row)
187
+ row_id += 1
188
+ for bed_call, zarr_call in zip(bed_row, zarr_row):
189
+ if bed_call == -127:
190
+ assert list(zarr_call) == [-1, -1]
191
+ elif bed_call == 0:
192
+ assert list(zarr_call) == [0, 0]
193
+ elif bed_call == 1:
194
+ assert list(zarr_call) == [1, 0]
195
+ elif bed_call == 2:
196
+ assert list(zarr_call) == [1, 1]
197
+ else: # pragma no cover
198
+ assert False
bio2zarr/provenance.py ADDED
@@ -0,0 +1,7 @@
1
+ __version__ = "undefined"
2
+ try:
3
+ from . import _version
4
+
5
+ __version__ = _version.version
6
+ except ImportError: # pragma: nocover
7
+ pass