bio2zarr 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.1'
16
- __version_tuple__ = version_tuple = (0, 0, 1)
15
+ __version__ = version = '0.0.2'
16
+ __version_tuple__ = version_tuple = (0, 0, 2)
bio2zarr/cli.py CHANGED
@@ -7,35 +7,52 @@ from . import vcf_utils
7
7
  from . import plink
8
8
  from . import provenance
9
9
 
10
+
11
+ class NaturalOrderGroup(click.Group):
12
+ """
13
+ List commands in the order they are provided in the help text.
14
+ """
15
+
16
+ def list_commands(self, ctx):
17
+ return self.commands.keys()
18
+
19
+
10
20
  # Common arguments/options
11
21
  verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
12
22
 
23
+ version = click.version_option(version=f"{provenance.__version__}")
24
+
13
25
  worker_processes = click.option(
14
26
  "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
15
27
  )
16
28
 
17
- # TODO help text
18
- chunk_length = click.option(
29
+ column_chunk_size = click.option(
30
+ "-c",
31
+ "--column-chunk-size",
32
+ type=int,
33
+ default=64,
34
+ help="Approximate uncompressed size of exploded column chunks in MiB",
35
+ )
36
+
37
+ # Note: -l and -w were chosen when these were called "width" and "length".
38
+ # possibly there are better letters now.
39
+ variants_chunk_size = click.option(
19
40
  "-l",
20
- "--chunk-length",
41
+ "--variants-chunk-size",
21
42
  type=int,
22
43
  default=None,
23
44
  help="Chunk size in the variants dimension",
24
45
  )
25
46
 
26
- chunk_width = click.option(
47
+ samples_chunk_size = click.option(
27
48
  "-w",
28
- "--chunk-width",
49
+ "--samples-chunk-size",
29
50
  type=int,
30
51
  default=None,
31
52
  help="Chunk size in the samples dimension",
32
53
  )
33
54
 
34
- version = click.version_option(version=f"bio2zarr {provenance.__version__}")
35
55
 
36
-
37
- # Note: logging hasn't been implemented in the code at all, this is just
38
- # a first pass to try out some ways of doing things to see what works.
39
56
  def setup_logging(verbosity):
40
57
  level = "WARNING"
41
58
  if verbosity == 1:
@@ -43,26 +60,24 @@ def setup_logging(verbosity):
43
60
  elif verbosity >= 2:
44
61
  level = "DEBUG"
45
62
  # NOTE: I'm not that excited about coloredlogs, just trying it out
46
- # as it is installed by cyvcf2 anyway. We will have some complicated
47
- # stuff doing on with threads and processes, to logs might not work
48
- # so well anyway.
63
+ # as it is installed by cyvcf2 anyway.
49
64
  coloredlogs.install(level=level)
50
65
 
51
66
 
52
67
  @click.command
53
68
  @click.argument("vcfs", nargs=-1, required=True)
54
- @click.argument("out_path", type=click.Path())
69
+ @click.argument("zarr_path", type=click.Path())
55
70
  @verbose
56
71
  @worker_processes
57
- @click.option("-c", "--column-chunk-size", type=int, default=64)
58
- def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
72
+ @column_chunk_size
73
+ def explode(vcfs, zarr_path, verbose, worker_processes, column_chunk_size):
59
74
  """
60
- Convert VCF(s) to columnar intermediate format
75
+ Convert VCF(s) to intermediate columnar format
61
76
  """
62
77
  setup_logging(verbose)
63
78
  vcf.explode(
64
79
  vcfs,
65
- out_path,
80
+ zarr_path,
66
81
  worker_processes=worker_processes,
67
82
  column_chunk_size=column_chunk_size,
68
83
  show_progress=True,
@@ -70,34 +85,85 @@ def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
70
85
 
71
86
 
72
87
  @click.command
73
- @click.argument("if_path", type=click.Path())
88
+ @click.argument("vcfs", nargs=-1, required=True)
89
+ @click.argument("icf_path", type=click.Path())
90
+ @click.argument("num_partitions", type=int)
91
+ @column_chunk_size
74
92
  @verbose
75
- def inspect(if_path, verbose):
93
+ @worker_processes
94
+ def dexplode_init(
95
+ vcfs, icf_path, num_partitions, column_chunk_size, verbose, worker_processes
96
+ ):
76
97
  """
77
- Inspect an intermediate format file
98
+ Initial step for parallel conversion of VCF(s) to intermediate columnar format
99
+ over the requested number of paritions.
78
100
  """
79
101
  setup_logging(verbose)
80
- data = vcf.inspect(if_path)
102
+ num_partitions = vcf.explode_init(
103
+ icf_path,
104
+ vcfs,
105
+ target_num_partitions=num_partitions,
106
+ column_chunk_size=column_chunk_size,
107
+ worker_processes=worker_processes,
108
+ show_progress=True,
109
+ )
110
+ click.echo(num_partitions)
111
+
112
+
113
+ @click.command
114
+ @click.argument("icf_path", type=click.Path())
115
+ @click.argument("partition", type=int)
116
+ @verbose
117
+ def dexplode_partition(icf_path, partition, verbose):
118
+ """
119
+ Convert a VCF partition into intermediate columnar format. Must be called *after*
120
+ the ICF path has been initialised with dexplode_init. Partition indexes must be
121
+ from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
122
+ """
123
+ setup_logging(verbose)
124
+ vcf.explode_partition(icf_path, partition, show_progress=True)
125
+
126
+
127
+ @click.command
128
+ @click.argument("path", type=click.Path(), required=True)
129
+ @verbose
130
+ def dexplode_finalise(path, verbose):
131
+ """
132
+ Final step for parallel conversion of VCF(s) to intermediate columnar format
133
+ """
134
+ setup_logging(verbose)
135
+ vcf.explode_finalise(path)
136
+
137
+
138
+ @click.command
139
+ @click.argument("icf_path", type=click.Path())
140
+ @verbose
141
+ def inspect(icf_path, verbose):
142
+ """
143
+ Inspect an intermediate format or Zarr path.
144
+ """
145
+ setup_logging(verbose)
146
+ data = vcf.inspect(icf_path)
81
147
  click.echo(tabulate.tabulate(data, headers="keys"))
82
148
 
83
149
 
84
150
  @click.command
85
- @click.argument("if_path", type=click.Path())
86
- def mkschema(if_path):
151
+ @click.argument("icf_path", type=click.Path())
152
+ def mkschema(icf_path):
87
153
  """
88
154
  Generate a schema for zarr encoding
89
155
  """
90
156
  stream = click.get_text_stream("stdout")
91
- vcf.mkschema(if_path, stream)
157
+ vcf.mkschema(icf_path, stream)
92
158
 
93
159
 
94
160
  @click.command
95
- @click.argument("if_path", type=click.Path())
161
+ @click.argument("icf_path", type=click.Path())
96
162
  @click.argument("zarr_path", type=click.Path())
97
163
  @verbose
98
164
  @click.option("-s", "--schema", default=None, type=click.Path(exists=True))
99
- @chunk_length
100
- @chunk_width
165
+ @variants_chunk_size
166
+ @samples_chunk_size
101
167
  @click.option(
102
168
  "-V",
103
169
  "--max-variant-chunks",
@@ -109,50 +175,61 @@ def mkschema(if_path):
109
175
  "schema tuning."
110
176
  ),
111
177
  )
178
+ @click.option(
179
+ "-M",
180
+ "--max-memory",
181
+ type=int,
182
+ default=None,
183
+ help="An approximate bound on overall memory usage in megabytes",
184
+ )
112
185
  @worker_processes
113
186
  def encode(
114
- if_path,
187
+ icf_path,
115
188
  zarr_path,
116
189
  verbose,
117
190
  schema,
118
- chunk_length,
119
- chunk_width,
191
+ variants_chunk_size,
192
+ samples_chunk_size,
120
193
  max_variant_chunks,
194
+ max_memory,
121
195
  worker_processes,
122
196
  ):
123
197
  """
124
- Encode intermediate format (see explode) to vcfzarr
198
+ Encode intermediate columnar format (see explode) to vcfzarr.
125
199
  """
126
200
  setup_logging(verbose)
127
201
  vcf.encode(
128
- if_path,
202
+ icf_path,
129
203
  zarr_path,
130
204
  schema,
131
- chunk_length=chunk_length,
132
- chunk_width=chunk_width,
205
+ variants_chunk_size=variants_chunk_size,
206
+ samples_chunk_size=samples_chunk_size,
133
207
  max_v_chunks=max_variant_chunks,
134
208
  worker_processes=worker_processes,
209
+ max_memory=max_memory,
135
210
  show_progress=True,
136
211
  )
137
212
 
138
213
 
139
214
  @click.command(name="convert")
140
215
  @click.argument("vcfs", nargs=-1, required=True)
141
- @click.argument("out_path", type=click.Path())
142
- @chunk_length
143
- @chunk_width
216
+ @click.argument("zarr_path", type=click.Path())
217
+ @variants_chunk_size
218
+ @samples_chunk_size
144
219
  @verbose
145
220
  @worker_processes
146
- def convert_vcf(vcfs, out_path, chunk_length, chunk_width, verbose, worker_processes):
221
+ def convert_vcf(
222
+ vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
223
+ ):
147
224
  """
148
- Convert input VCF(s) directly to vcfzarr (not recommended for large files)
225
+ Convert input VCF(s) directly to vcfzarr (not recommended for large files).
149
226
  """
150
227
  setup_logging(verbose)
151
228
  vcf.convert(
152
229
  vcfs,
153
- out_path,
154
- chunk_length=chunk_length,
155
- chunk_width=chunk_width,
230
+ zarr_path,
231
+ variants_chunk_size=variants_chunk_size,
232
+ samples_chunk_size=samples_chunk_size,
156
233
  show_progress=True,
157
234
  worker_processes=worker_processes,
158
235
  )
@@ -160,39 +237,95 @@ def convert_vcf(vcfs, out_path, chunk_length, chunk_width, verbose, worker_proce
160
237
 
161
238
  @click.command
162
239
  @click.argument("vcfs", nargs=-1, required=True)
163
- @click.argument("out_path", type=click.Path())
164
- def validate(vcfs, out_path):
240
+ @click.argument("zarr_path", type=click.Path())
241
+ def validate(vcfs, zarr_path):
165
242
  """
166
243
  Development only, do not use. Will be removed before release.
167
244
  """
168
245
  # FIXME! Will silently not look at remaining VCFs
169
- vcf.validate(vcfs[0], out_path, show_progress=True)
246
+ vcf.validate(vcfs[0], zarr_path, show_progress=True)
170
247
 
171
248
 
172
249
  @version
173
- @click.group()
250
+ @click.group(cls=NaturalOrderGroup)
174
251
  def vcf2zarr():
175
- pass
252
+ """
253
+ Convert VCF file(s) to the vcfzarr format.
254
+
255
+ The simplest usage is:
256
+
257
+ $ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
258
+
259
+ This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
260
+ step. As this writes the intermediate columnar format to a temporary directory,
261
+ we only recommend this approach for small files (< 1GB, say).
262
+
263
+ The recommended approach is to run the conversion in two passes, and
264
+ to keep the intermediate columnar format ("exploded") around to facilitate
265
+ experimentation with chunk sizes and compression settings:
266
+
267
+ \b
268
+ $ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
269
+ $ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
270
+
271
+ The inspect command provides a way to view contents of an exploded ICF
272
+ or Zarr:
273
+
274
+ $ vcf2zarr inspect [PATH]
275
+
276
+ This is useful when tweaking chunk sizes and compression settings to suit
277
+ your dataset, using the mkschema command and --schema option to encode:
278
+
279
+ \b
280
+ $ vcf2zarr mkschema [ICF_PATH] > schema.json
281
+ $ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
282
+
283
+ By editing the schema.json file you can drop columns that are not of interest
284
+ and edit column specific compression settings. The --max-variant-chunks option
285
+ to encode allows you to try out these options on small subsets, hopefully
286
+ arriving at settings with the desired balance of compression and query
287
+ performance.
288
+
289
+ ADVANCED USAGE
290
+
291
+ For very large datasets (terabyte scale) it may be necessary to distribute the
292
+ explode and encode steps across a cluster:
293
+
294
+ \b
295
+ $ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
296
+ $ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
297
+ $ vcf2zarr dexplode-finalise [ICF_PATH]
298
+
299
+ See the online documentation at [FIXME] for more details on distributed explode.
300
+ """
176
301
 
177
302
 
178
303
  # TODO figure out how to get click to list these in the given order.
179
- vcf2zarr.add_command(explode)
304
+ vcf2zarr.add_command(convert_vcf)
180
305
  vcf2zarr.add_command(inspect)
306
+ vcf2zarr.add_command(explode)
181
307
  vcf2zarr.add_command(mkschema)
182
308
  vcf2zarr.add_command(encode)
183
- vcf2zarr.add_command(convert_vcf)
309
+ vcf2zarr.add_command(dexplode_init)
310
+ vcf2zarr.add_command(dexplode_partition)
311
+ vcf2zarr.add_command(dexplode_finalise)
184
312
  vcf2zarr.add_command(validate)
185
313
 
186
314
 
187
315
  @click.command(name="convert")
188
316
  @click.argument("in_path", type=click.Path())
189
- @click.argument("out_path", type=click.Path())
317
+ @click.argument("zarr_path", type=click.Path())
190
318
  @worker_processes
191
319
  @verbose
192
- @chunk_length
193
- @chunk_width
320
+ @variants_chunk_size
321
+ @samples_chunk_size
194
322
  def convert_plink(
195
- in_path, out_path, verbose, worker_processes, chunk_length, chunk_width
323
+ in_path,
324
+ zarr_path,
325
+ verbose,
326
+ worker_processes,
327
+ variants_chunk_size,
328
+ samples_chunk_size,
196
329
  ):
197
330
  """
198
331
  In development; DO NOT USE!
@@ -200,11 +333,11 @@ def convert_plink(
200
333
  setup_logging(verbose)
201
334
  plink.convert(
202
335
  in_path,
203
- out_path,
336
+ zarr_path,
204
337
  show_progress=True,
205
338
  worker_processes=worker_processes,
206
- chunk_width=chunk_width,
207
- chunk_length=chunk_length,
339
+ samples_chunk_size=samples_chunk_size,
340
+ variants_chunk_size=variants_chunk_size,
208
341
  )
209
342
 
210
343
 
bio2zarr/core.py CHANGED
@@ -16,12 +16,6 @@ logger = logging.getLogger(__name__)
16
16
 
17
17
  numcodecs.blosc.use_threads = False
18
18
 
19
- # TODO this should probably go in another module where we abstract
20
- # out the zarr defaults
21
- default_compressor = numcodecs.Blosc(
22
- cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
23
- )
24
-
25
19
 
26
20
  def chunk_aligned_slices(z, n, max_chunks=None):
27
21
  """
@@ -53,7 +47,12 @@ def wait_on_futures(futures):
53
47
  for future in cf.as_completed(futures):
54
48
  exception = future.exception()
55
49
  if exception is not None:
56
- raise exception
50
+ cancel_futures(futures)
51
+ if isinstance(exception, cf.process.BrokenProcessPool):
52
+ raise RuntimeError(
53
+ "Worker process died: you may have run out of memory") from exception
54
+ else:
55
+ raise exception
57
56
 
58
57
 
59
58
  def cancel_futures(futures):
@@ -74,15 +73,18 @@ class BufferedArray:
74
73
  assert offset % array.chunks[0] == 0
75
74
  dims = list(array.shape)
76
75
  dims[0] = min(array.chunks[0], array.shape[0])
77
- self.buff = np.zeros(dims, dtype=array.dtype)
76
+ self.buff = np.empty(dims, dtype=array.dtype)
77
+ # Explicitly Fill with zeros here to make any out-of-memory errors happen
78
+ # quickly.
79
+ self.buff[:] = 0
78
80
  self.buffer_row = 0
79
81
 
80
82
  @property
81
- def chunk_length(self):
83
+ def variants_chunk_size(self):
82
84
  return self.buff.shape[0]
83
85
 
84
86
  def next_buffer_row(self):
85
- if self.buffer_row == self.chunk_length:
87
+ if self.buffer_row == self.variants_chunk_size:
86
88
  self.flush()
87
89
  row = self.buffer_row
88
90
  self.buffer_row += 1
@@ -104,13 +106,13 @@ class BufferedArray:
104
106
  f"{self.array_offset}:{self.array_offset + self.buffer_row}"
105
107
  f"{self.buff.nbytes / 2**20: .2f}Mb"
106
108
  )
107
- self.array_offset += self.chunk_length
109
+ self.array_offset += self.variants_chunk_size
108
110
  self.buffer_row = 0
109
111
 
110
112
 
111
113
  def sync_flush_1d_array(np_buffer, zarr_array, offset):
112
114
  zarr_array[offset : offset + np_buffer.shape[0]] = np_buffer
113
- update_progress(1)
115
+ update_progress(np_buffer.nbytes)
114
116
 
115
117
 
116
118
  def sync_flush_2d_array(np_buffer, zarr_array, offset):
@@ -118,13 +120,16 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
118
120
  # incremental, and to avoid large memcopies in the underlying
119
121
  # encoder implementations.
120
122
  s = slice(offset, offset + np_buffer.shape[0])
121
- chunk_width = zarr_array.chunks[1]
123
+ samples_chunk_size = zarr_array.chunks[1]
124
+ # TODO use zarr chunks here to support non-uniform chunking later
125
+ # and for simplicity
122
126
  zarr_array_width = zarr_array.shape[1]
123
127
  start = 0
124
128
  while start < zarr_array_width:
125
- stop = min(start + chunk_width, zarr_array_width)
126
- zarr_array[s, start:stop] = np_buffer[:, start:stop]
127
- update_progress(1)
129
+ stop = min(start + samples_chunk_size, zarr_array_width)
130
+ chunk_buffer = np_buffer[:, start:stop]
131
+ zarr_array[s, start:stop] = chunk_buffer
132
+ update_progress(chunk_buffer.nbytes)
128
133
  start = stop
129
134
 
130
135
 
@@ -169,7 +174,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
169
174
  self.executor = cf.ProcessPoolExecutor(
170
175
  max_workers=worker_processes,
171
176
  )
172
- self.futures = []
177
+ self.futures = set()
173
178
 
174
179
  set_progress(0)
175
180
  if progress_config is None:
@@ -177,7 +182,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
177
182
  self.progress_config = progress_config
178
183
  self.progress_bar = tqdm.tqdm(
179
184
  total=progress_config.total,
180
- desc=f"{progress_config.title:>9}",
185
+ desc=f"{progress_config.title:>7}",
181
186
  unit_scale=True,
182
187
  unit=progress_config.units,
183
188
  smoothing=0.1,
@@ -208,7 +213,19 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
208
213
  logger.debug("Exit progress thread")
209
214
 
210
215
  def submit(self, *args, **kwargs):
211
- self.futures.append(self.executor.submit(*args, **kwargs))
216
+ future = self.executor.submit(*args, **kwargs)
217
+ self.futures.add(future)
218
+ return future
219
+
220
+ def wait_for_completed(self, timeout=None):
221
+ done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
222
+ for future in done:
223
+ exception = future.exception()
224
+ # TODO do the check for BrokenProcessPool here
225
+ if exception is not None:
226
+ raise exception
227
+ self.futures = not_done
228
+ return done
212
229
 
213
230
  def results_as_completed(self):
214
231
  for future in cf.as_completed(self.futures):
bio2zarr/plink.py CHANGED
@@ -4,6 +4,7 @@ import humanfriendly
4
4
  import numpy as np
5
5
  import zarr
6
6
  import bed_reader
7
+ import numcodecs
7
8
 
8
9
  from . import core
9
10
 
@@ -22,14 +23,14 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
22
23
  gt = core.BufferedArray(root["call_genotype"], start)
23
24
  gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
24
25
  gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
25
- chunk_length = gt.array.chunks[0]
26
+ variants_chunk_size = gt.array.chunks[0]
26
27
  n = gt.array.shape[1]
27
- assert start % chunk_length == 0
28
+ assert start % variants_chunk_size == 0
28
29
 
29
30
  logger.debug(f"Reading slice {start}:{stop}")
30
31
  chunk_start = start
31
32
  while chunk_start < stop:
32
- chunk_stop = min(chunk_start + chunk_length, stop)
33
+ chunk_stop = min(chunk_start + variants_chunk_size, stop)
33
34
  logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
34
35
  bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
35
36
  logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
@@ -60,8 +61,8 @@ def convert(
60
61
  *,
61
62
  show_progress=False,
62
63
  worker_processes=1,
63
- chunk_length=None,
64
- chunk_width=None,
64
+ variants_chunk_size=None,
65
+ samples_chunk_size=None,
65
66
  ):
66
67
  bed = bed_reader.open_bed(bed_path, num_threads=1)
67
68
  n = bed.iid_count
@@ -69,25 +70,30 @@ def convert(
69
70
  logging.info(f"Scanned plink with {n} samples and {m} variants")
70
71
 
71
72
  # FIXME
72
- if chunk_width is None:
73
- chunk_width = 1000
74
- if chunk_length is None:
75
- chunk_length = 10_000
73
+ if samples_chunk_size is None:
74
+ samples_chunk_size = 1000
75
+ if variants_chunk_size is None:
76
+ variants_chunk_size = 10_000
76
77
 
77
78
  store = zarr.DirectoryStore(zarr_path)
78
79
  root = zarr.group(store=store, overwrite=True)
79
80
 
80
81
  ploidy = 2
81
82
  shape = [m, n]
82
- chunks = [chunk_length, chunk_width]
83
+ chunks = [variants_chunk_size, samples_chunk_size]
83
84
  dimensions = ["variants", "samples"]
84
85
 
86
+ # TODO we should be reusing some logic from vcfzarr here on laying
87
+ # out the basic dataset, and using the schema generator. Currently
88
+ # we're not using the best Blosc settings for genotypes here.
89
+ default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
90
+
85
91
  a = root.array(
86
92
  "sample_id",
87
93
  bed.iid,
88
94
  dtype="str",
89
- compressor=core.default_compressor,
90
- chunks=(chunk_width,),
95
+ compressor=default_compressor,
96
+ chunks=(samples_chunk_size,),
91
97
  )
92
98
  a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
93
99
  logger.debug(f"Encoded samples")
@@ -98,8 +104,8 @@ def convert(
98
104
  "variant_position",
99
105
  bed.bp_position,
100
106
  dtype=np.int32,
101
- compressor=core.default_compressor,
102
- chunks=(chunk_length,),
107
+ compressor=default_compressor,
108
+ chunks=(variants_chunk_size,),
103
109
  )
104
110
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
105
111
  logger.debug(f"encoded variant_position")
@@ -109,8 +115,8 @@ def convert(
109
115
  "variant_allele",
110
116
  alleles,
111
117
  dtype="str",
112
- compressor=core.default_compressor,
113
- chunks=(chunk_length,),
118
+ compressor=default_compressor,
119
+ chunks=(variants_chunk_size,),
114
120
  )
115
121
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
116
122
  logger.debug(f"encoded variant_allele")
@@ -121,7 +127,7 @@ def convert(
121
127
  dtype="bool",
122
128
  shape=list(shape),
123
129
  chunks=list(chunks),
124
- compressor=core.default_compressor,
130
+ compressor=default_compressor,
125
131
  )
126
132
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
127
133
 
@@ -132,7 +138,7 @@ def convert(
132
138
  dtype="i1",
133
139
  shape=list(shape),
134
140
  chunks=list(chunks),
135
- compressor=core.default_compressor,
141
+ compressor=default_compressor,
136
142
  )
137
143
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
138
144
 
@@ -141,7 +147,7 @@ def convert(
141
147
  dtype="bool",
142
148
  shape=list(shape),
143
149
  chunks=list(chunks),
144
- compressor=core.default_compressor,
150
+ compressor=default_compressor,
145
151
  )
146
152
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
147
153