bio2zarr 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.1.0'
16
- __version_tuple__ = version_tuple = (0, 1, 0)
15
+ __version__ = version = '0.1.2'
16
+ __version_tuple__ = version_tuple = (0, 1, 2)
bio2zarr/cli.py CHANGED
@@ -149,6 +149,13 @@ max_memory = click.option(
149
149
  help="An approximate bound on overall memory usage (e.g. 10G),",
150
150
  )
151
151
 
152
+ local_alleles = click.option(
153
+ "--local-alleles/--no-local-alleles",
154
+ show_default=True,
155
+ default=False,
156
+ help="Use local allele fields to reduce the storage requirements of the output.",
157
+ )
158
+
152
159
 
153
160
  def setup_logging(verbosity):
154
161
  level = "WARNING"
@@ -312,7 +319,7 @@ def dexplode_finalise(icf_path, verbose):
312
319
 
313
320
 
314
321
  @click.command
315
- @click.argument("path", type=click.Path())
322
+ @click.argument("path", type=click.Path(exists=True))
316
323
  @verbose
317
324
  def inspect(path, verbose):
318
325
  """
@@ -325,12 +332,26 @@ def inspect(path, verbose):
325
332
 
326
333
  @click.command
327
334
  @icf_path
328
- def mkschema(icf_path):
335
+ @variants_chunk_size
336
+ @samples_chunk_size
337
+ @local_alleles
338
+ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
329
339
  """
330
340
  Generate a schema for zarr encoding
331
341
  """
342
+ if local_alleles:
343
+ click.echo(
344
+ "WARNING: Local alleles support is preliminary; please use with caution.",
345
+ err=True,
346
+ )
332
347
  stream = click.get_text_stream("stdout")
333
- vcf2zarr.mkschema(icf_path, stream)
348
+ vcf2zarr.mkschema(
349
+ icf_path,
350
+ stream,
351
+ variants_chunk_size=variants_chunk_size,
352
+ samples_chunk_size=samples_chunk_size,
353
+ local_alleles=local_alleles,
354
+ )
334
355
 
335
356
 
336
357
  @click.command
@@ -469,6 +490,7 @@ def dencode_finalise(zarr_path, verbose, progress):
469
490
  @verbose
470
491
  @progress
471
492
  @worker_processes
493
+ @local_alleles
472
494
  def convert_vcf(
473
495
  vcfs,
474
496
  zarr_path,
@@ -478,6 +500,7 @@ def convert_vcf(
478
500
  verbose,
479
501
  progress,
480
502
  worker_processes,
503
+ local_alleles,
481
504
  ):
482
505
  """
483
506
  Convert input VCF(s) directly to vcfzarr (not recommended for large files).
@@ -491,6 +514,7 @@ def convert_vcf(
491
514
  samples_chunk_size=samples_chunk_size,
492
515
  show_progress=progress,
493
516
  worker_processes=worker_processes,
517
+ local_alleles=local_alleles,
494
518
  )
495
519
 
496
520
 
@@ -560,7 +584,7 @@ plink2zarr.add_command(convert_plink)
560
584
 
561
585
  @click.command
562
586
  @version
563
- @click.argument("vcf_path", type=click.Path(exists=True, dir_okay=False))
587
+ @vcfs
564
588
  @verbose
565
589
  @num_partitions
566
590
  @click.option(
@@ -570,12 +594,16 @@ plink2zarr.add_command(convert_plink)
570
594
  default=None,
571
595
  help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
572
596
  )
573
- def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
597
+ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
574
598
  """
575
- Output bcftools region strings that partition an indexed VCF/BCF file
599
+ Output bcftools region strings that partition the indexed VCF/BCF files
576
600
  into either an approximate number of parts (-n), or parts of approximately
577
601
  a given size (-s). One of -n or -s must be supplied.
578
602
 
603
+ If multiple VCF/BCF files are provided, the number of parts (-n) is
604
+ interpreted as the total number of partitions across all the files,
605
+ and the partitions are distributed evenly among the files.
606
+
579
607
  Note that both the number of partitions and sizes are a target, and the
580
608
  returned number of partitions may not exactly correspond. In particular,
581
609
  there is a maximum level of granularity determined by the associated index
@@ -590,9 +618,15 @@ def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
590
618
  "Either --num-partitions or --partition-size must be specified"
591
619
  )
592
620
 
593
- indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
594
- regions = indexed_vcf.partition_into_regions(
595
- num_parts=num_partitions, target_part_size=partition_size
596
- )
597
- for region in regions:
598
- click.echo(f"{region}\t{vcf_path}")
621
+ if num_partitions is None:
622
+ num_parts_per_path = None
623
+ else:
624
+ num_parts_per_path = max(1, num_partitions // len(vcfs))
625
+
626
+ for vcf_path in vcfs:
627
+ indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
628
+ regions = indexed_vcf.partition_into_regions(
629
+ num_parts=num_parts_per_path, target_part_size=partition_size
630
+ )
631
+ for region in regions:
632
+ click.echo(f"{region}\t{vcf_path}")
bio2zarr/core.py CHANGED
@@ -63,6 +63,27 @@ def chunk_aligned_slices(z, n, max_chunks=None):
63
63
  return slices
64
64
 
65
65
 
66
+ def first_dim_slice_iter(z, start, stop):
67
+ """
68
+ Efficiently iterate over the specified slice of the first dimension of the zarr
69
+ array z.
70
+ """
71
+ chunk_size = z.chunks[0]
72
+ first_chunk = start // chunk_size
73
+ last_chunk = (stop // chunk_size) + (stop % chunk_size != 0)
74
+ for chunk in range(first_chunk, last_chunk):
75
+ Z = z.blocks[chunk]
76
+ chunk_start = chunk * chunk_size
77
+ chunk_stop = chunk_start + chunk_size
78
+ slice_start = None
79
+ if start > chunk_start:
80
+ slice_start = start - chunk_start
81
+ slice_stop = None
82
+ if stop < chunk_stop:
83
+ slice_stop = stop - chunk_start
84
+ yield from Z[slice_start:slice_stop]
85
+
86
+
66
87
  def du(path):
67
88
  """
68
89
  Return the total bytes stored at this path.
@@ -113,13 +134,16 @@ def cancel_futures(futures):
113
134
  class BufferedArray:
114
135
  array: zarr.Array
115
136
  array_offset: int
137
+ name: str
116
138
  buff: np.ndarray
117
139
  buffer_row: int
140
+ max_buff_size: int = 0
118
141
 
119
- def __init__(self, array, offset):
142
+ def __init__(self, array, offset, name="Unknown"):
120
143
  self.array = array
121
144
  self.array_offset = offset
122
145
  assert offset % array.chunks[0] == 0
146
+ self.name = name
123
147
  dims = list(array.shape)
124
148
  dims[0] = min(array.chunks[0], array.shape[0])
125
149
  self.buff = np.empty(dims, dtype=array.dtype)
@@ -150,11 +174,17 @@ class BufferedArray:
150
174
  self.buff[: self.buffer_row], self.array, self.array_offset
151
175
  )
152
176
  logger.debug(
153
- f"Flushed <{self.array.name} {self.array.shape} "
177
+ f"Flushed <{self.name} {self.array.shape} "
154
178
  f"{self.array.dtype}> "
155
179
  f"{self.array_offset}:{self.array_offset + self.buffer_row}"
156
180
  f"{self.buff.nbytes / 2**20: .2f}Mb"
157
181
  )
182
+ # Note this is inaccurate for string data as we're just reporting the
183
+ # size of the container. When we switch the numpy 2 StringDtype this
184
+ # should improve and we can get more visibility on how memory
185
+ # is being used.
186
+ # https://github.com/sgkit-dev/bio2zarr/issues/30
187
+ self.max_buff_size = max(self.max_buff_size, self.buff.nbytes)
158
188
  self.array_offset += self.variants_chunk_size
159
189
  self.buffer_row = 0
160
190
 
bio2zarr/plink.py CHANGED
@@ -6,6 +6,8 @@ import numcodecs
6
6
  import numpy as np
7
7
  import zarr
8
8
 
9
+ from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
10
+
9
11
  from . import core
10
12
 
11
13
  logger = logging.getLogger(__name__)
@@ -17,8 +19,7 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
17
19
  # the correct approach is, but it is important to note that the
18
20
  # 0th allele is *not* necessarily the REF for these datasets.
19
21
  bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
20
- store = zarr.DirectoryStore(zarr_path)
21
- root = zarr.group(store=store)
22
+ root = zarr.open(store=zarr_path, mode="a", **ZARR_FORMAT_KWARGS)
22
23
  gt = core.BufferedArray(root["call_genotype"], start)
23
24
  gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
24
25
  gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
@@ -73,8 +74,7 @@ def convert(
73
74
  if variants_chunk_size is None:
74
75
  variants_chunk_size = 10_000
75
76
 
76
- store = zarr.DirectoryStore(zarr_path)
77
- root = zarr.group(store=store, overwrite=True)
77
+ root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
78
78
 
79
79
  ploidy = 2
80
80
  shape = [m, n]
@@ -88,7 +88,8 @@ def convert(
88
88
 
89
89
  a = root.array(
90
90
  "sample_id",
91
- bed.iid,
91
+ data=bed.iid,
92
+ shape=bed.iid.shape,
92
93
  dtype="str",
93
94
  compressor=default_compressor,
94
95
  chunks=(samples_chunk_size,),
@@ -100,7 +101,8 @@ def convert(
100
101
  # fetching repeatedly from bim file
101
102
  a = root.array(
102
103
  "variant_position",
103
- bed.bp_position,
104
+ data=bed.bp_position,
105
+ shape=bed.bp_position.shape,
104
106
  dtype=np.int32,
105
107
  compressor=default_compressor,
106
108
  chunks=(variants_chunk_size,),
@@ -111,41 +113,45 @@ def convert(
111
113
  alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
112
114
  a = root.array(
113
115
  "variant_allele",
114
- alleles,
116
+ data=alleles,
117
+ shape=alleles.shape,
115
118
  dtype="str",
116
119
  compressor=default_compressor,
117
- chunks=(variants_chunk_size,),
120
+ chunks=(variants_chunk_size, alleles.shape[1]),
118
121
  )
119
122
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
120
123
  logger.debug("encoded variant_allele")
121
124
 
122
125
  # TODO remove this?
123
126
  a = root.empty(
124
- "call_genotype_phased",
127
+ name="call_genotype_phased",
125
128
  dtype="bool",
126
129
  shape=list(shape),
127
130
  chunks=list(chunks),
128
131
  compressor=default_compressor,
132
+ **ZARR_FORMAT_KWARGS,
129
133
  )
130
134
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
131
135
 
132
136
  shape += [ploidy]
133
137
  dimensions += ["ploidy"]
134
138
  a = root.empty(
135
- "call_genotype",
139
+ name="call_genotype",
136
140
  dtype="i1",
137
141
  shape=list(shape),
138
142
  chunks=list(chunks),
139
143
  compressor=default_compressor,
144
+ **ZARR_FORMAT_KWARGS,
140
145
  )
141
146
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
142
147
 
143
148
  a = root.empty(
144
- "call_genotype_mask",
149
+ name="call_genotype_mask",
145
150
  dtype="bool",
146
151
  shape=list(shape),
147
152
  chunks=list(chunks),
148
153
  compressor=default_compressor,
154
+ **ZARR_FORMAT_KWARGS,
149
155
  )
150
156
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
151
157
 
@@ -154,7 +160,7 @@ def convert(
154
160
  num_slices = max(1, worker_processes * 4)
155
161
  slices = core.chunk_aligned_slices(a, num_slices)
156
162
 
157
- total_chunks = sum(a.nchunks for a in root.values())
163
+ total_chunks = sum(a.nchunks for _, a in root.arrays())
158
164
 
159
165
  progress_config = core.ProgressConfig(
160
166
  total=total_chunks, title="Convert", units="chunks", show=show_progress
@@ -171,8 +177,7 @@ def convert(
171
177
  # FIXME do this more efficiently - currently reading the whole thing
172
178
  # in for convenience, and also comparing call-by-call
173
179
  def validate(bed_path, zarr_path):
174
- store = zarr.DirectoryStore(zarr_path)
175
- root = zarr.group(store=store)
180
+ root = zarr.open(store=zarr_path, mode="r")
176
181
  call_genotype = root["call_genotype"][:]
177
182
 
178
183
  bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
bio2zarr/vcf2zarr/icf.py CHANGED
@@ -110,7 +110,7 @@ class VcfPartition:
110
110
  num_records: int = -1
111
111
 
112
112
 
113
- ICF_METADATA_FORMAT_VERSION = "0.3"
113
+ ICF_METADATA_FORMAT_VERSION = "0.4"
114
114
  ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
115
115
  cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
116
116
  )
@@ -212,6 +212,7 @@ def fixed_vcf_field_definitions():
212
212
  make_field_def("FILTERS", "String", "."),
213
213
  make_field_def("REF", "String", "1"),
214
214
  make_field_def("ALT", "String", "."),
215
+ make_field_def("rlen", "Integer", "1"), # computed field
215
216
  ]
216
217
  return fields
217
218
 
@@ -240,7 +241,7 @@ def scan_vcf(path, target_num_partitions):
240
241
  for h in vcf.header_iter():
241
242
  if h["HeaderType"] in ["INFO", "FORMAT"]:
242
243
  field = VcfField.from_header(h)
243
- if field.name == "GT":
244
+ if h["HeaderType"] == "FORMAT" and field.name == "GT":
244
245
  field.vcf_type = "Integer"
245
246
  field.vcf_number = "."
246
247
  fields.append(field)
@@ -300,7 +301,11 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
300
301
  )
301
302
  with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
302
303
  for path in paths:
303
- pwm.submit(scan_vcf, path, max(1, target_num_partitions // len(paths)))
304
+ pwm.submit(
305
+ scan_vcf,
306
+ path,
307
+ max(1, target_num_partitions // len(paths)),
308
+ )
304
309
  results = list(pwm.results_as_completed())
305
310
 
306
311
  # Sort to make the ordering deterministic
@@ -408,7 +413,7 @@ def sanitise_value_float_1d(buff, j, value):
408
413
  if value is None:
409
414
  buff[j] = constants.FLOAT32_MISSING
410
415
  else:
411
- value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
416
+ value = np.array(value, ndmin=1, dtype=buff.dtype, copy=True)
412
417
  # numpy will map None values to Nan, but we need a
413
418
  # specific NaN
414
419
  value[np.isnan(value)] = constants.FLOAT32_MISSING
@@ -422,7 +427,7 @@ def sanitise_value_float_2d(buff, j, value):
422
427
  buff[j] = constants.FLOAT32_MISSING
423
428
  else:
424
429
  # print("value = ", value)
425
- value = np.array(value, ndmin=2, dtype=buff.dtype, copy=False)
430
+ value = np.array(value, ndmin=2, dtype=buff.dtype, copy=True)
426
431
  buff[j] = constants.FLOAT32_FILL
427
432
  buff[j, :, : value.shape[1]] = value
428
433
 
@@ -432,7 +437,7 @@ def sanitise_int_array(value, ndmin, dtype):
432
437
  value = [
433
438
  constants.VCF_INT_MISSING if x is None else x for x in value
434
439
  ] # NEEDS TEST
435
- value = np.array(value, ndmin=ndmin, copy=False)
440
+ value = np.array(value, ndmin=ndmin, copy=True)
436
441
  value[value == constants.VCF_INT_MISSING] = -1
437
442
  value[value == constants.VCF_INT_FILL] = -2
438
443
  # TODO watch out for clipping here!
@@ -494,15 +499,15 @@ class VcfValueTransformer:
494
499
  def transform(self, vcf_value):
495
500
  if isinstance(vcf_value, tuple):
496
501
  vcf_value = [self.missing if v is None else v for v in vcf_value]
497
- value = np.array(vcf_value, ndmin=self.dimension, copy=False)
502
+ value = np.array(vcf_value, ndmin=self.dimension, copy=True)
498
503
  return value
499
504
 
500
505
  def transform_and_update_bounds(self, vcf_value):
501
506
  if vcf_value is None:
502
507
  return None
508
+ # print(self, self.field.full_name, "T", vcf_value)
503
509
  value = self.transform(vcf_value)
504
510
  self.update_bounds(value)
505
- # print(self.field.full_name, "T", vcf_value, "->", value)
506
511
  return value
507
512
 
508
513
 
@@ -531,13 +536,15 @@ class FloatValueTransformer(VcfValueTransformer):
531
536
  class StringValueTransformer(VcfValueTransformer):
532
537
  def update_bounds(self, value):
533
538
  summary = self.field.summary
534
- number = value.shape[-1]
539
+ if self.field.category == "FORMAT":
540
+ number = max(len(v) for v in value)
541
+ else:
542
+ number = value.shape[-1]
535
543
  # TODO would be nice to report string lengths, but not
536
544
  # really necessary.
537
545
  summary.max_number = max(summary.max_number, number)
538
546
 
539
547
  def transform(self, vcf_value):
540
- # print("transform", vcf_value)
541
548
  if self.dimension == 1:
542
549
  value = np.array(list(vcf_value.split(",")))
543
550
  else:
@@ -853,11 +860,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
853
860
 
854
861
  def summary_table(self):
855
862
  data = []
856
- for name, col in self.fields.items():
857
- summary = col.vcf_field.summary
863
+ for name, icf_field in self.fields.items():
864
+ summary = icf_field.vcf_field.summary
858
865
  d = {
859
866
  "name": name,
860
- "type": col.vcf_field.vcf_type,
867
+ "type": icf_field.vcf_field.vcf_type,
861
868
  "chunks": summary.num_chunks,
862
869
  "size": core.display_size(summary.uncompressed_size),
863
870
  "compressed": core.display_size(summary.compressed_size),
@@ -962,7 +969,7 @@ class IntermediateColumnarFormatWriter:
962
969
  compressor=None,
963
970
  ):
964
971
  if self.path.exists():
965
- raise ValueError("ICF path already exists")
972
+ raise ValueError(f"ICF path already exists: {self.path}")
966
973
  if compressor is None:
967
974
  compressor = ICF_DEFAULT_COMPRESSOR
968
975
  vcfs = [pathlib.Path(vcf) for vcf in vcfs]
@@ -1009,8 +1016,8 @@ class IntermediateColumnarFormatWriter:
1009
1016
  self.path.mkdir()
1010
1017
  self.wip_path.mkdir()
1011
1018
  for field in self.metadata.fields:
1012
- col_path = get_vcf_field_path(self.path, field)
1013
- col_path.mkdir(parents=True)
1019
+ field_path = get_vcf_field_path(self.path, field)
1020
+ field_path.mkdir(parents=True)
1014
1021
 
1015
1022
  def load_partition_summaries(self):
1016
1023
  summaries = []
@@ -1074,13 +1081,19 @@ class IntermediateColumnarFormatWriter:
1074
1081
  tcw.append("FILTERS", variant.FILTERS)
1075
1082
  tcw.append("REF", variant.REF)
1076
1083
  tcw.append("ALT", variant.ALT)
1084
+ tcw.append("rlen", variant.end - variant.start)
1077
1085
  for field in info_fields:
1078
1086
  tcw.append(field.full_name, variant.INFO.get(field.name, None))
1079
1087
  if has_gt:
1080
- tcw.append("FORMAT/GT", variant.genotype.array())
1088
+ if variant.genotype is None:
1089
+ val = None
1090
+ else:
1091
+ val = variant.genotype.array()
1092
+ tcw.append("FORMAT/GT", val)
1081
1093
  for field in format_fields:
1082
1094
  val = variant.format(field.name)
1083
1095
  tcw.append(field.full_name, val)
1096
+
1084
1097
  # Note: an issue with updating the progress per variant here like
1085
1098
  # this is that we get a significant pause at the end of the counter
1086
1099
  # while all the "small" fields get flushed. Possibly not much to be