bio2zarr 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.2'
16
- __version_tuple__ = version_tuple = (0, 0, 2)
15
+ __version__ = version = '0.0.4'
16
+ __version_tuple__ = version_tuple = (0, 0, 4)
bio2zarr/cli.py CHANGED
@@ -1,6 +1,12 @@
1
+ import logging
2
+ import os
3
+ import pathlib
4
+ import shutil
5
+
1
6
  import click
2
7
  import tabulate
3
8
  import coloredlogs
9
+ import numcodecs
4
10
 
5
11
  from . import vcf
6
12
  from . import vcf_utils
@@ -8,6 +14,9 @@ from . import plink
8
14
  from . import provenance
9
15
 
10
16
 
17
+ logger = logging.getLogger(__name__)
18
+
19
+
11
20
  class NaturalOrderGroup(click.Group):
12
21
  """
13
22
  List commands in the order they are provided in the help text.
@@ -18,8 +27,32 @@ class NaturalOrderGroup(click.Group):
18
27
 
19
28
 
20
29
  # Common arguments/options
30
+ vcfs = click.argument(
31
+ "vcfs", nargs=-1, required=True, type=click.Path(exists=True, dir_okay=False)
32
+ )
33
+
34
+ new_icf_path = click.argument(
35
+ "icf_path", type=click.Path(file_okay=False, dir_okay=True)
36
+ )
37
+
38
+ icf_path = click.argument(
39
+ "icf_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
40
+ )
41
+
42
+ new_zarr_path = click.argument(
43
+ "zarr_path", type=click.Path(file_okay=False, dir_okay=True)
44
+ )
45
+
21
46
  verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
22
47
 
48
+ force = click.option(
49
+ "-f",
50
+ "--force",
51
+ is_flag=True,
52
+ flag_value=True,
53
+ help="Force overwriting of existing directories",
54
+ )
55
+
23
56
  version = click.version_option(version=f"{provenance.__version__}")
24
57
 
25
58
  worker_processes = click.option(
@@ -34,6 +67,17 @@ column_chunk_size = click.option(
34
67
  help="Approximate uncompressed size of exploded column chunks in MiB",
35
68
  )
36
69
 
70
+ # We could provide the full flexiblity of numcodecs/Blosc here, but there
71
+ # doesn't seem much point. Can always add more arguments here to control
72
+ # compression level, etc.
73
+ compressor = click.option(
74
+ "-C",
75
+ "--compressor",
76
+ type=click.Choice(["lz4", "zstd"]),
77
+ default=None,
78
+ help="Codec to use for compressing column chunks (Default=zstd)."
79
+ )
80
+
37
81
  # Note: -l and -w were chosen when these were called "width" and "length".
38
82
  # possibly there are better letters now.
39
83
  variants_chunk_size = click.option(
@@ -64,59 +108,101 @@ def setup_logging(verbosity):
64
108
  coloredlogs.install(level=level)
65
109
 
66
110
 
111
+ def check_overwrite_dir(path, force):
112
+ path = pathlib.Path(path)
113
+ if path.exists():
114
+ if not force:
115
+ click.confirm(
116
+ f"Do you want to overwrite {path}? (use --force to skip this check)",
117
+ abort=True,
118
+ )
119
+ # These trees can be mondo-big and on slow file systems, so it's entirely
120
+ # feasible that the delete would fail or be killed. This makes it less likely
121
+ # that partially deleted paths are mistaken for good paths.
122
+ tmp_delete_path = path.with_suffix(f"{path.suffix}.{os.getpid()}.DELETING")
123
+ logger.info(f"Deleting {path} (renamed to {tmp_delete_path} while in progress)")
124
+ os.rename(path, tmp_delete_path)
125
+ shutil.rmtree(tmp_delete_path)
126
+
127
+
128
+ def get_compressor(cname):
129
+ if cname is None:
130
+ return None
131
+ config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
132
+ config["cname"] = cname
133
+ return numcodecs.get_codec(config)
134
+
135
+
67
136
  @click.command
68
- @click.argument("vcfs", nargs=-1, required=True)
69
- @click.argument("zarr_path", type=click.Path())
137
+ @vcfs
138
+ @new_icf_path
139
+ @force
70
140
  @verbose
71
- @worker_processes
72
141
  @column_chunk_size
73
- def explode(vcfs, zarr_path, verbose, worker_processes, column_chunk_size):
142
+ @compressor
143
+ @worker_processes
144
+ def explode(
145
+ vcfs, icf_path, force, verbose, column_chunk_size, compressor, worker_processes
146
+ ):
74
147
  """
75
148
  Convert VCF(s) to intermediate columnar format
76
149
  """
77
150
  setup_logging(verbose)
151
+ check_overwrite_dir(icf_path, force)
78
152
  vcf.explode(
153
+ icf_path,
79
154
  vcfs,
80
- zarr_path,
81
155
  worker_processes=worker_processes,
82
156
  column_chunk_size=column_chunk_size,
157
+ compressor=get_compressor(compressor),
83
158
  show_progress=True,
84
159
  )
85
160
 
86
161
 
87
162
  @click.command
88
- @click.argument("vcfs", nargs=-1, required=True)
89
- @click.argument("icf_path", type=click.Path())
90
- @click.argument("num_partitions", type=int)
163
+ @vcfs
164
+ @new_icf_path
165
+ @click.argument("num_partitions", type=click.IntRange(min=1))
166
+ @force
91
167
  @column_chunk_size
168
+ @compressor
92
169
  @verbose
93
170
  @worker_processes
94
171
  def dexplode_init(
95
- vcfs, icf_path, num_partitions, column_chunk_size, verbose, worker_processes
172
+ vcfs,
173
+ icf_path,
174
+ num_partitions,
175
+ force,
176
+ column_chunk_size,
177
+ compressor,
178
+ verbose,
179
+ worker_processes,
96
180
  ):
97
181
  """
98
- Initial step for parallel conversion of VCF(s) to intermediate columnar format
182
+ Initial step for distributed conversion of VCF(s) to intermediate columnar format
99
183
  over the requested number of paritions.
100
184
  """
101
185
  setup_logging(verbose)
186
+ check_overwrite_dir(icf_path, force)
102
187
  num_partitions = vcf.explode_init(
103
188
  icf_path,
104
189
  vcfs,
105
190
  target_num_partitions=num_partitions,
106
191
  column_chunk_size=column_chunk_size,
107
192
  worker_processes=worker_processes,
193
+ compressor=get_compressor(compressor),
108
194
  show_progress=True,
109
195
  )
110
196
  click.echo(num_partitions)
111
197
 
112
198
 
113
199
  @click.command
114
- @click.argument("icf_path", type=click.Path())
115
- @click.argument("partition", type=int)
200
+ @icf_path
201
+ @click.argument("partition", type=click.IntRange(min=0))
116
202
  @verbose
117
203
  def dexplode_partition(icf_path, partition, verbose):
118
204
  """
119
- Convert a VCF partition into intermediate columnar format. Must be called *after*
205
+ Convert a VCF partition to intermediate columnar format. Must be called *after*
120
206
  the ICF path has been initialised with dexplode_init. Partition indexes must be
121
207
  from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
122
208
  """
@@ -129,26 +215,26 @@ def dexplode_partition(icf_path, partition, verbose):
129
215
  @verbose
130
216
  def dexplode_finalise(path, verbose):
131
217
  """
132
- Final step for parallel conversion of VCF(s) to intermediate columnar format
218
+ Final step for distributed conversion of VCF(s) to intermediate columnar format.
133
219
  """
134
220
  setup_logging(verbose)
135
221
  vcf.explode_finalise(path)
136
222
 
137
223
 
138
224
  @click.command
139
- @click.argument("icf_path", type=click.Path())
225
+ @click.argument("path", type=click.Path())
140
226
  @verbose
141
- def inspect(icf_path, verbose):
227
+ def inspect(path, verbose):
142
228
  """
143
- Inspect an intermediate format or Zarr path.
229
+ Inspect an intermediate columnar format or Zarr path.
144
230
  """
145
231
  setup_logging(verbose)
146
- data = vcf.inspect(icf_path)
232
+ data = vcf.inspect(path)
147
233
  click.echo(tabulate.tabulate(data, headers="keys"))
148
234
 
149
235
 
150
236
  @click.command
151
- @click.argument("icf_path", type=click.Path())
237
+ @icf_path
152
238
  def mkschema(icf_path):
153
239
  """
154
240
  Generate a schema for zarr encoding
@@ -158,8 +244,9 @@ def mkschema(icf_path):
158
244
 
159
245
 
160
246
  @click.command
161
- @click.argument("icf_path", type=click.Path())
162
- @click.argument("zarr_path", type=click.Path())
247
+ @icf_path
248
+ @new_zarr_path
249
+ @force
163
250
  @verbose
164
251
  @click.option("-s", "--schema", default=None, type=click.Path(exists=True))
165
252
  @variants_chunk_size
@@ -178,14 +265,14 @@ def mkschema(icf_path):
178
265
  @click.option(
179
266
  "-M",
180
267
  "--max-memory",
181
- type=int,
182
268
  default=None,
183
- help="An approximate bound on overall memory usage in megabytes",
269
+ help="An approximate bound on overall memory usage (e.g. 10G),",
184
270
  )
185
271
  @worker_processes
186
272
  def encode(
187
273
  icf_path,
188
274
  zarr_path,
275
+ force,
189
276
  verbose,
190
277
  schema,
191
278
  variants_chunk_size,
@@ -195,13 +282,14 @@ def encode(
195
282
  worker_processes,
196
283
  ):
197
284
  """
198
- Encode intermediate columnar format (see explode) to vcfzarr.
285
+ Convert intermediate columnar format to vcfzarr.
199
286
  """
200
287
  setup_logging(verbose)
288
+ check_overwrite_dir(zarr_path, force)
201
289
  vcf.encode(
202
290
  icf_path,
203
291
  zarr_path,
204
- schema,
292
+ schema_path=schema,
205
293
  variants_chunk_size=variants_chunk_size,
206
294
  samples_chunk_size=samples_chunk_size,
207
295
  max_v_chunks=max_variant_chunks,
@@ -212,8 +300,8 @@ def encode(
212
300
 
213
301
 
214
302
  @click.command(name="convert")
215
- @click.argument("vcfs", nargs=-1, required=True)
216
- @click.argument("zarr_path", type=click.Path())
303
+ @vcfs
304
+ @new_zarr_path
217
305
  @variants_chunk_size
218
306
  @samples_chunk_size
219
307
  @verbose
@@ -235,17 +323,6 @@ def convert_vcf(
235
323
  )
236
324
 
237
325
 
238
- @click.command
239
- @click.argument("vcfs", nargs=-1, required=True)
240
- @click.argument("zarr_path", type=click.Path())
241
- def validate(vcfs, zarr_path):
242
- """
243
- Development only, do not use. Will be removed before release.
244
- """
245
- # FIXME! Will silently not look at remaining VCFs
246
- vcf.validate(vcfs[0], zarr_path, show_progress=True)
247
-
248
-
249
326
  @version
250
327
  @click.group(cls=NaturalOrderGroup)
251
328
  def vcf2zarr():
@@ -309,7 +386,6 @@ vcf2zarr.add_command(encode)
309
386
  vcf2zarr.add_command(dexplode_init)
310
387
  vcf2zarr.add_command(dexplode_partition)
311
388
  vcf2zarr.add_command(dexplode_finalise)
312
- vcf2zarr.add_command(validate)
313
389
 
314
390
 
315
391
  @click.command(name="convert")
bio2zarr/core.py CHANGED
@@ -50,7 +50,8 @@ def wait_on_futures(futures):
50
50
  cancel_futures(futures)
51
51
  if isinstance(exception, cf.process.BrokenProcessPool):
52
52
  raise RuntimeError(
53
- "Worker process died: you may have run out of memory") from exception
53
+ "Worker process died: you may have run out of memory"
54
+ ) from exception
54
55
  else:
55
56
  raise exception
56
57
 
bio2zarr/vcf.py CHANGED
@@ -151,8 +151,8 @@ class VcfPartition:
151
151
 
152
152
  ICF_METADATA_FORMAT_VERSION = "0.2"
153
153
  ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
154
- cname="lz4", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
155
- ).get_config()
154
+ cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
155
+ )
156
156
 
157
157
 
158
158
  @dataclasses.dataclass
@@ -284,9 +284,7 @@ def scan_vcf(path, target_num_partitions):
284
284
  return metadata, vcf.raw_header
285
285
 
286
286
 
287
- def scan_vcfs(
288
- paths, show_progress, target_num_partitions, column_chunk_size, worker_processes=1
289
- ):
287
+ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
290
288
  logger.info(
291
289
  f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
292
290
  )
@@ -334,12 +332,6 @@ def scan_vcfs(
334
332
  key=lambda x: (contig_index_map[x.region.contig], x.region.start)
335
333
  )
336
334
  icf_metadata.partitions = all_partitions
337
- icf_metadata.format_version = ICF_METADATA_FORMAT_VERSION
338
- icf_metadata.compressor = ICF_DEFAULT_COMPRESSOR
339
- icf_metadata.column_chunk_size = column_chunk_size
340
- # Bare minimum here for provenance - would be nice to include versions of key
341
- # dependencies as well.
342
- icf_metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
343
335
  logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
344
336
  return icf_metadata, header
345
337
 
@@ -824,13 +816,7 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
824
816
  return False
825
817
 
826
818
 
827
- # TODO rename to IntermediateColumnarFormat and move to icf.py
828
-
829
-
830
819
  class IntermediateColumnarFormat(collections.abc.Mapping):
831
- # TODO Check if other compressors would give reasonable compression
832
- # with significantly faster times
833
-
834
820
  def __init__(self, path):
835
821
  self.path = pathlib.Path(path)
836
822
  # TODO raise a more informative error here telling people this
@@ -904,6 +890,15 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
904
890
  return len(self.columns)
905
891
 
906
892
 
893
+
894
+ def mkdir_with_progress(path):
895
+ logger.debug(f"mkdir f{path}")
896
+ # NOTE we may have race-conditions here, I'm not sure. Hopefully allowing
897
+ # parents=True will take care of it.
898
+ path.mkdir(parents=True)
899
+ core.update_progress(1)
900
+
901
+
907
902
  class IntermediateColumnarFormatWriter:
908
903
  def __init__(self, path):
909
904
  self.path = pathlib.Path(path)
@@ -922,9 +917,12 @@ class IntermediateColumnarFormatWriter:
922
917
  worker_processes=1,
923
918
  target_num_partitions=None,
924
919
  show_progress=False,
920
+ compressor=None,
925
921
  ):
926
922
  if self.path.exists():
927
- shutil.rmtree(self.path)
923
+ raise ValueError("ICF path already exists")
924
+ if compressor is None:
925
+ compressor = ICF_DEFAULT_COMPRESSOR
928
926
  vcfs = [pathlib.Path(vcf) for vcf in vcfs]
929
927
  target_num_partitions = max(target_num_partitions, len(vcfs))
930
928
 
@@ -934,14 +932,19 @@ class IntermediateColumnarFormatWriter:
934
932
  worker_processes=worker_processes,
935
933
  show_progress=show_progress,
936
934
  target_num_partitions=target_num_partitions,
937
- column_chunk_size=column_chunk_size,
938
935
  )
939
936
  self.metadata = icf_metadata
937
+ self.metadata.format_version = ICF_METADATA_FORMAT_VERSION
938
+ self.metadata.compressor = compressor.get_config()
939
+ self.metadata.column_chunk_size = column_chunk_size
940
+ # Bare minimum here for provenance - would be nice to include versions of key
941
+ # dependencies as well.
942
+ self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
940
943
 
941
- self.mkdirs()
944
+ self.mkdirs(worker_processes, show_progress=show_progress)
942
945
 
943
946
  # Note: this is needed for the current version of the vcfzarr spec, but it's
944
- # probably goint to be dropped.
947
+ # probably going to be dropped.
945
948
  # https://github.com/pystatgen/vcf-zarr-spec/issues/15
946
949
  # May be useful to keep lying around still though?
947
950
  logger.info(f"Writing VCF header")
@@ -953,20 +956,30 @@ class IntermediateColumnarFormatWriter:
953
956
  json.dump(self.metadata.asdict(), f, indent=4)
954
957
  return self.num_partitions
955
958
 
956
- def mkdirs(self):
957
- # TODO add worker_processes here and do this with the ParallelWorkManager
958
- logger.info(
959
- f"Creating {len(self.metadata.fields) * self.num_partitions} directories"
960
- )
959
+ def mkdirs(self, worker_processes=1, show_progress=False):
960
+ num_dirs = len(self.metadata.fields) * self.num_partitions
961
+ logger.info(f"Creating {num_dirs} directories")
961
962
  self.path.mkdir()
962
963
  self.wip_path.mkdir()
963
- for field in self.metadata.fields:
964
- col_path = get_vcf_field_path(self.path, field)
965
- logger.debug(f"Make directories for {field.full_name} at {col_path}")
966
- col_path.mkdir(parents=True)
967
- for j in range(self.num_partitions):
968
- part_path = col_path / f"p{j}"
969
- part_path.mkdir()
964
+ # Due to high latency batch system filesystems, we create all the directories in
965
+ # parallel
966
+ progress_config = core.ProgressConfig(
967
+ total=num_dirs,
968
+ units="dirs",
969
+ title="Mkdirs",
970
+ show=show_progress,
971
+ )
972
+ with core.ParallelWorkManager(
973
+ worker_processes=worker_processes, progress_config=progress_config
974
+ ) as manager:
975
+ for field in self.metadata.fields:
976
+ col_path = get_vcf_field_path(self.path, field)
977
+ # Don't bother trying to count the intermediate directories towards
978
+ # progress
979
+ manager.submit(col_path.mkdir, parents=True)
980
+ for j in range(self.num_partitions):
981
+ part_path = col_path / f"p{j}"
982
+ manager.submit(mkdir_with_progress, part_path)
970
983
 
971
984
  def load_partition_summaries(self):
972
985
  summaries = []
@@ -1133,12 +1146,13 @@ class IntermediateColumnarFormatWriter:
1133
1146
 
1134
1147
 
1135
1148
  def explode(
1136
- vcfs,
1137
1149
  icf_path,
1150
+ vcfs,
1138
1151
  *,
1139
1152
  column_chunk_size=16,
1140
1153
  worker_processes=1,
1141
1154
  show_progress=False,
1155
+ compressor=None,
1142
1156
  ):
1143
1157
  writer = IntermediateColumnarFormatWriter(icf_path)
1144
1158
  num_partitions = writer.init(
@@ -1148,6 +1162,7 @@ def explode(
1148
1162
  worker_processes=worker_processes,
1149
1163
  show_progress=show_progress,
1150
1164
  column_chunk_size=column_chunk_size,
1165
+ compressor=compressor,
1151
1166
  )
1152
1167
  writer.explode(worker_processes=worker_processes, show_progress=show_progress)
1153
1168
  writer.finalise()
@@ -1162,6 +1177,7 @@ def explode_init(
1162
1177
  target_num_partitions=1,
1163
1178
  worker_processes=1,
1164
1179
  show_progress=False,
1180
+ compressor=None,
1165
1181
  ):
1166
1182
  writer = IntermediateColumnarFormatWriter(icf_path)
1167
1183
  return writer.init(
@@ -1170,6 +1186,7 @@ def explode_init(
1170
1186
  worker_processes=worker_processes,
1171
1187
  show_progress=show_progress,
1172
1188
  column_chunk_size=column_chunk_size,
1189
+ compressor=compressor,
1173
1190
  )
1174
1191
 
1175
1192
 
@@ -1480,16 +1497,28 @@ class EncodingWork:
1480
1497
  memory: int = 0
1481
1498
 
1482
1499
 
1500
+ def parse_max_memory(max_memory):
1501
+ if max_memory is None:
1502
+ # Effectively unbounded
1503
+ return 2**63
1504
+ if isinstance(max_memory, str):
1505
+ max_memory = humanfriendly.parse_size(max_memory)
1506
+ logger.info(f"Set memory budget to {display_size(max_memory)}")
1507
+ return max_memory
1508
+
1509
+
1483
1510
  class VcfZarrWriter:
1484
- def __init__(self, path, icf, schema):
1511
+ def __init__(self, path, icf, schema, dimension_separator=None):
1485
1512
  self.path = pathlib.Path(path)
1486
1513
  self.icf = icf
1487
1514
  self.schema = schema
1515
+ # Default to using nested directories following the Zarr v3 default.
1516
+ # This seems to require version 2.17+ to work properly
1517
+ self.dimension_separator = "/" if dimension_separator is None else dimension_separator
1488
1518
  store = zarr.DirectoryStore(self.path)
1489
1519
  self.root = zarr.group(store=store)
1490
1520
 
1491
1521
  def init_array(self, variable):
1492
- # print("CREATE", variable)
1493
1522
  object_codec = None
1494
1523
  if variable.dtype == "O":
1495
1524
  object_codec = numcodecs.VLenUTF8()
@@ -1501,7 +1530,9 @@ class VcfZarrWriter:
1501
1530
  compressor=numcodecs.get_codec(variable.compressor),
1502
1531
  filters=[numcodecs.get_codec(filt) for filt in variable.filters],
1503
1532
  object_codec=object_codec,
1533
+ dimension_separator=self.dimension_separator,
1504
1534
  )
1535
+ # Dimension names are part of the spec in Zarr v3
1505
1536
  a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
1506
1537
 
1507
1538
  def get_array(self, name):
@@ -1639,6 +1670,7 @@ class VcfZarrWriter:
1639
1670
  "contig_length",
1640
1671
  self.schema.contig_length,
1641
1672
  dtype=np.int64,
1673
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1642
1674
  )
1643
1675
  array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1644
1676
  return {v: j for j, v in enumerate(self.schema.contig_id)}
@@ -1661,8 +1693,6 @@ class VcfZarrWriter:
1661
1693
  self.init_array(column)
1662
1694
 
1663
1695
  def finalise(self):
1664
- # for column in self.schema.columns.values():
1665
- # self.finalise_array(column)
1666
1696
  zarr.consolidate_metadata(self.path)
1667
1697
 
1668
1698
  def encode(
@@ -1672,12 +1702,7 @@ class VcfZarrWriter:
1672
1702
  show_progress=False,
1673
1703
  max_memory=None,
1674
1704
  ):
1675
- if max_memory is None:
1676
- # Unbounded
1677
- max_memory = 2**63
1678
- else:
1679
- # Value is specified in Mibibytes
1680
- max_memory *= 2**20 # NEEDS TEST
1705
+ max_memory = parse_max_memory(max_memory)
1681
1706
 
1682
1707
  # TODO this will move into the setup logic later when we're making it possible
1683
1708
  # to split the work by slice
@@ -1764,8 +1789,8 @@ class VcfZarrWriter:
1764
1789
 
1765
1790
  # Fail early if we can't fit a particular column into memory
1766
1791
  for wp in work:
1767
- if wp.memory >= max_memory:
1768
- raise ValueError( # NEEDS TEST
1792
+ if wp.memory > max_memory:
1793
+ raise ValueError(
1769
1794
  f"Insufficient memory for {wp.columns}: "
1770
1795
  f"{display_size(wp.memory)} > {display_size(max_memory)}"
1771
1796
  )
@@ -1778,6 +1803,8 @@ class VcfZarrWriter:
1778
1803
  )
1779
1804
 
1780
1805
  used_memory = 0
1806
+ # We need to keep some bounds on the queue size or the memory bounds algorithm
1807
+ # below doesn't really work.
1781
1808
  max_queued = 4 * max(1, worker_processes)
1782
1809
  encoded_slices = collections.Counter()
1783
1810
 
@@ -1804,10 +1831,14 @@ class VcfZarrWriter:
1804
1831
  self.finalise_array(column)
1805
1832
 
1806
1833
  for wp in work:
1807
- if (
1834
+ while (
1808
1835
  used_memory + wp.memory > max_memory
1809
1836
  or len(future_to_work) > max_queued
1810
1837
  ):
1838
+ logger.debug(
1839
+ f"Wait: mem_required={used_memory + wp.memory} max_mem={max_memory} "
1840
+ f"queued={len(future_to_work)} max_queued={max_queued}"
1841
+ )
1811
1842
  service_completed_futures()
1812
1843
  future = pwm.submit(wp.func, wp.start, wp.stop)
1813
1844
  used_memory += wp.memory
@@ -1832,6 +1863,7 @@ def encode(
1832
1863
  variants_chunk_size=None,
1833
1864
  samples_chunk_size=None,
1834
1865
  max_v_chunks=None,
1866
+ dimension_separator=None,
1835
1867
  max_memory=None,
1836
1868
  worker_processes=1,
1837
1869
  show_progress=False,
@@ -1855,7 +1887,7 @@ def encode(
1855
1887
  if zarr_path.exists():
1856
1888
  logger.warning(f"Deleting existing {zarr_path}")
1857
1889
  shutil.rmtree(zarr_path)
1858
- vzw = VcfZarrWriter(zarr_path, icf, schema)
1890
+ vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
1859
1891
  vzw.init()
1860
1892
  vzw.encode(
1861
1893
  max_v_chunks=max_v_chunks,
@@ -1876,10 +1908,11 @@ def convert(
1876
1908
  show_progress=False,
1877
1909
  # TODO add arguments to control location of tmpdir
1878
1910
  ):
1879
- with tempfile.TemporaryDirectory(prefix="vcf2zarr_if_") as if_dir:
1911
+ with tempfile.TemporaryDirectory(prefix="vcf2zarr") as tmp:
1912
+ if_dir = pathlib.Path(tmp) / "if"
1880
1913
  explode(
1881
- vcfs,
1882
1914
  if_dir,
1915
+ vcfs,
1883
1916
  worker_processes=worker_processes,
1884
1917
  show_progress=show_progress,
1885
1918
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bio2zarr
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Home-page: https://github.com/pystatgen/bio2zarr
6
6
  Author: sgkit Developers
@@ -20,7 +20,7 @@ Requires-Python: >=3.9
20
20
  Description-Content-Type: text/x-rst
21
21
  License-File: LICENSE
22
22
  Requires-Dist: numpy
23
- Requires-Dist: zarr !=2.11.0,!=2.11.1,!=2.11.2,>=2.10.0
23
+ Requires-Dist: zarr >=2.17
24
24
  Requires-Dist: click
25
25
  Requires-Dist: tabulate
26
26
  Requires-Dist: tqdm
@@ -0,0 +1,16 @@
1
+ bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
2
+ bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
3
+ bio2zarr/_version.py,sha256=yBVOKdXLEcTVc7YV7ZPqRXhRDRt-pKrfXxcgHkgPY5g,411
4
+ bio2zarr/cli.py,sha256=QE0DfoZHbBbxq9K_im9y4tJ49_Wss0zzavSjjz-85Xw,11484
5
+ bio2zarr/core.py,sha256=tZb9exfFmuzbA8tUpPY8avSm9YvfH31-vUCTM4fpj78,8128
6
+ bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
7
+ bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
8
+ bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
9
+ bio2zarr/vcf.py,sha256=MEskVTDq4QntzoawPz0sfmInV0aPkIPLXXNv7GmVcmY,73870
10
+ bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
11
+ bio2zarr-0.0.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
+ bio2zarr-0.0.4.dist-info/METADATA,sha256=DISckjzZ0b6FpBTfBvpmJmEe00SIdTHyB3UTsTR8rws,1077
13
+ bio2zarr-0.0.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
+ bio2zarr-0.0.4.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
15
+ bio2zarr-0.0.4.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
16
+ bio2zarr-0.0.4.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
2
- bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
3
- bio2zarr/_version.py,sha256=NDHlyIcJZjLz8wKlmD1-pr6me5FHBAYwO_ynLG-37N8,411
4
- bio2zarr/cli.py,sha256=rNgxpjIwpltEHj1NOpJtwLvGOA0etuxcqMXyNlPbCts,9882
5
- bio2zarr/core.py,sha256=sBlWmHjcb7tAn_7WQRBdrbGcEd_lT_3HTQ_JbzomVMg,8111
6
- bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
7
- bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
8
- bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
9
- bio2zarr/vcf.py,sha256=g2TqH9Lbp4Ds8kjOnjvHvoMAgnG6Kx8pKPN1bqBKKIQ,72201
10
- bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
11
- bio2zarr-0.0.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
- bio2zarr-0.0.2.dist-info/METADATA,sha256=Uqirw85BARPHIZmkPJJKfWRKQgjhtQDDfH9wLJDoxj8,1106
13
- bio2zarr-0.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
- bio2zarr-0.0.2.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
15
- bio2zarr-0.0.2.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
16
- bio2zarr-0.0.2.dist-info/RECORD,,