bio2zarr 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +39 -7
- bio2zarr/core.py +2 -1
- bio2zarr/vcf.py +83 -50
- {bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/METADATA +2 -2
- bio2zarr-0.0.4.dist-info/RECORD +16 -0
- bio2zarr-0.0.3.dist-info/RECORD +0 -16
- {bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/top_level.txt +0 -0
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -6,6 +6,7 @@ import shutil
|
|
|
6
6
|
import click
|
|
7
7
|
import tabulate
|
|
8
8
|
import coloredlogs
|
|
9
|
+
import numcodecs
|
|
9
10
|
|
|
10
11
|
from . import vcf
|
|
11
12
|
from . import vcf_utils
|
|
@@ -66,6 +67,17 @@ column_chunk_size = click.option(
|
|
|
66
67
|
help="Approximate uncompressed size of exploded column chunks in MiB",
|
|
67
68
|
)
|
|
68
69
|
|
|
70
|
+
# We could provide the full flexiblity of numcodecs/Blosc here, but there
|
|
71
|
+
# doesn't seem much point. Can always add more arguments here to control
|
|
72
|
+
# compression level, etc.
|
|
73
|
+
compressor = click.option(
|
|
74
|
+
"-C",
|
|
75
|
+
"--compressor",
|
|
76
|
+
type=click.Choice(["lz4", "zstd"]),
|
|
77
|
+
default=None,
|
|
78
|
+
help="Codec to use for compressing column chunks (Default=zstd)."
|
|
79
|
+
)
|
|
80
|
+
|
|
69
81
|
# Note: -l and -w were chosen when these were called "width" and "length".
|
|
70
82
|
# possibly there are better letters now.
|
|
71
83
|
variants_chunk_size = click.option(
|
|
@@ -113,24 +125,36 @@ def check_overwrite_dir(path, force):
|
|
|
113
125
|
shutil.rmtree(tmp_delete_path)
|
|
114
126
|
|
|
115
127
|
|
|
128
|
+
def get_compressor(cname):
|
|
129
|
+
if cname is None:
|
|
130
|
+
return None
|
|
131
|
+
config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
|
|
132
|
+
config["cname"] = cname
|
|
133
|
+
return numcodecs.get_codec(config)
|
|
134
|
+
|
|
135
|
+
|
|
116
136
|
@click.command
|
|
117
137
|
@vcfs
|
|
118
138
|
@new_icf_path
|
|
119
139
|
@force
|
|
120
140
|
@verbose
|
|
121
|
-
@worker_processes
|
|
122
141
|
@column_chunk_size
|
|
123
|
-
|
|
142
|
+
@compressor
|
|
143
|
+
@worker_processes
|
|
144
|
+
def explode(
|
|
145
|
+
vcfs, icf_path, force, verbose, column_chunk_size, compressor, worker_processes
|
|
146
|
+
):
|
|
124
147
|
"""
|
|
125
148
|
Convert VCF(s) to intermediate columnar format
|
|
126
149
|
"""
|
|
127
150
|
setup_logging(verbose)
|
|
128
151
|
check_overwrite_dir(icf_path, force)
|
|
129
152
|
vcf.explode(
|
|
130
|
-
vcfs,
|
|
131
153
|
icf_path,
|
|
154
|
+
vcfs,
|
|
132
155
|
worker_processes=worker_processes,
|
|
133
156
|
column_chunk_size=column_chunk_size,
|
|
157
|
+
compressor=get_compressor(compressor),
|
|
134
158
|
show_progress=True,
|
|
135
159
|
)
|
|
136
160
|
|
|
@@ -141,10 +165,18 @@ def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size)
|
|
|
141
165
|
@click.argument("num_partitions", type=click.IntRange(min=1))
|
|
142
166
|
@force
|
|
143
167
|
@column_chunk_size
|
|
168
|
+
@compressor
|
|
144
169
|
@verbose
|
|
145
170
|
@worker_processes
|
|
146
171
|
def dexplode_init(
|
|
147
|
-
vcfs,
|
|
172
|
+
vcfs,
|
|
173
|
+
icf_path,
|
|
174
|
+
num_partitions,
|
|
175
|
+
force,
|
|
176
|
+
column_chunk_size,
|
|
177
|
+
compressor,
|
|
178
|
+
verbose,
|
|
179
|
+
worker_processes,
|
|
148
180
|
):
|
|
149
181
|
"""
|
|
150
182
|
Initial step for distributed conversion of VCF(s) to intermediate columnar format
|
|
@@ -158,6 +190,7 @@ def dexplode_init(
|
|
|
158
190
|
target_num_partitions=num_partitions,
|
|
159
191
|
column_chunk_size=column_chunk_size,
|
|
160
192
|
worker_processes=worker_processes,
|
|
193
|
+
compressor=get_compressor(compressor),
|
|
161
194
|
show_progress=True,
|
|
162
195
|
)
|
|
163
196
|
click.echo(num_partitions)
|
|
@@ -232,9 +265,8 @@ def mkschema(icf_path):
|
|
|
232
265
|
@click.option(
|
|
233
266
|
"-M",
|
|
234
267
|
"--max-memory",
|
|
235
|
-
type=int,
|
|
236
268
|
default=None,
|
|
237
|
-
help="An approximate bound on overall memory usage
|
|
269
|
+
help="An approximate bound on overall memory usage (e.g. 10G),",
|
|
238
270
|
)
|
|
239
271
|
@worker_processes
|
|
240
272
|
def encode(
|
|
@@ -250,7 +282,7 @@ def encode(
|
|
|
250
282
|
worker_processes,
|
|
251
283
|
):
|
|
252
284
|
"""
|
|
253
|
-
|
|
285
|
+
Convert intermediate columnar format to vcfzarr.
|
|
254
286
|
"""
|
|
255
287
|
setup_logging(verbose)
|
|
256
288
|
check_overwrite_dir(zarr_path, force)
|
bio2zarr/core.py
CHANGED
|
@@ -50,7 +50,8 @@ def wait_on_futures(futures):
|
|
|
50
50
|
cancel_futures(futures)
|
|
51
51
|
if isinstance(exception, cf.process.BrokenProcessPool):
|
|
52
52
|
raise RuntimeError(
|
|
53
|
-
"Worker process died: you may have run out of memory"
|
|
53
|
+
"Worker process died: you may have run out of memory"
|
|
54
|
+
) from exception
|
|
54
55
|
else:
|
|
55
56
|
raise exception
|
|
56
57
|
|
bio2zarr/vcf.py
CHANGED
|
@@ -151,8 +151,8 @@ class VcfPartition:
|
|
|
151
151
|
|
|
152
152
|
ICF_METADATA_FORMAT_VERSION = "0.2"
|
|
153
153
|
ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
154
|
-
cname="
|
|
155
|
-
)
|
|
154
|
+
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
|
|
155
|
+
)
|
|
156
156
|
|
|
157
157
|
|
|
158
158
|
@dataclasses.dataclass
|
|
@@ -284,9 +284,7 @@ def scan_vcf(path, target_num_partitions):
|
|
|
284
284
|
return metadata, vcf.raw_header
|
|
285
285
|
|
|
286
286
|
|
|
287
|
-
def scan_vcfs(
|
|
288
|
-
paths, show_progress, target_num_partitions, column_chunk_size, worker_processes=1
|
|
289
|
-
):
|
|
287
|
+
def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
290
288
|
logger.info(
|
|
291
289
|
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
|
|
292
290
|
)
|
|
@@ -334,12 +332,6 @@ def scan_vcfs(
|
|
|
334
332
|
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
|
|
335
333
|
)
|
|
336
334
|
icf_metadata.partitions = all_partitions
|
|
337
|
-
icf_metadata.format_version = ICF_METADATA_FORMAT_VERSION
|
|
338
|
-
icf_metadata.compressor = ICF_DEFAULT_COMPRESSOR
|
|
339
|
-
icf_metadata.column_chunk_size = column_chunk_size
|
|
340
|
-
# Bare minimum here for provenance - would be nice to include versions of key
|
|
341
|
-
# dependencies as well.
|
|
342
|
-
icf_metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
|
|
343
335
|
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
|
|
344
336
|
return icf_metadata, header
|
|
345
337
|
|
|
@@ -824,13 +816,7 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
|
|
|
824
816
|
return False
|
|
825
817
|
|
|
826
818
|
|
|
827
|
-
# TODO rename to IntermediateColumnarFormat and move to icf.py
|
|
828
|
-
|
|
829
|
-
|
|
830
819
|
class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
831
|
-
# TODO Check if other compressors would give reasonable compression
|
|
832
|
-
# with significantly faster times
|
|
833
|
-
|
|
834
820
|
def __init__(self, path):
|
|
835
821
|
self.path = pathlib.Path(path)
|
|
836
822
|
# TODO raise a more informative error here telling people this
|
|
@@ -904,6 +890,15 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
904
890
|
return len(self.columns)
|
|
905
891
|
|
|
906
892
|
|
|
893
|
+
|
|
894
|
+
def mkdir_with_progress(path):
|
|
895
|
+
logger.debug(f"mkdir f{path}")
|
|
896
|
+
# NOTE we may have race-conditions here, I'm not sure. Hopefully allowing
|
|
897
|
+
# parents=True will take care of it.
|
|
898
|
+
path.mkdir(parents=True)
|
|
899
|
+
core.update_progress(1)
|
|
900
|
+
|
|
901
|
+
|
|
907
902
|
class IntermediateColumnarFormatWriter:
|
|
908
903
|
def __init__(self, path):
|
|
909
904
|
self.path = pathlib.Path(path)
|
|
@@ -922,9 +917,12 @@ class IntermediateColumnarFormatWriter:
|
|
|
922
917
|
worker_processes=1,
|
|
923
918
|
target_num_partitions=None,
|
|
924
919
|
show_progress=False,
|
|
920
|
+
compressor=None,
|
|
925
921
|
):
|
|
926
922
|
if self.path.exists():
|
|
927
|
-
|
|
923
|
+
raise ValueError("ICF path already exists")
|
|
924
|
+
if compressor is None:
|
|
925
|
+
compressor = ICF_DEFAULT_COMPRESSOR
|
|
928
926
|
vcfs = [pathlib.Path(vcf) for vcf in vcfs]
|
|
929
927
|
target_num_partitions = max(target_num_partitions, len(vcfs))
|
|
930
928
|
|
|
@@ -934,14 +932,19 @@ class IntermediateColumnarFormatWriter:
|
|
|
934
932
|
worker_processes=worker_processes,
|
|
935
933
|
show_progress=show_progress,
|
|
936
934
|
target_num_partitions=target_num_partitions,
|
|
937
|
-
column_chunk_size=column_chunk_size,
|
|
938
935
|
)
|
|
939
936
|
self.metadata = icf_metadata
|
|
937
|
+
self.metadata.format_version = ICF_METADATA_FORMAT_VERSION
|
|
938
|
+
self.metadata.compressor = compressor.get_config()
|
|
939
|
+
self.metadata.column_chunk_size = column_chunk_size
|
|
940
|
+
# Bare minimum here for provenance - would be nice to include versions of key
|
|
941
|
+
# dependencies as well.
|
|
942
|
+
self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
|
|
940
943
|
|
|
941
|
-
self.mkdirs()
|
|
944
|
+
self.mkdirs(worker_processes, show_progress=show_progress)
|
|
942
945
|
|
|
943
946
|
# Note: this is needed for the current version of the vcfzarr spec, but it's
|
|
944
|
-
# probably
|
|
947
|
+
# probably going to be dropped.
|
|
945
948
|
# https://github.com/pystatgen/vcf-zarr-spec/issues/15
|
|
946
949
|
# May be useful to keep lying around still though?
|
|
947
950
|
logger.info(f"Writing VCF header")
|
|
@@ -953,20 +956,30 @@ class IntermediateColumnarFormatWriter:
|
|
|
953
956
|
json.dump(self.metadata.asdict(), f, indent=4)
|
|
954
957
|
return self.num_partitions
|
|
955
958
|
|
|
956
|
-
def mkdirs(self):
|
|
957
|
-
|
|
958
|
-
logger.info(
|
|
959
|
-
f"Creating {len(self.metadata.fields) * self.num_partitions} directories"
|
|
960
|
-
)
|
|
959
|
+
def mkdirs(self, worker_processes=1, show_progress=False):
|
|
960
|
+
num_dirs = len(self.metadata.fields) * self.num_partitions
|
|
961
|
+
logger.info(f"Creating {num_dirs} directories")
|
|
961
962
|
self.path.mkdir()
|
|
962
963
|
self.wip_path.mkdir()
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
964
|
+
# Due to high latency batch system filesystems, we create all the directories in
|
|
965
|
+
# parallel
|
|
966
|
+
progress_config = core.ProgressConfig(
|
|
967
|
+
total=num_dirs,
|
|
968
|
+
units="dirs",
|
|
969
|
+
title="Mkdirs",
|
|
970
|
+
show=show_progress,
|
|
971
|
+
)
|
|
972
|
+
with core.ParallelWorkManager(
|
|
973
|
+
worker_processes=worker_processes, progress_config=progress_config
|
|
974
|
+
) as manager:
|
|
975
|
+
for field in self.metadata.fields:
|
|
976
|
+
col_path = get_vcf_field_path(self.path, field)
|
|
977
|
+
# Don't bother trying to count the intermediate directories towards
|
|
978
|
+
# progress
|
|
979
|
+
manager.submit(col_path.mkdir, parents=True)
|
|
980
|
+
for j in range(self.num_partitions):
|
|
981
|
+
part_path = col_path / f"p{j}"
|
|
982
|
+
manager.submit(mkdir_with_progress, part_path)
|
|
970
983
|
|
|
971
984
|
def load_partition_summaries(self):
|
|
972
985
|
summaries = []
|
|
@@ -1133,12 +1146,13 @@ class IntermediateColumnarFormatWriter:
|
|
|
1133
1146
|
|
|
1134
1147
|
|
|
1135
1148
|
def explode(
|
|
1136
|
-
vcfs,
|
|
1137
1149
|
icf_path,
|
|
1150
|
+
vcfs,
|
|
1138
1151
|
*,
|
|
1139
1152
|
column_chunk_size=16,
|
|
1140
1153
|
worker_processes=1,
|
|
1141
1154
|
show_progress=False,
|
|
1155
|
+
compressor=None,
|
|
1142
1156
|
):
|
|
1143
1157
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1144
1158
|
num_partitions = writer.init(
|
|
@@ -1148,6 +1162,7 @@ def explode(
|
|
|
1148
1162
|
worker_processes=worker_processes,
|
|
1149
1163
|
show_progress=show_progress,
|
|
1150
1164
|
column_chunk_size=column_chunk_size,
|
|
1165
|
+
compressor=compressor,
|
|
1151
1166
|
)
|
|
1152
1167
|
writer.explode(worker_processes=worker_processes, show_progress=show_progress)
|
|
1153
1168
|
writer.finalise()
|
|
@@ -1162,6 +1177,7 @@ def explode_init(
|
|
|
1162
1177
|
target_num_partitions=1,
|
|
1163
1178
|
worker_processes=1,
|
|
1164
1179
|
show_progress=False,
|
|
1180
|
+
compressor=None,
|
|
1165
1181
|
):
|
|
1166
1182
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1167
1183
|
return writer.init(
|
|
@@ -1170,6 +1186,7 @@ def explode_init(
|
|
|
1170
1186
|
worker_processes=worker_processes,
|
|
1171
1187
|
show_progress=show_progress,
|
|
1172
1188
|
column_chunk_size=column_chunk_size,
|
|
1189
|
+
compressor=compressor,
|
|
1173
1190
|
)
|
|
1174
1191
|
|
|
1175
1192
|
|
|
@@ -1480,16 +1497,28 @@ class EncodingWork:
|
|
|
1480
1497
|
memory: int = 0
|
|
1481
1498
|
|
|
1482
1499
|
|
|
1500
|
+
def parse_max_memory(max_memory):
|
|
1501
|
+
if max_memory is None:
|
|
1502
|
+
# Effectively unbounded
|
|
1503
|
+
return 2**63
|
|
1504
|
+
if isinstance(max_memory, str):
|
|
1505
|
+
max_memory = humanfriendly.parse_size(max_memory)
|
|
1506
|
+
logger.info(f"Set memory budget to {display_size(max_memory)}")
|
|
1507
|
+
return max_memory
|
|
1508
|
+
|
|
1509
|
+
|
|
1483
1510
|
class VcfZarrWriter:
|
|
1484
|
-
def __init__(self, path, icf, schema):
|
|
1511
|
+
def __init__(self, path, icf, schema, dimension_separator=None):
|
|
1485
1512
|
self.path = pathlib.Path(path)
|
|
1486
1513
|
self.icf = icf
|
|
1487
1514
|
self.schema = schema
|
|
1515
|
+
# Default to using nested directories following the Zarr v3 default.
|
|
1516
|
+
# This seems to require version 2.17+ to work properly
|
|
1517
|
+
self.dimension_separator = "/" if dimension_separator is None else dimension_separator
|
|
1488
1518
|
store = zarr.DirectoryStore(self.path)
|
|
1489
1519
|
self.root = zarr.group(store=store)
|
|
1490
1520
|
|
|
1491
1521
|
def init_array(self, variable):
|
|
1492
|
-
# print("CREATE", variable)
|
|
1493
1522
|
object_codec = None
|
|
1494
1523
|
if variable.dtype == "O":
|
|
1495
1524
|
object_codec = numcodecs.VLenUTF8()
|
|
@@ -1501,7 +1530,9 @@ class VcfZarrWriter:
|
|
|
1501
1530
|
compressor=numcodecs.get_codec(variable.compressor),
|
|
1502
1531
|
filters=[numcodecs.get_codec(filt) for filt in variable.filters],
|
|
1503
1532
|
object_codec=object_codec,
|
|
1533
|
+
dimension_separator=self.dimension_separator,
|
|
1504
1534
|
)
|
|
1535
|
+
# Dimension names are part of the spec in Zarr v3
|
|
1505
1536
|
a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
|
|
1506
1537
|
|
|
1507
1538
|
def get_array(self, name):
|
|
@@ -1639,6 +1670,7 @@ class VcfZarrWriter:
|
|
|
1639
1670
|
"contig_length",
|
|
1640
1671
|
self.schema.contig_length,
|
|
1641
1672
|
dtype=np.int64,
|
|
1673
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1642
1674
|
)
|
|
1643
1675
|
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
1644
1676
|
return {v: j for j, v in enumerate(self.schema.contig_id)}
|
|
@@ -1661,8 +1693,6 @@ class VcfZarrWriter:
|
|
|
1661
1693
|
self.init_array(column)
|
|
1662
1694
|
|
|
1663
1695
|
def finalise(self):
|
|
1664
|
-
# for column in self.schema.columns.values():
|
|
1665
|
-
# self.finalise_array(column)
|
|
1666
1696
|
zarr.consolidate_metadata(self.path)
|
|
1667
1697
|
|
|
1668
1698
|
def encode(
|
|
@@ -1672,12 +1702,7 @@ class VcfZarrWriter:
|
|
|
1672
1702
|
show_progress=False,
|
|
1673
1703
|
max_memory=None,
|
|
1674
1704
|
):
|
|
1675
|
-
|
|
1676
|
-
# Unbounded
|
|
1677
|
-
max_memory = 2**63
|
|
1678
|
-
else:
|
|
1679
|
-
# Value is specified in Mibibytes
|
|
1680
|
-
max_memory *= 2**20 # NEEDS TEST
|
|
1705
|
+
max_memory = parse_max_memory(max_memory)
|
|
1681
1706
|
|
|
1682
1707
|
# TODO this will move into the setup logic later when we're making it possible
|
|
1683
1708
|
# to split the work by slice
|
|
@@ -1764,8 +1789,8 @@ class VcfZarrWriter:
|
|
|
1764
1789
|
|
|
1765
1790
|
# Fail early if we can't fit a particular column into memory
|
|
1766
1791
|
for wp in work:
|
|
1767
|
-
if wp.memory
|
|
1768
|
-
raise ValueError(
|
|
1792
|
+
if wp.memory > max_memory:
|
|
1793
|
+
raise ValueError(
|
|
1769
1794
|
f"Insufficient memory for {wp.columns}: "
|
|
1770
1795
|
f"{display_size(wp.memory)} > {display_size(max_memory)}"
|
|
1771
1796
|
)
|
|
@@ -1778,6 +1803,8 @@ class VcfZarrWriter:
|
|
|
1778
1803
|
)
|
|
1779
1804
|
|
|
1780
1805
|
used_memory = 0
|
|
1806
|
+
# We need to keep some bounds on the queue size or the memory bounds algorithm
|
|
1807
|
+
# below doesn't really work.
|
|
1781
1808
|
max_queued = 4 * max(1, worker_processes)
|
|
1782
1809
|
encoded_slices = collections.Counter()
|
|
1783
1810
|
|
|
@@ -1804,10 +1831,14 @@ class VcfZarrWriter:
|
|
|
1804
1831
|
self.finalise_array(column)
|
|
1805
1832
|
|
|
1806
1833
|
for wp in work:
|
|
1807
|
-
|
|
1834
|
+
while (
|
|
1808
1835
|
used_memory + wp.memory > max_memory
|
|
1809
1836
|
or len(future_to_work) > max_queued
|
|
1810
1837
|
):
|
|
1838
|
+
logger.debug(
|
|
1839
|
+
f"Wait: mem_required={used_memory + wp.memory} max_mem={max_memory} "
|
|
1840
|
+
f"queued={len(future_to_work)} max_queued={max_queued}"
|
|
1841
|
+
)
|
|
1811
1842
|
service_completed_futures()
|
|
1812
1843
|
future = pwm.submit(wp.func, wp.start, wp.stop)
|
|
1813
1844
|
used_memory += wp.memory
|
|
@@ -1832,6 +1863,7 @@ def encode(
|
|
|
1832
1863
|
variants_chunk_size=None,
|
|
1833
1864
|
samples_chunk_size=None,
|
|
1834
1865
|
max_v_chunks=None,
|
|
1866
|
+
dimension_separator=None,
|
|
1835
1867
|
max_memory=None,
|
|
1836
1868
|
worker_processes=1,
|
|
1837
1869
|
show_progress=False,
|
|
@@ -1855,7 +1887,7 @@ def encode(
|
|
|
1855
1887
|
if zarr_path.exists():
|
|
1856
1888
|
logger.warning(f"Deleting existing {zarr_path}")
|
|
1857
1889
|
shutil.rmtree(zarr_path)
|
|
1858
|
-
vzw = VcfZarrWriter(zarr_path, icf, schema)
|
|
1890
|
+
vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
|
|
1859
1891
|
vzw.init()
|
|
1860
1892
|
vzw.encode(
|
|
1861
1893
|
max_v_chunks=max_v_chunks,
|
|
@@ -1876,10 +1908,11 @@ def convert(
|
|
|
1876
1908
|
show_progress=False,
|
|
1877
1909
|
# TODO add arguments to control location of tmpdir
|
|
1878
1910
|
):
|
|
1879
|
-
with tempfile.TemporaryDirectory(prefix="
|
|
1911
|
+
with tempfile.TemporaryDirectory(prefix="vcf2zarr") as tmp:
|
|
1912
|
+
if_dir = pathlib.Path(tmp) / "if"
|
|
1880
1913
|
explode(
|
|
1881
|
-
vcfs,
|
|
1882
1914
|
if_dir,
|
|
1915
|
+
vcfs,
|
|
1883
1916
|
worker_processes=worker_processes,
|
|
1884
1917
|
show_progress=show_progress,
|
|
1885
1918
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Home-page: https://github.com/pystatgen/bio2zarr
|
|
6
6
|
Author: sgkit Developers
|
|
@@ -20,7 +20,7 @@ Requires-Python: >=3.9
|
|
|
20
20
|
Description-Content-Type: text/x-rst
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: numpy
|
|
23
|
-
Requires-Dist: zarr
|
|
23
|
+
Requires-Dist: zarr >=2.17
|
|
24
24
|
Requires-Dist: click
|
|
25
25
|
Requires-Dist: tabulate
|
|
26
26
|
Requires-Dist: tqdm
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
|
|
2
|
+
bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
|
|
3
|
+
bio2zarr/_version.py,sha256=yBVOKdXLEcTVc7YV7ZPqRXhRDRt-pKrfXxcgHkgPY5g,411
|
|
4
|
+
bio2zarr/cli.py,sha256=QE0DfoZHbBbxq9K_im9y4tJ49_Wss0zzavSjjz-85Xw,11484
|
|
5
|
+
bio2zarr/core.py,sha256=tZb9exfFmuzbA8tUpPY8avSm9YvfH31-vUCTM4fpj78,8128
|
|
6
|
+
bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
|
|
7
|
+
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
8
|
+
bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
|
|
9
|
+
bio2zarr/vcf.py,sha256=MEskVTDq4QntzoawPz0sfmInV0aPkIPLXXNv7GmVcmY,73870
|
|
10
|
+
bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
|
|
11
|
+
bio2zarr-0.0.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
12
|
+
bio2zarr-0.0.4.dist-info/METADATA,sha256=DISckjzZ0b6FpBTfBvpmJmEe00SIdTHyB3UTsTR8rws,1077
|
|
13
|
+
bio2zarr-0.0.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
14
|
+
bio2zarr-0.0.4.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
|
|
15
|
+
bio2zarr-0.0.4.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
16
|
+
bio2zarr-0.0.4.dist-info/RECORD,,
|
bio2zarr-0.0.3.dist-info/RECORD
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
|
|
2
|
-
bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
|
|
3
|
-
bio2zarr/_version.py,sha256=hB095avW4HuDZxn8qPHRG1UMzSSonb8ZDAsLxt9hmk8,411
|
|
4
|
-
bio2zarr/cli.py,sha256=N_vEFj730p_TL7Dk9m9T3ceAhVV58BMYRDmBmoeKH7A,10766
|
|
5
|
-
bio2zarr/core.py,sha256=sBlWmHjcb7tAn_7WQRBdrbGcEd_lT_3HTQ_JbzomVMg,8111
|
|
6
|
-
bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
|
|
7
|
-
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
8
|
-
bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
|
|
9
|
-
bio2zarr/vcf.py,sha256=g2TqH9Lbp4Ds8kjOnjvHvoMAgnG6Kx8pKPN1bqBKKIQ,72201
|
|
10
|
-
bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
|
|
11
|
-
bio2zarr-0.0.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
12
|
-
bio2zarr-0.0.3.dist-info/METADATA,sha256=dc2y5xrnkcvD1qmKGFL5GrsbM1_tiIlAYB2GrAlLunM,1106
|
|
13
|
-
bio2zarr-0.0.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
14
|
-
bio2zarr-0.0.3.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
|
|
15
|
-
bio2zarr-0.0.3.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
16
|
-
bio2zarr-0.0.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|