bio2zarr 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

Files changed (56) hide show
  1. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/.github/workflows/ci.yml +50 -1
  2. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/CHANGELOG.md +25 -1
  3. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/PKG-INFO +5 -3
  4. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/_version.py +9 -4
  5. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/cli.py +46 -12
  6. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/core.py +32 -2
  7. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/plink.py +19 -14
  8. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/vcf2zarr/icf.py +41 -18
  9. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/vcf2zarr/vcz.py +460 -138
  10. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/vcf2zarr/verification.py +19 -16
  11. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/vcf_utils.py +30 -14
  12. bio2zarr-0.1.3/bio2zarr/zarr_utils.py +18 -0
  13. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/PKG-INFO +5 -3
  14. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/SOURCES.txt +1 -0
  15. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/requires.txt +2 -1
  16. bio2zarr-0.1.3/docs/vcf2zarr/overview.md +152 -0
  17. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/vcf2zarr/tutorial.md +2 -0
  18. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/vcfpartition/overview.md +1 -1
  19. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/pyproject.toml +5 -1
  20. bio2zarr-0.1.1/docs/vcf2zarr/overview.md +0 -92
  21. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/.github/workflows/cd.yml +0 -0
  22. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/.github/workflows/docs.yml +0 -0
  23. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/.gitignore +0 -0
  24. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/.pre-commit-config.yaml +0 -0
  25. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/LICENSE +0 -0
  26. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/MANIFEST.in +0 -0
  27. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/README.md +0 -0
  28. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/__init__.py +0 -0
  29. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/__main__.py +0 -0
  30. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/constants.py +0 -0
  31. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/provenance.py +0 -0
  32. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/typing.py +0 -0
  33. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/vcf2zarr/__init__.py +0 -0
  34. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/dependency_links.txt +0 -0
  35. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/entry_points.txt +0 -0
  36. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/top_level.txt +0 -0
  37. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/Makefile +0 -0
  38. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/_config.yml +0 -0
  39. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/_static/asciinema-player.css +0 -0
  40. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/_static/asciinema-player.min.js +0 -0
  41. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/_static/custom.css +0 -0
  42. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/_toc.yml +0 -0
  43. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/build.sh +0 -0
  44. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/cast_scripts/vcf2zarr_convert.sh +0 -0
  45. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/cast_scripts/vcf2zarr_explode.sh +0 -0
  46. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/installation.md +0 -0
  47. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/intro.md +0 -0
  48. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/logo.png +0 -0
  49. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/requirements.txt +0 -0
  50. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/vcf2zarr/cli_ref.md +0 -0
  51. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/vcfpartition/cli_ref.md +0 -0
  52. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/setup.cfg +0 -0
  53. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/validation-data/Makefile +0 -0
  54. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/validation-data/split.sh +0 -0
  55. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/validation.py +0 -0
  56. {bio2zarr-0.1.1 → bio2zarr-0.1.3}/vcf_generator.py +0 -0
@@ -24,17 +24,21 @@ jobs:
24
24
  # Use macos-13 because pip binary packages for ARM aren't
25
25
  # available for many dependencies
26
26
  os: [macos-13, macos-14, ubuntu-latest]
27
- python-version: ["3.9", "3.10", "3.11"]
27
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
28
28
  exclude:
29
29
  # Just run macos tests on one Python version
30
30
  - os: macos-13
31
31
  python-version: "3.10"
32
32
  - os: macos-13
33
33
  python-version: "3.11"
34
+ - os: macos-13
35
+ python-version: "3.12"
34
36
  - os: macos-14
35
37
  python-version: "3.9"
36
38
  - os: macos-14
37
39
  python-version: "3.10"
40
+ - os: macos-14
41
+ python-version: "3.12"
38
42
  steps:
39
43
  - uses: actions/checkout@v4
40
44
  - name: Set up Python ${{ matrix.python-version }}
@@ -105,3 +109,48 @@ jobs:
105
109
  vcfpartition --help
106
110
  python -m bio2zarr vcfpartition --help
107
111
 
112
+ test-numpy-version:
113
+ name: Test numpy versions
114
+ runs-on: ubuntu-latest
115
+ strategy:
116
+ matrix:
117
+ numpy: ["==1.26", ">=2"]
118
+ steps:
119
+ - uses: actions/checkout@v4
120
+ - uses: actions/setup-python@v5
121
+ with:
122
+ python-version: '3.11'
123
+ - name: Install dependencies
124
+ run: |
125
+ python -m pip install --upgrade pip
126
+ python -m pip install '.[dev]'
127
+ - name: Install numpy${{ matrix.numpy }}
128
+ run: |
129
+ python -m pip install 'numpy${{ matrix.numpy }}'
130
+ - name: Run tests
131
+ run: |
132
+ # We just run the CLI tests here because it doesn't require other upstream
133
+ # packages like sgkit (which are tangled up with the numpy 2 dependency)
134
+ python -m pytest tests/test_cli.py
135
+
136
+ test-zarr-version:
137
+ name: Test Zarr versions
138
+ runs-on: ubuntu-latest
139
+ strategy:
140
+ matrix:
141
+ zarr: ["==2.18.3", ">=3.0.3"]
142
+ steps:
143
+ - uses: actions/checkout@v4
144
+ - uses: actions/setup-python@v5
145
+ with:
146
+ python-version: '3.11'
147
+ - name: Install dependencies
148
+ run: |
149
+ python -m pip install --upgrade pip
150
+ python -m pip install '.[dev]'
151
+ - name: Install zarr${{ matrix.zarr }}
152
+ run: |
153
+ python -m pip install 'zarr${{ matrix.zarr }}'
154
+ - name: Run tests
155
+ run: |
156
+ python -m pytest
@@ -1,8 +1,32 @@
1
+ # 0.1.3 2025-03-04
2
+
3
+ - Fix missing dependency issue for packaging
4
+
5
+ - Support out-of-order field definitions in the VCF header (#322, @ACEnglish)
6
+
7
+ # 0.1.2 2025-02-04
8
+
9
+ - Reduce memory requirement for encoding genotypes with large sample sizes
10
+
11
+ - Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300)
12
+
13
+ - Add chunksize options to mkschema (issue:294)
14
+
15
+ - Add experimental support for local alleles.
16
+
17
+ - Add experimental support for ``region_index``
18
+
19
+ Breaking changes
20
+
21
+ - ICF metadata format version bumped to ensure long-term compatility between numpy 1.26.x
22
+ and numpy >= 2. Existing ICFs will need to be recreated.
23
+
24
+
1
25
  # 0.1.1 2024-06-19
2
26
 
3
27
  Maintenance release:
4
28
 
5
- - Pin numpy to < 2
29
+ - Pin numpy to < 2
6
30
  - Pin Zarr to < 3
7
31
 
8
32
  # 0.1.0 2024-06-10
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: bio2zarr
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -219,11 +219,12 @@ Classifier: Programming Language :: Python :: 3
219
219
  Classifier: Programming Language :: Python :: 3.9
220
220
  Classifier: Programming Language :: Python :: 3.10
221
221
  Classifier: Programming Language :: Python :: 3.11
222
+ Classifier: Programming Language :: Python :: 3.12
222
223
  Classifier: Topic :: Scientific/Engineering
223
224
  Requires-Python: >=3.9
224
225
  Description-Content-Type: text/markdown
225
226
  License-File: LICENSE
226
- Requires-Dist: numpy<2
227
+ Requires-Dist: numpy>=1.26
227
228
  Requires-Dist: zarr<3,>=2.17
228
229
  Requires-Dist: click
229
230
  Requires-Dist: tabulate
@@ -232,6 +233,7 @@ Requires-Dist: humanfriendly
232
233
  Requires-Dist: cyvcf2
233
234
  Requires-Dist: bed_reader
234
235
  Provides-Extra: dev
236
+ Requires-Dist: hypothesis-vcf; extra == "dev"
235
237
  Requires-Dist: msprime; extra == "dev"
236
238
  Requires-Dist: pysam; extra == "dev"
237
239
  Requires-Dist: pytest; extra == "dev"
@@ -1,8 +1,13 @@
1
- # file generated by setuptools_scm
1
+ # file generated by setuptools-scm
2
2
  # don't change, don't track in version control
3
+
4
+ __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
5
+
3
6
  TYPE_CHECKING = False
4
7
  if TYPE_CHECKING:
5
- from typing import Tuple, Union
8
+ from typing import Tuple
9
+ from typing import Union
10
+
6
11
  VERSION_TUPLE = Tuple[Union[int, str], ...]
7
12
  else:
8
13
  VERSION_TUPLE = object
@@ -12,5 +17,5 @@ __version__: str
12
17
  __version_tuple__: VERSION_TUPLE
13
18
  version_tuple: VERSION_TUPLE
14
19
 
15
- __version__ = version = '0.1.1'
16
- __version_tuple__ = version_tuple = (0, 1, 1)
20
+ __version__ = version = '0.1.3'
21
+ __version_tuple__ = version_tuple = (0, 1, 3)
@@ -149,6 +149,13 @@ max_memory = click.option(
149
149
  help="An approximate bound on overall memory usage (e.g. 10G),",
150
150
  )
151
151
 
152
+ local_alleles = click.option(
153
+ "--local-alleles/--no-local-alleles",
154
+ show_default=True,
155
+ default=False,
156
+ help="Use local allele fields to reduce the storage requirements of the output.",
157
+ )
158
+
152
159
 
153
160
  def setup_logging(verbosity):
154
161
  level = "WARNING"
@@ -312,7 +319,7 @@ def dexplode_finalise(icf_path, verbose):
312
319
 
313
320
 
314
321
  @click.command
315
- @click.argument("path", type=click.Path())
322
+ @click.argument("path", type=click.Path(exists=True))
316
323
  @verbose
317
324
  def inspect(path, verbose):
318
325
  """
@@ -325,12 +332,26 @@ def inspect(path, verbose):
325
332
 
326
333
  @click.command
327
334
  @icf_path
328
- def mkschema(icf_path):
335
+ @variants_chunk_size
336
+ @samples_chunk_size
337
+ @local_alleles
338
+ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
329
339
  """
330
340
  Generate a schema for zarr encoding
331
341
  """
342
+ if local_alleles:
343
+ click.echo(
344
+ "WARNING: Local alleles support is preliminary; please use with caution.",
345
+ err=True,
346
+ )
332
347
  stream = click.get_text_stream("stdout")
333
- vcf2zarr.mkschema(icf_path, stream)
348
+ vcf2zarr.mkschema(
349
+ icf_path,
350
+ stream,
351
+ variants_chunk_size=variants_chunk_size,
352
+ samples_chunk_size=samples_chunk_size,
353
+ local_alleles=local_alleles,
354
+ )
334
355
 
335
356
 
336
357
  @click.command
@@ -469,6 +490,7 @@ def dencode_finalise(zarr_path, verbose, progress):
469
490
  @verbose
470
491
  @progress
471
492
  @worker_processes
493
+ @local_alleles
472
494
  def convert_vcf(
473
495
  vcfs,
474
496
  zarr_path,
@@ -478,6 +500,7 @@ def convert_vcf(
478
500
  verbose,
479
501
  progress,
480
502
  worker_processes,
503
+ local_alleles,
481
504
  ):
482
505
  """
483
506
  Convert input VCF(s) directly to vcfzarr (not recommended for large files).
@@ -491,6 +514,7 @@ def convert_vcf(
491
514
  samples_chunk_size=samples_chunk_size,
492
515
  show_progress=progress,
493
516
  worker_processes=worker_processes,
517
+ local_alleles=local_alleles,
494
518
  )
495
519
 
496
520
 
@@ -560,7 +584,7 @@ plink2zarr.add_command(convert_plink)
560
584
 
561
585
  @click.command
562
586
  @version
563
- @click.argument("vcf_path", type=click.Path(exists=True, dir_okay=False))
587
+ @vcfs
564
588
  @verbose
565
589
  @num_partitions
566
590
  @click.option(
@@ -570,12 +594,16 @@ plink2zarr.add_command(convert_plink)
570
594
  default=None,
571
595
  help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
572
596
  )
573
- def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
597
+ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
574
598
  """
575
- Output bcftools region strings that partition an indexed VCF/BCF file
599
+ Output bcftools region strings that partition the indexed VCF/BCF files
576
600
  into either an approximate number of parts (-n), or parts of approximately
577
601
  a given size (-s). One of -n or -s must be supplied.
578
602
 
603
+ If multiple VCF/BCF files are provided, the number of parts (-n) is
604
+ interpreted as the total number of partitions across all the files,
605
+ and the partitions are distributed evenly among the files.
606
+
579
607
  Note that both the number of partitions and sizes are a target, and the
580
608
  returned number of partitions may not exactly correspond. In particular,
581
609
  there is a maximum level of granularity determined by the associated index
@@ -590,9 +618,15 @@ def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
590
618
  "Either --num-partitions or --partition-size must be specified"
591
619
  )
592
620
 
593
- indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
594
- regions = indexed_vcf.partition_into_regions(
595
- num_parts=num_partitions, target_part_size=partition_size
596
- )
597
- for region in regions:
598
- click.echo(f"{region}\t{vcf_path}")
621
+ if num_partitions is None:
622
+ num_parts_per_path = None
623
+ else:
624
+ num_parts_per_path = max(1, num_partitions // len(vcfs))
625
+
626
+ for vcf_path in vcfs:
627
+ indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
628
+ regions = indexed_vcf.partition_into_regions(
629
+ num_parts=num_parts_per_path, target_part_size=partition_size
630
+ )
631
+ for region in regions:
632
+ click.echo(f"{region}\t{vcf_path}")
@@ -63,6 +63,27 @@ def chunk_aligned_slices(z, n, max_chunks=None):
63
63
  return slices
64
64
 
65
65
 
66
+ def first_dim_slice_iter(z, start, stop):
67
+ """
68
+ Efficiently iterate over the specified slice of the first dimension of the zarr
69
+ array z.
70
+ """
71
+ chunk_size = z.chunks[0]
72
+ first_chunk = start // chunk_size
73
+ last_chunk = (stop // chunk_size) + (stop % chunk_size != 0)
74
+ for chunk in range(first_chunk, last_chunk):
75
+ Z = z.blocks[chunk]
76
+ chunk_start = chunk * chunk_size
77
+ chunk_stop = chunk_start + chunk_size
78
+ slice_start = None
79
+ if start > chunk_start:
80
+ slice_start = start - chunk_start
81
+ slice_stop = None
82
+ if stop < chunk_stop:
83
+ slice_stop = stop - chunk_start
84
+ yield from Z[slice_start:slice_stop]
85
+
86
+
66
87
  def du(path):
67
88
  """
68
89
  Return the total bytes stored at this path.
@@ -113,13 +134,16 @@ def cancel_futures(futures):
113
134
  class BufferedArray:
114
135
  array: zarr.Array
115
136
  array_offset: int
137
+ name: str
116
138
  buff: np.ndarray
117
139
  buffer_row: int
140
+ max_buff_size: int = 0
118
141
 
119
- def __init__(self, array, offset):
142
+ def __init__(self, array, offset, name="Unknown"):
120
143
  self.array = array
121
144
  self.array_offset = offset
122
145
  assert offset % array.chunks[0] == 0
146
+ self.name = name
123
147
  dims = list(array.shape)
124
148
  dims[0] = min(array.chunks[0], array.shape[0])
125
149
  self.buff = np.empty(dims, dtype=array.dtype)
@@ -150,11 +174,17 @@ class BufferedArray:
150
174
  self.buff[: self.buffer_row], self.array, self.array_offset
151
175
  )
152
176
  logger.debug(
153
- f"Flushed <{self.array.name} {self.array.shape} "
177
+ f"Flushed <{self.name} {self.array.shape} "
154
178
  f"{self.array.dtype}> "
155
179
  f"{self.array_offset}:{self.array_offset + self.buffer_row}"
156
180
  f"{self.buff.nbytes / 2**20: .2f}Mb"
157
181
  )
182
+ # Note this is inaccurate for string data as we're just reporting the
183
+ # size of the container. When we switch the numpy 2 StringDtype this
184
+ # should improve and we can get more visibility on how memory
185
+ # is being used.
186
+ # https://github.com/sgkit-dev/bio2zarr/issues/30
187
+ self.max_buff_size = max(self.max_buff_size, self.buff.nbytes)
158
188
  self.array_offset += self.variants_chunk_size
159
189
  self.buffer_row = 0
160
190
 
@@ -6,6 +6,8 @@ import numcodecs
6
6
  import numpy as np
7
7
  import zarr
8
8
 
9
+ from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
10
+
9
11
  from . import core
10
12
 
11
13
  logger = logging.getLogger(__name__)
@@ -17,8 +19,7 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
17
19
  # the correct approach is, but it is important to note that the
18
20
  # 0th allele is *not* necessarily the REF for these datasets.
19
21
  bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
20
- store = zarr.DirectoryStore(zarr_path)
21
- root = zarr.group(store=store)
22
+ root = zarr.open(store=zarr_path, mode="a", **ZARR_FORMAT_KWARGS)
22
23
  gt = core.BufferedArray(root["call_genotype"], start)
23
24
  gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
24
25
  gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
@@ -73,8 +74,7 @@ def convert(
73
74
  if variants_chunk_size is None:
74
75
  variants_chunk_size = 10_000
75
76
 
76
- store = zarr.DirectoryStore(zarr_path)
77
- root = zarr.group(store=store, overwrite=True)
77
+ root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
78
78
 
79
79
  ploidy = 2
80
80
  shape = [m, n]
@@ -88,7 +88,8 @@ def convert(
88
88
 
89
89
  a = root.array(
90
90
  "sample_id",
91
- bed.iid,
91
+ data=bed.iid,
92
+ shape=bed.iid.shape,
92
93
  dtype="str",
93
94
  compressor=default_compressor,
94
95
  chunks=(samples_chunk_size,),
@@ -100,7 +101,8 @@ def convert(
100
101
  # fetching repeatedly from bim file
101
102
  a = root.array(
102
103
  "variant_position",
103
- bed.bp_position,
104
+ data=bed.bp_position,
105
+ shape=bed.bp_position.shape,
104
106
  dtype=np.int32,
105
107
  compressor=default_compressor,
106
108
  chunks=(variants_chunk_size,),
@@ -111,41 +113,45 @@ def convert(
111
113
  alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
112
114
  a = root.array(
113
115
  "variant_allele",
114
- alleles,
116
+ data=alleles,
117
+ shape=alleles.shape,
115
118
  dtype="str",
116
119
  compressor=default_compressor,
117
- chunks=(variants_chunk_size,),
120
+ chunks=(variants_chunk_size, alleles.shape[1]),
118
121
  )
119
122
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
120
123
  logger.debug("encoded variant_allele")
121
124
 
122
125
  # TODO remove this?
123
126
  a = root.empty(
124
- "call_genotype_phased",
127
+ name="call_genotype_phased",
125
128
  dtype="bool",
126
129
  shape=list(shape),
127
130
  chunks=list(chunks),
128
131
  compressor=default_compressor,
132
+ **ZARR_FORMAT_KWARGS,
129
133
  )
130
134
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
131
135
 
132
136
  shape += [ploidy]
133
137
  dimensions += ["ploidy"]
134
138
  a = root.empty(
135
- "call_genotype",
139
+ name="call_genotype",
136
140
  dtype="i1",
137
141
  shape=list(shape),
138
142
  chunks=list(chunks),
139
143
  compressor=default_compressor,
144
+ **ZARR_FORMAT_KWARGS,
140
145
  )
141
146
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
142
147
 
143
148
  a = root.empty(
144
- "call_genotype_mask",
149
+ name="call_genotype_mask",
145
150
  dtype="bool",
146
151
  shape=list(shape),
147
152
  chunks=list(chunks),
148
153
  compressor=default_compressor,
154
+ **ZARR_FORMAT_KWARGS,
149
155
  )
150
156
  a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
151
157
 
@@ -154,7 +160,7 @@ def convert(
154
160
  num_slices = max(1, worker_processes * 4)
155
161
  slices = core.chunk_aligned_slices(a, num_slices)
156
162
 
157
- total_chunks = sum(a.nchunks for a in root.values())
163
+ total_chunks = sum(a.nchunks for _, a in root.arrays())
158
164
 
159
165
  progress_config = core.ProgressConfig(
160
166
  total=total_chunks, title="Convert", units="chunks", show=show_progress
@@ -171,8 +177,7 @@ def convert(
171
177
  # FIXME do this more efficiently - currently reading the whole thing
172
178
  # in for convenience, and also comparing call-by-call
173
179
  def validate(bed_path, zarr_path):
174
- store = zarr.DirectoryStore(zarr_path)
175
- root = zarr.group(store=store)
180
+ root = zarr.open(store=zarr_path, mode="r")
176
181
  call_genotype = root["call_genotype"][:]
177
182
 
178
183
  bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
@@ -41,7 +41,7 @@ class VcfFieldSummary(core.JsonDataclass):
41
41
  return VcfFieldSummary(**d)
42
42
 
43
43
 
44
- @dataclasses.dataclass
44
+ @dataclasses.dataclass(order=True)
45
45
  class VcfField:
46
46
  category: str
47
47
  name: str
@@ -110,7 +110,7 @@ class VcfPartition:
110
110
  num_records: int = -1
111
111
 
112
112
 
113
- ICF_METADATA_FORMAT_VERSION = "0.3"
113
+ ICF_METADATA_FORMAT_VERSION = "0.4"
114
114
  ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
115
115
  cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
116
116
  )
@@ -192,6 +192,16 @@ class IcfMetadata(core.JsonDataclass):
192
192
  d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
193
193
  return IcfMetadata(**d)
194
194
 
195
+ def __eq__(self, other):
196
+ if not isinstance(other, IcfMetadata):
197
+ return NotImplemented
198
+ return (
199
+ self.samples == other.samples
200
+ and self.contigs == other.contigs
201
+ and self.filters == other.filters
202
+ and sorted(self.fields) == sorted(other.fields)
203
+ )
204
+
195
205
 
196
206
  def fixed_vcf_field_definitions():
197
207
  def make_field_def(name, vcf_type, vcf_number):
@@ -212,6 +222,7 @@ def fixed_vcf_field_definitions():
212
222
  make_field_def("FILTERS", "String", "."),
213
223
  make_field_def("REF", "String", "1"),
214
224
  make_field_def("ALT", "String", "."),
225
+ make_field_def("rlen", "Integer", "1"), # computed field
215
226
  ]
216
227
  return fields
217
228
 
@@ -240,7 +251,7 @@ def scan_vcf(path, target_num_partitions):
240
251
  for h in vcf.header_iter():
241
252
  if h["HeaderType"] in ["INFO", "FORMAT"]:
242
253
  field = VcfField.from_header(h)
243
- if field.name == "GT":
254
+ if h["HeaderType"] == "FORMAT" and field.name == "GT":
244
255
  field.vcf_type = "Integer"
245
256
  field.vcf_number = "."
246
257
  fields.append(field)
@@ -300,7 +311,11 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
300
311
  )
301
312
  with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
302
313
  for path in paths:
303
- pwm.submit(scan_vcf, path, max(1, target_num_partitions // len(paths)))
314
+ pwm.submit(
315
+ scan_vcf,
316
+ path,
317
+ max(1, target_num_partitions // len(paths)),
318
+ )
304
319
  results = list(pwm.results_as_completed())
305
320
 
306
321
  # Sort to make the ordering deterministic
@@ -408,7 +423,7 @@ def sanitise_value_float_1d(buff, j, value):
408
423
  if value is None:
409
424
  buff[j] = constants.FLOAT32_MISSING
410
425
  else:
411
- value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
426
+ value = np.array(value, ndmin=1, dtype=buff.dtype, copy=True)
412
427
  # numpy will map None values to Nan, but we need a
413
428
  # specific NaN
414
429
  value[np.isnan(value)] = constants.FLOAT32_MISSING
@@ -422,7 +437,7 @@ def sanitise_value_float_2d(buff, j, value):
422
437
  buff[j] = constants.FLOAT32_MISSING
423
438
  else:
424
439
  # print("value = ", value)
425
- value = np.array(value, ndmin=2, dtype=buff.dtype, copy=False)
440
+ value = np.array(value, ndmin=2, dtype=buff.dtype, copy=True)
426
441
  buff[j] = constants.FLOAT32_FILL
427
442
  buff[j, :, : value.shape[1]] = value
428
443
 
@@ -432,7 +447,7 @@ def sanitise_int_array(value, ndmin, dtype):
432
447
  value = [
433
448
  constants.VCF_INT_MISSING if x is None else x for x in value
434
449
  ] # NEEDS TEST
435
- value = np.array(value, ndmin=ndmin, copy=False)
450
+ value = np.array(value, ndmin=ndmin, copy=True)
436
451
  value[value == constants.VCF_INT_MISSING] = -1
437
452
  value[value == constants.VCF_INT_FILL] = -2
438
453
  # TODO watch out for clipping here!
@@ -494,15 +509,15 @@ class VcfValueTransformer:
494
509
  def transform(self, vcf_value):
495
510
  if isinstance(vcf_value, tuple):
496
511
  vcf_value = [self.missing if v is None else v for v in vcf_value]
497
- value = np.array(vcf_value, ndmin=self.dimension, copy=False)
512
+ value = np.array(vcf_value, ndmin=self.dimension, copy=True)
498
513
  return value
499
514
 
500
515
  def transform_and_update_bounds(self, vcf_value):
501
516
  if vcf_value is None:
502
517
  return None
518
+ # print(self, self.field.full_name, "T", vcf_value)
503
519
  value = self.transform(vcf_value)
504
520
  self.update_bounds(value)
505
- # print(self.field.full_name, "T", vcf_value, "->", value)
506
521
  return value
507
522
 
508
523
 
@@ -531,13 +546,15 @@ class FloatValueTransformer(VcfValueTransformer):
531
546
  class StringValueTransformer(VcfValueTransformer):
532
547
  def update_bounds(self, value):
533
548
  summary = self.field.summary
534
- number = value.shape[-1]
549
+ if self.field.category == "FORMAT":
550
+ number = max(len(v) for v in value)
551
+ else:
552
+ number = value.shape[-1]
535
553
  # TODO would be nice to report string lengths, but not
536
554
  # really necessary.
537
555
  summary.max_number = max(summary.max_number, number)
538
556
 
539
557
  def transform(self, vcf_value):
540
- # print("transform", vcf_value)
541
558
  if self.dimension == 1:
542
559
  value = np.array(list(vcf_value.split(",")))
543
560
  else:
@@ -853,11 +870,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
853
870
 
854
871
  def summary_table(self):
855
872
  data = []
856
- for name, col in self.fields.items():
857
- summary = col.vcf_field.summary
873
+ for name, icf_field in self.fields.items():
874
+ summary = icf_field.vcf_field.summary
858
875
  d = {
859
876
  "name": name,
860
- "type": col.vcf_field.vcf_type,
877
+ "type": icf_field.vcf_field.vcf_type,
861
878
  "chunks": summary.num_chunks,
862
879
  "size": core.display_size(summary.uncompressed_size),
863
880
  "compressed": core.display_size(summary.compressed_size),
@@ -962,7 +979,7 @@ class IntermediateColumnarFormatWriter:
962
979
  compressor=None,
963
980
  ):
964
981
  if self.path.exists():
965
- raise ValueError("ICF path already exists")
982
+ raise ValueError(f"ICF path already exists: {self.path}")
966
983
  if compressor is None:
967
984
  compressor = ICF_DEFAULT_COMPRESSOR
968
985
  vcfs = [pathlib.Path(vcf) for vcf in vcfs]
@@ -1009,8 +1026,8 @@ class IntermediateColumnarFormatWriter:
1009
1026
  self.path.mkdir()
1010
1027
  self.wip_path.mkdir()
1011
1028
  for field in self.metadata.fields:
1012
- col_path = get_vcf_field_path(self.path, field)
1013
- col_path.mkdir(parents=True)
1029
+ field_path = get_vcf_field_path(self.path, field)
1030
+ field_path.mkdir(parents=True)
1014
1031
 
1015
1032
  def load_partition_summaries(self):
1016
1033
  summaries = []
@@ -1074,13 +1091,19 @@ class IntermediateColumnarFormatWriter:
1074
1091
  tcw.append("FILTERS", variant.FILTERS)
1075
1092
  tcw.append("REF", variant.REF)
1076
1093
  tcw.append("ALT", variant.ALT)
1094
+ tcw.append("rlen", variant.end - variant.start)
1077
1095
  for field in info_fields:
1078
1096
  tcw.append(field.full_name, variant.INFO.get(field.name, None))
1079
1097
  if has_gt:
1080
- tcw.append("FORMAT/GT", variant.genotype.array())
1098
+ if variant.genotype is None:
1099
+ val = None
1100
+ else:
1101
+ val = variant.genotype.array()
1102
+ tcw.append("FORMAT/GT", val)
1081
1103
  for field in format_fields:
1082
1104
  val = variant.format(field.name)
1083
1105
  tcw.append(field.full_name, val)
1106
+
1084
1107
  # Note: an issue with updating the progress per variant here like
1085
1108
  # this is that we get a significant pause at the end of the counter
1086
1109
  # while all the "small" fields get flushed. Possibly not much to be