cpg-utils 5.3.1__tar.gz → 5.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/PKG-INFO +1 -1
  2. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/hail_batch.py +23 -2
  3. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils.egg-info/PKG-INFO +1 -1
  4. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/setup.py +1 -1
  5. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/LICENSE +0 -0
  6. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/README.md +0 -0
  7. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/__init__.py +0 -0
  8. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/cloud.py +0 -0
  9. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/cloudpath_hail_az.py +0 -0
  10. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/config.py +0 -0
  11. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/constants.py +0 -0
  12. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/cromwell.py +0 -0
  13. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/cromwell_model.py +0 -0
  14. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/dataproc.py +0 -0
  15. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/git.py +0 -0
  16. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/membership.py +0 -0
  17. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/py.typed +0 -0
  18. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils/slack.py +0 -0
  19. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils.egg-info/SOURCES.txt +0 -0
  20. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils.egg-info/dependency_links.txt +0 -0
  21. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils.egg-info/requires.txt +0 -0
  22. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/cpg_utils.egg-info/top_level.txt +0 -0
  23. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/pyproject.toml +0 -0
  24. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/setup.cfg +0 -0
  25. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/test/__init__.py +0 -0
  26. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/test/test_config.py +0 -0
  27. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/test/test_cromwell.py +0 -0
  28. {cpg_utils-5.3.1 → cpg_utils-5.4.0}/test/test_doctests.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cpg-utils
3
- Version: 5.3.1
3
+ Version: 5.4.0
4
4
  Summary: Library of convenience functions specific to the CPG
5
5
  Home-page: https://github.com/populationgenomics/cpg-utils
6
6
  License: MIT
@@ -1,6 +1,8 @@
1
1
  """Convenience functions related to Hail."""
2
2
 
3
3
  import asyncio
4
+ import base64
5
+ import gzip
4
6
  import inspect
5
7
  import logging
6
8
  import os
@@ -156,6 +158,23 @@ class Batch(hb.Batch):
156
158
  toml.dump(dict(get_config()), f)
157
159
  set_config_paths([str(config_path)])
158
160
 
161
+ def _pack_attribute(self, key: str, value: str) -> dict[str, str]:
162
+ """
163
+ Attributes are stored in a TEXT database field, which is limited to 64K.
164
+ If necessary, compress the value and annotate the key accordingly.
165
+ Eventually this may no longer suffice and we will need to split the value
166
+ across several attributes or similar.
167
+ """
168
+ if len(value) <= 10000: # noqa: PLR2004
169
+ return {key: value} # Store short values verbatim
170
+
171
+ raw = value.encode()
172
+ compressed_b64 = base64.standard_b64encode(gzip.compress(raw, compresslevel=9))
173
+ if len(compressed_b64) > 65535: # noqa: PLR2004
174
+ raise ValueError(f'Job attribute {key!r} value is too large')
175
+
176
+ return {f'{key}_gzip': compressed_b64.decode('ascii')}
177
+
159
178
  def _process_job_attributes(
160
179
  self,
161
180
  name: str | None = None,
@@ -175,7 +194,7 @@ class Batch(hb.Batch):
175
194
  dataset = attributes.get('dataset')
176
195
  sequencing_group = attributes.get('sequencing_group')
177
196
  participant_id = attributes.get('participant_id')
178
- sequencing_groups: set[str] = set(attributes.get('sequencing_groups') or [])
197
+ sequencing_groups: set[str] = set(attributes.pop('sequencing_groups', []) or [])
179
198
  if sequencing_group:
180
199
  sequencing_groups.add(sequencing_group)
181
200
  part = attributes.get('part')
@@ -215,7 +234,9 @@ class Batch(hb.Batch):
215
234
  self.job_by_tool[tool]['job_n'] += 1
216
235
  self.job_by_tool[tool]['sequencing_groups'] |= sequencing_groups
217
236
 
218
- attributes['sequencing_groups'] = sorted(sequencing_groups)
237
+ seqgroups_str = str(sorted(sequencing_groups))
238
+ attributes.update(self._pack_attribute('sequencing_groups', seqgroups_str))
239
+
219
240
  fixed_attrs = {k: str(v) for k, v in attributes.items()}
220
241
  return name, fixed_attrs
221
242
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cpg-utils
3
- Version: 5.3.1
3
+ Version: 5.4.0
4
4
  Summary: Library of convenience functions specific to the CPG
5
5
  Home-page: https://github.com/populationgenomics/cpg-utils
6
6
  License: MIT
@@ -8,7 +8,7 @@ with open('README.md') as f:
8
8
  setup(
9
9
  name='cpg-utils',
10
10
  # This tag is automatically updated by bumpversion
11
- version='5.3.1',
11
+ version='5.4.0',
12
12
  description='Library of convenience functions specific to the CPG',
13
13
  long_description=long_description,
14
14
  long_description_content_type='text/markdown',
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes