cpg-utils 5.3.0__tar.gz → 5.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/PKG-INFO +1 -1
  2. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/cromwell.py +5 -7
  3. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/hail_batch.py +23 -2
  4. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils.egg-info/PKG-INFO +1 -1
  5. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/setup.py +1 -1
  6. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/LICENSE +0 -0
  7. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/README.md +0 -0
  8. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/__init__.py +0 -0
  9. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/cloud.py +0 -0
  10. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/cloudpath_hail_az.py +0 -0
  11. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/config.py +0 -0
  12. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/constants.py +0 -0
  13. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/cromwell_model.py +0 -0
  14. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/dataproc.py +0 -0
  15. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/git.py +0 -0
  16. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/membership.py +0 -0
  17. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/py.typed +0 -0
  18. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/slack.py +0 -0
  19. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils.egg-info/SOURCES.txt +0 -0
  20. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils.egg-info/dependency_links.txt +0 -0
  21. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils.egg-info/requires.txt +0 -0
  22. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils.egg-info/top_level.txt +0 -0
  23. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/pyproject.toml +0 -0
  24. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/setup.cfg +0 -0
  25. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/test/__init__.py +0 -0
  26. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/test/test_config.py +0 -0
  27. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/test/test_cromwell.py +0 -0
  28. {cpg_utils-5.3.0 → cpg_utils-5.4.0}/test/test_doctests.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cpg-utils
3
- Version: 5.3.0
3
+ Version: 5.4.0
4
4
  Summary: Library of convenience functions specific to the CPG
5
5
  Home-page: https://github.com/populationgenomics/cpg-utils
6
6
  License: MIT
@@ -42,7 +42,7 @@ class CromwellBackend(Enum):
42
42
  pipelines_api = 'papi'
43
43
 
44
44
 
45
- DEFAULT_BACKEND = CromwellBackend.pipelines_api
45
+ DEFAULT_BACKEND = CromwellBackend.batch
46
46
 
47
47
 
48
48
  class CromwellOutputType:
@@ -643,7 +643,8 @@ def watch_workflow_and_get_output(
643
643
  array_length = output.array_length
644
644
  if array_length is None:
645
645
  # is single
646
- j = b.new_job(f'{job_prefix}_collect_{output_name}')
646
+ j = b.new_bash_job(f'{job_prefix}_collect_{output_name}')
647
+ j.image(driver_image)
647
648
  if output.resource_group:
648
649
  # is single resource group
649
650
  out_file_map[oname] = _copy_resource_group_into_batch(
@@ -660,13 +661,13 @@ def watch_workflow_and_get_output(
660
661
  output_name=output_name,
661
662
  idx=None,
662
663
  copy_file_into_batch=output.copy_file_into_batch,
663
- driver_image=driver_image,
664
664
  )
665
665
  else:
666
666
  # is array
667
667
  outs: list[Resource] = []
668
668
  for idx in range(array_length):
669
- j = b.new_job(f'{job_prefix}_collect_{output_name}[{idx}]')
669
+ j = b.new_bash_job(f'{job_prefix}_collect_{output_name}[{idx}]')
670
+ j.image(driver_image)
670
671
  if output.resource_group:
671
672
  # is array output group
672
673
  outs.append(
@@ -685,7 +686,6 @@ def watch_workflow_and_get_output(
685
686
  output_name=output_name,
686
687
  idx=idx,
687
688
  copy_file_into_batch=output.copy_file_into_batch,
688
- driver_image=driver_image,
689
689
  ),
690
690
  )
691
691
 
@@ -701,7 +701,6 @@ def _copy_basic_file_into_batch(
701
701
  output_name: str,
702
702
  idx: int | None,
703
703
  copy_file_into_batch: bool,
704
- driver_image: str,
705
704
  ) -> Resource:
706
705
  """
707
706
  1. Take the file-pointer to the dictionary `rdict`,
@@ -725,7 +724,6 @@ def _copy_basic_file_into_batch(
725
724
  jq_el = f'"{output_name}"[{idx}]'
726
725
 
727
726
  # activate to gcloud storage cp
728
- j.image(driver_image)
729
727
  j.env('GOOGLE_APPLICATION_CREDENTIALS', '/gsa-key/key.json')
730
728
  j.command(GCLOUD_ACTIVATE_AUTH)
731
729
 
@@ -1,6 +1,8 @@
1
1
  """Convenience functions related to Hail."""
2
2
 
3
3
  import asyncio
4
+ import base64
5
+ import gzip
4
6
  import inspect
5
7
  import logging
6
8
  import os
@@ -156,6 +158,23 @@ class Batch(hb.Batch):
156
158
  toml.dump(dict(get_config()), f)
157
159
  set_config_paths([str(config_path)])
158
160
 
161
+ def _pack_attribute(self, key: str, value: str) -> dict[str, str]:
162
+ """
163
+ Attributes are stored in a TEXT database field, which is limited to 64K.
164
+ If necessary, compress the value and annotate the key accordingly.
165
+ Eventually this may no longer suffice and we will need to split the value
166
+ across several attributes or similar.
167
+ """
168
+ if len(value) <= 10000: # noqa: PLR2004
169
+ return {key: value} # Store short values verbatim
170
+
171
+ raw = value.encode()
172
+ compressed_b64 = base64.standard_b64encode(gzip.compress(raw, compresslevel=9))
173
+ if len(compressed_b64) > 65535: # noqa: PLR2004
174
+ raise ValueError(f'Job attribute {key!r} value is too large')
175
+
176
+ return {f'{key}_gzip': compressed_b64.decode('ascii')}
177
+
159
178
  def _process_job_attributes(
160
179
  self,
161
180
  name: str | None = None,
@@ -175,7 +194,7 @@ class Batch(hb.Batch):
175
194
  dataset = attributes.get('dataset')
176
195
  sequencing_group = attributes.get('sequencing_group')
177
196
  participant_id = attributes.get('participant_id')
178
- sequencing_groups: set[str] = set(attributes.get('sequencing_groups') or [])
197
+ sequencing_groups: set[str] = set(attributes.pop('sequencing_groups', []) or [])
179
198
  if sequencing_group:
180
199
  sequencing_groups.add(sequencing_group)
181
200
  part = attributes.get('part')
@@ -215,7 +234,9 @@ class Batch(hb.Batch):
215
234
  self.job_by_tool[tool]['job_n'] += 1
216
235
  self.job_by_tool[tool]['sequencing_groups'] |= sequencing_groups
217
236
 
218
- attributes['sequencing_groups'] = sorted(sequencing_groups)
237
+ seqgroups_str = str(sorted(sequencing_groups))
238
+ attributes.update(self._pack_attribute('sequencing_groups', seqgroups_str))
239
+
219
240
  fixed_attrs = {k: str(v) for k, v in attributes.items()}
220
241
  return name, fixed_attrs
221
242
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cpg-utils
3
- Version: 5.3.0
3
+ Version: 5.4.0
4
4
  Summary: Library of convenience functions specific to the CPG
5
5
  Home-page: https://github.com/populationgenomics/cpg-utils
6
6
  License: MIT
@@ -8,7 +8,7 @@ with open('README.md') as f:
8
8
  setup(
9
9
  name='cpg-utils',
10
10
  # This tag is automatically updated by bumpversion
11
- version='5.3.0',
11
+ version='5.4.0',
12
12
  description='Library of convenience functions specific to the CPG',
13
13
  long_description=long_description,
14
14
  long_description_content_type='text/markdown',
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes