PyPI - cpg-utils - Versions diffs - 5.3.0__tar.gz → 5.4.0__tar.gz - Mend

cpg-utils 5.3.0tar.gz → 5.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{cpg_utils-5.3.0 → cpg_utils-5.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cpg-utils
-Version: 5.3.0
+Version: 5.4.0
 Summary: Library of convenience functions specific to the CPG
 Home-page: https://github.com/populationgenomics/cpg-utils
 License: MIT

{cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/cromwell.py RENAMED Viewed

@@ -42,7 +42,7 @@ class CromwellBackend(Enum):
     pipelines_api = 'papi'
-DEFAULT_BACKEND = CromwellBackend.pipelines_api
+DEFAULT_BACKEND = CromwellBackend.batch
 class CromwellOutputType:
@@ -643,7 +643,8 @@ def watch_workflow_and_get_output(
         array_length = output.array_length
         if array_length is None:
             # is single
-            j = b.new_job(f'{job_prefix}_collect_{output_name}')
+            j = b.new_bash_job(f'{job_prefix}_collect_{output_name}')
+            j.image(driver_image)
             if output.resource_group:
                 # is single resource group
                 out_file_map[oname] = _copy_resource_group_into_batch(
@@ -660,13 +661,13 @@ def watch_workflow_and_get_output(
                     output_name=output_name,
                     idx=None,
                     copy_file_into_batch=output.copy_file_into_batch,
-                    driver_image=driver_image,
                 )
         else:
             # is array
             outs: list[Resource] = []
             for idx in range(array_length):
-                j = b.new_job(f'{job_prefix}_collect_{output_name}[{idx}]')
+                j = b.new_bash_job(f'{job_prefix}_collect_{output_name}[{idx}]')
+                j.image(driver_image)
                 if output.resource_group:
                     # is array output group
                     outs.append(
@@ -685,7 +686,6 @@ def watch_workflow_and_get_output(
                             output_name=output_name,
                             idx=idx,
                             copy_file_into_batch=output.copy_file_into_batch,
-                            driver_image=driver_image,
                         ),
                     )
@@ -701,7 +701,6 @@ def _copy_basic_file_into_batch(
     output_name: str,
     idx: int | None,
     copy_file_into_batch: bool,
-    driver_image: str,
 ) -> Resource:
     """
     1. Take the file-pointer to the dictionary `rdict`,
@@ -725,7 +724,6 @@ def _copy_basic_file_into_batch(
         jq_el = f'"{output_name}"[{idx}]'
     # activate to gcloud storage cp
-    j.image(driver_image)
     j.env('GOOGLE_APPLICATION_CREDENTIALS', '/gsa-key/key.json')
     j.command(GCLOUD_ACTIVATE_AUTH)

{cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils/hail_batch.py RENAMED Viewed

@@ -1,6 +1,8 @@
 """Convenience functions related to Hail."""
 import asyncio
+import base64
+import gzip
 import inspect
 import logging
 import os
@@ -156,6 +158,23 @@ class Batch(hb.Batch):
             toml.dump(dict(get_config()), f)
         set_config_paths([str(config_path)])
+    def _pack_attribute(self, key: str, value: str) -> dict[str, str]:
+        """
+        Attributes are stored in a TEXT database field, which is limited to 64K.
+        If necessary, compress the value and annotate the key accordingly.
+        Eventually this may no longer suffice and we will need to split the value
+        across several attributes or similar.
+        """
+        if len(value) <= 10000:  # noqa: PLR2004
+            return {key: value}  # Store short values verbatim
+        raw = value.encode()
+        compressed_b64 = base64.standard_b64encode(gzip.compress(raw, compresslevel=9))
+        if len(compressed_b64) > 65535:  # noqa: PLR2004
+            raise ValueError(f'Job attribute {key!r} value is too large')
+        return {f'{key}_gzip': compressed_b64.decode('ascii')}
     def _process_job_attributes(
         self,
         name: str | None = None,
@@ -175,7 +194,7 @@ class Batch(hb.Batch):
         dataset = attributes.get('dataset')
         sequencing_group = attributes.get('sequencing_group')
         participant_id = attributes.get('participant_id')
-        sequencing_groups: set[str] = set(attributes.get('sequencing_groups') or [])
+        sequencing_groups: set[str] = set(attributes.pop('sequencing_groups', []) or [])
         if sequencing_group:
             sequencing_groups.add(sequencing_group)
         part = attributes.get('part')
@@ -215,7 +234,9 @@ class Batch(hb.Batch):
         self.job_by_tool[tool]['job_n'] += 1
         self.job_by_tool[tool]['sequencing_groups'] |= sequencing_groups
-        attributes['sequencing_groups'] = sorted(sequencing_groups)
+        seqgroups_str = str(sorted(sequencing_groups))
+        attributes.update(self._pack_attribute('sequencing_groups', seqgroups_str))
         fixed_attrs = {k: str(v) for k, v in attributes.items()}
         return name, fixed_attrs

{cpg_utils-5.3.0 → cpg_utils-5.4.0}/cpg_utils.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cpg-utils
-Version: 5.3.0
+Version: 5.4.0
 Summary: Library of convenience functions specific to the CPG
 Home-page: https://github.com/populationgenomics/cpg-utils
 License: MIT

{cpg_utils-5.3.0 → cpg_utils-5.4.0}/setup.py RENAMED Viewed

@@ -8,7 +8,7 @@ with open('README.md') as f:
 setup(
     name='cpg-utils',
     # This tag is automatically updated by bumpversion
-    version='5.3.0',
+    version='5.4.0',
     description='Library of convenience functions specific to the CPG',
     long_description=long_description,
     long_description_content_type='text/markdown',