cpg-utils 4.19.2__tar.gz → 5.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/PKG-INFO +1 -2
  2. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/cpg_utils/__init__.py +2 -3
  3. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/cpg_utils/cloud.py +69 -72
  4. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/cpg_utils/cloudpath_hail_az.py +9 -8
  5. cpg-utils-5.0.0/cpg_utils/config.py +548 -0
  6. cpg-utils-5.0.0/cpg_utils/constants.py +37 -0
  7. cpg-utils-5.0.0/cpg_utils/cromwell.py +776 -0
  8. cpg-utils-5.0.0/cpg_utils/cromwell_model.py +438 -0
  9. cpg-utils-5.0.0/cpg_utils/dataproc.py +408 -0
  10. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/cpg_utils/git.py +89 -65
  11. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/cpg_utils/hail_batch.py +259 -307
  12. cpg-utils-5.0.0/cpg_utils/membership.py +41 -0
  13. cpg-utils-5.0.0/cpg_utils/py.typed +0 -0
  14. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/cpg_utils/slack.py +21 -10
  15. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/cpg_utils.egg-info/PKG-INFO +1 -2
  16. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/cpg_utils.egg-info/SOURCES.txt +9 -1
  17. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/cpg_utils.egg-info/requires.txt +5 -6
  18. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/cpg_utils.egg-info/top_level.txt +1 -0
  19. cpg-utils-5.0.0/pyproject.toml +55 -0
  20. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/setup.py +11 -16
  21. cpg-utils-5.0.0/test/__init__.py +0 -0
  22. cpg-utils-5.0.0/test/test_config.py +198 -0
  23. cpg-utils-5.0.0/test/test_cromwell.py +88 -0
  24. cpg-utils-5.0.0/test/test_doctests.py +22 -0
  25. cpg-utils-4.19.2/cpg_utils/config.py +0 -205
  26. cpg-utils-4.19.2/pyproject.toml +0 -3
  27. cpg-utils-4.19.2/test/test_config.py +0 -36
  28. cpg-utils-4.19.2/test/test_hail_batch.py +0 -163
  29. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/LICENSE +0 -0
  30. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/README.md +0 -0
  31. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/cpg_utils.egg-info/dependency_links.txt +0 -0
  32. {cpg-utils-4.19.2 → cpg-utils-5.0.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cpg-utils
3
- Version: 4.19.2
3
+ Version: 5.0.0
4
4
  Summary: Library of convenience functions specific to the CPG
5
5
  Home-page: https://github.com/populationgenomics/cpg-utils
6
6
  License: MIT
@@ -16,7 +16,6 @@ Classifier: Programming Language :: Python
16
16
  Classifier: Topic :: Scientific/Engineering
17
17
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
18
  Description-Content-Type: text/markdown
19
- Provides-Extra: test
20
19
  License-File: LICENSE
21
20
 
22
21
  # cpg-utils
@@ -1,13 +1,12 @@
1
1
  """
2
2
  CPG utils
3
3
  """
4
+
4
5
  import pathlib
5
- from typing import Union
6
6
 
7
7
  from cloudpathlib import CloudPath
8
8
  from cloudpathlib.anypath import to_anypath
9
9
 
10
-
11
10
  # The AnyPath class https://cloudpathlib.drivendata.org/stable/anypath-polymorphism/
12
11
  # is very handy to parse a string that can be either a cloud URL or a local posix path.
13
12
  # However, AnyPath can't be used for type hinting, because neither CloudPath nor
@@ -15,7 +14,7 @@ from cloudpathlib.anypath import to_anypath
15
14
  # an instance of AnyPath class, but rather Union[CloudPath, pathlib.Path], and it's
16
15
  # designed to dynamically pick a specific CloudPath or pathlib.Path subclass.
17
16
  # Here we create an alias for such union to allow using simple "Path" in type hints:
18
- Path = Union[CloudPath, pathlib.Path]
17
+ Path = CloudPath | pathlib.Path
19
18
 
20
19
  # We would still need to call AnyPath() to parse a string, which might be confusing.
21
20
  # Something like to_path() would look better, so we are aliasing a handy method
@@ -5,29 +5,30 @@ import os
5
5
  import re
6
6
  import subprocess
7
7
  import traceback
8
-
9
- from cloudpathlib import AnyPath
8
+ from typing import Any
10
9
 
11
10
  # pylint: disable=no-name-in-module
12
11
  import google.api_core.exceptions
13
12
  import google.auth.transport
14
13
  import google.oauth2
14
+ from deprecated import deprecated
15
15
  from google.auth import (
16
16
  credentials as google_auth_credentials,
17
+ )
18
+ from google.auth import (
17
19
  environment_vars,
18
20
  exceptions,
21
+ jwt,
19
22
  )
20
- from google.auth import jwt
21
23
  from google.auth._default import (
22
24
  _AUTHORIZED_USER_TYPE,
23
- _SERVICE_ACCOUNT_TYPE,
24
25
  _EXTERNAL_ACCOUNT_TYPE,
26
+ _SERVICE_ACCOUNT_TYPE,
25
27
  )
26
28
  from google.auth.transport import requests
27
29
  from google.cloud import secretmanager
28
- from google.oauth2 import credentials as oauth2_credentials, service_account
29
-
30
- from cpg_utils.config import get_config
30
+ from google.oauth2 import credentials as oauth2_credentials
31
+ from google.oauth2 import service_account
31
32
 
32
33
  _CLOUD_SDK_MISSING_CREDENTIALS = """\
33
34
  Your default credentials were not found. To set up Application Default Credentials, \
@@ -44,7 +45,8 @@ IMPLEMENTED_CREDENTIALS_TYPES = (
44
45
  def email_from_id_token(id_token_jwt: str) -> str:
45
46
  """Decodes the ID token (JWT) to get the email address of the caller.
46
47
 
47
- See http://bit.ly/2YAIkzy for details.
48
+ See for details
49
+ https://developers.google.com/identity/sign-in/web/backend-auth?authuser=0#verify-the-integrity-of-the-id-token
48
50
 
49
51
  This function assumes that the token has been verified beforehand."""
50
52
 
@@ -107,7 +109,7 @@ def write_secret(project_id: str, secret_name: str, secret_value: str) -> None:
107
109
  request={
108
110
  'parent': secret_path,
109
111
  'payload': {'data': secret_value.encode('UTF-8')},
110
- }
112
+ },
111
113
  )
112
114
 
113
115
  # Disable all previous versions.
@@ -122,7 +124,8 @@ def write_secret(project_id: str, secret_name: str, secret_value: str) -> None:
122
124
 
123
125
 
124
126
  def get_google_identity_token(
125
- target_audience: str | None, request: google.auth.transport.Request = None
127
+ target_audience: str | None,
128
+ request: google.auth.transport.Request | None = None,
126
129
  ) -> str:
127
130
  """Returns a Google identity token for the given audience."""
128
131
  if request is None:
@@ -133,7 +136,10 @@ def get_google_identity_token(
133
136
  # https://github.com/googleapis/google-auth-library-python/issues/590
134
137
  creds = _get_default_id_token_credentials(target_audience, request)
135
138
  creds.refresh(request)
136
- return creds.token
139
+ token = creds.token
140
+ if not token:
141
+ raise ValueError('Could not generate google identity token')
142
+ return token
137
143
 
138
144
 
139
145
  class IDTokenCredentialsAdapter(google_auth_credentials.Credentials):
@@ -149,7 +155,7 @@ class IDTokenCredentialsAdapter(google_auth_credentials.Credentials):
149
155
  """Returns the expired property."""
150
156
  return self.credentials.expired
151
157
 
152
- def refresh(self, request):
158
+ def refresh(self, request: google.auth.transport.Request):
153
159
  """Refreshes the token."""
154
160
  self.credentials.refresh(request)
155
161
  self.token = self.credentials.id_token
@@ -167,18 +173,18 @@ class ExternalCredentialsAdapter(google_auth_credentials.Credentials):
167
173
  impersonate_id: str | None = None,
168
174
  ):
169
175
  super().__init__()
170
- self.token = None
176
+ self.token: str | None = None
171
177
  self.audience = audience
172
178
  impersonate_id = impersonate_id or os.environ.get('GOOGLE_IMPERSONATE_IDENTITY')
173
179
  if not impersonate_id:
174
180
  raise exceptions.DefaultCredentialsError(
175
- f'GOOGLE_IMPERSONATE_IDENTITY environment variable is not set. '
176
- f'Cannot impersonate service account.'
181
+ 'GOOGLE_IMPERSONATE_IDENTITY environment variable is not set. '
182
+ 'Cannot impersonate service account.',
177
183
  )
178
184
 
179
185
  self.impersonate_id = impersonate_id
180
186
 
181
- def refresh(self, *args, **kwargs): # pylint: disable=unused-argument
187
+ def refresh(self, *args: Any, **kwargs: Any): # noqa: ARG002
182
188
  """Call gcloud to get a new token."""
183
189
  command = [
184
190
  'gcloud',
@@ -189,12 +195,14 @@ class ExternalCredentialsAdapter(google_auth_credentials.Credentials):
189
195
  ]
190
196
  if self.audience:
191
197
  command.append(f'--audiences={self.audience}')
192
-
193
- self.token = subprocess.check_output(command).decode('utf-8').strip()
198
+ self.token = (
199
+ subprocess.check_output(command).decode('utf-8').strip() # noqa: S603
200
+ )
194
201
 
195
202
 
196
203
  def _load_credentials_from_file(
197
- filename: str, target_audience: str | None
204
+ filename: str,
205
+ target_audience: str | None,
198
206
  ) -> google_auth_credentials.Credentials | None:
199
207
  """
200
208
  Loads credentials from a file.
@@ -212,7 +220,7 @@ def _load_credentials_from_file(
212
220
  info = json.load(file_obj)
213
221
  except json.JSONDecodeError as exc:
214
222
  raise exceptions.DefaultCredentialsError(
215
- f'File {filename} is not a valid json file.'
223
+ f'File {filename} is not a valid json file.',
216
224
  ) from exc
217
225
 
218
226
  # The type key should indicate that the file is either a service account
@@ -221,33 +229,29 @@ def _load_credentials_from_file(
221
229
 
222
230
  if credential_type == _AUTHORIZED_USER_TYPE:
223
231
  current_credentials = oauth2_credentials.Credentials.from_authorized_user_info(
224
- info, scopes=['openid', 'https://www.googleapis.com/auth/userinfo.email']
232
+ info,
233
+ scopes=['openid', 'https://www.googleapis.com/auth/userinfo.email'],
225
234
  )
226
- current_credentials = IDTokenCredentialsAdapter(credentials=current_credentials)
227
-
228
- return current_credentials
235
+ return IDTokenCredentialsAdapter(credentials=current_credentials)
229
236
 
230
237
  if credential_type == _SERVICE_ACCOUNT_TYPE:
231
238
  try:
232
239
  return service_account.IDTokenCredentials.from_service_account_info(
233
- info, target_audience=target_audience
240
+ info,
241
+ target_audience=target_audience,
234
242
  )
235
243
  except ValueError as exc:
236
244
  raise exceptions.DefaultCredentialsError(
237
- f'Failed to load service account credentials from {filename}'
245
+ f'Failed to load service account credentials from {filename}',
238
246
  ) from exc
239
247
 
240
248
  if credential_type == _EXTERNAL_ACCOUNT_TYPE:
241
- # this one's a bit unfortunate, I can't find the API way to do it
242
- # credentials, _ = _get_external_account_credentials(info, filename=filename)
243
- # credentials._audience = target_audience
244
- # return credentials
245
249
  return ExternalCredentialsAdapter(audience=target_audience)
246
250
 
247
251
  raise exceptions.DefaultCredentialsError(
248
252
  f'The file {filename} does not have a valid type of google-cloud credentials. '
249
253
  f'Type is {credential_type}, but cpg-utils only implements '
250
- f'{IMPLEMENTED_CREDENTIALS_TYPES}.'
254
+ f'{IMPLEMENTED_CREDENTIALS_TYPES}.',
251
255
  )
252
256
 
253
257
 
@@ -260,12 +264,11 @@ def _get_explicit_environ_credentials(
260
264
  if explicit_file is None:
261
265
  return None
262
266
 
263
- current_credentials = _load_credentials_from_file(
264
- os.environ[environment_vars.CREDENTIALS], target_audience=target_audience
267
+ return _load_credentials_from_file(
268
+ os.environ[environment_vars.CREDENTIALS],
269
+ target_audience=target_audience,
265
270
  )
266
271
 
267
- return current_credentials
268
-
269
272
 
270
273
  def _get_gcloud_sdk_credentials(
271
274
  target_audience: str | None,
@@ -279,15 +282,15 @@ def _get_gcloud_sdk_credentials(
279
282
  if not os.path.isfile(credentials_filename):
280
283
  return None
281
284
 
282
- current_credentials = _load_credentials_from_file(
283
- credentials_filename, target_audience
285
+ return _load_credentials_from_file(
286
+ credentials_filename,
287
+ target_audience,
284
288
  )
285
289
 
286
- return current_credentials
287
-
288
290
 
289
291
  def _get_gce_credentials(
290
- target_audience: str | None, request: google.auth.transport.Request | None = None
292
+ target_audience: str | None,
293
+ request: google.auth.transport.Request | None = None,
291
294
  ) -> google_auth_credentials.Credentials | None:
292
295
  """Gets credentials and project ID from the GCE Metadata Service."""
293
296
  # Ping requires a transport, but we want application default credentials
@@ -312,14 +315,17 @@ def _get_gce_credentials(
312
315
 
313
316
  if _metadata.ping(request=request):
314
317
  return compute_engine.IDTokenCredentials(
315
- request, target_audience, use_metadata_identity_endpoint=True
318
+ request,
319
+ target_audience,
320
+ use_metadata_identity_endpoint=True,
316
321
  )
317
322
 
318
323
  return None
319
324
 
320
325
 
321
326
  def _get_default_id_token_credentials(
322
- target_audience: str | None, request: google.auth.transport.Request = None
327
+ target_audience: str | None,
328
+ request: google.auth.transport.Request | None = None,
323
329
  ) -> google_auth_credentials.Credentials:
324
330
  """Gets the default ID Token credentials for the current environment.
325
331
  `Application Default Credentials`_ provides an easy way to obtain credentials to call Google APIs for
@@ -349,35 +355,6 @@ def _get_default_id_token_credentials(
349
355
  raise exceptions.DefaultCredentialsError(_CLOUD_SDK_MISSING_CREDENTIALS)
350
356
 
351
357
 
352
- def get_cached_group_members(
353
- group, members_cache_location: str | None = None
354
- ) -> set[str]:
355
- """
356
- Get cached members of a group, based on the members_cache_location
357
- """
358
- group_name = group.split('@')[0]
359
-
360
- if not members_cache_location:
361
- config = get_config()
362
- members_cache_location = config['infrastructure']['members_cache_location']
363
-
364
- pathname = os.path.join(members_cache_location, group_name + '-members.txt') # type: ignore
365
-
366
- with AnyPath(pathname).open() as f:
367
- return set(line.strip() for line in f.readlines() if line.strip())
368
-
369
-
370
- def is_member_in_cached_group(
371
- group, member, members_cache_location: str | None = None
372
- ) -> bool:
373
- """
374
- Check if a member is in a group, based on the infrastructure config
375
- """
376
- return member.lower() in get_cached_group_members(
377
- group, members_cache_location=members_cache_location
378
- )
379
-
380
-
381
358
  def get_path_components_from_gcp_path(path: str) -> dict[str, str]:
382
359
  """
383
360
  Return the {bucket_name}, {dataset}, {bucket_type}, {subdir}, and {file} for GS only paths
@@ -391,7 +368,7 @@ def get_path_components_from_gcp_path(path: str) -> dict[str, str]:
391
368
  gspath_pattern = re.compile(
392
369
  r'gs://(?P<bucket>cpg-(?P<dataset>[\w-]+)-(?P<bucket_type>['
393
370
  + '|'.join(s for s in bucket_types)
394
- + r']+[-\w]*))/(?P<suffix>.+/)?(?P<file>.*)$'
371
+ + r']+[-\w]*))/(?P<suffix>.+/)?(?P<file>.*)$',
395
372
  )
396
373
 
397
374
  # if a match succeeds, return the key: value dictionary
@@ -400,3 +377,23 @@ def get_path_components_from_gcp_path(path: str) -> dict[str, str]:
400
377
 
401
378
  # raise an error if the input String was not a valid CPG bucket path
402
379
  raise ValueError('The input String did not match a valid GCP path')
380
+
381
+
382
+ def get_project_id_from_service_account_email(service_account_email: str) -> str:
383
+ """
384
+ Get GCP project id from service_account_email
385
+
386
+ >>> get_project_id_from_service_account_email('cromwell-test@tob-wgs.iam.gserviceaccount.com')
387
+ 'tob-wgs'
388
+ """
389
+ # quick and dirty
390
+ return service_account_email.split('@')[-1].split('.')[0]
391
+
392
+
393
+ @deprecated(reason='Use cpg_utils.membership.is_member_in_cached_group instead')
394
+ def is_member_in_cached_group(*args: Any, **kwargs: Any):
395
+ from cpg_utils.membership import (
396
+ is_member_in_cached_group as _is_member_in_cached_group,
397
+ )
398
+
399
+ return _is_member_in_cached_group(*args, **kwargs)
@@ -4,12 +4,11 @@ Inspired by https://github.com/drivendataorg/cloudpathlib/issues/157
4
4
  """
5
5
 
6
6
  import re
7
- from typing import Union, Optional
8
7
  from urllib.parse import urlparse
9
8
 
10
9
  from cloudpathlib import AzureBlobClient, AzureBlobPath
11
10
  from cloudpathlib.client import register_client_class
12
- from cloudpathlib.cloudpath import register_path_class, CloudPath
11
+ from cloudpathlib.cloudpath import CloudPath, register_path_class
13
12
  from cloudpathlib.exceptions import InvalidPrefixError
14
13
 
15
14
 
@@ -35,9 +34,9 @@ class HailAzureBlobPath(AzureBlobPath):
35
34
 
36
35
  def __init__(
37
36
  self,
38
- cloud_path: Union[str, CloudPath],
39
- client: Optional[AzureBlobClient] = None,
40
- token: Optional[str] = None,
37
+ cloud_path: str | CloudPath,
38
+ client: AzureBlobClient | None = None,
39
+ token: str | None = None,
41
40
  ):
42
41
  if isinstance(cloud_path, str):
43
42
  parsed = urlparse(cloud_path)
@@ -72,7 +71,9 @@ class HailAzureBlobPath(AzureBlobPath):
72
71
 
73
72
  @classmethod
74
73
  def is_valid_cloudpath(
75
- cls, path: Union[str, CloudPath], raise_on_error=False
74
+ cls,
75
+ path: str | CloudPath,
76
+ raise_on_error: bool = False,
76
77
  ) -> bool:
77
78
  """
78
79
  Also allowing HTTP.
@@ -81,13 +82,13 @@ class HailAzureBlobPath(AzureBlobPath):
81
82
  re.match(
82
83
  fr'({HailAzureBlobPath.cloud_prefix}|https://[a-z0-9]+\.(blob|dfs)\.core\.windows\.net)',
83
84
  str(path).lower(),
84
- )
85
+ ),
85
86
  )
86
87
 
87
88
  if raise_on_error and not valid:
88
89
  raise InvalidPrefixError(
89
90
  f'{path} is not a valid path since it does not start with {cls.cloud_prefix} '
90
- f'or valid Azure https blob or dfs location.'
91
+ f'or valid Azure https blob or dfs location.',
91
92
  )
92
93
 
93
94
  return valid