metaflow 2.11.15__py2.py3-none-any.whl → 2.11.16__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. metaflow/__init__.py +3 -0
  2. metaflow/clone_util.py +6 -0
  3. metaflow/extension_support/plugins.py +2 -0
  4. metaflow/metaflow_config.py +24 -0
  5. metaflow/metaflow_environment.py +2 -2
  6. metaflow/plugins/__init__.py +19 -0
  7. metaflow/plugins/airflow/airflow.py +7 -0
  8. metaflow/plugins/argo/argo_workflows.py +17 -0
  9. metaflow/plugins/azure/__init__.py +3 -0
  10. metaflow/plugins/azure/azure_credential.py +53 -0
  11. metaflow/plugins/azure/azure_exceptions.py +1 -1
  12. metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
  13. metaflow/plugins/azure/azure_utils.py +2 -35
  14. metaflow/plugins/azure/blob_service_client_factory.py +4 -2
  15. metaflow/plugins/datastores/azure_storage.py +6 -6
  16. metaflow/plugins/datatools/s3/s3.py +1 -1
  17. metaflow/plugins/gcp/__init__.py +1 -0
  18. metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +169 -0
  19. metaflow/plugins/gcp/gs_storage_client_factory.py +52 -1
  20. metaflow/plugins/kubernetes/kubernetes.py +85 -8
  21. metaflow/plugins/kubernetes/kubernetes_cli.py +24 -1
  22. metaflow/plugins/kubernetes/kubernetes_client.py +4 -1
  23. metaflow/plugins/kubernetes/kubernetes_decorator.py +49 -4
  24. metaflow/plugins/kubernetes/kubernetes_job.py +208 -201
  25. metaflow/plugins/kubernetes/kubernetes_jobsets.py +784 -0
  26. metaflow/plugins/timeout_decorator.py +2 -1
  27. metaflow/task.py +1 -12
  28. metaflow/tuple_util.py +27 -0
  29. metaflow/util.py +0 -15
  30. metaflow/version.py +1 -1
  31. {metaflow-2.11.15.dist-info → metaflow-2.11.16.dist-info}/METADATA +2 -2
  32. {metaflow-2.11.15.dist-info → metaflow-2.11.16.dist-info}/RECORD +36 -31
  33. {metaflow-2.11.15.dist-info → metaflow-2.11.16.dist-info}/LICENSE +0 -0
  34. {metaflow-2.11.15.dist-info → metaflow-2.11.16.dist-info}/WHEEL +0 -0
  35. {metaflow-2.11.15.dist-info → metaflow-2.11.16.dist-info}/entry_points.txt +0 -0
  36. {metaflow-2.11.15.dist-info → metaflow-2.11.16.dist-info}/top_level.txt +0 -0
metaflow/__init__.py CHANGED
@@ -143,6 +143,9 @@ from .client import (
143
143
  DataArtifact,
144
144
  )
145
145
 
146
+ # Import data class within tuple_util but not introduce new symbols.
147
+ from . import tuple_util
148
+
146
149
  __version_addl__ = []
147
150
  _ext_debug("Loading top-level modules")
148
151
  for m in _tl_modules:
metaflow/clone_util.py CHANGED
@@ -66,6 +66,12 @@ def clone_task_helper(
66
66
  type="attempt",
67
67
  tags=metadata_tags,
68
68
  ),
69
+ MetaDatum(
70
+ field="attempt_ok",
71
+ value="True", # During clone, the task is always considered successful.
72
+ type="internal_attempt_status",
73
+ tags=metadata_tags,
74
+ ),
69
75
  ],
70
76
  )
71
77
  output.done()
@@ -179,6 +179,8 @@ _plugin_categories = {
179
179
  "metadata_provider": lambda x: x.TYPE,
180
180
  "datastore": lambda x: x.TYPE,
181
181
  "secrets_provider": lambda x: x.TYPE,
182
+ "gcp_client_provider": lambda x: x.name,
183
+ "azure_client_provider": lambda x: x.name,
182
184
  "sidecar": None,
183
185
  "logging_sidecar": None,
184
186
  "monitor_sidecar": None,
@@ -26,6 +26,7 @@ DEFAULT_METADATA = from_conf("DEFAULT_METADATA", "local")
26
26
  DEFAULT_MONITOR = from_conf("DEFAULT_MONITOR", "nullSidecarMonitor")
27
27
  DEFAULT_PACKAGE_SUFFIXES = from_conf("DEFAULT_PACKAGE_SUFFIXES", ".py,.R,.RDS")
28
28
  DEFAULT_AWS_CLIENT_PROVIDER = from_conf("DEFAULT_AWS_CLIENT_PROVIDER", "boto3")
29
+ DEFAULT_GCP_CLIENT_PROVIDER = from_conf("DEFAULT_GCP_CLIENT_PROVIDER", "gcp-default")
29
30
  DEFAULT_SECRETS_BACKEND_TYPE = from_conf("DEFAULT_SECRETS_BACKEND_TYPE")
30
31
  DEFAULT_SECRETS_ROLE = from_conf("DEFAULT_SECRETS_ROLE")
31
32
 
@@ -144,6 +145,22 @@ DATATOOLS_LOCALROOT = from_conf(
144
145
  # Secrets Backend - AWS Secrets Manager configuration
145
146
  AWS_SECRETS_MANAGER_DEFAULT_REGION = from_conf("AWS_SECRETS_MANAGER_DEFAULT_REGION")
146
147
 
148
+ # Secrets Backend - GCP Secrets name prefix. With this, users don't have
149
+ # to specify the full secret name in the @secret decorator.
150
+ #
151
+ # Note that it makes a difference whether the prefix ends with a slash or not
152
+ # E.g. if secret name passed to @secret decorator is mysecret:
153
+ # - "projects/1234567890/secrets/" -> "projects/1234567890/secrets/mysecret"
154
+ # - "projects/1234567890/secrets/foo-" -> "projects/1234567890/secrets/foo-mysecret"
155
+ GCP_SECRET_MANAGER_PREFIX = from_conf("GCP_SECRET_MANAGER_PREFIX")
156
+
157
+ # Secrets Backend - Azure Key Vault prefix. With this, users don't have to
158
+ # specify the full https:// vault url in the @secret decorator.
159
+ #
160
+ # It does not make a difference if the prefix ends in a / or not. We will handle either
161
+ # case correctly.
162
+ AZURE_KEY_VAULT_PREFIX = from_conf("AZURE_KEY_VAULT_PREFIX")
163
+
147
164
  # The root directory to save artifact pulls in, when using S3 or Azure
148
165
  ARTIFACT_LOCALROOT = from_conf("ARTIFACT_LOCALROOT", os.getcwd())
149
166
 
@@ -210,6 +227,8 @@ DEFAULT_CONTAINER_REGISTRY = from_conf("DEFAULT_CONTAINER_REGISTRY")
210
227
  INCLUDE_FOREACH_STACK = from_conf("INCLUDE_FOREACH_STACK", False)
211
228
  # Maximum length of the foreach value string to be stored in each ForeachFrame.
212
229
  MAXIMUM_FOREACH_VALUE_CHARS = from_conf("MAXIMUM_FOREACH_VALUE_CHARS", 30)
230
+ # The default runtime limit (In seconds) of jobs launched by any compute provider. Default of 5 days.
231
+ DEFAULT_RUNTIME_LIMIT = from_conf("DEFAULT_RUNTIME_LIMIT", 5 * 24 * 60 * 60)
213
232
 
214
233
  ###
215
234
  # Organization customizations
@@ -322,6 +341,9 @@ KUBERNETES_DISK = from_conf("KUBERNETES_DISK", None)
322
341
  ARGO_WORKFLOWS_KUBERNETES_SECRETS = from_conf("ARGO_WORKFLOWS_KUBERNETES_SECRETS", "")
323
342
  ARGO_WORKFLOWS_ENV_VARS_TO_SKIP = from_conf("ARGO_WORKFLOWS_ENV_VARS_TO_SKIP", "")
324
343
 
344
+ KUBERNETES_JOBSET_GROUP = from_conf("KUBERNETES_JOBSET_GROUP", "jobset.x-k8s.io")
345
+ KUBERNETES_JOBSET_VERSION = from_conf("KUBERNETES_JOBSET_VERSION", "v1alpha2")
346
+
325
347
  ##
326
348
  # Argo Events Configuration
327
349
  ##
@@ -456,9 +478,11 @@ def get_pinned_conda_libs(python_version, datastore_type):
456
478
  elif datastore_type == "azure":
457
479
  pins["azure-identity"] = ">=1.10.0"
458
480
  pins["azure-storage-blob"] = ">=12.12.0"
481
+ pins["azure-keyvault-secrets"] = ">=4.7.0"
459
482
  elif datastore_type == "gs":
460
483
  pins["google-cloud-storage"] = ">=2.5.0"
461
484
  pins["google-auth"] = ">=2.11.0"
485
+ pins["google-cloud-secret-manager"] = ">=2.10.0"
462
486
  elif datastore_type == "local":
463
487
  pass
464
488
  else:
@@ -124,12 +124,12 @@ class MetaflowEnvironment(object):
124
124
  cmds.append("%s -m pip install awscli boto3 -qqq" % self._python())
125
125
  elif datastore_type == "azure":
126
126
  cmds.append(
127
- "%s -m pip install azure-identity azure-storage-blob simple-azure-blob-downloader -qqq"
127
+ "%s -m pip install azure-identity azure-storage-blob azure-keyvault-secrets simple-azure-blob-downloader -qqq"
128
128
  % self._python()
129
129
  )
130
130
  elif datastore_type == "gs":
131
131
  cmds.append(
132
- "%s -m pip install google-cloud-storage google-auth simple-gcp-object-downloader -qqq"
132
+ "%s -m pip install google-cloud-storage google-auth simple-gcp-object-downloader google-cloud-secret-manager -qqq"
133
133
  % self._python()
134
134
  )
135
135
  else:
@@ -121,8 +121,25 @@ SECRETS_PROVIDERS_DESC = [
121
121
  "aws-secrets-manager",
122
122
  ".aws.secrets_manager.aws_secrets_manager_secrets_provider.AwsSecretsManagerSecretsProvider",
123
123
  ),
124
+ (
125
+ "gcp-secret-manager",
126
+ ".gcp.gcp_secret_manager_secrets_provider.GcpSecretManagerSecretsProvider",
127
+ ),
128
+ (
129
+ "az-key-vault",
130
+ ".azure.azure_secret_manager_secrets_provider.AzureKeyVaultSecretsProvider",
131
+ ),
124
132
  ]
125
133
 
134
+ GCP_CLIENT_PROVIDERS_DESC = [
135
+ ("gcp-default", ".gcp.gs_storage_client_factory.GcpDefaultClientProvider")
136
+ ]
137
+
138
+ AZURE_CLIENT_PROVIDERS_DESC = [
139
+ ("azure-default", ".azure.azure_credential.AzureDefaultClientProvider")
140
+ ]
141
+
142
+
126
143
  process_plugins(globals())
127
144
 
128
145
 
@@ -144,6 +161,8 @@ SIDECARS.update(MONITOR_SIDECARS)
144
161
 
145
162
  AWS_CLIENT_PROVIDERS = resolve_plugins("aws_client_provider")
146
163
  SECRETS_PROVIDERS = resolve_plugins("secrets_provider")
164
+ AZURE_CLIENT_PROVIDERS = resolve_plugins("azure_client_provider")
165
+ GCP_CLIENT_PROVIDERS = resolve_plugins("gcp_client_provider")
147
166
 
148
167
  from .cards.card_modules import MF_EXTERNAL_CARDS
149
168
 
@@ -17,6 +17,7 @@ from metaflow.metaflow_config import (
17
17
  AIRFLOW_KUBERNETES_KUBECONFIG_FILE,
18
18
  AIRFLOW_KUBERNETES_STARTUP_TIMEOUT_SECONDS,
19
19
  AWS_SECRETS_MANAGER_DEFAULT_REGION,
20
+ GCP_SECRET_MANAGER_PREFIX,
20
21
  AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
21
22
  CARD_AZUREROOT,
22
23
  CARD_GSROOT,
@@ -31,6 +32,7 @@ from metaflow.metaflow_config import (
31
32
  S3_ENDPOINT_URL,
32
33
  SERVICE_HEADERS,
33
34
  SERVICE_INTERNAL_URL,
35
+ AZURE_KEY_VAULT_PREFIX,
34
36
  )
35
37
 
36
38
  from metaflow.metaflow_config_funcs import config_values
@@ -408,6 +410,11 @@ class Airflow(object):
408
410
  env[
409
411
  "METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION"
410
412
  ] = AWS_SECRETS_MANAGER_DEFAULT_REGION
413
+ if GCP_SECRET_MANAGER_PREFIX:
414
+ env["METAFLOW_GCP_SECRET_MANAGER_PREFIX"] = GCP_SECRET_MANAGER_PREFIX
415
+
416
+ if AZURE_KEY_VAULT_PREFIX:
417
+ env["METAFLOW_AZURE_KEY_VAULT_PREFIX"] = AZURE_KEY_VAULT_PREFIX
411
418
 
412
419
  env.update(additional_mf_variables)
413
420
 
@@ -32,6 +32,8 @@ from metaflow.metaflow_config import (
32
32
  DATATOOLS_S3ROOT,
33
33
  DEFAULT_METADATA,
34
34
  DEFAULT_SECRETS_BACKEND_TYPE,
35
+ GCP_SECRET_MANAGER_PREFIX,
36
+ AZURE_KEY_VAULT_PREFIX,
35
37
  KUBERNETES_FETCH_EC2_METADATA,
36
38
  KUBERNETES_LABELS,
37
39
  KUBERNETES_NAMESPACE,
@@ -627,6 +629,14 @@ class ArgoWorkflows(object):
627
629
  ),
628
630
  }
629
631
 
632
+ if self._schedule is not None:
633
+ # timezone is an optional field and json dumps on None will result in null
634
+ # hence configuring it to an empty string
635
+ if self._timezone is None:
636
+ self._timezone = ""
637
+ cron_info = {"schedule": self._schedule, "tz": self._timezone}
638
+ annotations.update({"metaflow/cron": json.dumps(cron_info)})
639
+
630
640
  if self.parameters:
631
641
  annotations.update({"metaflow/parameters": json.dumps(self.parameters)})
632
642
 
@@ -838,6 +848,11 @@ class ArgoWorkflows(object):
838
848
  def _visit(
839
849
  node, exit_node=None, templates=None, dag_tasks=None, parent_foreach=None
840
850
  ):
851
+ if node.parallel_foreach:
852
+ raise ArgoWorkflowsException(
853
+ "Deploying flows with @parallel decorator(s) "
854
+ "as Argo Workflows is not supported currently."
855
+ )
841
856
  # Every for-each node results in a separate subDAG and an equivalent
842
857
  # DAGTemplate rooted at the child of the for-each node. Each DAGTemplate
843
858
  # has a unique name - the top-level DAGTemplate is named as the name of
@@ -1413,6 +1428,8 @@ class ArgoWorkflows(object):
1413
1428
  env[
1414
1429
  "METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION"
1415
1430
  ] = AWS_SECRETS_MANAGER_DEFAULT_REGION
1431
+ env["METAFLOW_GCP_SECRET_MANAGER_PREFIX"] = GCP_SECRET_MANAGER_PREFIX
1432
+ env["METAFLOW_AZURE_KEY_VAULT_PREFIX"] = AZURE_KEY_VAULT_PREFIX
1416
1433
 
1417
1434
  # support for Azure
1418
1435
  env[
@@ -0,0 +1,3 @@
1
+ from .azure_credential import (
2
+ create_cacheable_azure_credential as create_azure_credential,
3
+ )
@@ -0,0 +1,53 @@
1
+ class AzureDefaultClientProvider(object):
2
+ name = "azure-default"
3
+
4
+ @staticmethod
5
+ def create_cacheable_azure_credential(*args, **kwargs):
6
+ """azure.identity.DefaultAzureCredential is not readily cacheable in a dictionary
7
+ because it does not have a content based hash and equality implementations.
8
+
9
+ We implement a subclass CacheableDefaultAzureCredential to add them.
10
+
11
+ We need this because credentials will be part of the cache key in _ClientCache.
12
+ """
13
+ from azure.identity import DefaultAzureCredential
14
+
15
+ class CacheableDefaultAzureCredential(DefaultAzureCredential):
16
+ def __init__(self, *args, **kwargs):
17
+ super(CacheableDefaultAzureCredential, self).__init__(*args, **kwargs)
18
+ # Just hashing all the kwargs works because they are all individually
19
+ # hashable as of 7/15/2022.
20
+ #
21
+ # What if Azure adds unhashable things to kwargs?
22
+ # - We will have CI to catch this (it will always install the latest Azure SDKs)
23
+ # - In Metaflow usage today we never specify any kwargs anyway. (see last line
24
+ # of the outer function.
25
+ self._hash_code = hash((args, tuple(sorted(kwargs.items()))))
26
+
27
+ def __hash__(self):
28
+ return self._hash_code
29
+
30
+ def __eq__(self, other):
31
+ return hash(self) == hash(other)
32
+
33
+ return CacheableDefaultAzureCredential(*args, **kwargs)
34
+
35
+
36
+ cached_provider_class = None
37
+
38
+
39
+ def create_cacheable_azure_credential():
40
+ global cached_provider_class
41
+ if cached_provider_class is None:
42
+ from metaflow.metaflow_config import DEFAULT_AZURE_CLIENT_PROVIDER
43
+ from metaflow.plugins import AZURE_CLIENT_PROVIDERS
44
+
45
+ for p in AZURE_CLIENT_PROVIDERS:
46
+ if p.name == DEFAULT_AZURE_CLIENT_PROVIDER:
47
+ cached_provider_class = p
48
+ break
49
+ else:
50
+ raise ValueError(
51
+ "Cannot find Azure Client provider %s" % DEFAULT_AZURE_CLIENT_PROVIDER
52
+ )
53
+ return cached_provider_class.create_cacheable_azure_credential()
@@ -10,4 +10,4 @@ class MetaflowAzureResourceError(MetaflowException):
10
10
 
11
11
 
12
12
  class MetaflowAzurePackageError(MetaflowException):
13
- headline = "Missing required packages 'azure-identity' and 'azure-storage-blob'"
13
+ headline = "Missing required packages 'azure-identity' and 'azure-storage-blob' and 'azure-keyvault-secrets'"
@@ -0,0 +1,240 @@
1
+ from metaflow.plugins.secrets import SecretsProvider
2
+ import re
3
+ import base64
4
+ import codecs
5
+ from urllib.parse import urlparse
6
+ from metaflow.exception import MetaflowException
7
+ import sys
8
+ from metaflow.metaflow_config import AZURE_KEY_VAULT_PREFIX
9
+ from metaflow.plugins.azure.azure_credential import (
10
+ create_cacheable_azure_credential,
11
+ )
12
+
13
+
14
+ class MetaflowAzureKeyVaultBadVault(MetaflowException):
15
+ """Raised when the secretid is fully qualified but does not have the right key vault domain"""
16
+
17
+
18
+ class MetaflowAzureKeyVaultBadSecretType(MetaflowException):
19
+ """Raised when the secret type is anything except secrets"""
20
+
21
+
22
+ class MetaflowAzureKeyVaultBadSecretPath(MetaflowException):
23
+ """Raised when the secret path does not match to expected length"""
24
+
25
+
26
+ class MetaflowAzureKeyVaultBadSecretName(MetaflowException):
27
+ """Raised when the secret name does not match expected pattern"""
28
+
29
+
30
+ class MetaflowAzureKeyVaultBadSecretVersion(MetaflowException):
31
+ """Raised when the secret version does not match expected pattern"""
32
+
33
+
34
+ class MetaflowAzureKeyVaultBadSecret(MetaflowException):
35
+ """Raised when the secret does not match supported patterns in Metaflow"""
36
+
37
+
38
+ class AzureKeyVaultSecretsProvider(SecretsProvider):
39
+ TYPE = "az-key-vault"
40
+ key_vault_domains = [
41
+ ".vault.azure.net",
42
+ ".vault.azure.cn",
43
+ ".vault.usgovcloudapi.net",
44
+ ".vault.microsoftazure.de",
45
+ ]
46
+ supported_vault_object_types = ["secrets"]
47
+
48
+ # https://learn.microsoft.com/en-us/azure/key-vault/general/about-keys-secrets-certificates has details on vault name structure
49
+ # Vault name and Managed HSM pool name must be a 3-24 character string, containing only 0-9, a-z, A-Z, and not consecutive -.
50
+ def _is_valid_vault_name(self, vault_name):
51
+ vault_name_pattern = r"^(?!.*--)[a-zA-Z0-9-]{3,24}$"
52
+ return re.match(vault_name_pattern, vault_name) is not None
53
+
54
+ # The type of the object can be, "keys", "secrets", or "certificates".
55
+ # Currently only secrets will be supported
56
+ def _is_valid_object_type(self, secret_type):
57
+ for type in self.supported_vault_object_types:
58
+ if secret_type == type:
59
+ return True
60
+ return False
61
+
62
+ # The secret name must be a 1-127 character string, starting with a letter and containing only 0-9, a-z, A-Z, and -.
63
+ def _is_valid_secret_name(self, secret_name):
64
+ secret_name_pattern = r"^[a-zA-Z][a-zA-Z0-9-]{0,126}$"
65
+ return re.match(secret_name_pattern, secret_name) is not None
66
+
67
+ # An object-version is a system-generated, 32 character string identifier that is optionally used to address a unique version of an object.
68
+ def _is_valid_object_version(self, secret_version):
69
+ object_version_pattern = r"^[a-zA-Z0-9]{32}$"
70
+ return re.match(object_version_pattern, secret_version) is not None
71
+
72
+ # This function will check if the secret_id is fully qualified url. It will return True iff the secret_id is of the form:
73
+ # https://myvault.vault.azure.net/secrets/mysecret/ec96f02080254f109c51a1f14cdb1931 OR
74
+ # https://myvault.vault.azure.net/secrets/mysecret/
75
+ # validating the above as per recommendations in https://devblogs.microsoft.com/azure-sdk/guidance-for-applications-using-the-key-vault-libraries/
76
+ def _is_secret_id_fully_qualified_url(self, secret_id):
77
+ # if the secret_id is None/empty/does not start with https then return false
78
+ if secret_id is None or secret_id == "" or not secret_id.startswith("https://"):
79
+ return False
80
+ try:
81
+ parsed_vault_url = urlparse(secret_id)
82
+ except ValueError:
83
+ print("invalid vault url", file=sys.stderr)
84
+ return False
85
+ hostname = parsed_vault_url.netloc
86
+
87
+ k_v_domain_found = False
88
+ actual_k_v_domain = ""
89
+ for k_v_domain in self.key_vault_domains:
90
+ if k_v_domain in hostname:
91
+ k_v_domain_found = True
92
+ actual_k_v_domain = k_v_domain
93
+ break
94
+ if not k_v_domain_found:
95
+ # the secret_id started with https:// however the key_vault_domains
96
+ # were not present in the secret_id which means
97
+ raise MetaflowAzureKeyVaultBadVault("bad key vault domain %s" % secret_id)
98
+
99
+ # given the secret_id seems to have a valid key vault domain
100
+ # lets verify that the vault name corresponds to its regex.
101
+ vault_name = hostname[: -len(actual_k_v_domain)]
102
+ # verify the vault name pattern
103
+ if not self._is_valid_vault_name(vault_name):
104
+ raise MetaflowAzureKeyVaultBadVault("bad key vault name %s" % vault_name)
105
+
106
+ path_parts = parsed_vault_url.path.strip("/").split("/")
107
+ total_path_parts = len(path_parts)
108
+ if total_path_parts < 2 or total_path_parts > 3:
109
+ raise MetaflowAzureKeyVaultBadSecretPath(
110
+ "bad secret uri path %s" % path_parts
111
+ )
112
+
113
+ object_type = path_parts[0]
114
+ if not self._is_valid_object_type(object_type):
115
+ raise MetaflowAzureKeyVaultBadSecretType("bad secret type %s" % object_type)
116
+
117
+ secret_name = path_parts[1]
118
+ if not self._is_valid_secret_name(secret_name=secret_name):
119
+ raise MetaflowAzureKeyVaultBadSecretName("bad secret name %s" % secret_name)
120
+
121
+ if total_path_parts == 3:
122
+ if not self._is_valid_object_version(path_parts[2]):
123
+ raise MetaflowAzureKeyVaultBadSecretVersion(
124
+ "bad secret version %s" % path_parts[2]
125
+ )
126
+
127
+ return True
128
+
129
+ # This function will validate the correctness of the partial secret id.
130
+ # It will attempt to construct the fully qualified secret URL internally and
131
+ # call the _is_secret_id_fully_qualified_url to check validity
132
+ def _is_partial_secret_valid(self, secret_id):
133
+ secret_parts = secret_id.strip("/").split("/")
134
+ total_secret_parts = len(secret_parts)
135
+ if total_secret_parts < 1 or total_secret_parts > 2:
136
+ return False
137
+
138
+ # since the secret_id is supposedly a partial id, the AZURE_KEY_VAULT_PREFIX
139
+ # must be set.
140
+ if not AZURE_KEY_VAULT_PREFIX:
141
+ raise ValueError(
142
+ "cannot use simple secret id without setting METAFLOW_AZURE_KEY_VAULT_PREFIX. %s"
143
+ % AZURE_KEY_VAULT_PREFIX
144
+ )
145
+ domain = AZURE_KEY_VAULT_PREFIX.rstrip("/")
146
+ full_secret = "%s/secrets/%s" % (domain, secret_id)
147
+ if not self._is_secret_id_fully_qualified_url(full_secret):
148
+ return False
149
+
150
+ return True
151
+
152
+ def _sanitize_key_as_env_var(self, key):
153
+ """
154
+ Sanitize a key as an environment variable name.
155
+ This is purely a convenience trade-off to cover common cases well, vs. introducing
156
+ ambiguities (e.g. did the final '_' come from '.', or '-' or is original?).
157
+
158
+ 1/27/2023(jackie):
159
+
160
+ We start with few rules and should *sparingly* add more over time.
161
+ Also, it's TBD whether all possible providers will share the same sanitization logic.
162
+ Therefore we will keep this function private for now
163
+ """
164
+ return key.replace("-", "_").replace(".", "_").replace("/", "_")
165
+
166
+ def get_secret_as_dict(self, secret_id, options={}, role=None):
167
+ # https://learn.microsoft.com/en-us/azure/app-service/app-service-key-vault-references?tabs=azure-cli has a lot of details on
168
+ # the patterns used in key vault
169
+ # Vault names and Managed HSM pool names are selected by the user and are globally unique.
170
+ # Vault name and Managed HSM pool name must be a 3-24 character string, containing only 0-9, a-z, A-Z, and not consecutive -.
171
+ # object-type The type of the object. As of 05/08/24 only "secrets", are supported
172
+ # object-name An object-name is a user provided name for and must be unique within a key vault. The name must be a 1-127 character string, starting with a letter and containing only 0-9, a-z, A-Z, and -.
173
+ # object-version An object-version is a system-generated, 32 character string identifier that is optionally used to address a unique version of an object.
174
+
175
+ # We allow these forms of secret_id:
176
+ #
177
+ # 1. Full path like https://<key-vault-name><.vault-domain>/secrets/<secret-name>/<secret-version>. This is what you
178
+ # see in Azure portal and is easy to copy paste.
179
+ #
180
+ # 2. Full path but without the version like https://<key-vault-name><.vault-domain>/secrets/<secret-name>
181
+ #
182
+ # 3. Simple string like mysecret. This corresponds to the SecretName.
183
+ #
184
+ # 4. Simple string with <secret-name>/<secret-version> suffix like mysecret/123
185
+
186
+ # The latter two forms require METAFLOW_AZURE_KEY_VAULT_PREFIX to be set.
187
+
188
+ # if the secret_id is None/empty/does not start with https then return false
189
+ if secret_id is None or secret_id == "":
190
+ raise MetaflowAzureKeyVaultBadSecret("empty secret id is not supported")
191
+
192
+ # check if the passed in secret is a short-form ( #3/#4 in the above comment)
193
+ if not secret_id.startswith("https://"):
194
+ # check if the secret_id is of form `secret_name` OR `secret_name/secret_version`
195
+ if not self._is_partial_secret_valid(secret_id=secret_id):
196
+ raise MetaflowAzureKeyVaultBadSecret(
197
+ "unsupported partial secret %s" % secret_id
198
+ )
199
+
200
+ domain = AZURE_KEY_VAULT_PREFIX.rstrip("/")
201
+ full_secret = "%s/secrets/%s" % (domain, secret_id)
202
+
203
+ # if the secret id is passed as a URL - then check if the url is fully qualified
204
+ if secret_id.startswith("https://"):
205
+ if not self._is_secret_id_fully_qualified_url(secret_id=secret_id):
206
+ raise MetaflowException("unsupported secret %s" % secret_id)
207
+ full_secret = secret_id
208
+
209
+ # at this point I know that the secret URL is good so we can start creating the Secret Client
210
+ az_credentials = create_cacheable_azure_credential()
211
+ res = urlparse(full_secret)
212
+ az_vault_url = "%s://%s" % (
213
+ res.scheme,
214
+ res.netloc,
215
+ ) # https://myvault.vault.azure.net
216
+ secret_data = res.path.strip("/").split("/")[1:]
217
+ secret_name = secret_data[0]
218
+ secret_version = None
219
+ if len(secret_data) > 1:
220
+ secret_version = secret_data[1]
221
+
222
+ from azure.keyvault.secrets import SecretClient
223
+
224
+ client = SecretClient(vault_url=az_vault_url, credential=az_credentials)
225
+
226
+ key_vault_secret_val = client.get_secret(
227
+ name=secret_name, version=secret_version
228
+ )
229
+
230
+ result = {}
231
+
232
+ if options.get("env_var_name") is not None:
233
+ env_var_name = options["env_var_name"]
234
+ sanitized_key = self._sanitize_key_as_env_var(env_var_name)
235
+ else:
236
+ sanitized_key = self._sanitize_key_as_env_var(key_vault_secret_val.name)
237
+
238
+ response_payload = key_vault_secret_val.value
239
+ result[sanitized_key] = response_payload
240
+ return result
@@ -7,6 +7,7 @@ from metaflow.plugins.azure.azure_exceptions import (
7
7
  MetaflowAzurePackageError,
8
8
  )
9
9
  from metaflow.exception import MetaflowInternalError, MetaflowException
10
+ from metaflow.plugins.azure.azure_credential import create_cacheable_azure_credential
10
11
 
11
12
 
12
13
  def _check_and_init_azure_deps():
@@ -138,38 +139,6 @@ def handle_exceptions(func):
138
139
  return _inner_func
139
140
 
140
141
 
141
- @check_azure_deps
142
- def create_cacheable_default_azure_credentials(*args, **kwargs):
143
- """azure.identity.DefaultAzureCredential is not readily cacheable in a dictionary
144
- because it does not have a content based hash and equality implementations.
145
-
146
- We implement a subclass CacheableDefaultAzureCredential to add them.
147
-
148
- We need this because credentials will be part of the cache key in _ClientCache.
149
- """
150
- from azure.identity import DefaultAzureCredential
151
-
152
- class CacheableDefaultAzureCredential(DefaultAzureCredential):
153
- def __init__(self, *args, **kwargs):
154
- super(CacheableDefaultAzureCredential, self).__init__(*args, **kwargs)
155
- # Just hashing all the kwargs works because they are all individually
156
- # hashable as of 7/15/2022.
157
- #
158
- # What if Azure adds unhashable things to kwargs?
159
- # - We will have CI to catch this (it will always install the latest Azure SDKs)
160
- # - In Metaflow usage today we never specify any kwargs anyway. (see last line
161
- # of the outer function.
162
- self._hash_code = hash((args, tuple(sorted(kwargs.items()))))
163
-
164
- def __hash__(self):
165
- return self._hash_code
166
-
167
- def __eq__(self, other):
168
- return hash(self) == hash(other)
169
-
170
- return CacheableDefaultAzureCredential(*args, **kwargs)
171
-
172
-
173
142
  @check_azure_deps
174
143
  def create_static_token_credential(token_):
175
144
  from azure.core.credentials import TokenCredential
@@ -200,9 +169,7 @@ def create_static_token_credential(token_):
200
169
  def get_token(self, *_scopes, **_kwargs):
201
170
 
202
171
  if (self._cached_token.expires_on - time.time()) < 300:
203
- from azure.identity import DefaultAzureCredential
204
-
205
- self._credential = DefaultAzureCredential()
172
+ self._credential = create_cacheable_azure_credential()
206
173
  if self._credential:
207
174
  return self._credential.get_token(*_scopes, **_kwargs)
208
175
  return self._cached_token
@@ -1,9 +1,11 @@
1
1
  from metaflow.exception import MetaflowException
2
2
  from metaflow.metaflow_config import AZURE_STORAGE_BLOB_SERVICE_ENDPOINT
3
3
  from metaflow.plugins.azure.azure_utils import (
4
- create_cacheable_default_azure_credentials,
5
4
  check_azure_deps,
6
5
  )
6
+ from metaflow.plugins.azure.azure_credential import (
7
+ create_cacheable_azure_credential,
8
+ )
7
9
 
8
10
  import os
9
11
  import threading
@@ -125,7 +127,7 @@ def get_azure_blob_service_client(
125
127
  blob_service_endpoint = AZURE_STORAGE_BLOB_SERVICE_ENDPOINT
126
128
 
127
129
  if not credential:
128
- credential = create_cacheable_default_azure_credentials()
130
+ credential = create_cacheable_azure_credential()
129
131
  credential_is_cacheable = True
130
132
 
131
133
  if not credential_is_cacheable:
@@ -32,6 +32,8 @@ from metaflow.plugins.storage_executor import (
32
32
  handle_executor_exceptions,
33
33
  )
34
34
 
35
+ from metaflow.plugins.azure.azure_credential import create_cacheable_azure_credential
36
+
35
37
  AZURE_STORAGE_DOWNLOAD_MAX_CONCURRENCY = 4
36
38
  AZURE_STORAGE_UPLOAD_MAX_CONCURRENCY = 16
37
39
 
@@ -272,12 +274,10 @@ class AzureStorage(DataStoreStorage):
272
274
  if not self._default_scope_token or (
273
275
  self._default_scope_token.expires_on - time.time() < 300
274
276
  ):
275
- from azure.identity import DefaultAzureCredential
276
-
277
- with DefaultAzureCredential() as credential:
278
- self._default_scope_token = credential.get_token(
279
- AZURE_STORAGE_DEFAULT_SCOPE
280
- )
277
+ credential = create_cacheable_azure_credential()
278
+ self._default_scope_token = credential.get_token(
279
+ AZURE_STORAGE_DEFAULT_SCOPE
280
+ )
281
281
  return self._default_scope_token
282
282
 
283
283
  @property
@@ -21,7 +21,6 @@ from metaflow.metaflow_config import (
21
21
  TEMPDIR,
22
22
  )
23
23
  from metaflow.util import (
24
- namedtuple_with_defaults,
25
24
  is_stringish,
26
25
  to_bytes,
27
26
  to_unicode,
@@ -29,6 +28,7 @@ from metaflow.util import (
29
28
  url_quote,
30
29
  url_unquote,
31
30
  )
31
+ from metaflow.tuple_util import namedtuple_with_defaults
32
32
  from metaflow.exception import MetaflowException
33
33
  from metaflow.debug import debug
34
34
  import metaflow.tracing as tracing
@@ -0,0 +1 @@
1
+ from .gs_storage_client_factory import get_credentials