ob-metaflow 2.15.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. metaflow/__init__.py +10 -3
  2. metaflow/_vendor/imghdr/__init__.py +186 -0
  3. metaflow/_vendor/yaml/__init__.py +427 -0
  4. metaflow/_vendor/yaml/composer.py +139 -0
  5. metaflow/_vendor/yaml/constructor.py +748 -0
  6. metaflow/_vendor/yaml/cyaml.py +101 -0
  7. metaflow/_vendor/yaml/dumper.py +62 -0
  8. metaflow/_vendor/yaml/emitter.py +1137 -0
  9. metaflow/_vendor/yaml/error.py +75 -0
  10. metaflow/_vendor/yaml/events.py +86 -0
  11. metaflow/_vendor/yaml/loader.py +63 -0
  12. metaflow/_vendor/yaml/nodes.py +49 -0
  13. metaflow/_vendor/yaml/parser.py +589 -0
  14. metaflow/_vendor/yaml/reader.py +185 -0
  15. metaflow/_vendor/yaml/representer.py +389 -0
  16. metaflow/_vendor/yaml/resolver.py +227 -0
  17. metaflow/_vendor/yaml/scanner.py +1435 -0
  18. metaflow/_vendor/yaml/serializer.py +111 -0
  19. metaflow/_vendor/yaml/tokens.py +104 -0
  20. metaflow/cards.py +4 -0
  21. metaflow/cli.py +125 -21
  22. metaflow/cli_components/init_cmd.py +1 -0
  23. metaflow/cli_components/run_cmds.py +204 -40
  24. metaflow/cli_components/step_cmd.py +160 -4
  25. metaflow/client/__init__.py +1 -0
  26. metaflow/client/core.py +198 -130
  27. metaflow/client/filecache.py +59 -32
  28. metaflow/cmd/code/__init__.py +2 -1
  29. metaflow/cmd/develop/stub_generator.py +49 -18
  30. metaflow/cmd/develop/stubs.py +9 -27
  31. metaflow/cmd/make_wrapper.py +30 -0
  32. metaflow/datastore/__init__.py +1 -0
  33. metaflow/datastore/content_addressed_store.py +40 -9
  34. metaflow/datastore/datastore_set.py +10 -1
  35. metaflow/datastore/flow_datastore.py +124 -4
  36. metaflow/datastore/spin_datastore.py +91 -0
  37. metaflow/datastore/task_datastore.py +92 -6
  38. metaflow/debug.py +5 -0
  39. metaflow/decorators.py +331 -82
  40. metaflow/extension_support/__init__.py +414 -356
  41. metaflow/extension_support/_empty_file.py +2 -2
  42. metaflow/flowspec.py +322 -82
  43. metaflow/graph.py +178 -15
  44. metaflow/includefile.py +25 -3
  45. metaflow/lint.py +94 -3
  46. metaflow/meta_files.py +13 -0
  47. metaflow/metadata_provider/metadata.py +13 -2
  48. metaflow/metaflow_config.py +66 -4
  49. metaflow/metaflow_environment.py +91 -25
  50. metaflow/metaflow_profile.py +18 -0
  51. metaflow/metaflow_version.py +16 -1
  52. metaflow/package/__init__.py +673 -0
  53. metaflow/packaging_sys/__init__.py +880 -0
  54. metaflow/packaging_sys/backend.py +128 -0
  55. metaflow/packaging_sys/distribution_support.py +153 -0
  56. metaflow/packaging_sys/tar_backend.py +99 -0
  57. metaflow/packaging_sys/utils.py +54 -0
  58. metaflow/packaging_sys/v1.py +527 -0
  59. metaflow/parameters.py +6 -2
  60. metaflow/plugins/__init__.py +6 -0
  61. metaflow/plugins/airflow/airflow.py +11 -1
  62. metaflow/plugins/airflow/airflow_cli.py +16 -5
  63. metaflow/plugins/argo/argo_client.py +42 -20
  64. metaflow/plugins/argo/argo_events.py +6 -6
  65. metaflow/plugins/argo/argo_workflows.py +1023 -344
  66. metaflow/plugins/argo/argo_workflows_cli.py +396 -94
  67. metaflow/plugins/argo/argo_workflows_decorator.py +9 -0
  68. metaflow/plugins/argo/argo_workflows_deployer_objects.py +75 -49
  69. metaflow/plugins/argo/capture_error.py +5 -2
  70. metaflow/plugins/argo/conditional_input_paths.py +35 -0
  71. metaflow/plugins/argo/exit_hooks.py +209 -0
  72. metaflow/plugins/argo/param_val.py +19 -0
  73. metaflow/plugins/aws/aws_client.py +6 -0
  74. metaflow/plugins/aws/aws_utils.py +33 -1
  75. metaflow/plugins/aws/batch/batch.py +72 -5
  76. metaflow/plugins/aws/batch/batch_cli.py +24 -3
  77. metaflow/plugins/aws/batch/batch_decorator.py +57 -6
  78. metaflow/plugins/aws/step_functions/step_functions.py +28 -3
  79. metaflow/plugins/aws/step_functions/step_functions_cli.py +49 -4
  80. metaflow/plugins/aws/step_functions/step_functions_deployer.py +3 -0
  81. metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +30 -0
  82. metaflow/plugins/cards/card_cli.py +20 -1
  83. metaflow/plugins/cards/card_creator.py +24 -1
  84. metaflow/plugins/cards/card_datastore.py +21 -49
  85. metaflow/plugins/cards/card_decorator.py +58 -6
  86. metaflow/plugins/cards/card_modules/basic.py +38 -9
  87. metaflow/plugins/cards/card_modules/bundle.css +1 -1
  88. metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
  89. metaflow/plugins/cards/card_modules/components.py +592 -3
  90. metaflow/plugins/cards/card_modules/convert_to_native_type.py +34 -5
  91. metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
  92. metaflow/plugins/cards/card_modules/main.css +1 -0
  93. metaflow/plugins/cards/card_modules/main.js +56 -41
  94. metaflow/plugins/cards/card_modules/test_cards.py +22 -6
  95. metaflow/plugins/cards/component_serializer.py +1 -8
  96. metaflow/plugins/cards/metadata.py +22 -0
  97. metaflow/plugins/catch_decorator.py +9 -0
  98. metaflow/plugins/datastores/local_storage.py +12 -6
  99. metaflow/plugins/datastores/spin_storage.py +12 -0
  100. metaflow/plugins/datatools/s3/s3.py +49 -17
  101. metaflow/plugins/datatools/s3/s3op.py +113 -66
  102. metaflow/plugins/env_escape/client_modules.py +102 -72
  103. metaflow/plugins/events_decorator.py +127 -121
  104. metaflow/plugins/exit_hook/__init__.py +0 -0
  105. metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
  106. metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
  107. metaflow/plugins/kubernetes/kubernetes.py +12 -1
  108. metaflow/plugins/kubernetes/kubernetes_cli.py +11 -0
  109. metaflow/plugins/kubernetes/kubernetes_decorator.py +25 -6
  110. metaflow/plugins/kubernetes/kubernetes_job.py +12 -4
  111. metaflow/plugins/kubernetes/kubernetes_jobsets.py +31 -30
  112. metaflow/plugins/metadata_providers/local.py +76 -82
  113. metaflow/plugins/metadata_providers/service.py +13 -9
  114. metaflow/plugins/metadata_providers/spin.py +16 -0
  115. metaflow/plugins/package_cli.py +36 -24
  116. metaflow/plugins/parallel_decorator.py +11 -2
  117. metaflow/plugins/parsers.py +16 -0
  118. metaflow/plugins/pypi/bootstrap.py +7 -1
  119. metaflow/plugins/pypi/conda_decorator.py +41 -82
  120. metaflow/plugins/pypi/conda_environment.py +14 -6
  121. metaflow/plugins/pypi/micromamba.py +9 -1
  122. metaflow/plugins/pypi/pip.py +41 -5
  123. metaflow/plugins/pypi/pypi_decorator.py +4 -4
  124. metaflow/plugins/pypi/utils.py +22 -0
  125. metaflow/plugins/secrets/__init__.py +3 -0
  126. metaflow/plugins/secrets/secrets_decorator.py +14 -178
  127. metaflow/plugins/secrets/secrets_func.py +49 -0
  128. metaflow/plugins/secrets/secrets_spec.py +101 -0
  129. metaflow/plugins/secrets/utils.py +74 -0
  130. metaflow/plugins/test_unbounded_foreach_decorator.py +2 -2
  131. metaflow/plugins/timeout_decorator.py +0 -1
  132. metaflow/plugins/uv/bootstrap.py +29 -1
  133. metaflow/plugins/uv/uv_environment.py +5 -3
  134. metaflow/pylint_wrapper.py +5 -1
  135. metaflow/runner/click_api.py +79 -26
  136. metaflow/runner/deployer.py +208 -6
  137. metaflow/runner/deployer_impl.py +32 -12
  138. metaflow/runner/metaflow_runner.py +266 -33
  139. metaflow/runner/subprocess_manager.py +21 -1
  140. metaflow/runner/utils.py +27 -16
  141. metaflow/runtime.py +660 -66
  142. metaflow/task.py +255 -26
  143. metaflow/user_configs/config_options.py +33 -21
  144. metaflow/user_configs/config_parameters.py +220 -58
  145. metaflow/user_decorators/__init__.py +0 -0
  146. metaflow/user_decorators/common.py +144 -0
  147. metaflow/user_decorators/mutable_flow.py +512 -0
  148. metaflow/user_decorators/mutable_step.py +424 -0
  149. metaflow/user_decorators/user_flow_decorator.py +264 -0
  150. metaflow/user_decorators/user_step_decorator.py +749 -0
  151. metaflow/util.py +197 -7
  152. metaflow/vendor.py +23 -7
  153. metaflow/version.py +1 -1
  154. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Makefile +13 -2
  155. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Tiltfile +107 -7
  156. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/pick_services.sh +1 -0
  157. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/METADATA +2 -3
  158. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/RECORD +162 -121
  159. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
  160. metaflow/_vendor/v3_5/__init__.py +0 -1
  161. metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
  162. metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
  163. metaflow/_vendor/v3_5/zipp.py +0 -329
  164. metaflow/info_file.py +0 -25
  165. metaflow/package.py +0 -203
  166. metaflow/user_configs/config_decorators.py +0 -568
  167. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +0 -0
  168. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/licenses/LICENSE +0 -0
  169. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
@@ -90,6 +90,7 @@ class Kubernetes(object):
90
90
  step_name,
91
91
  task_id,
92
92
  attempt,
93
+ code_package_metadata,
93
94
  code_package_url,
94
95
  step_cmds,
95
96
  ):
@@ -104,7 +105,7 @@ class Kubernetes(object):
104
105
  stderr_path=STDERR_PATH,
105
106
  )
106
107
  init_cmds = self._environment.get_package_commands(
107
- code_package_url, self._datastore.TYPE
108
+ code_package_url, self._datastore.TYPE, code_package_metadata
108
109
  )
109
110
  init_expr = " && ".join(init_cmds)
110
111
  step_expr = bash_capture_logs(
@@ -165,11 +166,13 @@ class Kubernetes(object):
165
166
  task_id,
166
167
  attempt,
167
168
  user,
169
+ code_package_metadata,
168
170
  code_package_sha,
169
171
  code_package_url,
170
172
  code_package_ds,
171
173
  docker_image,
172
174
  docker_image_pull_policy,
175
+ image_pull_secrets=None,
173
176
  step_cli=None,
174
177
  service_account=None,
175
178
  secrets=None,
@@ -206,6 +209,7 @@ class Kubernetes(object):
206
209
  node_selector=node_selector,
207
210
  image=docker_image,
208
211
  image_pull_policy=docker_image_pull_policy,
212
+ image_pull_secrets=image_pull_secrets,
209
213
  cpu=cpu,
210
214
  memory=memory,
211
215
  disk=disk,
@@ -230,6 +234,7 @@ class Kubernetes(object):
230
234
  qos=qos,
231
235
  security_context=security_context,
232
236
  )
237
+ .environment_variable("METAFLOW_CODE_METADATA", code_package_metadata)
233
238
  .environment_variable("METAFLOW_CODE_SHA", code_package_sha)
234
239
  .environment_variable("METAFLOW_CODE_URL", code_package_url)
235
240
  .environment_variable("METAFLOW_CODE_DS", code_package_ds)
@@ -429,6 +434,7 @@ class Kubernetes(object):
429
434
  step_name=step_name,
430
435
  task_id=_tskid,
431
436
  attempt=attempt,
437
+ code_package_metadata=code_package_metadata,
432
438
  code_package_url=code_package_url,
433
439
  step_cmds=[
434
440
  step_cli.replace(
@@ -477,12 +483,14 @@ class Kubernetes(object):
477
483
  task_id,
478
484
  attempt,
479
485
  user,
486
+ code_package_metadata,
480
487
  code_package_sha,
481
488
  code_package_url,
482
489
  code_package_ds,
483
490
  step_cli,
484
491
  docker_image,
485
492
  docker_image_pull_policy,
493
+ image_pull_secrets=None,
486
494
  service_account=None,
487
495
  secrets=None,
488
496
  node_selector=None,
@@ -524,11 +532,13 @@ class Kubernetes(object):
524
532
  step_name=step_name,
525
533
  task_id=task_id,
526
534
  attempt=attempt,
535
+ code_package_metadata=code_package_metadata,
527
536
  code_package_url=code_package_url,
528
537
  step_cmds=[step_cli],
529
538
  ),
530
539
  image=docker_image,
531
540
  image_pull_policy=docker_image_pull_policy,
541
+ image_pull_secrets=image_pull_secrets,
532
542
  cpu=cpu,
533
543
  memory=memory,
534
544
  disk=disk,
@@ -551,6 +561,7 @@ class Kubernetes(object):
551
561
  qos=qos,
552
562
  security_context=security_context,
553
563
  )
564
+ .environment_variable("METAFLOW_CODE_METADATA", code_package_metadata)
554
565
  .environment_variable("METAFLOW_CODE_SHA", code_package_sha)
555
566
  .environment_variable("METAFLOW_CODE_URL", code_package_url)
556
567
  .environment_variable("METAFLOW_CODE_DS", code_package_ds)
@@ -41,6 +41,7 @@ def kubernetes():
41
41
  )
42
42
  @tracing.cli("kubernetes/step")
43
43
  @click.argument("step-name")
44
+ @click.argument("code-package-metadata")
44
45
  @click.argument("code-package-sha")
45
46
  @click.argument("code-package-url")
46
47
  @click.option(
@@ -53,6 +54,12 @@ def kubernetes():
53
54
  default=None,
54
55
  help="Optional Docker Image Pull Policy for Kubernetes pod.",
55
56
  )
57
+ @click.option(
58
+ "--image-pull-secrets",
59
+ default=None,
60
+ type=JSONTypeClass(),
61
+ multiple=False,
62
+ )
56
63
  @click.option(
57
64
  "--service-account",
58
65
  help="IRSA requirement for Kubernetes pod.",
@@ -155,11 +162,13 @@ def kubernetes():
155
162
  def step(
156
163
  ctx,
157
164
  step_name,
165
+ code_package_metadata,
158
166
  code_package_sha,
159
167
  code_package_url,
160
168
  executable=None,
161
169
  image=None,
162
170
  image_pull_policy=None,
171
+ image_pull_secrets=None,
163
172
  service_account=None,
164
173
  secrets=None,
165
174
  node_selector=None,
@@ -297,12 +306,14 @@ def step(
297
306
  task_id=task_id,
298
307
  attempt=str(retry_count),
299
308
  user=util.get_username(),
309
+ code_package_metadata=code_package_metadata,
300
310
  code_package_sha=code_package_sha,
301
311
  code_package_url=code_package_url,
302
312
  code_package_ds=ctx.obj.flow_datastore.TYPE,
303
313
  step_cli=step_cli,
304
314
  docker_image=image,
305
315
  docker_image_pull_policy=image_pull_policy,
316
+ image_pull_secrets=image_pull_secrets,
306
317
  service_account=service_account,
307
318
  secrets=secrets,
308
319
  node_selector=node_selector,
@@ -11,6 +11,7 @@ from metaflow.metadata_provider import MetaDatum
11
11
  from metaflow.metadata_provider.util import sync_local_metadata_to_datastore
12
12
  from metaflow.metaflow_config import (
13
13
  DATASTORE_LOCAL_DIR,
14
+ FEAT_ALWAYS_UPLOAD_CODE_PACKAGE,
14
15
  KUBERNETES_CONTAINER_IMAGE,
15
16
  KUBERNETES_CONTAINER_REGISTRY,
16
17
  KUBERNETES_CPU,
@@ -18,6 +19,7 @@ from metaflow.metaflow_config import (
18
19
  KUBERNETES_FETCH_EC2_METADATA,
19
20
  KUBERNETES_GPU_VENDOR,
20
21
  KUBERNETES_IMAGE_PULL_POLICY,
22
+ KUBERNETES_IMAGE_PULL_SECRETS,
21
23
  KUBERNETES_MEMORY,
22
24
  KUBERNETES_LABELS,
23
25
  KUBERNETES_ANNOTATIONS,
@@ -74,6 +76,10 @@ class KubernetesDecorator(StepDecorator):
74
76
  not, a default Docker image mapping to the current version of Python is used.
75
77
  image_pull_policy: str, default KUBERNETES_IMAGE_PULL_POLICY
76
78
  If given, the imagePullPolicy to be applied to the Docker image of the step.
79
+ image_pull_secrets: List[str], default []
80
+ The default is extracted from METAFLOW_KUBERNETES_IMAGE_PULL_SECRETS.
81
+ Kubernetes image pull secrets to use when pulling container images
82
+ in Kubernetes.
77
83
  service_account : str, default METAFLOW_KUBERNETES_SERVICE_ACCOUNT
78
84
  Kubernetes service account to use when launching pod in Kubernetes.
79
85
  secrets : List[str], optional, default None
@@ -92,7 +98,7 @@ class KubernetesDecorator(StepDecorator):
92
98
  the scheduled node should not have GPUs.
93
99
  gpu_vendor : str, default KUBERNETES_GPU_VENDOR
94
100
  The vendor of the GPUs to be used for this step.
95
- tolerations : List[str], default []
101
+ tolerations : List[Dict[str,str]], default []
96
102
  The default is extracted from METAFLOW_KUBERNETES_TOLERATIONS.
97
103
  Kubernetes tolerations to use when launching pod in Kubernetes.
98
104
  labels: Dict[str, str], default: METAFLOW_KUBERNETES_LABELS
@@ -141,6 +147,7 @@ class KubernetesDecorator(StepDecorator):
141
147
  "disk": "10240",
142
148
  "image": None,
143
149
  "image_pull_policy": None,
150
+ "image_pull_secrets": None, # e.g., ["regcred"]
144
151
  "service_account": None,
145
152
  "secrets": None, # e.g., mysecret
146
153
  "node_selector": None, # e.g., kubernetes.io/os=linux
@@ -164,6 +171,7 @@ class KubernetesDecorator(StepDecorator):
164
171
  "qos": KUBERNETES_QOS,
165
172
  "security_context": None,
166
173
  }
174
+ package_metadata = None
167
175
  package_url = None
168
176
  package_sha = None
169
177
  run_time_limit = None
@@ -173,8 +181,6 @@ class KubernetesDecorator(StepDecorator):
173
181
  target_platform = KUBERNETES_CONDA_ARCH or "linux-64"
174
182
 
175
183
  def init(self):
176
- super(KubernetesDecorator, self).init()
177
-
178
184
  if not self.attributes["namespace"]:
179
185
  self.attributes["namespace"] = KUBERNETES_NAMESPACE
180
186
  if not self.attributes["service_account"]:
@@ -194,6 +200,10 @@ class KubernetesDecorator(StepDecorator):
194
200
  )
195
201
  if not self.attributes["image_pull_policy"] and KUBERNETES_IMAGE_PULL_POLICY:
196
202
  self.attributes["image_pull_policy"] = KUBERNETES_IMAGE_PULL_POLICY
203
+ if not self.attributes["image_pull_secrets"] and KUBERNETES_IMAGE_PULL_SECRETS:
204
+ self.attributes["image_pull_secrets"] = json.loads(
205
+ KUBERNETES_IMAGE_PULL_SECRETS
206
+ )
197
207
 
198
208
  if isinstance(self.attributes["node_selector"], str):
199
209
  self.attributes["node_selector"] = parse_kube_keyvalue_list(
@@ -476,6 +486,7 @@ class KubernetesDecorator(StepDecorator):
476
486
  # to execute on Kubernetes anymore. We can execute possible fallback
477
487
  # code locally.
478
488
  cli_args.commands = ["kubernetes", "step"]
489
+ cli_args.command_args.append(self.package_metadata)
479
490
  cli_args.command_args.append(self.package_sha)
480
491
  cli_args.command_args.append(self.package_url)
481
492
 
@@ -494,6 +505,7 @@ class KubernetesDecorator(StepDecorator):
494
505
  for key, val in v.items()
495
506
  ]
496
507
  elif k in [
508
+ "image_pull_secrets",
497
509
  "tolerations",
498
510
  "persistent_volume_claims",
499
511
  "labels",
@@ -646,9 +658,16 @@ class KubernetesDecorator(StepDecorator):
646
658
  @classmethod
647
659
  def _save_package_once(cls, flow_datastore, package):
648
660
  if cls.package_url is None:
649
- cls.package_url, cls.package_sha = flow_datastore.save_data(
650
- [package.blob], len_hint=1
651
- )[0]
661
+ if not FEAT_ALWAYS_UPLOAD_CODE_PACKAGE:
662
+ cls.package_url, cls.package_sha = flow_datastore.save_data(
663
+ [package.blob], len_hint=1
664
+ )[0]
665
+ cls.package_metadata = package.package_metadata
666
+ else:
667
+ # Blocks until the package is uploaded
668
+ cls.package_url = package.package_url()
669
+ cls.package_sha = package.package_sha()
670
+ cls.package_metadata = package.package_metadata
652
671
 
653
672
 
654
673
  # TODO: Unify this method with the multi-node setup in @batch
@@ -235,8 +235,10 @@ class KubernetesJob(object):
235
235
  )
236
236
  ],
237
237
  node_selector=self._kwargs.get("node_selector"),
238
- # TODO (savin): Support image_pull_secrets
239
- # image_pull_secrets=?,
238
+ image_pull_secrets=[
239
+ client.V1LocalObjectReference(secret)
240
+ for secret in self._kwargs.get("image_pull_secrets") or []
241
+ ],
240
242
  # TODO (savin): Support preemption policies
241
243
  # preemption_policy=?,
242
244
  #
@@ -520,12 +522,10 @@ class RunningJob(object):
520
522
  # 3. If the pod object hasn't shown up yet, we set the parallelism to 0
521
523
  # to preempt it.
522
524
  client = self._client.get()
523
-
524
525
  if not self.is_done:
525
526
  if self.is_running:
526
527
  # Case 1.
527
528
  from kubernetes.stream import stream
528
-
529
529
  api_instance = client.CoreV1Api
530
530
  try:
531
531
  # TODO: stream opens a web-socket connection. It may
@@ -591,6 +591,10 @@ class RunningJob(object):
591
591
  return self.id
592
592
  return "job %s" % self._name
593
593
 
594
+ @property
595
+ def is_unschedulable(self):
596
+ return self._job["metadata"]["annotations"].get("metaflow/job_status", "") == "Unsatisfiable_Resource_Request"
597
+
594
598
  @property
595
599
  def is_done(self):
596
600
  # Check if the container is done. As a side effect, also refreshes self._job and
@@ -604,6 +608,7 @@ class RunningJob(object):
604
608
  or bool(self._job["status"].get("failed"))
605
609
  or self._are_pod_containers_done
606
610
  or (self._job["spec"]["parallelism"] == 0)
611
+ or self.is_unschedulable
607
612
  )
608
613
 
609
614
  if not done():
@@ -661,6 +666,7 @@ class RunningJob(object):
661
666
  bool(self._job["status"].get("failed"))
662
667
  or self._has_any_container_failed
663
668
  or (self._job["spec"]["parallelism"] == 0)
669
+ or self.is_unschedulable
664
670
  )
665
671
  return retval
666
672
 
@@ -758,6 +764,8 @@ class RunningJob(object):
758
764
  return 0, None
759
765
  # Best effort since Pod object can disappear on us at anytime
760
766
  else:
767
+ if self.is_unschedulable:
768
+ return 1, self._job["metadata"]["annotations"].get("metaflow/job_status_reason", "")
761
769
  if self._pod.get("status", {}).get("phase") not in (
762
770
  "Succeeded",
763
771
  "Failed",
@@ -6,6 +6,7 @@ from collections import namedtuple
6
6
  from metaflow.exception import MetaflowException
7
7
  from metaflow.metaflow_config import KUBERNETES_JOBSET_GROUP, KUBERNETES_JOBSET_VERSION
8
8
  from metaflow.tracing import inject_tracing_vars
9
+ from metaflow._vendor import yaml
9
10
 
10
11
  from .kube_utils import qos_requests_and_limits
11
12
 
@@ -718,8 +719,11 @@ class JobSetSpec(object):
718
719
  )
719
720
  ],
720
721
  node_selector=self._kwargs.get("node_selector"),
721
- # TODO (savin): Support image_pull_secrets
722
- # image_pull_secrets=?,
722
+ image_pull_secrets=[
723
+ client.V1LocalObjectReference(secret)
724
+ for secret in self._kwargs.get("image_pull_secrets")
725
+ or []
726
+ ],
723
727
  # TODO (savin): Support preemption policies
724
728
  # preemption_policy=?,
725
729
  #
@@ -1022,34 +1026,32 @@ class KubernetesArgoJobSet(object):
1022
1026
 
1023
1027
  def dump(self):
1024
1028
  client = self._kubernetes_sdk
1025
-
1026
- data = json.dumps(
1027
- client.ApiClient().sanitize_for_serialization(
1028
- dict(
1029
- apiVersion=self._group + "/" + self._version,
1030
- kind="JobSet",
1031
- metadata=client.api_client.ApiClient().sanitize_for_serialization(
1032
- client.V1ObjectMeta(
1033
- name=self.name,
1034
- labels=self._labels,
1035
- annotations=self._annotations,
1036
- )
1037
- ),
1038
- spec=dict(
1039
- replicatedJobs=[self.control.dump(), self.worker.dump()],
1040
- suspend=False,
1041
- startupPolicy=None,
1042
- successPolicy=None,
1043
- # The Failure Policy helps setting the number of retries for the jobset.
1044
- # but we don't rely on it and instead rely on either the local scheduler
1045
- # or the Argo Workflows to handle retries.
1046
- failurePolicy=None,
1047
- network=None,
1048
- ),
1049
- status=None,
1050
- )
1029
+ js_dict = client.ApiClient().sanitize_for_serialization(
1030
+ dict(
1031
+ apiVersion=self._group + "/" + self._version,
1032
+ kind="JobSet",
1033
+ metadata=client.api_client.ApiClient().sanitize_for_serialization(
1034
+ client.V1ObjectMeta(
1035
+ name=self.name,
1036
+ labels=self._labels,
1037
+ annotations=self._annotations,
1038
+ )
1039
+ ),
1040
+ spec=dict(
1041
+ replicatedJobs=[self.control.dump(), self.worker.dump()],
1042
+ suspend=False,
1043
+ startupPolicy=None,
1044
+ successPolicy=None,
1045
+ # The Failure Policy helps setting the number of retries for the jobset.
1046
+ # but we don't rely on it and instead rely on either the local scheduler
1047
+ # or the Argo Workflows to handle retries.
1048
+ failurePolicy=None,
1049
+ network=None,
1050
+ ),
1051
+ status=None,
1051
1052
  )
1052
1053
  )
1054
+ data = yaml.dump(js_dict, default_flow_style=False, indent=2)
1053
1055
  # The values we populate in the Jobset manifest (for Argo Workflows) piggybacks on the Argo Workflow's templating engine.
1054
1056
  # Even though Argo Workflows's templating helps us constructing all the necessary IDs and populating the fields
1055
1057
  # required by Metaflow, we run into one glitch. When we construct JSON/YAML serializable objects,
@@ -1064,7 +1066,6 @@ class KubernetesArgoJobSet(object):
1064
1066
  # Since the value of `num_parallel` can be dynamic and can change from run to run, we need to ensure that the
1065
1067
  # value can be passed-down dynamically and is **explicitly set as a integer** in the Jobset Manifest submitted as a
1066
1068
  # part of the Argo Workflow
1067
-
1068
- quoted_substring = '"{{=asInt(inputs.parameters.workerCount)}}"'
1069
+ quoted_substring = "'{{=asInt(inputs.parameters.workerCount)}}'"
1069
1070
  unquoted_substring = "{{=asInt(inputs.parameters.workerCount)}}"
1070
1071
  return data.replace(quoted_substring, unquoted_substring)