konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
konduktor/task.py ADDED
@@ -0,0 +1,949 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: konduktor: https://github.com/konduktor-org/konduktor
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Task: a coarse-grained stage in an application."""
14
+
15
+ import inspect
16
+ import json
17
+ import os
18
+ import re
19
+ import typing
20
+ from typing import Any, Dict, List, Optional, Tuple, Union
21
+
22
+ import yaml # type: ignore
23
+
24
+ if typing.TYPE_CHECKING:
25
+ import konduktor.resource as resources_lib
26
+ import konduktor.serving as servings_lib
27
+
28
+ import konduktor
29
+ from konduktor import constants, logging
30
+ from konduktor.data import data_utils
31
+ from konduktor.data import storage as storage_lib
32
+ from konduktor.utils import common_utils, exceptions, schemas, ux_utils
33
+
34
+ logger = logging.get_logger(__name__)
35
+
36
+ _VALID_NAME_REGEX = '[a-zA-Z0-9]+(?:[._-]{1,2}[a-zA-Z0-9]+)*'
37
+ _VALID_NAME_DESCR = (
38
+ 'ASCII characters and may contain lowercase and'
39
+ ' uppercase letters, digits, underscores, periods,'
40
+ ' and dashes. Must start and end with alphanumeric'
41
+ ' characters. No triple dashes or underscores.'
42
+ )
43
+
44
+ _RUN_FN_CHECK_FAIL_MSG = (
45
+ 'run command generator must take exactly 2 arguments: node_rank (int) and'
46
+ 'a list of node ip addresses (List[str]). Got {run_sig}'
47
+ )
48
+
49
+
50
+ def _is_valid_name(name: Optional[str]) -> bool:
51
+ """Checks if the task name is valid.
52
+
53
+ Valid is defined as either NoneType or str with ASCII characters which may
54
+ contain lowercase and uppercase letters, digits, underscores, periods,
55
+ and dashes. Must start and end with alphanumeric characters.
56
+ No triple dashes or underscores.
57
+
58
+ Examples:
59
+ some_name_here
60
+ some-name-here
61
+ some__name__here
62
+ some--name--here
63
+ some__name--here
64
+ some.name.here
65
+ some-name_he.re
66
+ this---shouldnt--work
67
+ this___shouldnt_work
68
+ _thisshouldntwork
69
+ thisshouldntwork_
70
+ """
71
+ if name is None:
72
+ return False
73
+ return bool(re.fullmatch(_VALID_NAME_REGEX, name))
74
+
75
+
76
+ def _fill_in_env_vars(
77
+ yaml_field: Dict[str, Any],
78
+ task_envs: Dict[str, str],
79
+ ) -> Dict[str, Any]:
80
+ """Detects env vars in yaml field and fills them with task_envs.
81
+
82
+ Use cases of env vars in file_mounts:
83
+ - dst/src paths; e.g.,
84
+ /model_path/llama-${SIZE}b: s3://llama-weights/llama-${SIZE}b
85
+ - storage's name (bucket name)
86
+ - storage's source (local path)
87
+
88
+ Use cases of env vars in service:
89
+ - model type; e.g.,
90
+ service:
91
+ readiness_probe:
92
+ path: /v1/chat/completions
93
+ post_data:
94
+ model: $MODEL_NAME
95
+ messages:
96
+ - role: user
97
+ content: How to print hello world?
98
+ max_tokens: 1
99
+
100
+ We simply dump yaml_field into a json string, and replace env vars using
101
+ regex. This should be safe as yaml config has been schema-validated.
102
+
103
+ Env vars of the following forms are detected:
104
+ - ${ENV}
105
+ - $ENV
106
+ where <ENV> must appear in task.envs.
107
+ """
108
+ yaml_field_str = json.dumps(yaml_field)
109
+
110
+ def replace_var(match):
111
+ var_name = match.group(1)
112
+ # If the variable isn't in the dictionary, return it unchanged
113
+ return task_envs.get(var_name, match.group(0))
114
+
115
+ # Pattern for valid env var names in bash.
116
+ pattern = r'\$\{?\b([a-zA-Z_][a-zA-Z0-9_]*)\b\}?'
117
+ yaml_field_str = re.sub(pattern, replace_var, yaml_field_str)
118
+ return json.loads(yaml_field_str)
119
+
120
+
121
+ class Task:
122
+ """Task: a computation to be run on the cloud."""
123
+
124
+ def __init__(
125
+ self,
126
+ name: str,
127
+ *,
128
+ setup: Optional[str] = None,
129
+ run: Optional[str] = None,
130
+ envs: Optional[Dict[str, str]] = None,
131
+ workdir: Optional[str] = None,
132
+ num_nodes: Optional[int] = None,
133
+ ):
134
+ """Initializes a Task.
135
+
136
+ All fields are optional. ``Task.run`` is the actual program: either a
137
+ shell command to run (str) or a command generator for different nodes
138
+ (lambda; see below).
139
+
140
+ Optionally, call ``Task.set_resources()`` to set the resource
141
+ requirements for this task. If not set, a default CPU-only requirement
142
+ is assumed (the same as ``konduktor launch``).
143
+
144
+ All setters of this class, ``Task.set_*()``, return ``self``, i.e.,
145
+ they are fluent APIs and can be chained together.
146
+
147
+ Example:
148
+ .. code-block:: python
149
+
150
+ # A Task that will sync up local workdir '.', containing
151
+ # requirements.txt and train.py.
152
+ konduktor.Task(setup='pip install requirements.txt',
153
+ run='python train.py',
154
+ workdir='.')
155
+
156
+ # An empty Task for provisioning a cluster.
157
+ task = konduktor.Task(num_nodes=n).set_resources(...)
158
+
159
+ # Chaining setters.
160
+ konduktor.Task().set_resources(...).set_file_mounts(...)
161
+
162
+ Args:
163
+ name: A string name for the Task for display purposes.
164
+ run: The actual command for the task. If not None, either a shell
165
+ command (str) or a command generator (callable). If latter, it
166
+ must take a node rank and a list of node addresses as input and
167
+ return a shell command (str) (valid to return None for some nodes,
168
+ in which case no commands are run on them). Run commands will be
169
+ run under ``workdir``. Note the command generator should be a
170
+ self-contained lambda.
171
+ envs: A dictionary of environment variables to set before running the
172
+ setup and run commands.
173
+ workdir: The local working directory. This directory will be synced
174
+ to a location on the remote VM(s), and ``setup`` and ``run``
175
+ commands will be run under that location (thus, they can rely on
176
+ relative paths when invoking binaries).
177
+ num_nodes: The number of nodes to provision for this Task. If None,
178
+ treated as 1 node. If > 1, each node will execute its own
179
+ setup/run command, where ``run`` can either be a str, meaning all
180
+ nodes get the same command, or a lambda, with the semantics
181
+ documented above.
182
+ """
183
+ assert name is not None, 'Task name is required'
184
+ self.name = name
185
+ self.setup = setup
186
+ self.run = run
187
+ self.storage_mounts: Dict[str, storage_lib.Storage] = {}
188
+ self.storage_plans: Dict[storage_lib.Storage, storage_lib.StoreType] = {}
189
+ self._envs = envs or {}
190
+ self.workdir = workdir
191
+ # Ignore type error due to a mypy bug.
192
+ # https://github.com/python/mypy/issues/3004
193
+ self._num_nodes = 1
194
+ self.num_nodes = num_nodes # type: ignore
195
+ # Default to CPU VM
196
+ self.resources: Optional[konduktor.Resources] = None
197
+ self.serving: Optional[konduktor.Serving] = None
198
+
199
+ self.file_mounts: Optional[Dict[str, str]] = None
200
+ self.best_resources = None # (asaiacai): this is unused consider removing
201
+
202
+ # Check if the task is legal.
203
+ self._validate()
204
+
205
+ def _validate(self):
206
+ """Checks if the Task fields are valid."""
207
+
208
+ # TODO(asaiacai): add validations here to check that valid
209
+ # kueue resources are specified as labels
210
+ if not _is_valid_name(self.name):
211
+ with ux_utils.print_exception_no_traceback():
212
+ raise ValueError(
213
+ f'Invalid task name {self.name}. Valid name: '
214
+ f'{_VALID_NAME_DESCR}'
215
+ )
216
+
217
+ # Check self.run
218
+ if callable(self.run):
219
+ run_sig = inspect.signature(self.run)
220
+ # Check that run is a function with 2 arguments.
221
+ if len(run_sig.parameters) != 2:
222
+ with ux_utils.print_exception_no_traceback():
223
+ raise ValueError(_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
224
+
225
+ type_list = [int, List[str]]
226
+ # Check annotations, if exists
227
+ for i, param in enumerate(run_sig.parameters.values()):
228
+ if param.annotation != inspect.Parameter.empty:
229
+ if param.annotation != type_list[i]:
230
+ with ux_utils.print_exception_no_traceback():
231
+ raise ValueError(_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
232
+
233
+ # Check self containedness.
234
+ run_closure = inspect.getclosurevars(self.run)
235
+ if run_closure.nonlocals:
236
+ with ux_utils.print_exception_no_traceback():
237
+ raise ValueError(
238
+ 'run command generator must be self contained. '
239
+ f'Found nonlocals: {run_closure.nonlocals}'
240
+ )
241
+ if run_closure.globals:
242
+ with ux_utils.print_exception_no_traceback():
243
+ raise ValueError(
244
+ 'run command generator must be self contained. '
245
+ f'Found globals: {run_closure.globals}'
246
+ )
247
+ if run_closure.unbound:
248
+ # Do not raise an error here. Import statements, which are
249
+ # allowed, will be considered as unbounded.
250
+ pass
251
+ elif self.run is not None and not isinstance(self.run, str):
252
+ with ux_utils.print_exception_no_traceback():
253
+ raise ValueError(
254
+ 'run must be a shell script (str), ' f'Got {type(self.run)}'
255
+ )
256
+ elif self.run is None:
257
+ with ux_utils.print_exception_no_traceback():
258
+ raise ValueError('run commands are empty')
259
+
260
+ # Workdir.
261
+ if self.workdir is not None:
262
+ full_workdir = os.path.abspath(os.path.expanduser(self.workdir))
263
+ if not os.path.isdir(full_workdir):
264
+ # Symlink to a dir is legal (isdir() follows symlinks).
265
+ with ux_utils.print_exception_no_traceback():
266
+ raise ValueError(
267
+ 'Workdir must exist and must be a directory (or '
268
+ f'a symlink to a directory). {self.workdir} not found.'
269
+ )
270
+
271
+ @staticmethod
272
+ def from_yaml_config(
273
+ config: Dict[str, Any],
274
+ env_overrides: Optional[List[Tuple[str, str]]] = None,
275
+ ) -> 'Task':
276
+ # More robust handling for 'envs': explicitly convert keys and values to
277
+ # str, since users may pass '123' as keys/values which will get parsed
278
+ # as int causing validate_schema() to fail.
279
+ envs = config.get('envs')
280
+ if envs is not None and isinstance(envs, dict):
281
+ new_envs: Dict[str, Optional[str]] = {}
282
+ for k, v in envs.items():
283
+ if v is not None:
284
+ new_envs[str(k)] = str(v)
285
+ else:
286
+ new_envs[str(k)] = None
287
+ config['envs'] = new_envs
288
+ common_utils.validate_schema(
289
+ config, schemas.get_task_schema(), 'Invalid task YAML: '
290
+ )
291
+ if env_overrides is not None:
292
+ # We must override env vars before constructing the Task, because
293
+ # the Storage object creation is eager and it (its name/source
294
+ # fields) may depend on env vars.
295
+ new_envs = config.get('envs', {})
296
+ new_envs.update(env_overrides)
297
+ config['envs'] = new_envs
298
+
299
+ for k, v in config.get('envs', {}).items():
300
+ if v is None:
301
+ with ux_utils.print_exception_no_traceback():
302
+ raise ValueError(
303
+ f'Environment variable {k!r} is None. Please set a '
304
+ 'value for it in task YAML or with --env flag. '
305
+ f'To set it to be empty, use an empty string ({k}: "" '
306
+ f'in task YAML or --env {k}="" in CLI).'
307
+ )
308
+
309
+ # Fill in any Task.envs into file_mounts (src/dst paths, storage
310
+ # name/source).
311
+ if config.get('file_mounts') is not None:
312
+ config['file_mounts'] = _fill_in_env_vars(
313
+ config['file_mounts'], config.get('envs', {})
314
+ )
315
+
316
+ # Fill in any Task.envs into workdir
317
+ if config.get('workdir') is not None:
318
+ config['workdir'] = _fill_in_env_vars(
319
+ config['workdir'], config.get('envs', {})
320
+ )
321
+
322
+ task = Task(
323
+ config.pop('name', None),
324
+ setup=config.pop('setup', None),
325
+ run=config.pop('run', None),
326
+ workdir=config.pop('workdir', None),
327
+ num_nodes=config.pop('num_nodes', None),
328
+ envs=config.pop('envs', None),
329
+ )
330
+
331
+ # Create lists to store storage objects inlined in file_mounts.
332
+ # These are retained in dicts in the YAML schema and later parsed to
333
+ # storage objects with the storage/storage_mount objects.
334
+ fm_storages = []
335
+ file_mounts = config.pop('file_mounts', None)
336
+ if file_mounts is not None:
337
+ copy_mounts = {}
338
+ for dst_path, src in file_mounts.items():
339
+ # Check if it is str path
340
+ if isinstance(src, str):
341
+ copy_mounts[dst_path] = src
342
+ # If the src is not a str path, it is likely a dict. Try to
343
+ # parse storage object.
344
+ elif isinstance(src, dict):
345
+ fm_storages.append((dst_path, src))
346
+ else:
347
+ with ux_utils.print_exception_no_traceback():
348
+ raise ValueError(
349
+ f'Unable to parse file_mount ' f'{dst_path}:{src}'
350
+ )
351
+ task.set_file_mounts(copy_mounts)
352
+
353
+ task_storage_mounts: Dict[str, storage_lib.Storage] = {}
354
+ all_storages = fm_storages
355
+ for storage in all_storages:
356
+ mount_path = storage[0]
357
+ assert mount_path, 'Storage mount path cannot be empty.'
358
+ try:
359
+ storage_obj = storage_lib.Storage.from_yaml_config(storage[1])
360
+ except exceptions.StorageSourceError as e:
361
+ # Patch the error message to include the mount path, if included
362
+ e.args = (
363
+ e.args[0].replace('<destination_path>', mount_path),
364
+ ) + e.args[1:]
365
+ raise e
366
+ task_storage_mounts[mount_path] = storage_obj
367
+ task.set_storage_mounts(task_storage_mounts)
368
+
369
+ # Experimental configs.
370
+ experimnetal_configs = config.pop('experimental', None)
371
+ cluster_config_override = None
372
+ if experimnetal_configs is not None:
373
+ cluster_config_override = experimnetal_configs.pop('config_overrides', None)
374
+ logger.debug(
375
+ 'Overriding konduktor config with task-level config: '
376
+ f'{cluster_config_override}'
377
+ )
378
+ assert not experimnetal_configs, (
379
+ 'Invalid task args: ' f'{experimnetal_configs.keys()}'
380
+ )
381
+
382
+ # Parse resources field.
383
+ resources_config = config.pop('resources', {})
384
+ if cluster_config_override is not None:
385
+ assert resources_config.get('_cluster_config_overrides') is None, (
386
+ 'Cannot set _cluster_config_overrides in both resources and '
387
+ 'experimental.config_overrides'
388
+ )
389
+ resources_config['_cluster_config_overrides'] = cluster_config_override
390
+
391
+ task.set_resources(konduktor.Resources.from_yaml_config(resources_config))
392
+
393
+ # Parse serving field.
394
+ serving_config = config.pop('serving', None)
395
+ if serving_config is not None:
396
+ serving = konduktor.Serving.from_yaml_config(serving_config, task.run)
397
+ if serving is not None:
398
+ task.set_serving(serving)
399
+
400
+ assert not config, f'Invalid task args: {config.keys()}'
401
+ return task
402
+
403
+ @staticmethod
404
+ def from_yaml(yaml_path: str) -> 'Task':
405
+ """Initializes a task from a task YAML.
406
+
407
+ Example:
408
+ .. code-block:: python
409
+
410
+ task = konduktor.Task.from_yaml('/path/to/task.yaml')
411
+
412
+ Args:
413
+ yaml_path: file path to a valid task yaml file.
414
+
415
+ Raises:
416
+ ValueError: if the path gets loaded into a str instead of a dict; or
417
+ if there are any other parsing errors.
418
+ """
419
+ with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
420
+ # https://github.com/yaml/pyyaml/issues/165#issuecomment-430074049
421
+ # to raise errors on duplicate keys.
422
+ config = yaml.safe_load(f)
423
+
424
+ if isinstance(config, str):
425
+ with ux_utils.print_exception_no_traceback():
426
+ raise ValueError(
427
+ 'YAML loaded as str, not as dict. '
428
+ f'Is it correct? Path: {yaml_path}'
429
+ )
430
+
431
+ if config is None:
432
+ config = {}
433
+ return Task.from_yaml_config(config)
434
+
435
+ @property
436
+ def num_nodes(self) -> int:
437
+ return self._num_nodes
438
+
439
+ @num_nodes.setter
440
+ def num_nodes(self, num_nodes: Optional[int]) -> None:
441
+ if num_nodes is None:
442
+ num_nodes = 1
443
+ if not isinstance(num_nodes, int) or num_nodes <= 0:
444
+ with ux_utils.print_exception_no_traceback():
445
+ raise ValueError(
446
+ f'num_nodes should be a positive int. Got: {num_nodes}'
447
+ )
448
+ self._num_nodes = num_nodes
449
+
450
+ @property
451
+ def envs(self) -> Dict[str, str]:
452
+ return self._envs
453
+
454
+ def update_envs(
455
+ self, envs: Union[None, List[Tuple[str, str]], Dict[str, str]]
456
+ ) -> 'Task':
457
+ """Updates environment variables for use inside the setup/run commands.
458
+
459
+ Args:
460
+ envs: (optional) either a list of ``(env_name, value)`` or a dict
461
+ ``{env_name: value}``.
462
+
463
+ Returns:
464
+ self: The current task, with envs updated.
465
+
466
+ Raises:
467
+ ValueError: if various invalid inputs errors are detected.
468
+ """
469
+ if envs is None:
470
+ envs = {}
471
+ if isinstance(envs, (list, tuple)):
472
+ keys = set(env[0] for env in envs)
473
+ if len(keys) != len(envs):
474
+ with ux_utils.print_exception_no_traceback():
475
+ raise ValueError('Duplicate env keys provided.')
476
+ envs = dict(envs)
477
+ if isinstance(envs, dict):
478
+ for key in envs:
479
+ if not isinstance(key, str):
480
+ with ux_utils.print_exception_no_traceback():
481
+ raise ValueError('Env keys must be strings.')
482
+ if not common_utils.is_valid_env_var(key):
483
+ with ux_utils.print_exception_no_traceback():
484
+ raise ValueError(f'Invalid env key: {key}')
485
+ else:
486
+ with ux_utils.print_exception_no_traceback():
487
+ raise ValueError(
488
+ 'envs must be List[Tuple[str, str]] or Dict[str, str]: ' f'{envs}'
489
+ )
490
+ self._envs.update(envs)
491
+ return self
492
+
493
+ def set_resources(
494
+ self,
495
+ resources: 'resources_lib.Resources',
496
+ ) -> 'Task':
497
+ """Sets the required resources to execute this task.
498
+
499
+ If this function is not called for a Task, default resource
500
+ requirements will be used (8 vCPUs).
501
+
502
+ Args:
503
+ resources: either a konduktor.Resources, a set of them, or a list of them.
504
+ A set or a list of resources asks the optimizer to "pick the
505
+ best of these resources" to run this task.
506
+ Returns:
507
+ self: The current task, with resources set.
508
+ """
509
+ if isinstance(resources, konduktor.Resources):
510
+ resources = resources
511
+ self.resources = resources
512
+
513
+ # TODO(asaiacai): we're only going to support COPY for now.
514
+ # MOUNT is not supported. Evaluate if the task requires
515
+ # FUSE and set the requires_fuse flag
516
+ for _, storage_obj in self.storage_mounts.items():
517
+ if storage_obj.mode not in storage_lib.StorageMode:
518
+ with ux_utils.print_exception_no_traceback():
519
+ raise ValueError(
520
+ f'Storage Type {storage_obj.mode} '
521
+ 'not supported. '
522
+ f'Only {", ".join([mode.name for mode in storage_lib.StorageMode])} ' # noqa: E501
523
+ 'is supported!'
524
+ )
525
+ # if storage_obj.mode == storage_lib.StorageMode.MOUNT:
526
+ # for r in self.resources:
527
+ # r.requires_fuse = True
528
+ # break
529
+
530
+ return self
531
+
532
+ def set_resources_override(self, override_params: Dict[str, Any]) -> 'Task':
533
+ """Sets the override parameters for the resources."""
534
+ assert self.resources is not None, 'Resources are required'
535
+ new_resources = self.resources.copy(**override_params)
536
+
537
+ self.set_resources(new_resources)
538
+ return self
539
+
540
+ def set_serving(
541
+ self,
542
+ serving: 'servings_lib.Serving',
543
+ ) -> 'Task':
544
+ """Sets the serving configuration for this task.
545
+
546
+ Args:
547
+ serving: konduktor.Serving object
548
+
549
+ Returns:
550
+ self: The current task, with serving set.
551
+ """
552
+ if self._num_nodes and self._num_nodes != 1:
553
+ with ux_utils.print_exception_no_traceback():
554
+ raise ValueError(
555
+ f'Only single node serving is supported (num_nodes: 1). '
556
+ f'Got: {self.num_nodes}'
557
+ )
558
+
559
+ if serving.max_replicas < serving.min_replicas:
560
+ with ux_utils.print_exception_no_traceback():
561
+ raise ValueError(
562
+ f'max_replicas ({serving.max_replicas}) cannot be '
563
+ f'less than min_replicas ({serving.min_replicas})'
564
+ )
565
+
566
+ if serving.max_replicas == 0 and serving.min_replicas == 0:
567
+ with ux_utils.print_exception_no_traceback():
568
+ raise ValueError(
569
+ f'max_replicas ({serving.max_replicas}) and '
570
+ f'min_replicas ({serving.min_replicas}) cannot both be 0'
571
+ )
572
+
573
+ if isinstance(serving, konduktor.Serving):
574
+ serving = serving
575
+ self.serving = serving
576
+ return self
577
+
578
+ def set_serving_override(self, override_params: Dict[str, Any]) -> 'Task':
579
+ """Sets the override parameters for the serving config."""
580
+ assert self.serving is not None, 'Serving config is required'
581
+ new_serving = konduktor.Serving(
582
+ min_replicas=override_params.get('min_replicas', self.serving.min_replicas),
583
+ max_replicas=override_params.get('max_replicas', self.serving.max_replicas),
584
+ ports=override_params.get('ports', self.serving.ports),
585
+ probe=override_params.get('probe', self.serving.probe),
586
+ )
587
+
588
+ self.num_nodes = override_params.get('num_nodes', self.num_nodes)
589
+
590
+ self.set_serving(new_serving)
591
+ return self
592
+
593
+ def set_file_mounts(self, file_mounts: Optional[Dict[str, str]]) -> 'Task':
594
+ """Sets the file mounts for this task.
595
+
596
+ Useful for syncing datasets, dotfiles, etc.
597
+
598
+ File mounts are a dictionary: ``{remote_path: local_path/cloud URI}``.
599
+ Local (or cloud) files/directories will be synced to the specified
600
+ paths on the remote VM(s) where this Task will run.
601
+
602
+ Neither source or destimation paths can end with a slash.
603
+
604
+ Example:
605
+ .. code-block:: python
606
+
607
+ task.set_file_mounts({
608
+ '~/.dotfile': '/local/.dotfile',
609
+ # /remote/dir/ will contain the contents of /local/dir/.
610
+ '/remote/dir': '/local/dir',
611
+ })
612
+
613
+ Args:
614
+ file_mounts: an optional dict of ``{remote_path: local_path/cloud
615
+ URI}``, where remote means the VM(s) on which this Task will
616
+ eventually run on, and local means the node from which the task is
617
+ launched.
618
+
619
+ Returns:
620
+ self: the current task, with file mounts set.
621
+
622
+ Raises:
623
+ ValueError: if input paths are invalid.
624
+ """
625
+ if file_mounts is None:
626
+ self.file_mounts = None
627
+ return self
628
+ for target, source in file_mounts.items():
629
+ if target.endswith('/') or source.endswith('/'):
630
+ with ux_utils.print_exception_no_traceback():
631
+ raise ValueError(
632
+ 'File mount paths cannot end with a slash '
633
+ '(try "/mydir: /mydir" or "/myfile: /myfile"). '
634
+ f'Found: target={target} source={source}'
635
+ )
636
+ if data_utils.is_cloud_store_url(target):
637
+ with ux_utils.print_exception_no_traceback():
638
+ raise ValueError(
639
+ 'File mount destination paths cannot be cloud storage'
640
+ )
641
+ if not data_utils.is_cloud_store_url(source):
642
+ if not os.path.exists(
643
+ os.path.abspath(os.path.expanduser(source))
644
+ ) and not source.startswith('konduktor:'):
645
+ with ux_utils.print_exception_no_traceback():
646
+ raise ValueError(
647
+ f'File mount source {source!r} does not exist '
648
+ 'locally. To fix: check if it exists, and correct '
649
+ 'the path.'
650
+ )
651
+ if (
652
+ target == constants.KONDUKTOR_REMOTE_WORKDIR
653
+ and self.workdir is not None
654
+ ):
655
+ with ux_utils.print_exception_no_traceback():
656
+ raise ValueError(
657
+ f'Cannot use {constants.KONDUKTOR_REMOTE_WORKDIR!r} as a '
658
+ 'destination path of a file mount, as it will be used '
659
+ 'by the workdir. If uploading a file/folder to the '
660
+ 'workdir is needed, please specify the full path to '
661
+ 'the file/folder.'
662
+ )
663
+
664
+ self.file_mounts = file_mounts
665
+ return self
666
+
667
+ def _get_preferred_store(self) -> Tuple[storage_lib.StoreType, Optional[str]]:
668
+ """Returns the preferred store type and region for this task."""
669
+ storage_cloud = None
670
+
671
+ enabled_storage_clouds = (
672
+ storage_lib.get_cached_enabled_storage_clouds_or_refresh(
673
+ raise_if_no_cloud_access=True
674
+ )
675
+ )
676
+ resources = self.resources
677
+ if resources is not None:
678
+ storage_cloud = resources.cloud
679
+ else:
680
+ storage_cloud = None
681
+
682
+ if storage_cloud is not None:
683
+ if str(storage_cloud) not in enabled_storage_clouds:
684
+ storage_cloud = None
685
+
686
+ storage_cloud_str = None
687
+ if storage_cloud is None:
688
+ storage_cloud_str = enabled_storage_clouds[0]
689
+ assert storage_cloud_str is not None, enabled_storage_clouds[0]
690
+ storage_region = None # Use default region in the Store class
691
+ else:
692
+ storage_cloud_str = str(storage_cloud)
693
+
694
+ store_type = storage_lib.StoreType.from_cloud(storage_cloud_str)
695
+ return store_type, storage_region
696
+
697
+ def sync_storage_mounts(self) -> None:
698
+ """(INTERNAL) Eagerly syncs storage mounts to cloud storage.
699
+
700
+ After syncing up, COPY-mode storage mounts are translated into regular
701
+ file_mounts of the form ``{ /remote/path: {s3,gs,..}://<bucket path>
702
+ }``. For local file mounts, we first sync all local paths from
703
+ `workdir` and `file_mounts` to the cloud storage.
704
+ """
705
+ for storage in self.storage_mounts.values():
706
+ if len(storage.stores) == 0:
707
+ store_type, store_region = self._get_preferred_store()
708
+ self.storage_plans[storage] = store_type
709
+ storage.add_store(store_type, store_region)
710
+ else:
711
+ # We will download the first store that is added to remote.
712
+ self.storage_plans[storage] = list(storage.stores.keys())[0]
713
+
714
+ storage_mounts = self.storage_mounts
715
+ storage_plans = self.storage_plans
716
+ for mnt_path, storage in storage_mounts.items():
717
+ if storage.mode == storage_lib.StorageMode.COPY:
718
+ store_type = storage_plans[storage]
719
+ # TODO(asaiacai): add back other stores here
720
+ elif store_type is storage_lib.StoreType.S3:
721
+ if isinstance(storage.source, str) and storage.source.startswith(
722
+ 's3://'
723
+ ):
724
+ blob_path = storage.source
725
+ else:
726
+ assert storage.name is not None, storage
727
+ blob_path = 's3://' + storage.name
728
+ self.update_file_mounts(
729
+ {
730
+ mnt_path: blob_path,
731
+ }
732
+ )
733
+ elif store_type is storage_lib.StoreType.GCS:
734
+ if isinstance(storage.source, str) and storage.source.startswith(
735
+ 'gs://'
736
+ ):
737
+ blob_path = storage.source
738
+ else:
739
+ assert storage.name is not None, storage
740
+ blob_path = 'gs://' + storage.name
741
+ self.update_file_mounts(
742
+ {
743
+ mnt_path: blob_path,
744
+ }
745
+ )
746
+ else:
747
+ with ux_utils.print_exception_no_traceback():
748
+ raise ValueError(f'Storage Type {store_type} ' 'does not exist!')
749
+
750
+ def update_storage_mounts(
751
+ self, storage_mounts: Dict[str, storage_lib.Storage]
752
+ ) -> 'Task':
753
+ """Updates the storage mounts for this task.
754
+
755
+ Different from set_storage_mounts(), this function updates into the
756
+ existing storage_mounts (calls ``dict.update()``), rather than
757
+ overwriting it.
758
+
759
+ This should be called before provisioning in order to take effect.
760
+
761
+ Args:
762
+ storage_mounts: an optional dict of ``{mount_path: konduktor.data.Storage
763
+ object}``, where mount_path is the path inside the remote VM(s)
764
+ where the Storage object will be mounted on.
765
+
766
+ Returns:
767
+ self: The current task, with storage mounts updated.
768
+
769
+ Raises:
770
+ ValueError: if input paths are invalid.
771
+ """
772
+ if not storage_mounts:
773
+ return self
774
+ task_storage_mounts = self.storage_mounts if self.storage_mounts else {}
775
+ task_storage_mounts.update(storage_mounts)
776
+ return self.set_storage_mounts(task_storage_mounts)
777
+
778
+ def update_file_mounts(self, file_mounts: Dict[str, str]) -> 'Task':
779
+ """Updates the file mounts for this task.
780
+
781
+ Different from set_file_mounts(), this function updates into the
782
+ existing file_mounts (calls ``dict.update()``), rather than
783
+ overwritting it.
784
+
785
+ This should be called before provisioning in order to take effect.
786
+
787
+ Example:
788
+ .. code-block:: python
789
+
790
+ task.update_file_mounts({
791
+ '~/.config': '~/Documents/config',
792
+ '/tmp/workdir': '/local/workdir/cnn-cifar10',
793
+ })
794
+
795
+ Args:
796
+ file_mounts: a dict of ``{remote_path: local_path/cloud URI}``, where
797
+ remote means the VM(s) on which this Task will eventually run on,
798
+ and local means the node from which the task is launched.
799
+
800
+ Returns:
801
+ self: the current task, with file mounts updated.
802
+
803
+ Raises:
804
+ ValueError: if input paths are invalid.
805
+ """
806
+ if self.file_mounts is None:
807
+ self.file_mounts = {}
808
+ assert self.file_mounts is not None
809
+ self.file_mounts.update(file_mounts)
810
+ # For validation logic:
811
+ return self.set_file_mounts(self.file_mounts)
812
+
813
+ def set_storage_mounts(
814
+ self,
815
+ storage_mounts: Optional[Dict[str, storage_lib.Storage]],
816
+ ) -> 'Task':
817
+ """Sets the storage mounts for this task.
818
+
819
+ Storage mounts are a dictionary: ``{mount_path: sky.Storage object}``,
820
+ each of which mounts a sky.Storage object (a cloud object store bucket)
821
+ to a path inside the remote cluster.
822
+
823
+ A sky.Storage object can be created by uploading from a local directory
824
+ (setting ``source``), or backed by an existing cloud bucket (setting
825
+ ``name`` to the bucket name; or setting ``source`` to the bucket URI).
826
+
827
+ Example:
828
+ .. code-block:: python
829
+
830
+ task.set_storage_mounts({
831
+ '/remote/imagenet/': sky.Storage(name='my-bucket',
832
+ source='/local/imagenet'),
833
+ })
834
+
835
+ Args:
836
+ storage_mounts: an optional dict of ``{mount_path: sky.Storage
837
+ object}``, where mount_path is the path inside the remote VM(s)
838
+ where the Storage object will be mounted on.
839
+
840
+ Returns:
841
+ self: The current task, with storage mounts set.
842
+
843
+ Raises:
844
+ ValueError: if input paths are invalid.
845
+ """
846
+ if storage_mounts is None:
847
+ self.storage_mounts = {}
848
+ # Clear the requires_fuse flag if no storage mounts are set.
849
+ assert self.resources is not None, 'Task resources are required'
850
+ return self
851
+ for target, storage_obj in storage_mounts.items():
852
+ # TODO(zhwu): /home/username/sky_workdir as the target path need
853
+ # to be filtered out as well.
854
+ if (
855
+ target == constants.KONDUKTOR_REMOTE_WORKDIR
856
+ and self.workdir is not None
857
+ ):
858
+ with ux_utils.print_exception_no_traceback():
859
+ raise ValueError(
860
+ f'Cannot use {constants.KONDUKTOR_REMOTE_WORKDIR!r} as a '
861
+ 'destination path of a file mount, as it will be used '
862
+ 'by the workdir. If uploading a file/folder to the '
863
+ 'workdir is needed, please specify the full path to '
864
+ 'the file/folder.'
865
+ )
866
+
867
+ if data_utils.is_cloud_store_url(target):
868
+ with ux_utils.print_exception_no_traceback():
869
+ raise ValueError(
870
+ 'Storage mount destination path cannot be cloud storage'
871
+ )
872
+
873
+ assert (
874
+ storage_obj.mode == storage_lib.StorageMode.COPY
875
+ ), 'Only COPY mode is supported for storage mounts'
876
+ # TODO(asaiacai): can decide if we want to just delete this
877
+ # if storage_obj.mode == storage_lib.StorageMode.MOUNT:
878
+ # # If any storage is using MOUNT mode, we need to enable FUSE in
879
+ # # the resources.
880
+ # for r in self.resources:
881
+ # r.requires_fuse = True
882
+
883
+ # Storage source validation is done in Storage object
884
+ self.storage_mounts = storage_mounts
885
+ return self
886
+
887
+ def get_local_to_remote_file_mounts(self) -> Optional[Dict[str, str]]:
888
+ """Returns file mounts of the form (dst=VM path, src=local path).
889
+
890
+ Any cloud object store URIs (gs://, s3://, etc.), either as source or
891
+ destination, are not included.
892
+
893
+ INTERNAL: this method is internal-facing.
894
+ """
895
+ if self.file_mounts is None:
896
+ return None
897
+ d = {}
898
+ for k, v in self.file_mounts.items():
899
+ if not data_utils.is_cloud_store_url(
900
+ k
901
+ ) and not data_utils.is_cloud_store_url(v):
902
+ d[k] = v
903
+ return d
904
+
905
+ def to_yaml_config(self) -> Dict[str, Any]:
906
+ """Returns a yaml-style dict representation of the task.
907
+
908
+ INTERNAL: this method is internal-facing.
909
+ """
910
+ config = {}
911
+
912
+ def add_if_not_none(key, value, no_empty: bool = False):
913
+ if no_empty and not value:
914
+ return
915
+ if value is not None:
916
+ config[key] = value
917
+
918
+ add_if_not_none('name', self.name)
919
+
920
+ tmp_resource_config: Dict[str, Any] = {}
921
+ assert self.resources is not None, 'Resources are not defined'
922
+ tmp_resource_config = self.resources.to_yaml_config()
923
+
924
+ add_if_not_none('resources', tmp_resource_config)
925
+
926
+ add_if_not_none('num_nodes', self.num_nodes)
927
+
928
+ add_if_not_none('run', self.run)
929
+ add_if_not_none('workdir', self.workdir)
930
+ add_if_not_none('envs', self.envs, no_empty=True)
931
+
932
+ add_if_not_none('file_mounts', {})
933
+
934
+ if self.file_mounts is not None:
935
+ config['file_mounts'].update(self.file_mounts)
936
+
937
+ add_if_not_none('serving', {})
938
+
939
+ if self.serving is not None:
940
+ add_if_not_none('serving', self.serving.to_yaml_config())
941
+
942
+ if self.storage_mounts is not None:
943
+ config['file_mounts'].update(
944
+ {
945
+ mount_path: storage.to_yaml_config()
946
+ for mount_path, storage in self.storage_mounts.items()
947
+ }
948
+ )
949
+ return config