torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  2. torchx/cli/cmd_delete.py +30 -0
  3. torchx/cli/cmd_list.py +1 -2
  4. torchx/cli/cmd_run.py +202 -28
  5. torchx/cli/cmd_tracker.py +1 -1
  6. torchx/cli/main.py +2 -0
  7. torchx/components/__init__.py +1 -8
  8. torchx/components/dist.py +9 -3
  9. torchx/components/integration_tests/component_provider.py +2 -2
  10. torchx/components/utils.py +1 -1
  11. torchx/distributed/__init__.py +1 -1
  12. torchx/runner/api.py +102 -81
  13. torchx/runner/config.py +3 -1
  14. torchx/runner/events/__init__.py +20 -10
  15. torchx/runner/events/api.py +1 -1
  16. torchx/schedulers/__init__.py +7 -10
  17. torchx/schedulers/api.py +66 -25
  18. torchx/schedulers/aws_batch_scheduler.py +47 -6
  19. torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
  20. torchx/schedulers/docker_scheduler.py +4 -3
  21. torchx/schedulers/ids.py +27 -23
  22. torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
  23. torchx/schedulers/kubernetes_scheduler.py +355 -36
  24. torchx/schedulers/local_scheduler.py +2 -1
  25. torchx/schedulers/lsf_scheduler.py +1 -1
  26. torchx/schedulers/slurm_scheduler.py +102 -27
  27. torchx/specs/__init__.py +40 -9
  28. torchx/specs/api.py +222 -12
  29. torchx/specs/builders.py +109 -28
  30. torchx/specs/file_linter.py +117 -53
  31. torchx/specs/finder.py +25 -37
  32. torchx/specs/named_resources_aws.py +13 -2
  33. torchx/specs/overlays.py +106 -0
  34. torchx/tracker/__init__.py +2 -2
  35. torchx/tracker/api.py +1 -1
  36. torchx/util/entrypoints.py +1 -6
  37. torchx/util/strings.py +1 -1
  38. torchx/util/types.py +12 -1
  39. torchx/version.py +2 -2
  40. torchx/workspace/api.py +102 -5
  41. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
  42. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
  43. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
  44. torchx/examples/pipelines/__init__.py +0 -0
  45. torchx/examples/pipelines/kfp/__init__.py +0 -0
  46. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
  47. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
  48. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
  49. torchx/pipelines/kfp/__init__.py +0 -30
  50. torchx/pipelines/kfp/adapter.py +0 -274
  51. torchx/pipelines/kfp/version.py +0 -19
  52. torchx/schedulers/gcp_batch_scheduler.py +0 -497
  53. torchx/schedulers/ray/ray_common.py +0 -22
  54. torchx/schedulers/ray/ray_driver.py +0 -307
  55. torchx/schedulers/ray_scheduler.py +0 -454
  56. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
  57. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
  58. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0
@@ -27,10 +27,81 @@ Install Volcano:
27
27
  See the
28
28
  `Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
29
29
  for more information.
30
+
31
+ Pod Overlay
32
+ ===========
33
+
34
+ You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
35
+ the ``kubernetes`` metadata on your role. The value can be:
36
+
37
+ - A dict with the overlay structure
38
+ - A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
39
+
40
+ Merge semantics:
41
+ - **dict**: recursive merge (upsert)
42
+ - **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
43
+ - **primitives**: replace
44
+
45
+ .. code:: python
46
+
47
+ from torchx.specs import Role
48
+
49
+ # Dict overlay - lists append, tuples replace
50
+ role = Role(
51
+ name="trainer",
52
+ image="my-image:latest",
53
+ entrypoint="train.py",
54
+ metadata={
55
+ "kubernetes": {
56
+ "spec": {
57
+ "nodeSelector": {"gpu": "true"},
58
+ "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}], # appends
59
+ "volumes": ({"name": "my-volume", "emptyDir": {}},) # replaces
60
+ }
61
+ }
62
+ }
63
+ )
64
+
65
+ # File URI overlay
66
+ role = Role(
67
+ name="trainer",
68
+ image="my-image:latest",
69
+ entrypoint="train.py",
70
+ metadata={
71
+ "kubernetes": "file:///path/to/pod_overlay.yaml"
72
+ }
73
+ )
74
+
75
+ CLI usage with builtin components:
76
+
77
+ .. code:: bash
78
+
79
+ $ torchx run --scheduler kubernetes dist.ddp \\
80
+ --metadata kubernetes=file:///path/to/pod_overlay.yaml \\
81
+ --script train.py
82
+
83
+ Example ``pod_overlay.yaml``:
84
+
85
+ .. code:: yaml
86
+
87
+ spec:
88
+ nodeSelector:
89
+ node.kubernetes.io/instance-type: p4d.24xlarge
90
+ tolerations:
91
+ - key: nvidia.com/gpu
92
+ operator: Exists
93
+ effect: NoSchedule
94
+ volumes: !!python/tuple
95
+ - name: my-volume
96
+ emptyDir: {}
97
+
98
+ The overlay is deep-merged with the generated pod, preserving existing fields
99
+ and adding or overriding specified ones.
30
100
  """
31
101
 
32
102
  import json
33
103
  import logging
104
+ import re
34
105
  import warnings
35
106
  from dataclasses import dataclass
36
107
  from datetime import datetime
@@ -45,6 +116,7 @@ from typing import (
45
116
  Tuple,
46
117
  TYPE_CHECKING,
47
118
  TypedDict,
119
+ Union,
48
120
  )
49
121
 
50
122
  import torchx
@@ -77,7 +149,6 @@ from torchx.specs.api import (
77
149
  from torchx.util.strings import normalize_str
78
150
  from torchx.workspace.docker_workspace import DockerWorkspaceMixin
79
151
 
80
-
81
152
  if TYPE_CHECKING:
82
153
  from docker import DockerClient
83
154
  from kubernetes.client import ApiClient, CustomObjectsApi
@@ -87,6 +158,7 @@ if TYPE_CHECKING:
87
158
  )
88
159
  from kubernetes.client.rest import ApiException
89
160
 
161
+
90
162
  logger: logging.Logger = logging.getLogger(__name__)
91
163
 
92
164
  # Kubernetes reserves a small amount of resources per host for the system. For
@@ -97,6 +169,40 @@ logger: logging.Logger = logging.getLogger(__name__)
97
169
  RESERVED_MILLICPU = 100
98
170
  RESERVED_MEMMB = 1024
99
171
 
172
+
173
+ def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
174
+ """Apply overlay dict to V1Pod object, merging nested fields.
175
+
176
+ Merge semantics:
177
+ - dict: upsert (recursive merge)
178
+ - list: append by default, replace if tuple
179
+ - primitives: replace
180
+ """
181
+ from kubernetes import client
182
+
183
+ api = client.ApiClient()
184
+ pod_dict = api.sanitize_for_serialization(pod)
185
+
186
+ def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
187
+ for key, value in overlay.items():
188
+ if isinstance(value, dict) and key in base and isinstance(base[key], dict):
189
+ deep_merge(base[key], value)
190
+ elif isinstance(value, tuple):
191
+ base[key] = list(value)
192
+ elif (
193
+ isinstance(value, list) and key in base and isinstance(base[key], list)
194
+ ):
195
+ base[key].extend(value)
196
+ else:
197
+ base[key] = value
198
+
199
+ deep_merge(pod_dict, overlay)
200
+
201
+ merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
202
+ pod.spec = merged_pod.spec
203
+ pod.metadata = merged_pod.metadata
204
+
205
+
100
206
  RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
101
207
  RetryPolicy.REPLICA: [],
102
208
  RetryPolicy.APPLICATION: [
@@ -188,7 +294,14 @@ def sanitize_for_serialization(obj: object) -> object:
188
294
  return api.sanitize_for_serialization(obj)
189
295
 
190
296
 
191
- def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod":
297
+ def role_to_pod(
298
+ name: str,
299
+ role: Role,
300
+ service_account: Optional[str],
301
+ reserved_millicpu: int = RESERVED_MILLICPU,
302
+ reserved_memmb: int = RESERVED_MEMMB,
303
+ efa_device_count: Optional[int] = None,
304
+ ) -> "V1Pod":
192
305
  from kubernetes.client.models import ( # noqa: F811 redefinition of unused
193
306
  V1Container,
194
307
  V1ContainerPort,
@@ -218,18 +331,29 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
218
331
  if resource.cpu > 0:
219
332
  mcpu = int(resource.cpu * 1000)
220
333
  limits["cpu"] = f"{mcpu}m"
221
- request_mcpu = max(mcpu - RESERVED_MILLICPU, 0)
334
+ request_mcpu = max(mcpu - reserved_millicpu, 0)
222
335
  requests["cpu"] = f"{request_mcpu}m"
223
336
  if resource.memMB > 0:
224
337
  limits["memory"] = f"{int(resource.memMB)}M"
225
- request_memMB = max(int(resource.memMB) - RESERVED_MEMMB, 0)
338
+ request_memMB = max(int(resource.memMB) - reserved_memmb, 0)
226
339
  requests["memory"] = f"{request_memMB}M"
227
340
  if resource.gpu > 0:
228
341
  requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
229
342
 
343
+ EFA_DEVICE = "vpc.amazonaws.com/efa"
230
344
  for device_name, device_limit in resource.devices.items():
231
345
  limits[device_name] = str(device_limit)
232
346
 
347
+ # Handle EFA device count override:
348
+ # - None (default): use whatever count is in the resource spec (already added above)
349
+ # - 0: remove EFA devices entirely
350
+ # - N > 0: set EFA device count to N (override or add)
351
+ if efa_device_count is not None:
352
+ if efa_device_count == 0:
353
+ limits.pop(EFA_DEVICE, None)
354
+ else:
355
+ limits[EFA_DEVICE] = str(efa_device_count)
356
+
233
357
  resources = V1ResourceRequirements(
234
358
  limits=limits,
235
359
  requests=requests,
@@ -369,7 +493,10 @@ def app_to_resource(
369
493
  queue: str,
370
494
  service_account: Optional[str],
371
495
  priority_class: Optional[str] = None,
372
- ) -> Dict[str, object]:
496
+ reserved_millicpu: int = RESERVED_MILLICPU,
497
+ reserved_memmb: int = RESERVED_MEMMB,
498
+ efa_device_count: Optional[int] = None,
499
+ ) -> Dict[str, Any]:
373
500
  """
374
501
  app_to_resource creates a volcano job kubernetes resource definition from
375
502
  the provided AppDef. The resource definition can be used to launch the
@@ -399,8 +526,27 @@ def app_to_resource(
399
526
  replica_role = values.apply(role)
400
527
  if role_idx == 0 and replica_id == 0:
401
528
  replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
402
-
403
- pod = role_to_pod(name, replica_role, service_account)
529
+ replica_role.env["TORCHX_IMAGE"] = replica_role.image
530
+
531
+ pod = role_to_pod(
532
+ name,
533
+ replica_role,
534
+ service_account,
535
+ reserved_millicpu,
536
+ reserved_memmb,
537
+ efa_device_count,
538
+ )
539
+ if k8s_metadata := role.metadata.get("kubernetes"):
540
+ if isinstance(k8s_metadata, str):
541
+ import fsspec
542
+
543
+ with fsspec.open(k8s_metadata, "r") as f:
544
+ k8s_metadata = yaml.unsafe_load(f)
545
+ elif not isinstance(k8s_metadata, dict):
546
+ raise ValueError(
547
+ f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
548
+ )
549
+ _apply_pod_overlay(pod, k8s_metadata)
404
550
  pod.metadata.labels.update(
405
551
  pod_labels(
406
552
  app=app,
@@ -443,7 +589,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
443
589
  if priority_class is not None:
444
590
  job_spec["priorityClassName"] = priority_class
445
591
 
446
- resource: Dict[str, object] = {
592
+ resource: Dict[str, Any] = {
447
593
  "apiVersion": "batch.volcano.sh/v1alpha1",
448
594
  "kind": "Job",
449
595
  "metadata": {"name": f"{unique_app_id}"},
@@ -455,7 +601,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
455
601
  @dataclass
456
602
  class KubernetesJob:
457
603
  images_to_push: Dict[str, Tuple[str, str]]
458
- resource: Dict[str, object]
604
+ resource: Dict[str, Any]
459
605
 
460
606
  def __str__(self) -> str:
461
607
  return yaml.dump(sanitize_for_serialization(self.resource))
@@ -470,12 +616,13 @@ class KubernetesOpts(TypedDict, total=False):
470
616
  image_repo: Optional[str]
471
617
  service_account: Optional[str]
472
618
  priority_class: Optional[str]
619
+ validate_spec: Optional[bool]
620
+ reserved_millicpu: Optional[int]
621
+ reserved_memmb: Optional[int]
622
+ efa_device_count: Optional[int]
473
623
 
474
624
 
475
- class KubernetesScheduler(
476
- DockerWorkspaceMixin,
477
- Scheduler[KubernetesOpts, AppDef, AppDryRunInfo[KubernetesJob]],
478
- ):
625
+ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
479
626
  """
480
627
  KubernetesScheduler is a TorchX scheduling interface to Kubernetes.
481
628
 
@@ -485,7 +632,7 @@ class KubernetesScheduler(
485
632
  For installation instructions see: https://github.com/volcano-sh/volcano
486
633
 
487
634
  This has been confirmed to work with Volcano v1.3.0 and Kubernetes versions
488
- v1.18-1.21. See https://github.com/pytorch/torchx/issues/120 which is
635
+ v1.18-1.21. See https://github.com/meta-pytorch/torchx/issues/120 which is
489
636
  tracking Volcano support for Kubernetes v1.22.
490
637
 
491
638
  .. note::
@@ -503,6 +650,16 @@ class KubernetesScheduler(
503
650
  $ torchx status kubernetes://torchx_user/1234
504
651
  ...
505
652
 
653
+ **Cancellation**
654
+
655
+ Canceling a job aborts it while preserving the job spec for inspection
656
+ and cloning via kubectl apply. Use the delete command to remove the job entirely:
657
+
658
+ .. code-block:: bash
659
+
660
+ $ torchx cancel kubernetes://namespace/jobname # abort, preserves spec
661
+ $ torchx delete kubernetes://namespace/jobname # delete completely
662
+
506
663
  **Config Options**
507
664
 
508
665
  .. runopts::
@@ -581,9 +738,14 @@ class KubernetesScheduler(
581
738
  if c is None:
582
739
  configuration = client.Configuration()
583
740
  try:
584
- config.load_kube_config(client_configuration=configuration)
585
- except config.ConfigException as e:
586
- warnings.warn(f"failed to load kube config: {e}")
741
+ # Try in-cluster config first (for pods with ServiceAccount)
742
+ config.load_incluster_config(client_configuration=configuration)
743
+ except config.ConfigException:
744
+ # Fall back to kubeconfig (for local development)
745
+ try:
746
+ config.load_kube_config(client_configuration=configuration)
747
+ except config.ConfigException as e:
748
+ warnings.warn(f"failed to load kube config: {e}", stacklevel=2)
587
749
 
588
750
  c = self._client = client.ApiClient(configuration)
589
751
 
@@ -635,7 +797,7 @@ class KubernetesScheduler(
635
797
  else:
636
798
  raise
637
799
 
638
- return f'{namespace}:{resp["metadata"]["name"]}'
800
+ return f"{namespace}:{resp['metadata']['name']}"
639
801
 
640
802
  def _submit_dryrun(
641
803
  self, app: AppDef, cfg: KubernetesOpts
@@ -657,7 +819,56 @@ class KubernetesScheduler(
657
819
  priority_class, str
658
820
  ), "priority_class must be a str"
659
821
 
660
- resource = app_to_resource(app, queue, service_account, priority_class)
822
+ reserved_millicpu = cfg.get("reserved_millicpu", RESERVED_MILLICPU)
823
+ assert isinstance(reserved_millicpu, int), "reserved_millicpu must be an int"
824
+
825
+ reserved_memmb = cfg.get("reserved_memmb", RESERVED_MEMMB)
826
+ assert isinstance(reserved_memmb, int), "reserved_memmb must be an int"
827
+
828
+ efa_device_count = cfg.get("efa_device_count")
829
+ assert efa_device_count is None or isinstance(
830
+ efa_device_count, int
831
+ ), "efa_device_count must be an int or None"
832
+
833
+ resource = app_to_resource(
834
+ app,
835
+ queue,
836
+ service_account,
837
+ priority_class,
838
+ reserved_millicpu,
839
+ reserved_memmb,
840
+ efa_device_count,
841
+ )
842
+
843
+ if cfg.get("validate_spec"):
844
+ try:
845
+ self._custom_objects_api().create_namespaced_custom_object(
846
+ group="batch.volcano.sh",
847
+ version="v1alpha1",
848
+ namespace=cfg.get("namespace") or "default",
849
+ plural="jobs",
850
+ body=resource,
851
+ dry_run="All",
852
+ )
853
+ except Exception as e:
854
+ from kubernetes.client.rest import ApiException
855
+
856
+ if isinstance(e, ApiException):
857
+ raise ValueError(f"Invalid job spec: {e.reason}") from e
858
+ raise
859
+
860
+ job_name = resource["metadata"]["name"]
861
+ for task in resource["spec"]["tasks"]:
862
+ task_name = task["name"]
863
+ replicas = task.get("replicas", 1)
864
+ max_index = replicas - 1
865
+ pod_name = f"{job_name}-{task_name}-{max_index}"
866
+ if len(pod_name) > 63:
867
+ raise ValueError(
868
+ f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
869
+ f"Shorten app.name or role names"
870
+ )
871
+
661
872
  req = KubernetesJob(
662
873
  resource=resource,
663
874
  images_to_push=images_to_push,
@@ -669,6 +880,31 @@ class KubernetesScheduler(
669
880
  pass
670
881
 
671
882
  def _cancel_existing(self, app_id: str) -> None:
883
+ """
884
+ Abort a Volcano job while preserving the spec for inspection.
885
+ """
886
+ namespace, name = app_id.split(":")
887
+ vcjob = self._custom_objects_api().get_namespaced_custom_object(
888
+ group="batch.volcano.sh",
889
+ version="v1alpha1",
890
+ namespace=namespace,
891
+ plural="jobs",
892
+ name=name,
893
+ )
894
+ vcjob["status"]["state"]["phase"] = "Aborted"
895
+ self._custom_objects_api().replace_namespaced_custom_object_status(
896
+ group="batch.volcano.sh",
897
+ version="v1alpha1",
898
+ namespace=namespace,
899
+ plural="jobs",
900
+ name=name,
901
+ body=vcjob,
902
+ )
903
+
904
+ def _delete_existing(self, app_id: str) -> None:
905
+ """
906
+ Delete a Volcano job completely from the cluster.
907
+ """
672
908
  namespace, name = app_id.split(":")
673
909
  self._custom_objects_api().delete_namespaced_custom_object(
674
910
  group="batch.volcano.sh",
@@ -702,19 +938,52 @@ class KubernetesScheduler(
702
938
  type_=str,
703
939
  help="The name of the PriorityClass to set on the job specs",
704
940
  )
941
+ opts.add(
942
+ "validate_spec",
943
+ type_=bool,
944
+ help="Validate job spec using Kubernetes API dry-run before submission",
945
+ default=True,
946
+ )
947
+ opts.add(
948
+ "reserved_millicpu",
949
+ type_=int,
950
+ help="Amount of CPU in millicores to reserve for Kubernetes system overhead (default: 100)",
951
+ default=RESERVED_MILLICPU,
952
+ )
953
+ opts.add(
954
+ "reserved_memmb",
955
+ type_=int,
956
+ help="Amount of memory in MB to reserve for Kubernetes system overhead (default: 1024)",
957
+ default=RESERVED_MEMMB,
958
+ )
959
+ opts.add(
960
+ "efa_device_count",
961
+ type_=int,
962
+ help="EFA device count override: None/unset=use resource spec, "
963
+ "0=remove EFA, N>0=set EFA count to N",
964
+ default=None,
965
+ )
705
966
  return opts
706
967
 
707
968
  def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
969
+ from kubernetes import client
970
+ from kubernetes.client.rest import ApiException
971
+
708
972
  namespace, name = app_id.split(":")
709
973
  roles = {}
710
974
  roles_statuses = {}
711
- resp = self._custom_objects_api().get_namespaced_custom_object_status(
712
- group="batch.volcano.sh",
713
- version="v1alpha1",
714
- namespace=namespace,
715
- plural="jobs",
716
- name=name,
717
- )
975
+ try:
976
+ resp = self._custom_objects_api().get_namespaced_custom_object_status(
977
+ group="batch.volcano.sh",
978
+ version="v1alpha1",
979
+ namespace=namespace,
980
+ plural="jobs",
981
+ name=name,
982
+ )
983
+ except ApiException as e:
984
+ if e.status == 404:
985
+ return None
986
+ raise
718
987
  status = resp.get("status")
719
988
  if status:
720
989
  state_str = status["state"]["phase"]
@@ -723,18 +992,44 @@ class KubernetesScheduler(
723
992
  TASK_STATUS_COUNT = "taskStatusCount"
724
993
 
725
994
  if TASK_STATUS_COUNT in status:
726
- for name, status in status[TASK_STATUS_COUNT].items():
727
- role, _, idx = name.rpartition("-")
995
+ for task_name, task_status in status[TASK_STATUS_COUNT].items():
996
+ role, _, idx = task_name.rpartition("-")
728
997
 
729
- state_str = next(iter(status["phase"].keys()))
998
+ state_str = next(iter(task_status["phase"].keys()))
730
999
  state = TASK_STATE[state_str]
731
1000
 
732
1001
  if role not in roles:
733
1002
  roles[role] = Role(name=role, num_replicas=0, image="")
734
1003
  roles_statuses[role] = RoleStatus(role, [])
735
1004
  roles[role].num_replicas += 1
1005
+
1006
+ # Pod name follows the pattern: {job_name}-{task_name}-0
1007
+ # Get the pod to retrieve its IP address
1008
+ pod_name_k8s = f"{name}-{task_name}-0"
1009
+ hostname = ""
1010
+ try:
1011
+ core_api = client.CoreV1Api(self._api_client())
1012
+ pod = core_api.read_namespaced_pod(
1013
+ name=pod_name_k8s, namespace=namespace
1014
+ )
1015
+ pod_ip = pod.status.pod_ip
1016
+
1017
+ if pod_ip is not None:
1018
+ # Convert IP to dashed format (e.g., 10.244.1.5 -> 10-244-1-5)
1019
+ pod_ip_dashed = pod_ip.replace(".", "-")
1020
+
1021
+ # Kubernetes DNS = <pod-ip-dashed>.<namespace>.pod.cluster.local
1022
+ # Note: This will only be useful if the client using the IPs is in the cluster.
1023
+ hostname = f"{pod_ip_dashed}.{namespace}.pod.cluster.local"
1024
+
1025
+ except ApiException:
1026
+ # Pod not found - hostname remains empty
1027
+ pass
1028
+
736
1029
  roles_statuses[role].replicas.append(
737
- ReplicaStatus(id=int(idx), role=role, state=state, hostname="")
1030
+ ReplicaStatus(
1031
+ id=int(idx), role=role, state=state, hostname=hostname
1032
+ )
738
1033
  )
739
1034
  else:
740
1035
  app_state = AppState.UNKNOWN
@@ -778,7 +1073,10 @@ class KubernetesScheduler(
778
1073
  core_api = client.CoreV1Api(self._api_client())
779
1074
  if should_tail:
780
1075
  w = watch.Watch()
781
- iterator = w.stream(core_api.read_namespaced_pod_log, **args)
1076
+ iterator = (
1077
+ f"{line}\n"
1078
+ for line in w.stream(core_api.read_namespaced_pod_log, **args)
1079
+ )
782
1080
  else:
783
1081
  resp = core_api.read_namespaced_pod_log(**args)
784
1082
  iterator = split_lines(resp)
@@ -823,13 +1121,34 @@ def create_scheduler(
823
1121
  def pod_labels(
824
1122
  app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
825
1123
  ) -> Dict[str, str]:
1124
+
1125
+ def clean(label_value: str) -> str:
1126
+ # cleans the provided `label_value` to make it compliant
1127
+ # to pod label specs as described in
1128
+ # https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
1129
+ #
1130
+ # Valid label value:
1131
+ # must be 63 characters or less (can be empty),
1132
+ # unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
1133
+ # could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
1134
+
1135
+ # Replace invalid characters (allow: alphanum, -, _, .) with "."
1136
+ label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
1137
+ # Replace leading non-alphanumeric with "."
1138
+ label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
1139
+ # Replace trailing non-alphanumeric with "."
1140
+ label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
1141
+
1142
+ # Trim to 63 characters
1143
+ return label_value[:63]
1144
+
826
1145
  return {
827
- LABEL_VERSION: torchx.__version__,
828
- LABEL_APP_NAME: app.name,
1146
+ LABEL_VERSION: clean(torchx.__version__),
1147
+ LABEL_APP_NAME: clean(app.name),
829
1148
  LABEL_ROLE_INDEX: str(role_idx),
830
- LABEL_ROLE_NAME: role.name,
1149
+ LABEL_ROLE_NAME: clean(role.name),
831
1150
  LABEL_REPLICA_ID: str(replica_id),
832
- LABEL_KUBE_APP_NAME: app.name,
1151
+ LABEL_KUBE_APP_NAME: clean(app.name),
833
1152
  LABEL_ORGANIZATION: "torchx.pytorch.org",
834
- LABEL_UNIQUE_NAME: app_id,
1153
+ LABEL_UNIQUE_NAME: clean(app_id),
835
1154
  }
@@ -529,7 +529,7 @@ def _register_termination_signals() -> None:
529
529
  signal.signal(signal.SIGINT, _terminate_process_handler)
530
530
 
531
531
 
532
- class LocalScheduler(Scheduler[LocalOpts, AppDef, AppDryRunInfo[PopenRequest]]):
532
+ class LocalScheduler(Scheduler[LocalOpts]):
533
533
  """
534
534
  Schedules on localhost. Containers are modeled as processes and
535
535
  certain properties of the container that are either not relevant
@@ -1159,6 +1159,7 @@ class LogIterator:
1159
1159
  self._check_finished() # check to see if app has finished running
1160
1160
 
1161
1161
  if os.path.isfile(self._log_file):
1162
+ time.sleep(0.1) # fix timing issue
1162
1163
  self._log_fp = open(
1163
1164
  self._log_file,
1164
1165
  mode="rt",
@@ -394,7 +394,7 @@ class LsfBsub:
394
394
  {self.materialize()}"""
395
395
 
396
396
 
397
- class LsfScheduler(Scheduler[LsfOpts, AppDef, AppDryRunInfo]):
397
+ class LsfScheduler(Scheduler[LsfOpts]):
398
398
  """
399
399
  **Example: hello_world**
400
400