polyaxon 2.0.0rc49__py3-none-any.whl → 2.4.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. polyaxon/_auxiliaries/cleaner.py +8 -3
  2. polyaxon/_auxiliaries/init.py +7 -2
  3. polyaxon/_auxiliaries/notifier.py +8 -2
  4. polyaxon/_auxiliaries/sidecar.py +30 -2
  5. polyaxon/_cli/artifacts.py +96 -11
  6. polyaxon/_cli/components.py +96 -11
  7. polyaxon/_cli/config.py +118 -22
  8. polyaxon/_cli/dashboard.py +15 -2
  9. polyaxon/_cli/init.py +1 -1
  10. polyaxon/_cli/models.py +96 -11
  11. polyaxon/_cli/operations.py +267 -90
  12. polyaxon/_cli/project_versions.py +139 -6
  13. polyaxon/_cli/projects.py +23 -9
  14. polyaxon/_cli/run.py +37 -9
  15. polyaxon/_cli/services/agent.py +2 -2
  16. polyaxon/_cli/services/clean_artifacts.py +1 -1
  17. polyaxon/_cli/services/sidecar.py +8 -1
  18. polyaxon/_client/client.py +17 -0
  19. polyaxon/_client/mixin.py +39 -0
  20. polyaxon/_client/project.py +218 -23
  21. polyaxon/_client/run.py +131 -33
  22. polyaxon/_compiler/contexts/contexts.py +2 -2
  23. polyaxon/_compiler/contexts/ray_job.py +4 -2
  24. polyaxon/_compiler/resolver/agent.py +12 -2
  25. polyaxon/_compiler/resolver/runtime.py +2 -2
  26. polyaxon/_contexts/paths.py +4 -7
  27. polyaxon/_deploy/operators/compose.py +1 -27
  28. polyaxon/_deploy/schemas/deployment.py +4 -1
  29. polyaxon/_deploy/schemas/intervals.py +0 -7
  30. polyaxon/_deploy/schemas/proxy.py +1 -0
  31. polyaxon/_deploy/schemas/service.py +11 -1
  32. polyaxon/_docker/converter/base/base.py +8 -0
  33. polyaxon/_docker/executor.py +10 -4
  34. polyaxon/_env_vars/getters/owner_entity.py +4 -2
  35. polyaxon/_env_vars/getters/project.py +4 -2
  36. polyaxon/_env_vars/getters/run.py +5 -2
  37. polyaxon/_env_vars/keys.py +7 -1
  38. polyaxon/_flow/__init__.py +2 -0
  39. polyaxon/_flow/builds/__init__.py +19 -6
  40. polyaxon/_flow/component/base.py +1 -0
  41. polyaxon/_flow/component/component.py +14 -0
  42. polyaxon/_flow/environment/__init__.py +8 -8
  43. polyaxon/_flow/hooks/__init__.py +19 -6
  44. polyaxon/_flow/init/__init__.py +6 -6
  45. polyaxon/_flow/matrix/iterative.py +0 -1
  46. polyaxon/_flow/matrix/tuner.py +18 -6
  47. polyaxon/_flow/operations/operation.py +44 -17
  48. polyaxon/_flow/plugins/__init__.py +6 -0
  49. polyaxon/_flow/run/__init__.py +2 -2
  50. polyaxon/_flow/run/dag.py +2 -2
  51. polyaxon/_flow/run/dask/dask.py +0 -1
  52. polyaxon/_flow/run/dask/replica.py +3 -3
  53. polyaxon/_flow/run/enums.py +5 -0
  54. polyaxon/_flow/run/job.py +4 -4
  55. polyaxon/_flow/run/kubeflow/mpi_job.py +1 -2
  56. polyaxon/_flow/run/kubeflow/mx_job.py +1 -2
  57. polyaxon/_flow/run/kubeflow/paddle_job.py +35 -4
  58. polyaxon/_flow/run/kubeflow/pytorch_job.py +51 -5
  59. polyaxon/_flow/run/kubeflow/replica.py +4 -4
  60. polyaxon/_flow/run/kubeflow/scheduling_policy.py +12 -0
  61. polyaxon/_flow/run/kubeflow/tf_job.py +3 -3
  62. polyaxon/_flow/run/kubeflow/xgboost_job.py +1 -2
  63. polyaxon/_flow/run/ray/ray.py +2 -3
  64. polyaxon/_flow/run/ray/replica.py +3 -3
  65. polyaxon/_flow/run/service.py +4 -4
  66. polyaxon/_fs/fs.py +7 -2
  67. polyaxon/_fs/utils.py +3 -2
  68. polyaxon/_k8s/converter/base/base.py +2 -1
  69. polyaxon/_k8s/converter/base/main.py +1 -0
  70. polyaxon/_k8s/converter/base/sidecar.py +16 -1
  71. polyaxon/_k8s/converter/common/accelerators.py +7 -4
  72. polyaxon/_k8s/converter/converters/job.py +1 -1
  73. polyaxon/_k8s/converter/converters/kubeflow/paddle_job.py +1 -0
  74. polyaxon/_k8s/converter/converters/kubeflow/pytroch_job.py +2 -0
  75. polyaxon/_k8s/converter/converters/kubeflow/tf_job.py +1 -0
  76. polyaxon/_k8s/converter/converters/ray_job.py +4 -2
  77. polyaxon/_k8s/custom_resources/dask_job.py +3 -0
  78. polyaxon/_k8s/custom_resources/kubeflow/common.py +4 -1
  79. polyaxon/_k8s/custom_resources/kubeflow/paddle_job.py +10 -1
  80. polyaxon/_k8s/custom_resources/kubeflow/pytorch_job.py +14 -1
  81. polyaxon/_k8s/custom_resources/kubeflow/tf_job.py +4 -0
  82. polyaxon/_k8s/custom_resources/ray_job.py +3 -0
  83. polyaxon/_k8s/custom_resources/setter.py +1 -1
  84. polyaxon/_k8s/executor/async_executor.py +2 -0
  85. polyaxon/_k8s/executor/base.py +23 -6
  86. polyaxon/_k8s/logging/async_monitor.py +150 -5
  87. polyaxon/_k8s/manager/async_manager.py +96 -23
  88. polyaxon/_k8s/manager/base.py +4 -0
  89. polyaxon/_k8s/manager/manager.py +282 -134
  90. polyaxon/_local_process/__init__.py +0 -0
  91. polyaxon/_local_process/agent.py +6 -0
  92. polyaxon/_local_process/converter/__init__.py +1 -0
  93. polyaxon/_local_process/converter/base/__init__.py +1 -0
  94. polyaxon/_local_process/converter/base/base.py +140 -0
  95. polyaxon/_local_process/converter/base/containers.py +69 -0
  96. polyaxon/_local_process/converter/base/env_vars.py +253 -0
  97. polyaxon/_local_process/converter/base/init.py +414 -0
  98. polyaxon/_local_process/converter/base/main.py +74 -0
  99. polyaxon/_local_process/converter/base/mounts.py +82 -0
  100. polyaxon/_local_process/converter/converters/__init__.py +8 -0
  101. polyaxon/_local_process/converter/converters/job.py +40 -0
  102. polyaxon/_local_process/converter/converters/service.py +41 -0
  103. polyaxon/_local_process/converter/mixins.py +38 -0
  104. polyaxon/_local_process/executor.py +132 -0
  105. polyaxon/_local_process/process_types.py +39 -0
  106. polyaxon/_managers/agent.py +2 -0
  107. polyaxon/_managers/home.py +2 -1
  108. polyaxon/_operations/tuner.py +1 -0
  109. polyaxon/_polyaxonfile/check.py +2 -0
  110. polyaxon/_polyaxonfile/manager/operations.py +3 -0
  111. polyaxon/_polyaxonfile/manager/workflows.py +2 -0
  112. polyaxon/_polyaxonfile/specs/compiled_operation.py +1 -0
  113. polyaxon/_polyaxonfile/specs/operation.py +1 -0
  114. polyaxon/_polyaxonfile/specs/sections.py +3 -0
  115. polyaxon/_pql/manager.py +1 -1
  116. polyaxon/_runner/agent/async_agent.py +97 -21
  117. polyaxon/_runner/agent/base_agent.py +27 -9
  118. polyaxon/_runner/agent/client.py +15 -1
  119. polyaxon/_runner/agent/sync_agent.py +85 -20
  120. polyaxon/_runner/converter/converter.py +6 -2
  121. polyaxon/_runner/executor.py +13 -7
  122. polyaxon/_schemas/agent.py +27 -1
  123. polyaxon/_schemas/client.py +30 -3
  124. polyaxon/_schemas/installation.py +4 -3
  125. polyaxon/_schemas/lifecycle.py +10 -5
  126. polyaxon/_schemas/log_handler.py +2 -3
  127. polyaxon/_schemas/types/artifacts.py +3 -3
  128. polyaxon/_schemas/types/dockerfile.py +3 -3
  129. polyaxon/_schemas/types/file.py +3 -3
  130. polyaxon/_schemas/types/git.py +3 -3
  131. polyaxon/_schemas/types/tensorboard.py +3 -3
  132. polyaxon/_sdk/api/agents_v1_api.py +1076 -73
  133. polyaxon/_sdk/api/organizations_v1_api.py +371 -10
  134. polyaxon/_sdk/api/project_dashboards_v1_api.py +12 -12
  135. polyaxon/_sdk/api/project_searches_v1_api.py +12 -12
  136. polyaxon/_sdk/api/projects_v1_api.py +221 -44
  137. polyaxon/_sdk/api/runs_v1_api.py +917 -445
  138. polyaxon/_sdk/api/service_accounts_v1_api.py +16 -16
  139. polyaxon/_sdk/api/teams_v1_api.py +2827 -375
  140. polyaxon/_sdk/api/users_v1_api.py +231 -55
  141. polyaxon/_sdk/async_client/api_client.py +4 -0
  142. polyaxon/_sdk/schemas/__init__.py +10 -2
  143. polyaxon/_sdk/schemas/v1_agent.py +2 -1
  144. polyaxon/_sdk/schemas/v1_agent_reconcile_body_request.py +14 -0
  145. polyaxon/_sdk/schemas/v1_artifact_tree.py +1 -1
  146. polyaxon/_sdk/schemas/v1_dashboard_spec.py +4 -0
  147. polyaxon/_sdk/schemas/v1_events_response.py +4 -0
  148. polyaxon/_sdk/schemas/v1_organization.py +1 -0
  149. polyaxon/_sdk/schemas/v1_preset.py +8 -0
  150. polyaxon/_sdk/schemas/v1_project.py +1 -0
  151. polyaxon/_sdk/schemas/v1_project_settings.py +4 -2
  152. polyaxon/_sdk/schemas/v1_run.py +2 -2
  153. polyaxon/_sdk/schemas/v1_run_edge_lineage.py +14 -0
  154. polyaxon/_sdk/schemas/v1_run_edges_graph.py +9 -0
  155. polyaxon/_sdk/schemas/v1_section_spec.py +7 -2
  156. polyaxon/_sdk/schemas/v1_settings_catalog.py +1 -0
  157. polyaxon/_sdk/schemas/v1_team.py +3 -0
  158. polyaxon/_sdk/schemas/v1_user.py +1 -2
  159. polyaxon/_sdk/schemas/v1_user_access.py +17 -0
  160. polyaxon/_services/values.py +1 -0
  161. polyaxon/_sidecar/container/__init__.py +39 -18
  162. polyaxon/_sidecar/container/monitors/__init__.py +1 -0
  163. polyaxon/_sidecar/container/monitors/logs.py +10 -13
  164. polyaxon/_sidecar/container/monitors/spec.py +24 -0
  165. polyaxon/_sidecar/ignore.py +0 -1
  166. polyaxon/_utils/fqn_utils.py +25 -2
  167. polyaxon/client.py +1 -1
  168. polyaxon/pkg.py +1 -1
  169. polyaxon/schemas.py +8 -1
  170. polyaxon/settings.py +6 -0
  171. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/METADATA +43 -43
  172. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/RECORD +176 -155
  173. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/WHEEL +1 -1
  174. polyaxon/_sdk/schemas/v1_project_user_access.py +0 -10
  175. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/LICENSE +0 -0
  176. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/entry_points.txt +0 -0
  177. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/top_level.txt +0 -0
@@ -33,15 +33,26 @@ class SidecarConverter(_BaseConverter):
33
33
 
34
34
  @staticmethod
35
35
  def _get_sidecar_args(
36
- container_id: str, sleep_interval: int, sync_interval: int, monitor_logs: bool
36
+ container_id: str,
37
+ sleep_interval: int,
38
+ sync_interval: int,
39
+ monitor_logs: bool,
40
+ monitor_spec: bool,
37
41
  ) -> List[str]:
38
42
  args = [
39
43
  "--container-id={}".format(container_id),
40
44
  "--sleep-interval={}".format(sleep_interval),
41
45
  "--sync-interval={}".format(sync_interval),
42
46
  ]
47
+ # enable monitor logs and spec by default
48
+ if monitor_logs is None:
49
+ monitor_logs = True
50
+ if monitor_spec is None:
51
+ monitor_spec = True
43
52
  if monitor_logs:
44
53
  args.append("--monitor-logs")
54
+ if monitor_spec:
55
+ args.append("--monitor-spec")
45
56
  return args
46
57
 
47
58
  @classmethod
@@ -87,6 +98,7 @@ class SidecarConverter(_BaseConverter):
87
98
  sleep_interval = polyaxon_sidecar.sleep_interval
88
99
  sync_interval = polyaxon_sidecar.sync_interval
89
100
  monitor_logs = polyaxon_sidecar.monitor_logs
101
+ monitor_spec = polyaxon_sidecar.monitor_spec
90
102
  if plugins and plugins.sidecar:
91
103
  if plugins.sidecar.sleep_interval:
92
104
  sleep_interval = plugins.sidecar.sleep_interval
@@ -94,11 +106,14 @@ class SidecarConverter(_BaseConverter):
94
106
  sync_interval = plugins.sidecar.sync_interval
95
107
  if plugins.sidecar.monitor_logs:
96
108
  monitor_logs = plugins.sidecar.monitor_logs
109
+ if plugins.sidecar.monitor_spec:
110
+ monitor_spec = plugins.sidecar.monitor_spec
97
111
  sidecar_args = cls._get_sidecar_args(
98
112
  container_id=container_id,
99
113
  sleep_interval=sleep_interval,
100
114
  sync_interval=sync_interval,
101
115
  monitor_logs=monitor_logs,
116
+ monitor_spec=monitor_spec,
102
117
  )
103
118
 
104
119
  env_from = []
@@ -37,14 +37,17 @@ def requests_gpu(resources: k8s_schemas.V1ResourceRequirements) -> bool:
37
37
  if not resources:
38
38
  return False
39
39
 
40
+ if not isinstance(resources, k8s_schemas.V1ResourceRequirements):
41
+ resources = k8s_schemas.V1ResourceRequirements(**resources)
42
+
40
43
  if resources.requests:
41
- for key in resources.requests.keys():
42
- if "gpu" in key:
44
+ for key, val in resources.requests.items():
45
+ if "gpu" in key and val is not None and val > 0:
43
46
  return True
44
47
 
45
48
  if resources.limits:
46
- for key in resources.limits.keys():
47
- if "gpu" in key:
49
+ for key, val in resources.limits.items():
50
+ if "gpu" in key and val is not None and val > 0:
48
51
  return True
49
52
 
50
53
  return False
@@ -39,7 +39,7 @@ class JobConverter(JobMixin, BaseConverter):
39
39
  default_sa=default_sa,
40
40
  )
41
41
  return get_job_custom_resource(
42
- namespace=self.namespace,
42
+ namespace=compiled_operation.namespace or self.namespace,
43
43
  main_container=replica_spec.main_container,
44
44
  sidecar_containers=replica_spec.sidecar_containers,
45
45
  init_containers=replica_spec.init_containers,
@@ -58,6 +58,7 @@ class PaddleJobConverter(PaddleJobMixin, BaseConverter):
58
58
  termination=compiled_operation.termination,
59
59
  clean_pod_policy=job.clean_pod_policy,
60
60
  scheduling_policy=job.scheduling_policy,
61
+ elastic_policy=job.elastic_policy,
61
62
  collect_logs=plugins.collect_logs,
62
63
  sync_statuses=plugins.sync_statuses,
63
64
  notifications=plugins.notifications,
@@ -58,6 +58,8 @@ class PytorchJobConverter(PytorchJobMixin, BaseConverter):
58
58
  termination=compiled_operation.termination,
59
59
  clean_pod_policy=job.clean_pod_policy,
60
60
  scheduling_policy=job.scheduling_policy,
61
+ elastic_policy=job.elastic_policy,
62
+ n_proc_per_node=job.n_proc_per_node,
61
63
  collect_logs=plugins.collect_logs,
62
64
  sync_statuses=plugins.sync_statuses,
63
65
  notifications=plugins.notifications,
@@ -63,6 +63,7 @@ class TfJobConverter(TFJobMixin, BaseConverter):
63
63
  collect_logs=plugins.collect_logs,
64
64
  clean_pod_policy=job.clean_pod_policy,
65
65
  scheduling_policy=job.scheduling_policy,
66
+ success_policy=job.success_policy,
66
67
  enable_dynamic_worker=job.enable_dynamic_worker,
67
68
  sync_statuses=plugins.sync_statuses,
68
69
  notifications=plugins.notifications,
@@ -58,7 +58,9 @@ class RayJobConverter(RayJobMixin, BaseConverter):
58
58
  config=compiled_operation.plugins, auth=default_auth
59
59
  )
60
60
  head = _get_replica(job.head)
61
- workers = {n: _get_replica(w) for n, w in job.workers.items()}
61
+ workers = None
62
+ if job.workers:
63
+ workers = {n: _get_replica(w) for n, w in job.workers.items()}
62
64
  labels = self.get_labels(version=pkg.VERSION, labels={})
63
65
 
64
66
  return get_ray_job_custom_resource(
@@ -68,7 +70,7 @@ class RayJobConverter(RayJobMixin, BaseConverter):
68
70
  workers=workers,
69
71
  entrypoint=job.entrypoint,
70
72
  metadata=job.metadata,
71
- runtime_env=encode(orjson_dumps(job.runtime_env)),
73
+ runtime_env=orjson_dumps(job.runtime_env),
72
74
  ray_version=job.ray_version,
73
75
  termination=compiled_operation.termination,
74
76
  collect_logs=plugins.collect_logs,
@@ -46,6 +46,9 @@ def get_dask_replicas_template(
46
46
  if liveness_probe and replica.main_container.liveness_probe is None:
47
47
  replica.main_container.liveness_probe = liveness_probe
48
48
 
49
+ labels = {**labels, **replica.labels}
50
+ annotations = {**annotations, **replica.annotations}
51
+
49
52
  metadata, pod_spec = get_pod_spec(
50
53
  namespace=namespace,
51
54
  main_container=replica.main_container,
@@ -16,6 +16,9 @@ def get_kf_replicas_template(
16
16
  if not replica:
17
17
  return
18
18
 
19
+ labels = {**labels, **replica.labels}
20
+ annotations = {**annotations, **replica.annotations}
21
+
19
22
  metadata, pod_spec = get_pod_spec(
20
23
  namespace=namespace,
21
24
  main_container=replica.main_container,
@@ -30,6 +33,6 @@ def get_kf_replicas_template(
30
33
 
31
34
  template_spec[replica_name] = {
32
35
  "replicas": replica.num_replicas,
33
- "restartPolicy": pod_spec.restart_policy or "Never",
36
+ "restartPolicy": pod_spec.restart_policy or "OnFailure",
34
37
  "template": get_pod_template_spec(metadata=metadata, pod_spec=pod_spec),
35
38
  }
@@ -1,6 +1,11 @@
1
1
  from typing import Dict, List, Optional
2
2
 
3
- from polyaxon._flow import V1Notification, V1SchedulingPolicy, V1Termination
3
+ from polyaxon._flow import (
4
+ V1Notification,
5
+ V1PaddleElasticPolicy,
6
+ V1SchedulingPolicy,
7
+ V1Termination,
8
+ )
4
9
  from polyaxon._k8s.custom_resources.kubeflow.common import get_kf_replicas_template
5
10
  from polyaxon._k8s.custom_resources.operation import get_operation_custom_object
6
11
  from polyaxon._k8s.custom_resources.setter import (
@@ -25,6 +30,7 @@ def get_paddle_job_custom_resource(
25
30
  notifications: List[V1Notification],
26
31
  clean_pod_policy: Optional[str],
27
32
  scheduling_policy: Optional[V1SchedulingPolicy],
33
+ elastic_policy: Optional[V1PaddleElasticPolicy],
28
34
  labels: Dict[str, str],
29
35
  annotations: Dict[str, str],
30
36
  ) -> Dict:
@@ -58,6 +64,9 @@ def get_paddle_job_custom_resource(
58
64
  template_spec=template_spec, scheduling_policy=scheduling_policy
59
65
  )
60
66
 
67
+ if elastic_policy:
68
+ template_spec["elasticPolicy"] = elastic_policy.to_light_dict()
69
+
61
70
  custom_object = {"paddleJobSpec": template_spec}
62
71
  custom_object = set_termination(
63
72
  custom_object=custom_object, termination=termination
@@ -1,6 +1,11 @@
1
1
  from typing import Dict, List, Optional
2
2
 
3
- from polyaxon._flow import V1Notification, V1SchedulingPolicy, V1Termination
3
+ from polyaxon._flow import (
4
+ V1Notification,
5
+ V1PytorchElasticPolicy,
6
+ V1SchedulingPolicy,
7
+ V1Termination,
8
+ )
4
9
  from polyaxon._k8s.custom_resources.kubeflow.common import get_kf_replicas_template
5
10
  from polyaxon._k8s.custom_resources.operation import get_operation_custom_object
6
11
  from polyaxon._k8s.custom_resources.setter import (
@@ -25,6 +30,8 @@ def get_pytorch_job_custom_resource(
25
30
  notifications: List[V1Notification],
26
31
  clean_pod_policy: Optional[str],
27
32
  scheduling_policy: Optional[V1SchedulingPolicy],
33
+ elastic_policy: Optional[V1PytorchElasticPolicy],
34
+ n_proc_per_node: Optional[int],
28
35
  labels: Dict[str, str],
29
36
  annotations: Dict[str, str],
30
37
  ) -> Dict:
@@ -58,6 +65,12 @@ def get_pytorch_job_custom_resource(
58
65
  template_spec=template_spec, scheduling_policy=scheduling_policy
59
66
  )
60
67
 
68
+ if elastic_policy:
69
+ template_spec["elasticPolicy"] = elastic_policy.to_light_dict()
70
+
71
+ if n_proc_per_node is not None:
72
+ template_spec["nProcPerNode"] = str(n_proc_per_node)
73
+
61
74
  custom_object = {"pytorchJobSpec": template_spec}
62
75
  custom_object = set_termination(
63
76
  custom_object=custom_object, termination=termination
@@ -28,6 +28,7 @@ def get_tf_job_custom_resource(
28
28
  clean_pod_policy: Optional[str],
29
29
  scheduling_policy: Optional[V1SchedulingPolicy],
30
30
  enable_dynamic_worker: bool,
31
+ success_policy: Optional[str],
31
32
  labels: Dict[str, str],
32
33
  annotations: Dict[str, str],
33
34
  ) -> Dict:
@@ -73,6 +74,9 @@ def get_tf_job_custom_resource(
73
74
  if enable_dynamic_worker:
74
75
  template_spec["enableDynamicWorker"] = enable_dynamic_worker
75
76
 
77
+ if success_policy:
78
+ template_spec["successPolicy"] = success_policy
79
+
76
80
  template_spec = {"replicaSpecs": template_spec}
77
81
 
78
82
  template_spec = set_clean_pod_policy(
@@ -26,6 +26,9 @@ def _get_ray_replicas_template(
26
26
  if not replica:
27
27
  return
28
28
 
29
+ labels = {**labels, **replica.labels}
30
+ annotations = {**annotations, **replica.annotations}
31
+
29
32
  metadata, pod_spec = get_pod_spec(
30
33
  namespace=namespace,
31
34
  main_container=replica.main_container,
@@ -46,7 +46,7 @@ def set_notify(custom_object: Dict, notifications: List[V1Notification]) -> Dict
46
46
  def set_clean_pod_policy(template_spec: Dict, clean_pod_policy: str) -> Dict:
47
47
  if not clean_pod_policy:
48
48
  # Sets default clean pod policy
49
- clean_pod_policy = "All"
49
+ clean_pod_policy = "None"
50
50
 
51
51
  template_spec["cleanPodPolicy"] = clean_pod_policy.capitalize()
52
52
  return template_spec
@@ -14,6 +14,8 @@ class AsyncExecutor(BaseExecutor):
14
14
  )
15
15
 
16
16
  async def refresh(self):
17
+ if self._manager:
18
+ await self._manager.close()
17
19
  manager = super().refresh()
18
20
  await manager.setup()
19
21
  return manager
@@ -5,7 +5,7 @@ from kubernetes.client import Configuration
5
5
 
6
6
  from polyaxon import settings
7
7
  from polyaxon._k8s.converter.converters import CONVERTERS
8
- from polyaxon._k8s.converter.mixins import MIXIN_MAPPING
8
+ from polyaxon._k8s.converter.mixins import MIXIN_MAPPING, BaseMixin
9
9
  from polyaxon._runner.executor import BaseExecutor as _BaseExecutor
10
10
  from polyaxon._runner.kinds import RunnerKind
11
11
  from polyaxon._utils.fqn_utils import get_resource_name
@@ -56,7 +56,9 @@ class BaseExecutor(_BaseExecutor):
56
56
  api = k8s_client.ApiClient()
57
57
  return api.sanitize_for_serialization(resource)
58
58
 
59
- def create(self, run_uuid: str, run_kind: str, resource: Dict) -> Dict:
59
+ def create(
60
+ self, run_uuid: str, run_kind: str, resource: Dict, namespace: str = None
61
+ ) -> Dict:
60
62
  mixin = self._get_mixin_for_kind(kind=run_kind)
61
63
  resource_name = get_resource_name(run_uuid)
62
64
  return self.manager.create_custom_object(
@@ -65,9 +67,12 @@ class BaseExecutor(_BaseExecutor):
65
67
  version=mixin.API_VERSION,
66
68
  plural=mixin.PLURAL,
67
69
  body=resource,
70
+ namespace=namespace,
68
71
  )
69
72
 
70
- def apply(self, run_uuid: str, run_kind: str, resource: Dict) -> Dict:
73
+ def apply(
74
+ self, run_uuid: str, run_kind: str, resource: Dict, namespace: str = None
75
+ ) -> Dict:
71
76
  mixin = self._get_mixin_for_kind(kind=run_kind)
72
77
  resource_name = get_resource_name(run_uuid)
73
78
  return self.manager.update_custom_object(
@@ -76,9 +81,10 @@ class BaseExecutor(_BaseExecutor):
76
81
  version=mixin.API_VERSION,
77
82
  plural=mixin.PLURAL,
78
83
  body=resource,
84
+ namespace=namespace,
79
85
  )
80
86
 
81
- def stop(self, run_uuid: str, run_kind: str):
87
+ def stop(self, run_uuid: str, run_kind: str, namespace: str = None):
82
88
  mixin = self._get_mixin_for_kind(kind=run_kind)
83
89
  resource_name = get_resource_name(run_uuid)
84
90
  return self.manager.delete_custom_object(
@@ -86,16 +92,18 @@ class BaseExecutor(_BaseExecutor):
86
92
  group=mixin.GROUP,
87
93
  version=mixin.API_VERSION,
88
94
  plural=mixin.PLURAL,
95
+ namespace=namespace,
89
96
  )
90
97
 
91
- def clean(self, run_uuid: str, run_kind: str):
98
+ def clean(self, run_uuid: str, run_kind: str, namespace: str = None):
92
99
  return self.apply(
93
100
  run_uuid=run_uuid,
94
101
  run_kind=run_kind,
95
102
  resource={"metadata": {"finalizers": None}},
103
+ namespace=namespace,
96
104
  )
97
105
 
98
- def get(self, run_uuid: str, run_kind: str):
106
+ def get(self, run_uuid: str, run_kind: str, namespace: str = None):
99
107
  mixin = self._get_mixin_for_kind(kind=run_kind)
100
108
  resource_name = get_resource_name(run_uuid)
101
109
  return self.manager.get_custom_object(
@@ -103,4 +111,13 @@ class BaseExecutor(_BaseExecutor):
103
111
  group=mixin.GROUP,
104
112
  version=mixin.API_VERSION,
105
113
  plural=mixin.PLURAL,
114
+ namespace=namespace,
115
+ )
116
+
117
+ def list_ops(self, namespace: str = None):
118
+ return self.manager.list_custom_objects(
119
+ group=BaseMixin.GROUP,
120
+ version=BaseMixin.API_VERSION,
121
+ plural=BaseMixin.PLURAL,
122
+ namespace=namespace,
106
123
  )
@@ -6,6 +6,7 @@ from clipped.utils.tz import now
6
6
  from kubernetes_asyncio.client.models import V1Pod
7
7
  from kubernetes_asyncio.client.rest import ApiException
8
8
 
9
+ from polyaxon._flow import V1RunKind
9
10
  from polyaxon._k8s.manager.async_manager import AsyncK8sManager
10
11
  from traceml.logging import V1Log, V1Logs
11
12
 
@@ -65,7 +66,7 @@ async def query_k8s_operation_logs(
65
66
  new_time = now()
66
67
  params = {}
67
68
  if last_time:
68
- since_seconds = (new_time - last_time).total_seconds() - 1
69
+ since_seconds = (new_time - last_time).total_seconds()
69
70
  params["since_seconds"] = int(since_seconds)
70
71
  if stream:
71
72
  params["tail_lines"] = V1Logs._CHUNK_SIZE
@@ -82,9 +83,28 @@ async def query_k8s_operation_logs(
82
83
  **params,
83
84
  )
84
85
 
86
+ if logs and last_time:
87
+ # make sure to filter logs larger than last_time
88
+ logs = [log for log in logs if log.timestamp > last_time]
89
+ if logs and logs[-1].timestamp:
90
+ new_time = logs[-1].timestamp
85
91
  return logs, new_time
86
92
 
87
93
 
94
+ async def collect_agent_service_logs(
95
+ k8s_manager: AsyncK8sManager, pod: V1Pod
96
+ ) -> List[V1Log]:
97
+ if not pod or not pod.spec.containers:
98
+ return []
99
+ container = pod.spec.containers[0]
100
+ return await handle_container_logs(
101
+ k8s_manager=k8s_manager,
102
+ pod=pod,
103
+ container_name=container.name,
104
+ tail_lines=V1Logs._CHUNK_SIZE,
105
+ )
106
+
107
+
88
108
  async def query_k8s_pod_logs(
89
109
  k8s_manager: AsyncK8sManager,
90
110
  pod: V1Pod,
@@ -94,13 +114,138 @@ async def query_k8s_pod_logs(
94
114
  new_time = now()
95
115
  params = {}
96
116
  if last_time:
97
- since_seconds = (new_time - last_time).total_seconds() - 1
117
+ since_seconds = (new_time - last_time).total_seconds()
98
118
  params["since_seconds"] = int(since_seconds)
99
119
  if stream:
100
120
  params["tail_lines"] = V1Logs._CHUNK_SIZE
101
121
 
102
122
  logs = await handle_pod_logs(k8s_manager=k8s_manager, pod=pod, **params)
103
123
 
104
- if logs:
105
- last_time = logs[-1].timestamp
106
- return logs, last_time
124
+ if logs and last_time:
125
+ # make sure to filter logs larger than last_time
126
+ logs = [log for log in logs if log.timestamp > last_time]
127
+ if logs and logs[-1].timestamp:
128
+ new_time = logs[-1].timestamp
129
+ return logs, new_time
130
+
131
+
132
+ async def get_op_pods_and_services(
133
+ k8s_manager: AsyncK8sManager,
134
+ run_uuid: str,
135
+ run_kind: str,
136
+ ):
137
+ pods = await k8s_manager.list_pods(
138
+ label_selector=k8s_manager.get_managed_by_polyaxon(run_uuid)
139
+ )
140
+ services = []
141
+ if V1RunKind.has_service(run_kind):
142
+ services = await k8s_manager.list_services(
143
+ label_selector=k8s_manager.get_managed_by_polyaxon(run_uuid)
144
+ )
145
+
146
+ return pods, services
147
+
148
+
149
+ async def get_resource_events(
150
+ k8s_manager: AsyncK8sManager, resource_type: str, resource_name: str
151
+ ):
152
+ field_selector = (
153
+ f"involvedObject.kind={resource_type},involvedObject.name={resource_name}"
154
+ )
155
+ try:
156
+ events = await k8s_manager.list_namespaced_events(field_selector=field_selector)
157
+
158
+ all_events = []
159
+ for event in events:
160
+ event_data = {
161
+ "reason": event.reason,
162
+ "message": event.message,
163
+ "first_timestamp": event.first_timestamp,
164
+ "last_timestamp": event.last_timestamp,
165
+ "count": event.count,
166
+ "type": event.type,
167
+ }
168
+ all_events.append(event_data)
169
+
170
+ return all_events
171
+
172
+ except ApiException as e:
173
+ print(f"Exception when calling CoreV1Api->list_namespaced_event: {e}")
174
+ return []
175
+
176
+
177
+ async def get_op_spec(
178
+ k8s_manager: AsyncK8sManager,
179
+ run_uuid: str,
180
+ run_kind: str,
181
+ ):
182
+ pods, services = await get_op_pods_and_services(
183
+ k8s_manager=k8s_manager,
184
+ run_uuid=run_uuid,
185
+ run_kind=run_kind,
186
+ )
187
+ pods_list = {}
188
+ for pod in pods or []:
189
+ pods_list[
190
+ pod.metadata.name
191
+ ] = k8s_manager.api_client.sanitize_for_serialization(pod)
192
+ pods_list[pod.metadata.name]["events"] = await get_resource_events(
193
+ k8s_manager=k8s_manager,
194
+ resource_type="Pod",
195
+ resource_name=pod.metadata.name,
196
+ )
197
+ services_list = {}
198
+ for service in services or []:
199
+ services_list[
200
+ service.metadata.name
201
+ ] = k8s_manager.api_client.sanitize_for_serialization(service)
202
+ services_list[service.metadata.name]["events"] = await get_resource_events(
203
+ k8s_manager=k8s_manager,
204
+ resource_type="Service",
205
+ resource_name=service.metadata.name,
206
+ )
207
+ data = {"pods": pods_list, "services": services_list}
208
+ return data, pods, services
209
+
210
+
211
+ async def get_agent_pods_and_services(
212
+ k8s_manager: AsyncK8sManager,
213
+ ):
214
+ pods = await k8s_manager.list_pods(
215
+ label_selector=k8s_manager.get_core_polyaxon(),
216
+ )
217
+ services = await k8s_manager.list_services(
218
+ label_selector=k8s_manager.get_core_polyaxon(),
219
+ )
220
+ return pods, services
221
+
222
+
223
+ async def get_agent_spec(
224
+ k8s_manager: AsyncK8sManager,
225
+ ):
226
+ pods, services = await get_agent_pods_and_services(
227
+ k8s_manager=k8s_manager,
228
+ )
229
+ pods_list = {}
230
+ for pod in pods or []:
231
+ pods_list[
232
+ pod.metadata.name
233
+ ] = k8s_manager.api_client.sanitize_for_serialization(pod)
234
+ pods_list[pod.metadata.name]["events"] = await get_resource_events(
235
+ k8s_manager=k8s_manager,
236
+ resource_type="Pod",
237
+ resource_name=pod.metadata.name,
238
+ )
239
+ data = {"pods": pods_list}
240
+ services_list = {}
241
+ for service in services or []:
242
+ services_list[
243
+ service.metadata.name
244
+ ] = k8s_manager.api_client.sanitize_for_serialization(service)
245
+ services_list[service.metadata.name]["events"] = await get_resource_events(
246
+ k8s_manager=k8s_manager,
247
+ resource_type="Service",
248
+ resource_name=service.metadata.name,
249
+ )
250
+ data["services"] = services_list
251
+ return data, pods, services