PyPI - konduktor-nightly - Versions diffs - 0.1.0.dev20250811105223__py3-none-any.whl → 0.1.0.dev20250813105033__py3-none-any.whl - Mend

konduktor-nightly 0.1.0.dev20250811105223py3-none-any.whl → 0.1.0.dev20250813105033py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of konduktor-nightly might be problematic. Click here for more details.

Files changed (12) hide show

konduktor/__init__.py CHANGED Viewed

@@ -11,7 +11,7 @@ from konduktor.task import Task
 __all__ = ['launch', 'Resources', 'Task', 'Serving']
 # Replaced with the current commit when building the wheels.
-_KONDUKTOR_COMMIT_SHA = '92fe69bd3f29e7b191de663c598dfcf10738f87a'
+_KONDUKTOR_COMMIT_SHA = 'f4ba2084fac1c1030245b475323f4f3a57fd3fa3'
 os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
@@ -45,5 +45,5 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev0.1.0.dev20250811105223'
+__version__ = '1.0.0.dev0.1.0.dev20250813105033'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))

konduktor/backends/constants.py CHANGED Viewed

@@ -8,6 +8,7 @@ USERID_LABEL = 'trainy.ai/user-id'
 USER_LABEL = 'trainy.ai/username'
 ACCELERATOR_LABEL = 'trainy.ai/accelerator'
 NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
+MAX_EXECUTION_TIME_LABEL = 'kueue.x-k8s.io/max-exec-time-seconds'
 # Start/stop/status related labels
 STOP_USERID_LABEL = 'trainy.ai/stop-userid'

konduktor/backends/jobset.py CHANGED Viewed

@@ -176,7 +176,7 @@ class JobsetBackend(backend.Backend):
         context = kubernetes_utils.get_current_kube_config_context_name()
         namespace = kubernetes_utils.get_kube_config_context_namespace(context)
         # TODO(asaiacai): need to set env variables in pod
-        jobset_utils.create_jobset(
+        jobset_response = jobset_utils.create_jobset(
             namespace,
             task,
             pod_spec['kubernetes']['pod_config'],
@@ -192,9 +192,10 @@ class JobsetBackend(backend.Backend):
                 ):
                     _wait_for_jobset_start(namespace, task.name)
                 try:
+                    assert jobset_response is not None
                     log_thread = threading.Thread(
                         target=log_utils.tail_logs,
-                        args=(task.name,),
+                        args=(jobset_response,),
                         daemon=True,
                     )
                     logger.info('streaming logs...')

konduktor/backends/jobset_utils.py CHANGED Viewed

@@ -39,6 +39,7 @@ JOBSET_USERID_LABEL = backend_constants.USERID_LABEL
 JOBSET_USER_LABEL = backend_constants.USER_LABEL
 JOBSET_ACCELERATOR_LABEL = backend_constants.ACCELERATOR_LABEL
 JOBSET_NUM_ACCELERATORS_LABEL = backend_constants.NUM_ACCELERATORS_LABEL
+JOBSET_MAX_EXECUTION_TIME_LABEL = backend_constants.MAX_EXECUTION_TIME_LABEL
 SECRET_BASENAME_LABEL = backend_constants.SECRET_BASENAME_LABEL
@@ -48,6 +49,7 @@ _JOBSET_METADATA_LABELS = {
     'jobset_user_label': JOBSET_USER_LABEL,
     'jobset_accelerator_label': JOBSET_ACCELERATOR_LABEL,
     'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
+    'jobset_max_execution_time_label': JOBSET_MAX_EXECUTION_TIME_LABEL,
 }
@@ -79,6 +81,7 @@ def create_jobset(
     assert task.resources is not None, 'Task resources are undefined'
     accelerator_type = task.resources.get_accelerator_type() or 'None'
     num_accelerators = task.resources.get_accelerator_count() or 0
+    labels = task.resources.labels if task.resources.labels else {}
     with tempfile.NamedTemporaryFile() as temp:
         common_utils.fill_template(
             'jobset.yaml.j2',
@@ -91,6 +94,7 @@ def create_jobset(
                 'num_accelerators': num_accelerators,
                 'completions': task.resources.get_completions(),
                 'max_restarts': task.resources.get_max_restarts(),
+                'max_execution_time': labels.get('maxRunDurationSeconds', None),
                 **_JOBSET_METADATA_LABELS,
             },
             temp.name,
@@ -430,6 +434,36 @@ def _parse_timestamp_filter(timestamp_str: str) -> datetime:
     )
+def _format_timestamp(timestamp: str) -> str:
+    """Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
+    # Parse UTC timestamp and convert to local time
+    dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
+        tzinfo=timezone.utc
+    )
+    dt_local = dt_utc.astimezone()  # Convert to local timezone
+    return dt_local.strftime('%m/%d/%y %I:%M%p')
+def _get_job_start_time(job: Dict[str, Any]) -> str:
+    for condition in job['status']['conditions']:
+        if condition['reason'] == 'ResumeJobs':
+            return condition.get('lastTransitionTime', '')
+    return '-'
+def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
+    """Extract end time from JobSet conditions (Completed or Failed)"""
+    conditions = job.get('status', {}).get('conditions', [])
+    for condition in conditions:
+        # Look for terminal conditions with status=True
+        if (
+            condition.get('type') in ['Completed', 'Failed']
+            and condition.get('status') == 'True'
+        ):
+            return condition.get('lastTransitionTime', '')
+    return '-'
 def show_status_table(
     namespace: str,
     all_users: bool,
@@ -523,15 +557,6 @@ def show_status_table(
         result = f'{days_str}{hours_str}{minutes_str}{seconds_str}'
         return result if result else '<1 minute', delta
-    def _format_timestamp(timestamp: str) -> str:
-        """Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
-        # Parse UTC timestamp and convert to local time
-        dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
-            tzinfo=timezone.utc
-        )
-        dt_local = dt_utc.astimezone()  # Convert to local timezone
-        return dt_local.strftime('%m/%d/%y %I:%M%p')
     def _get_resources(job: Dict[str, Any]) -> str:
         num_pods = int(
             job['spec']['replicatedJobs'][0]['template']['spec']['parallelism']
@@ -591,25 +616,17 @@ def show_status_table(
             if before_dt and job_creation_time >= before_dt:
                 continue
         # Get start time
-        start_time = _format_timestamp(job['metadata']['creationTimestamp'])
+        start_time = _get_job_start_time(job)
+        if start_time != '-':
+            start_time = _format_timestamp(start_time)
         # Get submitted time (how long ago)
         submitted_time, _ = _get_time_delta(job['metadata']['creationTimestamp'])
         # Get end time (from JobSet conditions)
-        def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
-            """Extract end time from JobSet conditions (Completed or Failed)"""
-            conditions = job.get('status', {}).get('conditions', [])
-            for condition in conditions:
-                # Look for terminal conditions with status=True
-                if (
-                    condition.get('type') in ['Completed', 'Failed']
-                    and condition.get('status') == 'True'
-                ):
-                    return _format_timestamp(condition.get('lastTransitionTime', ''))
-            return '-'
         end_time = _get_end_time_from_conditions(job)
+        if end_time != '-':
+            end_time = _format_timestamp(end_time)
         if all_users:
             rows.append(

konduktor/cli.py CHANGED Viewed

@@ -732,7 +732,7 @@ def logs(
     # Verify the job exists before attempting to tail logs
     # TODO(asaiacai): unify the 404 logic under jobset_utils
     try:
-        jobset_utils.get_jobset(namespace, job_id)
+        jobset_response = jobset_utils.get_jobset(namespace, job_id)
     except jobset_utils.JobNotFoundError:
         raise click.UsageError(
             f"Job '{job_id}' not found in namespace "
@@ -741,12 +741,9 @@ def logs(
             f'{colorama.Style.RESET_ALL}.'
         )
-    click.secho(
-        'Logs are tailed from 1 hour ago, ' 'to see more logs, check Grafana.',
-        fg='yellow',
-    )
+    assert isinstance(jobset_response, dict), f'jobset_response: {jobset_response}'
     log_utils.tail_logs(
-        job_id,
+        jobset_response,
         worker_id=node_rank,
         follow=follow,
         num_logs=num_lines,

konduktor/templates/jobset.yaml.j2 CHANGED Viewed

@@ -11,6 +11,9 @@ jobset:
       {{ jobset_accelerator_label }}: "{{ accelerator_type }}"
       {{ jobset_num_accelerators_label }}: "{{ num_accelerators }}"
       {% endif %}
+      {% if max_execution_time %}
+      {{ jobset_max_execution_time_label }}: "{{ max_execution_time }}"
+      {% endif %}
       trainy.ai/konduktor-managed: "true"
       parent: "trainy"
     annotations: {}

konduktor/utils/log_utils.py CHANGED Viewed

@@ -337,45 +337,51 @@ def tail_loki_logs_ws(
 def tail_vicky_logs(
-    job_name: str,
+    jobset_response: Dict[str, Any],
     worker_id: int = 0,
-    num_logs: int = 1000,
+    num_logs: int = -1,
     follow: bool = True,
 ):
+    job_name = jobset_response['metadata']['name']
     context = kubernetes_utils.get_current_kube_config_context_name()
     namespace = kubernetes_utils.get_kube_config_context_namespace(context)
     query: Dict[str, Any] = {}
-    if num_logs > 5000:
-        # TODO(asaiacai): we should not have a limit on the number of logs, but rather
-        # let the user specify any number of lines, and we can print the last N lines.
-        # this can be done in chunks. Potentially, we can query range
-        # until we reach the end of the log and then invoke tail again.
-        # Also include checks that the job is running/ever ran.
-        raise ValueError('num_logs must be less than or equal to 5000')
-    logger.info('ignoring num_logs argument for VictoriaLogs')
     vicky_svc = kr8s.objects.Service.get(
         'vls-victoria-logs-single-server', namespace='victoria-logs'
     )
+    if num_logs == -1:
+        query = {}
+    else:
+        assert num_logs > 0, f'num_logs must be greater than 0, got {num_logs}'
+        query = {'limit': num_logs}
+    if follow:
+        logger.info(
+            'No end time found, tailing logs from 1 hour ago. '
+            'If logs come up empty, there might be logs just earlier '
+            'than the past hour, check Grafana or use:\n'
+            f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
+            f'`konduktor tail --no-follow {job_name}`'
+            f'{colorama.Style.RESET_ALL}'
+        )
+        query['start_offset'] = '1h'
+    query['query'] = (
+        f'k8s.namespace.name: "{namespace}" AND '
+        f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
+        f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
+    )
     with kr8s.portforward.PortForward(
         vicky_svc, VICKY_REMOTE_PORT, local_port='auto'
     ) as port:
         if follow:
             timeout = INFINITY
             vicky_url = f'http://localhost:{port}/select/logsql/tail'
-            query = {}
         else:
             vicky_url = f'http://localhost:{port}/select/logsql/query'
-            query = {'limit': num_logs}
             timeout = 1
         logger.debug(f'Vicky URL: {vicky_url}')
-        query['query'] = (
-            f'k8s.namespace.name: "{namespace}" AND '
-            f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
-            f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
-        )
-        query['start_offset'] = '1h'
         try:
             logger.debug(f'Making request to {vicky_url} with query: {query}')
             with requests.post(
@@ -412,16 +418,17 @@ def tail_vicky_logs(
 def tail_logs(
-    job_name: str,
+    jobset_response: Dict[str, Any],
     worker_id: int = 0,
     num_logs: int = 1000,
     follow: bool = True,
 ):
+    job_name = jobset_response['metadata']['name']
     logs_backend = config.get_nested(('logs', 'backend'), None)
     if logs_backend == LogBackend.VICTORIA:
-        tail_vicky_logs(job_name, worker_id, num_logs, follow)
+        tail_vicky_logs(jobset_response, worker_id, num_logs, follow)
     elif logs_backend == LogBackend.LOKI:
         tail_loki_logs_ws(job_name, worker_id, num_logs, follow)
     else:
         logger.info('Defaulting to VictoriaLogs')
-        tail_vicky_logs(job_name, worker_id, num_logs, follow)
+        tail_vicky_logs(jobset_response, worker_id, num_logs, follow)

{konduktor_nightly-0.1.0.dev20250811105223.dist-info → konduktor_nightly-0.1.0.dev20250813105033.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: konduktor-nightly
-Version: 0.1.0.dev20250811105223
+Version: 0.1.0.dev20250813105033
 Summary: GPU Cluster Health Management
 Author: Andrew Aikawa
 Author-email: asai@berkeley.edu

{konduktor_nightly-0.1.0.dev20250811105223.dist-info → konduktor_nightly-0.1.0.dev20250813105033.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-konduktor/__init__.py,sha256=DYiQ-TfOdYUqIzT8psdjnWcjtjMI4sbldAghArAY5e0,1574
+konduktor/__init__.py,sha256=_hY0EbT0p1RyIpToDmvNWpK2QTJNNplI92T0AlW7tck,1574
 konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
 konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
@@ -6,14 +6,14 @@ konduktor/adaptors/gcp.py,sha256=ierTF4z7vwpJ9BsC7LSiwv4uLcjGXscwZOwQrddr2vM,410
 konduktor/authentication.py,sha256=_mVy3eqoKohicHostFiGwG1-2ybxP-l7ouofQ0LRlCY,4570
 konduktor/backends/__init__.py,sha256=usWJ8HdZJEyg7MIsN8Zcz9rk9e2Lq5dWJ8dv6hCN3ys,199
 konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
-konduktor/backends/constants.py,sha256=NfdhY1PQnewvDCjgRKXj6EZDcVH8k_0GGxnMo7w6HDU,666
+konduktor/backends/constants.py,sha256=nt9G9AmFCOMwO4GuKgRQSzJJuKapOmaROp4_Y0tMF5A,732
 konduktor/backends/deployment.py,sha256=EHfB2uLeKFQ3maek9tx6XL4_sjQ-ax59DZA79Q3EkVs,5519
 konduktor/backends/deployment_utils.py,sha256=VGuL01rKe7p7PoVRI_cP4tiZRxHZ13nnTMG-bmDf7P0,28975
-konduktor/backends/jobset.py,sha256=OwgDog9nH-FoUmNU_H--C3U5jx70reTKL1l849M1k5A,8430
-konduktor/backends/jobset_utils.py,sha256=O983a78D411go_F0K2mijZAE1dXAFF7i6aQ7rOrfH7A,24663
+konduktor/backends/jobset.py,sha256=E9THHmcpxTohsx6Goi9mKF4dy_mYpR2DHloSwGVr9jA,8509
+konduktor/backends/jobset_utils.py,sha256=7fB8X4b2Q5BKFCIGME72dyeCfi-EemoMeJVnwtzcjq4,25184
 konduktor/backends/pod_utils.py,sha256=Jfv_CY8suF0e7QEaeQiNRRxRnOueLgPR8SfLEO7lnwc,15260
 konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
-konduktor/cli.py,sha256=9S6DEsK_qlD34UM6CwFah0FmJgQ4lVaV-LViKp9fJ6o,56687
+konduktor/cli.py,sha256=YD9gMH2ZJykdfrHvzY-DkPQgD-cltahEW141wdI8eiI,56674
 konduktor/config.py,sha256=9upqgCCYvcu6fKw7tovEYC1MWTkAAir0_WHPdayylbI,15536
 konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
 konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -76,7 +76,7 @@ konduktor/resource.py,sha256=qQhMlI6gvTaoGfYb9NNgSrUavgNqfcYVfb9V_oC5pLE,20411
 konduktor/serving.py,sha256=sh8TPAUXg23Bkt0ByatIMdxFFqzRm18HJTEkt3wHzdo,5147
 konduktor/task.py,sha256=97iLCo62qpN9wLGNPeFw64E8k1nch7AyySY3BUXHPWY,37496
 konduktor/templates/deployment.yaml.j2,sha256=uXFjDQaimbpFdAn2RJGaIvS_PzDY136cw_L3QMjz3ZA,3452
-konduktor/templates/jobset.yaml.j2,sha256=67yGuY4XdE4KBWN3DKvMJjlypQ0VpdiioRUAhpa3zA4,1072
+konduktor/templates/jobset.yaml.j2,sha256=gURWl6uQv_OLni-LFy2E7ttjGOtuRDt5Vfs4ALH7fpI,1196
 konduktor/templates/pod.yaml.j2,sha256=3uXx0ls2v8x-NL_Ypze5u9RoJS8F5bzoyOJcYwzf8Z0,18240
 konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
@@ -90,15 +90,15 @@ konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4x
 konduktor/utils/exceptions.py,sha256=5IFnN5bIUSBJv4KRRrCepk5jyY9EG5vWWQqbjCmP3NU,6682
 konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
 konduktor/utils/kubernetes_utils.py,sha256=7RThCOiyaALRqbwHZ40qMnBsbAgt669k0NHkxtfx7Bs,26205
-konduktor/utils/log_utils.py,sha256=k4Qo0OlUZYQmLcbSD9tDWe6_Q5XcsLO_K8uVWjlTEU0,16938
+konduktor/utils/log_utils.py,sha256=xg5-NM1l3oodRTkiKihuzwe82g7XnfTzprFPndSF1A8,17032
 konduktor/utils/loki_utils.py,sha256=h2ZvZQr1nE_wXXsKsGMjhG2s2MXknNd4icydTR_ruKU,3539
 konduktor/utils/rich_utils.py,sha256=ycADW6Ij3wX3uT8ou7T8qxX519RxlkJivsLvUahQaJo,3583
 konduktor/utils/schemas.py,sha256=tBrKhnkfn9uKDYdlb4L2KgooW-muuhww7U8fu9zX-ms,18336
 konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
 konduktor/utils/ux_utils.py,sha256=7-Lt3QbDVvBQUli5_U9lOdXKeC-ip8rZBpO9gQ6vPJw,7955
 konduktor/utils/validator.py,sha256=5C1kE57Eyj1OPnAbvojqMNHHtf5fnl47FK_vEttd8aw,4331
-konduktor_nightly-0.1.0.dev20250811105223.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
-konduktor_nightly-0.1.0.dev20250811105223.dist-info/METADATA,sha256=Gp9W_UVyGtpg-hU24Hm1DU-RKr-Hkzx10JQmBsrJMdQ,4247
-konduktor_nightly-0.1.0.dev20250811105223.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
-konduktor_nightly-0.1.0.dev20250811105223.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
-konduktor_nightly-0.1.0.dev20250811105223.dist-info/RECORD,,
+konduktor_nightly-0.1.0.dev20250813105033.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
+konduktor_nightly-0.1.0.dev20250813105033.dist-info/METADATA,sha256=SG5D0d2YiNKbLh9yKpMif5NucNJgwSYp-lU55QXL00c,4247
+konduktor_nightly-0.1.0.dev20250813105033.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
+konduktor_nightly-0.1.0.dev20250813105033.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
+konduktor_nightly-0.1.0.dev20250813105033.dist-info/RECORD,,

{konduktor_nightly-0.1.0.dev20250811105223.dist-info → konduktor_nightly-0.1.0.dev20250813105033.dist-info}/LICENSE RENAMED Viewed

File without changes

{konduktor_nightly-0.1.0.dev20250811105223.dist-info → konduktor_nightly-0.1.0.dev20250813105033.dist-info}/WHEEL RENAMED Viewed

File without changes

{konduktor_nightly-0.1.0.dev20250811105223.dist-info → konduktor_nightly-0.1.0.dev20250813105033.dist-info}/entry_points.txt RENAMED Viewed

File without changes

konduktor-nightly 0.1.0.dev20250811105223__py3-none-any.whl → 0.1.0.dev20250813105033__py3-none-any.whl

Potentially problematic release.

konduktor-nightly 0.1.0.dev20250811105223py3-none-any.whl → 0.1.0.dev20250813105033py3-none-any.whl