PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251009py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show

sky/__init__.py +6 -2
sky/adaptors/aws.py +25 -7
sky/adaptors/coreweave.py +278 -0
sky/adaptors/kubernetes.py +64 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +20 -0
sky/authentication.py +59 -149
sky/backends/backend_utils.py +104 -63
sky/backends/cloud_vm_ray_backend.py +84 -39
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +24 -28
sky/catalog/runpod_catalog.py +5 -1
sky/catalog/shadeform_catalog.py +165 -0
sky/check.py +25 -13
sky/client/cli/command.py +335 -86
sky/client/cli/flags.py +4 -2
sky/client/cli/table_utils.py +17 -9
sky/client/sdk.py +59 -12
sky/cloud_stores.py +73 -0
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +71 -16
sky/clouds/azure.py +12 -5
sky/clouds/cloud.py +19 -9
sky/clouds/cudo.py +12 -5
sky/clouds/do.py +4 -1
sky/clouds/fluidstack.py +12 -5
sky/clouds/gcp.py +12 -5
sky/clouds/hyperbolic.py +12 -5
sky/clouds/ibm.py +12 -5
sky/clouds/kubernetes.py +62 -25
sky/clouds/lambda_cloud.py +12 -5
sky/clouds/nebius.py +12 -5
sky/clouds/oci.py +12 -5
sky/clouds/paperspace.py +4 -1
sky/clouds/primeintellect.py +4 -1
sky/clouds/runpod.py +12 -5
sky/clouds/scp.py +12 -5
sky/clouds/seeweb.py +4 -1
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +4 -2
sky/clouds/vast.py +12 -5
sky/clouds/vsphere.py +4 -1
sky/core.py +12 -11
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +143 -19
sky/data/storage.py +168 -11
sky/exceptions.py +13 -1
sky/execution.py +13 -0
sky/global_user_state.py +189 -113
sky/jobs/client/sdk.py +32 -10
sky/jobs/client/sdk_async.py +9 -3
sky/jobs/constants.py +3 -1
sky/jobs/controller.py +164 -192
sky/jobs/file_content_utils.py +80 -0
sky/jobs/log_gc.py +201 -0
sky/jobs/recovery_strategy.py +59 -82
sky/jobs/scheduler.py +20 -9
sky/jobs/server/core.py +105 -23
sky/jobs/server/server.py +40 -28
sky/jobs/server/utils.py +32 -11
sky/jobs/state.py +588 -110
sky/jobs/utils.py +442 -209
sky/logs/agent.py +1 -1
sky/metrics/utils.py +45 -6
sky/optimizer.py +1 -1
sky/provision/__init__.py +7 -0
sky/provision/aws/instance.py +2 -1
sky/provision/azure/instance.py +2 -1
sky/provision/common.py +2 -0
sky/provision/cudo/instance.py +2 -1
sky/provision/do/instance.py +2 -1
sky/provision/fluidstack/instance.py +4 -3
sky/provision/gcp/instance.py +2 -1
sky/provision/hyperbolic/instance.py +2 -1
sky/provision/instance_setup.py +10 -2
sky/provision/kubernetes/constants.py +0 -1
sky/provision/kubernetes/instance.py +222 -89
sky/provision/kubernetes/network.py +12 -8
sky/provision/kubernetes/utils.py +114 -53
sky/provision/kubernetes/volume.py +5 -4
sky/provision/lambda_cloud/instance.py +2 -1
sky/provision/nebius/instance.py +2 -1
sky/provision/oci/instance.py +2 -1
sky/provision/paperspace/instance.py +2 -1
sky/provision/provisioner.py +11 -2
sky/provision/runpod/instance.py +2 -1
sky/provision/scp/instance.py +2 -1
sky/provision/seeweb/instance.py +3 -3
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/vast/instance.py +2 -1
sky/provision/vsphere/instance.py +2 -1
sky/resources.py +1 -1
sky/schemas/api/responses.py +9 -5
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/generated/jobsv1_pb2.py +52 -52
sky/schemas/generated/jobsv1_pb2.pyi +4 -2
sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
sky/serve/client/impl.py +11 -3
sky/serve/replica_managers.py +5 -2
sky/serve/serve_utils.py +9 -2
sky/serve/server/impl.py +7 -2
sky/serve/server/server.py +18 -15
sky/serve/service.py +2 -2
sky/server/auth/oauth2_proxy.py +2 -5
sky/server/common.py +31 -28
sky/server/constants.py +5 -1
sky/server/daemons.py +27 -19
sky/server/requests/executor.py +138 -74
sky/server/requests/payloads.py +9 -1
sky/server/requests/preconditions.py +13 -10
sky/server/requests/request_names.py +120 -0
sky/server/requests/requests.py +485 -153
sky/server/requests/serializers/decoders.py +26 -13
sky/server/requests/serializers/encoders.py +56 -11
sky/server/requests/threads.py +106 -0
sky/server/rest.py +70 -18
sky/server/server.py +283 -104
sky/server/stream_utils.py +233 -59
sky/server/uvicorn.py +18 -17
sky/setup_files/alembic.ini +4 -0
sky/setup_files/dependencies.py +32 -13
sky/sky_logging.py +0 -2
sky/skylet/constants.py +30 -7
sky/skylet/events.py +7 -0
sky/skylet/log_lib.py +8 -2
sky/skylet/log_lib.pyi +1 -1
sky/skylet/services.py +26 -13
sky/skylet/subprocess_daemon.py +103 -29
sky/skypilot_config.py +87 -75
sky/ssh_node_pools/server.py +9 -8
sky/task.py +67 -54
sky/templates/kubernetes-ray.yml.j2 +8 -1
sky/templates/nebius-ray.yml.j2 +1 -0
sky/templates/shadeform-ray.yml.j2 +72 -0
sky/templates/websocket_proxy.py +142 -12
sky/users/permission.py +8 -1
sky/utils/admin_policy_utils.py +16 -3
sky/utils/asyncio_utils.py +78 -0
sky/utils/auth_utils.py +153 -0
sky/utils/cli_utils/status_utils.py +8 -2
sky/utils/command_runner.py +11 -0
sky/utils/common.py +3 -1
sky/utils/common_utils.py +7 -4
sky/utils/context.py +57 -51
sky/utils/context_utils.py +30 -12
sky/utils/controller_utils.py +35 -8
sky/utils/db/db_utils.py +37 -10
sky/utils/db/migration_utils.py +8 -4
sky/utils/locks.py +24 -6
sky/utils/resource_checker.py +4 -1
sky/utils/resources_utils.py +53 -29
sky/utils/schemas.py +23 -4
sky/utils/subprocess_utils.py +17 -4
sky/volumes/server/server.py +7 -6
sky/workspaces/server.py +13 -12
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
/sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0

sky/jobs/file_content_utils.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Utilities for managing managed job file content.
+The helpers in this module fetch job file content (DAG YAML/env files) from the
+database-first storage added for managed jobs, transparently falling back to
+legacy on-disk paths when needed. Consumers should prefer the string-based
+helpers so controllers never have to rely on local disk state.
+"""
+import os
+from typing import Optional
+from sky import sky_logging
+from sky.jobs import state as managed_job_state
+logger = sky_logging.init_logger(__name__)
+def get_job_dag_content(job_id: int) -> Optional[str]:
+    """Get DAG YAML content for a job from database or disk.
+    Args:
+        job_id: The job ID
+    Returns:
+        DAG YAML content as string, or None if not found
+    """
+    file_info = managed_job_state.get_job_file_contents(job_id)
+    # Prefer content stored in the database
+    if file_info['dag_yaml_content'] is not None:
+        return file_info['dag_yaml_content']
+    # Fallback to disk path for backward compatibility
+    dag_yaml_path = file_info.get('dag_yaml_path')
+    if dag_yaml_path and os.path.exists(dag_yaml_path):
+        try:
+            with open(dag_yaml_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                logger.debug('Loaded DAG YAML from disk for job %s: %s', job_id,
+                             dag_yaml_path)
+                return content
+        except (FileNotFoundError, IOError, OSError) as e:
+            logger.warning(
+                f'Failed to read DAG YAML from disk {dag_yaml_path}: {e}')
+    logger.warning(f'DAG YAML content not found for job {job_id}')
+    return None
+def get_job_env_content(job_id: int) -> Optional[str]:
+    """Get environment file content for a job from database or disk.
+    Args:
+        job_id: The job ID
+    Returns:
+        Environment file content as string, or None if not found
+    """
+    file_info = managed_job_state.get_job_file_contents(job_id)
+    # Prefer content stored in the database
+    if file_info['env_file_content'] is not None:
+        return file_info['env_file_content']
+    # Fallback to disk path for backward compatibility
+    env_file_path = file_info.get('env_file_path')
+    if env_file_path and os.path.exists(env_file_path):
+        try:
+            with open(env_file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                logger.debug('Loaded environment file from disk for job %s: %s',
+                             job_id, env_file_path)
+                return content
+        except (FileNotFoundError, IOError, OSError) as e:
+            logger.warning(
+                f'Failed to read environment file from disk {env_file_path}: '
+                f'{e}')
+    # Environment file is optional, so don't warn if not found
+    return None

sky/jobs/log_gc.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""Log garbage collection for managed jobs."""
+import asyncio
+from datetime import datetime
+import os
+import pathlib
+import shutil
+import time
+import anyio
+import filelock
+from sky import sky_logging
+from sky import skypilot_config
+from sky.jobs import constants as managed_job_constants
+from sky.jobs import state as managed_job_state
+from sky.jobs import utils as managed_job_utils
+from sky.utils import context
+from sky.utils import context_utils
+logger = sky_logging.init_logger(__name__)
+# Filelock for garbage collector leader election.
+_JOB_CONTROLLER_GC_LOCK_PATH = os.path.expanduser(
+    '~/.sky/locks/job_controller_gc.lock')
+_DEFAULT_TASK_LOGS_GC_RETENTION_HOURS = 24 * 7
+_DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS = 24 * 7
+_LEAST_FREQUENT_GC_INTERVAL_SECONDS = 3600
+_MOST_FREQUENT_GC_INTERVAL_SECONDS = 30
+def _next_gc_interval(retention_seconds: int) -> int:
+    """Get the next GC interval."""
+    # Run the GC at least per hour to ensure hourly accuracy and
+    # at most per 30 seconds (when retention_seconds is small) to
+    # avoid too frequent cleanup.
+    return max(min(retention_seconds, _LEAST_FREQUENT_GC_INTERVAL_SECONDS),
+               _MOST_FREQUENT_GC_INTERVAL_SECONDS)
+async def gc_controller_logs_for_job():
+    """Garbage collect job and controller logs."""
+    while True:
+        skypilot_config.reload_config()
+        controller_logs_retention = skypilot_config.get_nested(
+            ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
+            _DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS) * 3600
+        # Negative value disables the GC
+        if controller_logs_retention >= 0:
+            logger.info(f'GC controller logs for job: retention '
+                        f'{controller_logs_retention} seconds')
+            try:
+                finished = False
+                while not finished:
+                    finished = await _clean_controller_logs_with_retention(
+                        controller_logs_retention)
+            except asyncio.CancelledError:
+                logger.info('Managed jobs logs GC task cancelled')
+                break
+            except Exception as e:  # pylint: disable=broad-except
+                logger.error(f'Error GC controller logs for job: {e}',
+                             exc_info=True)
+        else:
+            logger.info('Controller logs GC is disabled')
+        interval = _next_gc_interval(controller_logs_retention)
+        logger.info('Next controller logs GC is scheduled after '
+                    f'{interval} seconds')
+        await asyncio.sleep(interval)
+async def gc_task_logs_for_job():
+    """Garbage collect task logs for job."""
+    while True:
+        skypilot_config.reload_config()
+        task_logs_retention = skypilot_config.get_nested(
+            ('jobs', 'controller', 'task_logs_gc_retention_hours'),
+            _DEFAULT_TASK_LOGS_GC_RETENTION_HOURS) * 3600
+        # Negative value disables the GC
+        if task_logs_retention >= 0:
+            logger.info('GC task logs for job: '
+                        f'retention {task_logs_retention} seconds')
+            try:
+                finished = False
+                while not finished:
+                    finished = await _clean_task_logs_with_retention(
+                        task_logs_retention)
+            except asyncio.CancelledError:
+                logger.info('Task logs GC task cancelled')
+                break
+            except Exception as e:  # pylint: disable=broad-except
+                logger.error(f'Error GC task logs for job: {e}', exc_info=True)
+        else:
+            logger.info('Controller logs GC is disabled')
+        interval = _next_gc_interval(task_logs_retention)
+        logger.info(f'Next task logs GC is scheduled after {interval} seconds')
+        await asyncio.sleep(_next_gc_interval(task_logs_retention))
+async def _clean_controller_logs_with_retention(retention_seconds: int,
+                                                batch_size: int = 100):
+    """Clean controller logs with retention.
+    Returns:
+        Whether the GC of this round has finished, False means there might
+        still be more controller logs to clean.
+    """
+    assert batch_size > 0, 'Batch size must be positive'
+    jobs = await managed_job_state.get_controller_logs_to_clean_async(
+        retention_seconds, batch_size=batch_size)
+    job_ids_to_update = []
+    for job in jobs:
+        job_ids_to_update.append(job['job_id'])
+        log_file = managed_job_utils.controller_log_file_for_job(job['job_id'])
+        cleaned_at = time.time()
+        if await anyio.Path(log_file).exists():
+            ts_str = datetime.fromtimestamp(cleaned_at).strftime(
+                '%Y-%m-%d %H:%M:%S')
+            msg = f'Controller log has been cleaned at {ts_str}.'
+            # Sync down logs will reference to this file directly, so we
+            # keep the file and delete the content.
+            # TODO(aylei): refactor sync down logs if the inode usage
+            # becomes an issue.
+            async with await anyio.open_file(log_file, 'w',
+                                             encoding='utf-8') as f:
+                await f.write(msg + '\n')
+    # Batch the update, the timestamp will be not accurate but it's okay.
+    await managed_job_state.set_controller_logs_cleaned_async(
+        job_ids=job_ids_to_update, logs_cleaned_at=time.time())
+    complete = len(jobs) < batch_size
+    logger.info(f'Cleaned {len(jobs)} controller logs with retention '
+                f'{retention_seconds} seconds, complete: {complete}')
+    return complete
+async def _clean_task_logs_with_retention(retention_seconds: int,
+                                          batch_size: int = 100):
+    """Clean task logs with retention.
+    Returns:
+        Whether the GC of this round has finished, False means there might
+        still be more task logs to clean.
+    """
+    assert batch_size > 0, 'Batch size must be positive'
+    tasks = await managed_job_state.get_task_logs_to_clean_async(
+        retention_seconds, batch_size=batch_size)
+    tasks_to_update = []
+    for task in tasks:
+        local_log_file = anyio.Path(task['local_log_file'])
+        # We assume the log directory has the following layout:
+        # task-id/
+        #   - run.log
+        #   - tasks/
+        #     - run.log
+        # and also remove the tasks directory on cleanup.
+        task_log_dir = local_log_file.parent.joinpath('tasks')
+        await local_log_file.unlink(missing_ok=True)
+        await context_utils.to_thread(shutil.rmtree,
+                                      str(task_log_dir),
+                                      ignore_errors=True)
+        # We have at least once semantic guarantee for the cleanup here.
+        tasks_to_update.append((task['job_id'], task['task_id']))
+    await managed_job_state.set_task_logs_cleaned_async(
+        tasks=list(tasks_to_update), logs_cleaned_at=time.time())
+    complete = len(tasks) < batch_size
+    logger.info(f'Cleaned {len(tasks)} task logs with retention '
+                f'{retention_seconds} seconds, complete: {complete}')
+    return complete
+@context.contextual_async
+async def run_log_gc():
+    """Run the log garbage collector."""
+    log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
+    os.makedirs(log_dir, exist_ok=True)
+    log_path = os.path.join(log_dir, 'garbage_collector.log')
+    # Remove previous log file
+    await anyio.Path(log_path).unlink(missing_ok=True)
+    ctx = context.get()
+    assert ctx is not None, 'Context is not initialized'
+    ctx.redirect_log(pathlib.Path(log_path))
+    gc_controller_logs_for_job_task = asyncio.create_task(
+        gc_controller_logs_for_job())
+    gc_task_logs_for_job_task = asyncio.create_task(gc_task_logs_for_job())
+    await asyncio.gather(gc_controller_logs_for_job_task,
+                         gc_task_logs_for_job_task)
+def elect_for_log_gc():
+    """Use filelock to elect for the log garbage collector.
+    The log garbage collector runs in the controller process to avoid the
+    overhead of launching a new process and the lifecycle management, the
+    threads that does not elected as the log garbage collector just wait.
+    on the filelock and bring trivial overhead.
+    """
+    with filelock.FileLock(_JOB_CONTROLLER_GC_LOCK_PATH):
+        asyncio.run(run_log_gc())

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -70,7 +70,6 @@ class StrategyExecutor:
         max_restarts_on_errors: int,
         job_id: int,
         task_id: int,
-        job_logger: logging.Logger,
         pool: Optional[str],
         starting: Set[int],
         starting_lock: asyncio.Lock,
@@ -85,7 +84,6 @@ class StrategyExecutor:
             max_restarts_on_errors: Maximum number of restarts on errors.
             job_id: The ID of the job.
             task_id: The ID of the task.
-            job_logger: Logger instance for this specific job.
             starting: Set of job IDs that are currently starting.
             starting_lock: Lock to synchronize starting jobs.
             starting_signal: Condition to signal when a job can start.
@@ -105,7 +103,6 @@ class StrategyExecutor:
         self.task_id = task_id
         self.pool = pool
         self.restart_cnt_on_failure = 0
-        self._logger = job_logger
         self.job_id_on_pool_cluster: Optional[int] = None
         self.starting = starting
         self.starting_lock = starting_lock
@@ -119,7 +116,6 @@ class StrategyExecutor:
         task: 'task_lib.Task',
         job_id: int,
         task_id: int,
-        job_logger: logging.Logger,
         pool: Optional[str],
         starting: Set[int],
         starting_lock: asyncio.Lock,
@@ -156,7 +152,7 @@ class StrategyExecutor:
         assert job_recovery_strategy is not None, job_recovery_name
         return job_recovery_strategy(cluster_name, backend, task,
                                      max_restarts_on_errors, job_id, task_id,
-                                     job_logger, pool, starting, starting_lock,
+                                     pool, starting, starting_lock,
                                      starting_signal)
     async def launch(self) -> float:
@@ -224,7 +220,7 @@ class StrategyExecutor:
                 **kwargs,
                 _try_cancel_if_cluster_is_init=True,
             )
-            self._logger.debug(f'sdk.cancel request ID: {request_id}')
+            logger.debug(f'sdk.cancel request ID: {request_id}')
             await context_utils.to_thread(
                 sdk.get,
                 request_id,
@@ -261,16 +257,15 @@ class StrategyExecutor:
                 # loop.
                 # TODO(zhwu): log the unexpected error to usage collection
                 # for future debugging.
-                self._logger.info(
-                    f'Unexpected exception: {e}\nFailed to get the '
-                    'refresh the cluster status. Retrying.')
+                logger.info(f'Unexpected exception: {e}\nFailed to get the '
+                            'refresh the cluster status. Retrying.')
                 continue
             if cluster_status != status_lib.ClusterStatus.UP:
                 # The cluster can be preempted before the job is
                 # launched.
                 # Break to let the retry launch kick in.
-                self._logger.info('The cluster is preempted before the job '
-                                  'is submitted.')
+                logger.info('The cluster is preempted before the job '
+                            'is submitted.')
                 # TODO(zhwu): we should recover the preemption with the
                 # recovery strategy instead of the current while loop.
                 break
@@ -279,7 +274,6 @@ class StrategyExecutor:
                 status = await managed_job_utils.get_job_status(
                     self.backend,
                     self.cluster_name,
-                    job_logger=self._logger,
                     job_id=self.job_id_on_pool_cluster)
             except Exception as e:  # pylint: disable=broad-except
                 # If any unexpected error happens, retry the job checking
@@ -288,9 +282,8 @@ class StrategyExecutor:
                 # get_job_status, so it should not happen here.
                 # TODO(zhwu): log the unexpected error to usage collection
                 # for future debugging.
-                self._logger.info(
-                    f'Unexpected exception: {e}\nFailed to get the '
-                    'job status. Retrying.')
+                logger.info(f'Unexpected exception: {e}\nFailed to get the '
+                            'job status. Retrying.')
                 continue
             # Check the job status until it is not in initialized status
@@ -306,9 +299,8 @@ class StrategyExecutor:
                 except Exception as e:  # pylint: disable=broad-except
                     # If we failed to get the job timestamp, we will retry
                     # job checking loop.
-                    self._logger.info(
-                        f'Unexpected Exception: {e}\nFailed to get '
-                        'the job start timestamp. Retrying.')
+                    logger.info(f'Unexpected Exception: {e}\nFailed to get '
+                                'the job start timestamp. Retrying.')
                     continue
             # Wait for the job to be started
             await asyncio.sleep(
@@ -370,7 +362,6 @@ class StrategyExecutor:
                         self.starting,
                         self.starting_lock,
                         self.starting_signal,
-                        self._logger,
                 ):
                     # The job state may have been PENDING during backoff -
                     # update to STARTING or RECOVERING.
@@ -394,21 +385,19 @@ class StrategyExecutor:
                                 for env_var in ENV_VARS_TO_CLEAR:
                                     vars_to_restore[env_var] = os.environ.pop(
                                         env_var, None)
-                                    self._logger.debug('Cleared env var: '
-                                                       f'{env_var}')
-                                self._logger.debug('Env vars for api_start: '
-                                                   f'{os.environ}')
+                                    logger.debug('Cleared env var: '
+                                                 f'{env_var}')
+                                logger.debug('Env vars for api_start: '
+                                             f'{os.environ}')
                                 await context_utils.to_thread(sdk.api_start)
-                                self._logger.info('API server started.')
+                                logger.info('API server started.')
                             finally:
                                 for env_var, value in vars_to_restore.items():
                                     if value is not None:
-                                        self._logger.debug(
-                                            'Restored env var: '
-                                            f'{env_var}: {value}')
+                                        logger.debug('Restored env var: '
+                                                     f'{env_var}: {value}')
                                         os.environ[env_var] = value
-                            log_file = _get_logger_file(self._logger)
                             request_id = None
                             try:
                                 request_id = await context_utils.to_thread(
@@ -429,31 +418,27 @@ class StrategyExecutor:
                                     # down=True,
                                     _is_launched_by_jobs_controller=True,
                                 )
-                                self._logger.debug('sdk.launch request ID: '
-                                                   f'{request_id}')
-                                if log_file is None:
-                                    raise OSError('Log file is None')
-                                with open(log_file, 'a', encoding='utf-8') as f:
-                                    await context_utils.to_thread(
-                                        sdk.stream_and_get,
-                                        request_id,
-                                        output_stream=f,
-                                    )
+                                logger.debug('sdk.launch request ID: '
+                                             f'{request_id}')
+                                await context_utils.to_thread(
+                                    sdk.stream_and_get,
+                                    request_id,
+                                )
                             except asyncio.CancelledError:
                                 if request_id:
                                     req = await context_utils.to_thread(
                                         sdk.api_cancel, request_id)
-                                    self._logger.debug('sdk.api_cancel request '
-                                                       f'ID: {req}')
+                                    logger.debug('sdk.api_cancel request '
+                                                 f'ID: {req}')
                                     try:
                                         await context_utils.to_thread(
                                             sdk.get, req)
                                     except Exception as e:  # pylint: disable=broad-except
                                         # we must still return a CancelledError
-                                        self._logger.error(
+                                        logger.error(
                                             f'Failed to cancel the job: {e}')
                                 raise
-                            self._logger.info('Managed job cluster launched.')
+                            logger.info('Managed job cluster launched.')
                         else:
                             self.cluster_name = await (context_utils.to_thread(
                                 serve_utils.get_next_cluster_name, self.pool,
@@ -468,8 +453,8 @@ class StrategyExecutor:
                                     self.dag,
                                     cluster_name=self.cluster_name,
                                 )
-                                self._logger.debug('sdk.exec request ID: '
-                                                   f'{request_id}')
+                                logger.debug('sdk.exec request ID: '
+                                             f'{request_id}')
                                 job_id_on_pool_cluster, _ = (
                                     await context_utils.to_thread(
                                         sdk.get, request_id))
@@ -477,14 +462,14 @@ class StrategyExecutor:
                                 if request_id:
                                     req = await context_utils.to_thread(
                                         sdk.api_cancel, request_id)
-                                    self._logger.debug('sdk.api_cancel request '
-                                                       f'ID: {req}')
+                                    logger.debug('sdk.api_cancel request '
+                                                 f'ID: {req}')
                                     try:
                                         await context_utils.to_thread(
                                             sdk.get, req)
                                     except Exception as e:  # pylint: disable=broad-except
                                         # we must still return a CancelledError
-                                        self._logger.error(
+                                        logger.error(
                                             f'Failed to cancel the job: {e}')
                                 raise
                             assert job_id_on_pool_cluster is not None, (
@@ -492,15 +477,14 @@ class StrategyExecutor:
                             self.job_id_on_pool_cluster = job_id_on_pool_cluster
                             await state.set_job_id_on_pool_cluster_async(
                                 self.job_id, job_id_on_pool_cluster)
-                        self._logger.info('Managed job cluster launched.')
+                        logger.info('Managed job cluster launched.')
                     except (exceptions.InvalidClusterNameError,
                             exceptions.NoCloudAccessError,
                             exceptions.ResourcesMismatchError,
                             exceptions.StorageSpecError,
                             exceptions.StorageError) as e:
-                        self._logger.error(
-                            'Failure happened before provisioning. '
-                            f'{common_utils.format_exception(e)}')
+                        logger.error('Failure happened before provisioning. '
+                                     f'{common_utils.format_exception(e)}')
                         if raise_on_failure:
                             raise exceptions.ProvisionPrechecksError(
                                 reasons=[e])
@@ -528,24 +512,22 @@ class StrategyExecutor:
                             reasons_str = '; '.join(
                                 common_utils.format_exception(err)
                                 for err in reasons)
-                            self._logger.error(
+                            logger.error(
                                 'Failure happened before provisioning. '
                                 f'Failover reasons: {reasons_str}')
                             if raise_on_failure:
                                 raise exceptions.ProvisionPrechecksError(
                                     reasons)
                             return None
-                        self._logger.info(
-                            'Failed to launch a cluster with error: '
-                            f'{common_utils.format_exception(e)})')
+                        logger.info('Failed to launch a cluster with error: '
+                                    f'{common_utils.format_exception(e)})')
                     except Exception as e:  # pylint: disable=broad-except
                         # If the launch fails, it will be recovered by the
                         # following code.
-                        self._logger.info(
-                            'Failed to launch a cluster with error: '
-                            f'{common_utils.format_exception(e)})')
+                        logger.info('Failed to launch a cluster with error: '
+                                    f'{common_utils.format_exception(e)})')
                         with ux_utils.enable_traceback():
-                            self._logger.info(
+                            logger.info(
                                 f'  Traceback: {traceback.format_exc()}')
                     else:  # No exception, the launch succeeds.
                         # At this point, a sky.launch() has succeeded. Cluster
@@ -559,7 +541,7 @@ class StrategyExecutor:
                         # launch.
                         # TODO(zhwu): log the unexpected error to usage
                         # collection for future debugging.
-                        self._logger.info(
+                        logger.info(
                             'Failed to successfully submit the job to the '
                             'launched cluster, due to unexpected submission '
                             'errors or the cluster being preempted during '
@@ -594,8 +576,8 @@ class StrategyExecutor:
                 # Calculate the backoff time and sleep.
                 gap_seconds = (backoff.current_backoff()
                                if self.pool is None else 1)
-                self._logger.info('Retrying to launch the cluster in '
-                                  f'{gap_seconds:.1f} seconds.')
+                logger.info('Retrying to launch the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
                 await asyncio.sleep(gap_seconds)
                 continue
             else:
@@ -630,15 +612,14 @@ class FailoverStrategyExecutor(StrategyExecutor):
         max_restarts_on_errors: int,
         job_id: int,
         task_id: int,
-        job_logger: logging.Logger,
         pool: Optional[str],
         starting: Set[int],
         starting_lock: asyncio.Lock,
         starting_signal: asyncio.Condition,
     ) -> None:
         super().__init__(cluster_name, backend, task, max_restarts_on_errors,
-                         job_id, task_id, job_logger, pool, starting,
-                         starting_lock, starting_signal)
+                         job_id, task_id, pool, starting, starting_lock,
+                         starting_signal)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is
@@ -694,14 +675,13 @@ class FailoverStrategyExecutor(StrategyExecutor):
                     return job_submitted_at
             # Step 2
-            self._logger.debug('Terminating unhealthy cluster and reset cloud '
-                               'region.')
+            logger.debug('Terminating unhealthy cluster and reset cloud '
+                         'region.')
             await context_utils.to_thread(self._cleanup_cluster)
             # Step 3
-            self._logger.debug(
-                'Relaunch the cluster  without constraining to prior '
-                'cloud/region.')
+            logger.debug('Relaunch the cluster  without constraining to prior '
+                         'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
             job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
                                                   raise_on_failure=False,
@@ -709,8 +689,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS
-                self._logger.info('Retrying to recover the cluster in '
-                                  f'{gap_seconds:.1f} seconds.')
+                logger.info('Retrying to recover the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
                 await asyncio.sleep(gap_seconds)
                 continue
@@ -755,14 +735,12 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
         # task.resources.
         # Step 1
-        self._logger.debug(
-            'Terminating unhealthy cluster and reset cloud region.')
+        logger.debug('Terminating unhealthy cluster and reset cloud region.')
         await context_utils.to_thread(self._cleanup_cluster)
         # Step 2
-        self._logger.debug(
-            'Relaunch the cluster skipping the previously launched '
-            'cloud/region.')
+        logger.debug('Relaunch the cluster skipping the previously launched '
+                     'cloud/region.')
         if self._launched_resources is not None:
             task = self.dag.tasks[0]
             requested_resources = self._launched_resources
@@ -787,9 +765,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
         while True:
             # Step 3
-            self._logger.debug(
-                'Relaunch the cluster without constraining to prior '
-                'cloud/region.')
+            logger.debug('Relaunch the cluster without constraining to prior '
+                         'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
             job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
                                                   raise_on_failure=False,
@@ -797,8 +774,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS
-                self._logger.info('Retrying to recover the cluster in '
-                                  f'{gap_seconds:.1f} seconds.')
+                logger.info('Retrying to recover the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
                 await asyncio.sleep(gap_seconds)
                 continue

skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251009py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl