PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250616py3-none-any.whl → 1.0.0.dev20250618py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

sky/__init__.py +2 -4
sky/backends/backend_utils.py +7 -0
sky/backends/cloud_vm_ray_backend.py +91 -96
sky/cli.py +5 -6311
sky/client/cli.py +66 -639
sky/client/sdk.py +22 -2
sky/clouds/kubernetes.py +8 -0
sky/clouds/scp.py +7 -26
sky/clouds/utils/scp_utils.py +177 -124
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/global_user_state.py +50 -11
sky/jobs/controller.py +98 -31
sky/jobs/scheduler.py +37 -29
sky/jobs/server/core.py +36 -3
sky/jobs/state.py +69 -9
sky/jobs/utils.py +11 -0
sky/logs/__init__.py +17 -0
sky/logs/agent.py +73 -0
sky/logs/gcp.py +91 -0
sky/models.py +1 -0
sky/provision/__init__.py +1 -0
sky/provision/instance_setup.py +35 -0
sky/provision/provisioner.py +11 -0
sky/provision/scp/__init__.py +15 -0
sky/provision/scp/config.py +93 -0
sky/provision/scp/instance.py +528 -0
sky/resources.py +164 -29
sky/server/common.py +21 -9
sky/server/requests/payloads.py +19 -1
sky/server/server.py +121 -29
sky/setup_files/dependencies.py +11 -1
sky/skylet/constants.py +48 -1
sky/skylet/job_lib.py +83 -19
sky/task.py +171 -21
sky/templates/kubernetes-ray.yml.j2 +60 -4
sky/templates/scp-ray.yml.j2 +3 -50
sky/users/permission.py +47 -34
sky/users/rbac.py +10 -1
sky/users/server.py +274 -9
sky/utils/command_runner.py +1 -1
sky/utils/common_utils.py +16 -14
sky/utils/context.py +1 -1
sky/utils/controller_utils.py +12 -3
sky/utils/dag_utils.py +17 -4
sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
sky/utils/schemas.py +83 -5
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
sky/benchmark/__init__.py +0 -0
sky/benchmark/benchmark_state.py +0 -295
sky/benchmark/benchmark_utils.py +0 -641
sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
sky/skylet/providers/scp/__init__.py +0 -2
sky/skylet/providers/scp/config.py +0 -149
sky/skylet/providers/scp/node_provider.py +0 -578
/sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
/sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
/sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
/sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
/sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
/sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0

sky/dashboard/out/workspace/new.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/~~8e97adcaacc15293~~.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/~~8e97adcaacc15293~~.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-~~1b69b196a4dbffef~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-~~32b2caae3445bf3b~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js" defer=""></script><script src="/dashboard/_next/static/~~OZxMW3bxAJmqgn5f4MdhO~~/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/~~OZxMW3bxAJmqgn5f4MdhO~~/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"~~OZxMW3bxAJmqgn5f4MdhO~~","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1	+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>

sky/dashboard/out/workspaces/[name].html CHANGED Viewed

	@@ -1 +1 @@
1	- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/~~8e97adcaacc15293~~.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/~~8e97adcaacc15293~~.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-~~1b69b196a4dbffef~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-~~32b2caae3445bf3b~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-~~c807fc34f09c7df3~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-~~385d190b95815e11~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-~~ab9c4f609239155f~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-~~c8c2191328532b7d~~.js" defer=""></script><script src="/dashboard/_next/static/~~OZxMW3bxAJmqgn5f4MdhO~~/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/~~OZxMW3bxAJmqgn5f4MdhO~~/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"~~OZxMW3bxAJmqgn5f4MdhO~~","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1	+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-db3c97c2bfbceb65.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-1493ac755eadeb35.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-b3040e493f6e7947.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-c4ff1ec05e2f3daf.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>

sky/dashboard/out/workspaces.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/~~8e97adcaacc15293~~.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/~~8e97adcaacc15293~~.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-~~1b69b196a4dbffef~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-~~32b2caae3445bf3b~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js" defer=""></script><script src="/dashboard/_next/static/~~OZxMW3bxAJmqgn5f4MdhO~~/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/~~OZxMW3bxAJmqgn5f4MdhO~~/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"~~OZxMW3bxAJmqgn5f4MdhO~~","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1	+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>

sky/global_user_state.py CHANGED Viewed

@@ -64,6 +64,7 @@ user_table = sqlalchemy.Table(
     Base.metadata,
     sqlalchemy.Column('id', sqlalchemy.Text, primary_key=True),
     sqlalchemy.Column('name', sqlalchemy.Text),
+    sqlalchemy.Column('password', sqlalchemy.Text),
 )
 cluster_table = sqlalchemy.Table(
@@ -301,6 +302,12 @@ def create_table():
             'last_creation_command',
             sqlalchemy.Text(),
             default_statement='DEFAULT NULL')
+        db_utils.add_column_to_table_sqlalchemy(
+            session,
+            'users',
+            'password',
+            sqlalchemy.Text(),
+            default_statement='DEFAULT NULL')
         session.commit()
@@ -358,7 +365,9 @@ def add_or_update_user(user: models.User) -> bool:
             # First try INSERT OR IGNORE - this won't fail if user exists
             insert_stmnt = insert_func(user_table).prefix_with(
-                'OR IGNORE').values(id=user.id, name=user.name)
+                'OR IGNORE').values(id=user.id,
+                                    name=user.name,
+                                    password=user.password)
             result = session.execute(insert_stmnt)
             # Check if the INSERT actually inserted a row
@@ -366,8 +375,14 @@ def add_or_update_user(user: models.User) -> bool:
             if not was_inserted:
                 # User existed, so update it
-                session.query(user_table).filter_by(id=user.id).update(
-                    {user_table.c.name: user.name})
+                if user.password:
+                    session.query(user_table).filter_by(id=user.id).update({
+                        user_table.c.name: user.name,
+                        user_table.c.password: user.password
+                    })
+                else:
+                    session.query(user_table).filter_by(id=user.id).update(
+                        {user_table.c.name: user.name})
             session.commit()
             return was_inserted
@@ -377,15 +392,19 @@ def add_or_update_user(user: models.User) -> bool:
             # For PostgreSQL, use INSERT ... ON CONFLICT with RETURNING to
             # detect insert vs update
             insert_func = postgresql.insert
-            insert_stmnt = insert_func(user_table).values(id=user.id,
-                                                          name=user.name)
+            insert_stmnt = insert_func(user_table).values(
+                id=user.id, name=user.name, password=user.password)
             # Use a sentinel in the RETURNING clause to detect insert vs update
+            if user.password:
+                set_ = {
+                    user_table.c.name: user.name,
+                    user_table.c.password: user.password
+                }
+            else:
+                set_ = {user_table.c.name: user.name}
             upsert_stmnt = insert_stmnt.on_conflict_do_update(
-                index_elements=[user_table.c.id],
-                set_={
-                    user_table.c.name: user.name
-                }).returning(
+                index_elements=[user_table.c.id], set_=set_).returning(
                     user_table.c.id,
                     # This will be True for INSERT, False for UPDATE
                     sqlalchemy.literal_column('(xmax = 0)').label('was_inserted'
@@ -407,7 +426,24 @@ def get_user(user_id: str) -> Optional[models.User]:
         row = session.query(user_table).filter_by(id=user_id).first()
     if row is None:
         return None
-    return models.User(id=row.id, name=row.name)
+    return models.User(id=row.id, name=row.name, password=row.password)
+def get_user_by_name(username: str) -> List[models.User]:
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        rows = session.query(user_table).filter_by(name=username).all()
+    if len(rows) == 0:
+        return []
+    return [
+        models.User(id=row.id, name=row.name, password=row.password)
+        for row in rows
+    ]
+def delete_user(user_id: str) -> None:
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        session.query(user_table).filter_by(id=user_id).delete()
+        session.commit()
 @_init_db
@@ -415,7 +451,10 @@ def get_all_users() -> List[models.User]:
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         rows = session.query(user_table).all()
-    return [models.User(id=row.id, name=row.name) for row in rows]
+    return [
+        models.User(id=row.id, name=row.name, password=row.password)
+        for row in rows
+    ]
 @_init_db

sky/jobs/controller.py CHANGED Viewed

@@ -152,6 +152,20 @@ class JobsController:
         Other exceptions may be raised depending on the backend.
         """
+        latest_task_id, last_task_prev_status = (
+            managed_job_state.get_latest_task_id_status(self._job_id))
+        is_resume = False
+        if (latest_task_id is not None and last_task_prev_status !=
+                managed_job_state.ManagedJobStatus.PENDING):
+            assert latest_task_id >= task_id, (latest_task_id, task_id)
+            if latest_task_id > task_id:
+                logger.info(f'Task {task_id} ({task.name}) has already '
+                            'been executed. Skipping...')
+                return True
+            if latest_task_id == task_id:
+                # Start recovery.
+                is_resume = True
         callback_func = managed_job_utils.event_callback_func(
             job_id=self._job_id, task_id=task_id, task=task)
         if task.run is None:
@@ -171,42 +185,72 @@ class JobsController:
             return True
         usage_lib.messages.usage.update_task_id(task_id)
         task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
-        submitted_at = time.time()
-        if task_id == 0:
-            submitted_at = backend_utils.get_timestamp_from_run_timestamp(
-                self._backend.run_timestamp)
         assert task.name is not None, task
         cluster_name = managed_job_utils.generate_managed_job_cluster_name(
             task.name, self._job_id)
         self._strategy_executor = recovery_strategy.StrategyExecutor.make(
             cluster_name, self._backend, task, self._job_id, task_id)
-        managed_job_state.set_starting(
-            self._job_id,
-            task_id,
-            self._backend.run_timestamp,
-            submitted_at,
-            resources_str=backend_utils.get_task_resources_str(
-                task, is_managed_job=True),
-            specs={
-                'max_restarts_on_errors':
-                    self._strategy_executor.max_restarts_on_errors
-            },
-            callback_func=callback_func)
-        logger.info(
-            f'Submitted managed job {self._job_id} (task: {task_id}, name: '
-            f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
+        if not is_resume:
+            submitted_at = time.time()
+            if task_id == 0:
+                submitted_at = backend_utils.get_timestamp_from_run_timestamp(
+                    self._backend.run_timestamp)
+            managed_job_state.set_starting(
+                self._job_id,
+                task_id,
+                self._backend.run_timestamp,
+                submitted_at,
+                resources_str=backend_utils.get_task_resources_str(
+                    task, is_managed_job=True),
+                specs={
+                    'max_restarts_on_errors':
+                        self._strategy_executor.max_restarts_on_errors
+                },
+                callback_func=callback_func)
+            logger.info(f'Submitted managed job {self._job_id} '
+                        f'(task: {task_id}, name: {task.name!r}); '
+                        f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
         logger.info('Started monitoring.')
-        remote_job_submitted_at = self._strategy_executor.launch()
-        assert remote_job_submitted_at is not None, remote_job_submitted_at
+        # Only do the initial cluster launch if not resuming from a controller
+        # failure. Otherwise, we will transit to recovering immediately.
+        remote_job_submitted_at = time.time()
+        if not is_resume:
+            remote_job_submitted_at = self._strategy_executor.launch()
+            assert remote_job_submitted_at is not None, remote_job_submitted_at
-        managed_job_state.set_started(job_id=self._job_id,
-                                      task_id=task_id,
-                                      start_time=remote_job_submitted_at,
-                                      callback_func=callback_func)
+        if not is_resume:
+            managed_job_state.set_started(job_id=self._job_id,
+                                          task_id=task_id,
+                                          start_time=remote_job_submitted_at,
+                                          callback_func=callback_func)
         while True:
+            # NOTE: if we are resuming from a controller failure, we only keep
+            # monitoring if the job is in RUNNING state. For all other cases,
+            # we will directly transit to recovering since we have no idea what
+            # the cluster status is.
+            force_transit_to_recovering = False
+            if is_resume:
+                prev_status = managed_job_state.get_job_status_with_task_id(
+                    job_id=self._job_id, task_id=task_id)
+                if prev_status is not None:
+                    if prev_status.is_terminal():
+                        return (prev_status ==
+                                managed_job_state.ManagedJobStatus.SUCCEEDED)
+                    if (prev_status ==
+                            managed_job_state.ManagedJobStatus.CANCELLING):
+                        # If the controller is down when cancelling the job,
+                        # we re-raise the error to run the `_cleanup` function
+                        # again to clean up any remaining resources.
+                        raise exceptions.ManagedJobUserCancelledError(
+                            'Recovering cancel signal.')
+                if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
+                    force_transit_to_recovering = True
+                # This resume logic should only be triggered once.
+                is_resume = False
             time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
             # Check the network connection to avoid false alarm for job failure.
@@ -221,8 +265,19 @@ class JobsController:
             # NOTE: we do not check cluster status first because race condition
             # can occur, i.e. cluster can be down during the job status check.
-            job_status = managed_job_utils.get_job_status(
-                self._backend, cluster_name)
+            # NOTE: If fetching the job status fails or we force to transit to
+            # recovering, we will set the job status to None, which will force
+            # enter the recovering logic.
+            job_status = None
+            if not force_transit_to_recovering:
+                try:
+                    job_status = managed_job_utils.get_job_status(
+                        self._backend, cluster_name)
+                except exceptions.FetchClusterInfoError as fetch_e:
+                    logger.info(
+                        'Failed to fetch the job status. Start recovery.\n'
+                        f'Exception: {common_utils.format_exception(fetch_e)}\n'
+                        f'Traceback: {traceback.format_exc()}')
             if job_status == job_lib.JobStatus.SUCCEEDED:
                 success_end_time = managed_job_utils.try_to_get_job_end_time(
@@ -379,7 +434,17 @@ class JobsController:
             if handle is not None:
                 resources = handle.launched_resources
                 assert resources is not None, handle
-                if resources.need_cleanup_after_preemption_or_failure():
+                # If we are forcing to transit to recovering, we need to clean
+                # up the cluster as it is possible that we already submitted the
+                # job to the worker cluster, but state is not updated yet. In
+                # this case, it is possible that we will double-submit the job
+                # to the worker cluster. So we always clean up the cluster here.
+                # TODO(tian,cooperc): We can check if there is a running job on
+                # the worker cluster, and if so, we can skip the cleanup.
+                # Challenge: race condition when the worker cluster thought it
+                # does not have a running job yet but later the job is launched.
+                if (resources.need_cleanup_after_preemption_or_failure() or
+                        force_transit_to_recovering):
                     # Some spot resource (e.g., Spot TPU VM) may need to be
                     # cleaned up after preemption, as running launch again on
                     # those clusters again may fail.
@@ -389,9 +454,11 @@ class JobsController:
             # Try to recover the managed jobs, when the cluster is preempted or
             # failed or the job status is failed to be fetched.
-            managed_job_state.set_recovering(job_id=self._job_id,
-                                             task_id=task_id,
-                                             callback_func=callback_func)
+            managed_job_state.set_recovering(
+                job_id=self._job_id,
+                task_id=task_id,
+                force_transit_to_recovering=force_transit_to_recovering,
+                callback_func=callback_func)
             recovered_time = self._strategy_executor.recover()
             managed_job_state.set_recovered(self._job_id,
                                             task_id,

sky/jobs/scheduler.py CHANGED Viewed

@@ -84,6 +84,32 @@ def _get_lock_path() -> str:
     return path
+def _start_controller(job_id: int, dag_yaml_path: str,
+                      env_file_path: str) -> None:
+    activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
+    source_environment_cmd = (f'source {env_file_path};'
+                              if env_file_path else '')
+    run_controller_cmd = ('python -u -m sky.jobs.controller '
+                          f'{dag_yaml_path} --job-id {job_id};')
+    # If the command line here is changed, please also update
+    # utils._controller_process_alive. `--job-id X` should be at
+    # the end.
+    run_cmd = (f'{activate_python_env_cmd}'
+               f'{source_environment_cmd}'
+               f'{run_controller_cmd}')
+    logs_dir = os.path.expanduser(
+        managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
+    os.makedirs(logs_dir, exist_ok=True)
+    log_path = os.path.join(logs_dir, f'{job_id}.log')
+    pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
+    state.set_job_controller_pid(job_id, pid)
+    logger.debug(f'Job {job_id} started with pid {pid}')
 def maybe_schedule_next_jobs() -> None:
     """Determine if any managed jobs can be scheduled, and if so, schedule them.
@@ -158,32 +184,9 @@ def maybe_schedule_next_jobs() -> None:
                     job_id = maybe_next_job['job_id']
                     dag_yaml_path = maybe_next_job['dag_yaml_path']
+                    env_file_path = maybe_next_job['env_file_path']
-                    activate_python_env_cmd = (
-                        f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
-                    env_file = maybe_next_job['env_file_path']
-                    source_environment_cmd = (f'source {env_file};'
-                                              if env_file else '')
-                    run_controller_cmd = ('python -u -m sky.jobs.controller '
-                                          f'{dag_yaml_path} --job-id {job_id};')
-                    # If the command line here is changed, please also update
-                    # utils._controller_process_alive. `--job-id X` should be at
-                    # the end.
-                    run_cmd = (f'{activate_python_env_cmd}'
-                               f'{source_environment_cmd}'
-                               f'{run_controller_cmd}')
-                    logs_dir = os.path.expanduser(
-                        managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
-                    os.makedirs(logs_dir, exist_ok=True)
-                    log_path = os.path.join(logs_dir, f'{job_id}.log')
-                    pid = subprocess_utils.launch_new_process_tree(
-                        run_cmd, log_output=log_path)
-                    state.set_job_controller_pid(job_id, pid)
-                    logger.debug(f'Job {job_id} started with pid {pid}')
+                    _start_controller(job_id, dag_yaml_path, env_file_path)
     except filelock.Timeout:
         # If we can't get the lock, just exit. The process holding the lock
@@ -203,10 +206,15 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
     The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
     """
     with filelock.FileLock(_get_lock_path()):
-        state.scheduler_set_waiting(job_id, dag_yaml_path,
-                                    original_user_yaml_path, env_file_path,
-                                    common_utils.get_user_hash(), priority)
-    maybe_schedule_next_jobs()
+        is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
+                                                original_user_yaml_path,
+                                                env_file_path,
+                                                common_utils.get_user_hash(),
+                                                priority)
+    if is_resume:
+        _start_controller(job_id, dag_yaml_path, env_file_path)
+    else:
+        maybe_schedule_next_jobs()
 @contextlib.contextmanager

sky/jobs/server/core.py CHANGED Viewed

@@ -102,14 +102,47 @@ def launch(
                     'name only and comment out the task names (so that they '
                     'will be auto-generated) .')
         task_names.add(task_.name)
-        if task_.job_priority is not None:
-            if (priority is not None and priority != task_.job_priority):
+        # Check for priority in resources first, then fall back to job priority
+        task_priority = None
+        if task_.resources:
+            # Convert set to list to access elements by index
+            resources_list = list(task_.resources)
+            # Take first resource's priority as reference
+            task_priority = resources_list[0].priority
+            # Check all other resources have same priority
+            for resource in resources_list[1:]:
+                if resource.priority != task_priority:
+                    with ux_utils.print_exception_no_traceback():
+                        raise ValueError(
+                            f'Task {task_.name!r}: All resources must have the '
+                            'same priority. Found priority '
+                            f'{resource.priority} but expected {task_priority}.'
+                        )
+            # Check for conflict between resources priority and job
+            # priority
+            if task_.job_priority is not None:
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(
+                        f'Task {task_.name!r}: Cannot specify both '
+                        f'resources.priority ({task_priority}) and '
+                        f'job.priority ({task_.job_priority}). Please use only '
+                        'one priority specification method.')
+        # Fall back to job priority if no resources priority found
+        if task_priority is None:
+            task_priority = task_.job_priority
+        if task_priority is not None:
+            if (priority is not None and priority != task_priority):
                 with ux_utils.print_exception_no_traceback():
                     raise ValueError(
                         'Multiple tasks in the DAG have different priorities. '
                         'Either specify a priority in only one task, or set '
                         'the same priority for each task.')
-            priority = task_.job_priority
+            priority = task_priority
     if priority is None:
         priority = managed_job_constants.DEFAULT_PRIORITY

sky/jobs/state.py CHANGED Viewed

@@ -352,6 +352,16 @@ class ManagedJobStatus(enum.Enum):
             cls.FAILED_NO_RESOURCE, cls.FAILED_CONTROLLER
         ]
+    @classmethod
+    def processing_statuses(cls) -> List['ManagedJobStatus']:
+        # Any status that is not terminal and is not CANCELLING.
+        return [
+            cls.PENDING,
+            cls.STARTING,
+            cls.RUNNING,
+            cls.RECOVERING,
+        ]
 _SPOT_STATUS_TO_COLOR = {
     ManagedJobStatus.PENDING: colorama.Fore.BLUE,
@@ -607,21 +617,49 @@ def set_started(job_id: int, task_id: int, start_time: float,
 @_init_db
-def set_recovering(job_id: int, task_id: int, callback_func: CallbackType):
+def set_recovering(job_id: int, task_id: int, force_transit_to_recovering: bool,
+                   callback_func: CallbackType):
     """Set the task to recovering state, and update the job duration."""
     assert _DB_PATH is not None
     logger.info('=== Recovering... ===')
+    expected_status: List[str] = [ManagedJobStatus.RUNNING.value]
+    status_str = 'status=(?)'
+    if force_transit_to_recovering:
+        # For the HA job controller, it is possible that the jobs came from any
+        # processing status to recovering. But it should not be any terminal
+        # status as such jobs will not be recovered; and it should not be
+        # CANCELLING as we will directly trigger a cleanup.
+        expected_status = [
+            s.value for s in ManagedJobStatus.processing_statuses()
+        ]
+        question_mark_str = ', '.join(['?'] * len(expected_status))
+        status_str = f'status IN ({question_mark_str})'
+    # NOTE: if we are resuming from a controller failure and the previous status
+    # is STARTING, the initial value of `last_recovered_at` might not be set
+    # yet (default value -1). In this case, we should not add current timestamp.
+    # Otherwise, the job duration will be incorrect (~55 years from 1970).
+    current_time = time.time()
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         cursor.execute(
-            """\
+            f"""\
                 UPDATE spot SET
-                status=(?), job_duration=job_duration+(?)-last_recovered_at
+                status=(?),
+                job_duration=CASE
+                    WHEN last_recovered_at >= 0
+                    THEN job_duration+(?)-last_recovered_at
+                    ELSE job_duration
+                END,
+                last_recovered_at=CASE
+                    WHEN last_recovered_at < 0
+                    THEN (?)
+                    ELSE last_recovered_at
+                END
                 WHERE spot_job_id=(?) AND
                 task_id=(?) AND
-                status=(?) AND
+                {status_str} AND
                 end_at IS null""",
-            (ManagedJobStatus.RECOVERING.value, time.time(), job_id, task_id,
-             ManagedJobStatus.RUNNING.value))
+            (ManagedJobStatus.RECOVERING.value, current_time, current_time,
+             job_id, task_id, *expected_status))
         if cursor.rowcount != 1:
             raise exceptions.ManagedJobStatusError(
                 f'Failed to set the task to recovering. '
@@ -996,6 +1034,19 @@ def _get_all_task_ids_statuses(
         return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
+@_init_db
+def get_job_status_with_task_id(job_id: int,
+                                task_id: int) -> Optional[ManagedJobStatus]:
+    assert _DB_PATH is not None
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        status = cursor.execute(
+            """\
+            SELECT status FROM spot
+            WHERE spot_job_id=(?) AND task_id=(?)""",
+            (job_id, task_id)).fetchone()
+        return ManagedJobStatus(status[0]) if status else None
 def get_num_tasks(job_id: int) -> int:
     return len(_get_all_task_ids_statuses(job_id))
@@ -1156,8 +1207,15 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
 @_init_db
 def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
                           original_user_yaml_path: str, env_file_path: str,
-                          user_hash: str, priority: int) -> None:
-    """Do not call without holding the scheduler lock."""
+                          user_hash: str, priority: int) -> bool:
+    """Do not call without holding the scheduler lock.
+    Returns: Whether this is a recovery run or not.
+        If this is a recovery run, the job may already be in the WAITING
+        state and the update will not change the schedule_state (hence the
+        updated_count will be 0). In this case, we return True.
+        Otherwise, we return False.
+    """
     assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         updated_count = cursor.execute(
@@ -1169,7 +1227,9 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
             (ManagedJobScheduleState.WAITING.value, dag_yaml_path,
              original_user_yaml_path, env_file_path, user_hash, priority,
              job_id, ManagedJobScheduleState.INACTIVE.value)).rowcount
-        assert updated_count == 1, (job_id, updated_count)
+        # For a recovery run, the job may already be in the WAITING state.
+        assert updated_count <= 1, (job_id, updated_count)
+        return updated_count == 0
 @_init_db

sky/jobs/utils.py CHANGED Viewed

@@ -176,6 +176,17 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
     Note: we expect that job_id, if provided, refers to a nonterminal job or a
     job that has not completed its cleanup (schedule state not DONE).
     """
+    # This signal file suggests that the controller is recovering from a
+    # failure. See sky/templates/kubernetes-ray.yml.j2 for more details.
+    # When restarting the controller processes, we don't want this event to
+    # set the job status to FAILED_CONTROLLER.
+    # TODO(tian): Change this to restart the controller process. For now we
+    # disabled it when recovering because we want to avoid caveats of infinite
+    # restart of last controller process that fully occupied the controller VM.
+    if os.path.exists(
+            os.path.expanduser(
+                constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
+        return
     def _cleanup_job_clusters(job_id: int) -> Optional[str]:
         """Clean up clusters for a job. Returns error message if any.

sky/logs/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""Sky logging agents."""
+from typing import Optional
+from sky import exceptions
+from sky import skypilot_config
+from sky.logs.agent import LoggingAgent
+from sky.logs.gcp import GCPLoggingAgent
+def get_logging_agent() -> Optional[LoggingAgent]:
+    store = skypilot_config.get_nested(('logs', 'store'), None)
+    if store is None:
+        return None
+    if store == 'gcp':
+        return GCPLoggingAgent(skypilot_config.get_nested(('logs', 'gcp'), {}))
+    raise exceptions.InvalidSkyPilotConfigError(
+        f'Invalid logging store: {store}')

skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250616py3-none-any.whl → 1.0.0.dev20250618py3-none-any.whl