PyPI - trainml - Versions diffs - 0.5.17__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

trainml 0.5.17py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

examples/local_storage.py +0 -2
tests/integration/test_checkpoints_integration.py +4 -3
tests/integration/test_datasets_integration.py +5 -3
tests/integration/test_jobs_integration.py +33 -27
tests/integration/test_models_integration.py +7 -3
tests/integration/test_volumes_integration.py +2 -2
tests/unit/cli/test_cli_checkpoint_unit.py +312 -1
tests/unit/cloudbender/test_nodes_unit.py +112 -0
tests/unit/cloudbender/test_providers_unit.py +96 -0
tests/unit/cloudbender/test_regions_unit.py +106 -0
tests/unit/cloudbender/test_services_unit.py +141 -0
tests/unit/conftest.py +23 -10
tests/unit/projects/test_project_data_connectors_unit.py +39 -0
tests/unit/projects/test_project_datastores_unit.py +37 -0
tests/unit/projects/test_project_members_unit.py +46 -0
tests/unit/projects/test_project_services_unit.py +65 -0
tests/unit/projects/test_projects_unit.py +16 -0
tests/unit/test_auth_unit.py +17 -2
tests/unit/test_checkpoints_unit.py +256 -71
tests/unit/test_datasets_unit.py +218 -68
tests/unit/test_exceptions.py +133 -0
tests/unit/test_gpu_types_unit.py +11 -1
tests/unit/test_jobs_unit.py +1014 -95
tests/unit/test_main_unit.py +20 -0
tests/unit/test_models_unit.py +218 -70
tests/unit/test_trainml_unit.py +627 -3
tests/unit/test_volumes_unit.py +211 -70
tests/unit/utils/__init__.py +1 -0
tests/unit/utils/test_transfer_unit.py +4260 -0
trainml/__init__.py +1 -1
trainml/checkpoints.py +56 -57
trainml/cli/__init__.py +6 -3
trainml/cli/checkpoint.py +18 -57
trainml/cli/dataset.py +17 -57
trainml/cli/job/__init__.py +89 -67
trainml/cli/job/create.py +51 -24
trainml/cli/model.py +14 -56
trainml/cli/volume.py +18 -57
trainml/datasets.py +50 -55
trainml/jobs.py +269 -69
trainml/models.py +51 -55
trainml/trainml.py +159 -114
trainml/utils/__init__.py +1 -0
trainml/utils/auth.py +641 -0
trainml/utils/transfer.py +647 -0
trainml/volumes.py +48 -53
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/METADATA +3 -3
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/RECORD +52 -46
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/LICENSE +0 -0
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/WHEEL +0 -0
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/entry_points.txt +0 -0
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/top_level.txt +0 -0

trainml/jobs.py CHANGED Viewed

@@ -12,7 +12,7 @@ from trainml.exceptions import (
     SpecificationError,
     TrainMLException,
 )
-from trainml.connections import Connection
+from trainml.utils.transfer import upload, download
 class Jobs(object):
@@ -77,7 +77,8 @@ class Jobs(object):
             model=model,
             endpoint=endpoint,
             source_job_uuid=kwargs.get("source_job_uuid"),
-            project_uuid=kwargs.get("project_uuid") or self.trainml.active_project,
+            project_uuid=kwargs.get("project_uuid")
+            or self.trainml.active_project,
         )
         payload = {
             k: v
@@ -102,7 +103,9 @@ class Jobs(object):
         return job
     async def remove(self, id, **kwargs):
-        await self.trainml._query(f"/job/{id}", "DELETE", dict(**kwargs, force=True))
+        await self.trainml._query(
+            f"/job/{id}", "DELETE", dict(**kwargs, force=True)
+        )
 class Job:
@@ -292,42 +295,6 @@ class Job:
         )
         return resp
-    async def get_connection_utility_url(self):
-        resp = await self.trainml._query(
-            f"/job/{self._id}/download",
-            "GET",
-            dict(project_uuid=self._project_uuid),
-        )
-        return resp
-    def get_connection_details(self):
-        details = dict(
-            entity_type="job",
-            project_uuid=self._job.get("project_uuid"),
-            cidr=self.dict.get("vpn").get("cidr"),
-            ssh_port=(
-                self._job.get("vpn").get("client").get("ssh_port")
-                if self._job.get("vpn").get("client")
-                else None
-            ),
-            model_path=(
-                self._job.get("model").get("source_uri")
-                if self._job.get("model").get("source_type") == "local"
-                else None
-            ),
-            input_path=(
-                self._job.get("data").get("input_uri")
-                if self._job.get("data").get("input_type") == "local"
-                else None
-            ),
-            output_path=(
-                self._job.get("data").get("output_uri")
-                if self._job.get("data").get("output_type") == "local"
-                else None
-            ),
-        )
-        return details
     async def open(self):
         if self.type != "notebook":
             raise SpecificationError(
@@ -337,6 +304,7 @@ class Job:
         webbrowser.open(self.notebook_url)
     async def connect(self):
+        # Handle notebook/endpoint special cases
         if self.type == "notebook" and self.status not in [
             "new",
             "waiting for data/model download",
@@ -352,9 +320,30 @@ class Job:
             "waiting for data/model download",
         ]:
             return self.url
+        # Refresh to get latest job data first, so we can check worker statuses
+        await self.refresh()
+        # Check worker statuses - if any worker is uploading, allow connection
+        # This handles the case where job status might be "finished" but workers are still uploading
+        workers = self._job.get("workers", [])
+        has_uploading_workers = any(
+            worker.get("status") == "uploading" for worker in workers
+        ) if workers else False
+        # Log worker statuses for debugging
+        if workers:
+            worker_statuses = [
+                f"Worker {i+1}: {worker.get('status')}"
+                for i, worker in enumerate(workers)
+            ]
+            logging.debug(
+                f"Job status: {self.status}, Worker statuses: {', '.join(worker_statuses)}, Has uploading workers: {has_uploading_workers}"
+            )
+        # Check for invalid statuses (but allow "finished" if workers are still uploading)
         if self.status in [
             "failed",
-            "finished",
             "canceled",
             "archived",
             "removed",
@@ -364,26 +353,231 @@ class Job:
                 "status",
                 f"You can only connect to active jobs.",
             )
-        if self._job.get("vpn").get("status") == "n/a":
-            logging.info("Local connection not enabled for this job.")
-            return
-        if self.status == "new":
-            await self.wait_for("waiting for data/model download")
-        connection = Connection(
-            self.trainml, entity_type="job", id=self.id, entity=self
-        )
-        await connection.start()
-        return connection.status
-    async def disconnect(self):
-        if self._job.get("vpn").get("status") == "n/a":
-            logging.info("Local connection not enabled for this job.")
-            return
-        connection = Connection(
-            self.trainml, entity_type="job", id=self.id, entity=self
-        )
-        await connection.stop()
-        return connection.status
+        # Allow "finished" status if there are workers still uploading
+        # This handles reconnection scenarios where some workers are done but others are still uploading
+        if self.status == "finished":
+            if not has_uploading_workers:
+                raise SpecificationError(
+                    "status",
+                    f"You can only connect to active jobs.",
+                )
+            logging.info(
+                f"Job status is 'finished' but has {sum(1 for w in workers if w.get('status') == 'uploading')} worker(s) still uploading. Allowing connection to download remaining workers."
+            )
+            # If we have uploading workers, fall through to download logic
+        # Only allow specific statuses for connect
+        if self.status not in [
+            "waiting for data/model download",
+            "uploading",
+            "running",
+            "finished",  # Allow finished if workers are still uploading
+        ]:
+            if self.status == "new":
+                await self.wait_for("waiting for data/model download")
+            else:
+                raise SpecificationError(
+                    "status",
+                    f"You can only connect to jobs in 'waiting for data/model download', 'uploading', 'running', or 'finished' (with uploading workers) status.",
+                )
+        if self.status == "waiting for data/model download":
+            # Upload model and/or data if local
+            model = self._job.get("model", {})
+            data = self._job.get("data", {})
+            model_local = model.get("source_type") == "local"
+            data_local = data.get("input_type") == "local"
+            if not model_local and not data_local:
+                raise SpecificationError(
+                    "status",
+                    f"Job has no local model or data to upload. Model source_type: {model.get('source_type')}, Data input_type: {data.get('input_type')}",
+                )
+            upload_tasks = []
+            if model_local:
+                model_auth_token = model.get("auth_token")
+                model_hostname = model.get("hostname")
+                model_source_uri = model.get("source_uri")
+                if (
+                    not model_auth_token
+                    or not model_hostname
+                    or not model_source_uri
+                ):
+                    raise SpecificationError(
+                        "status",
+                        f"Job model missing required connection properties (auth_token, hostname, source_uri).",
+                    )
+                upload_tasks.append(
+                    upload(model_hostname, model_auth_token, model_source_uri)
+                )
+            if data_local:
+                data_auth_token = data.get("input_auth_token")
+                data_hostname = data.get("input_hostname")
+                data_input_uri = data.get("input_uri")
+                if (
+                    not data_auth_token
+                    or not data_hostname
+                    or not data_input_uri
+                ):
+                    raise SpecificationError(
+                        "status",
+                        f"Job data missing required connection properties (input_auth_token, input_hostname, input_uri).",
+                    )
+                upload_tasks.append(
+                    upload(data_hostname, data_auth_token, data_input_uri)
+                )
+            # Upload both in parallel if both are local
+            if upload_tasks:
+                await asyncio.gather(*upload_tasks)
+        elif self.status in ["uploading", "running", "finished"]:
+            # Download output if local
+            data = self._job.get("data", {})
+            if data.get("output_type") != "local":
+                raise SpecificationError(
+                    "status",
+                    f"Job output_type is not 'local', cannot download output.",
+                )
+            output_uri = data.get("output_uri")
+            if not output_uri:
+                raise SpecificationError(
+                    "status",
+                    f"Job data missing output_uri for local output download.",
+                )
+            # Track which workers we've already started downloading
+            downloading_workers = set()
+            download_tasks = []
+            # Poll until all workers are finished
+            while True:
+                # Refresh job to get latest worker statuses
+                await self.refresh()
+                # Get fresh workers list
+                workers = self._job.get("workers", [])
+                if not workers:
+                    raise SpecificationError(
+                        "status",
+                        f"Job has no workers.",
+                    )
+                # Check if job is in a terminal state AND all workers are finished
+                # Allow "finished" status if workers are still uploading
+                all_workers_finished = all(
+                    worker.get("status") in ["finished", "removed"]
+                    for worker in workers
+                )
+                if self.status in ["canceled", "failed"]:
+                    break
+                if self.status == "finished" and all_workers_finished:
+                    break
+                # Check all workers for uploading status
+                for worker in workers:
+                    worker_id = worker.get("job_worker_uuid") or worker.get(
+                        "id"
+                    )
+                    worker_status = worker.get("status")
+                    # Start download for any worker that enters uploading status
+                    # This handles both new connections and reconnections where some workers are already uploading
+                    if (
+                        worker_status == "uploading"
+                        and worker_id not in downloading_workers
+                    ):
+                        output_auth_token = worker.get("output_auth_token")
+                        output_hostname = worker.get("output_hostname")
+                        if not output_auth_token or not output_hostname:
+                            logging.warning(
+                                f"Worker {worker_id} in uploading status missing output_auth_token or output_hostname, skipping."
+                            )
+                            # Mark as downloading to avoid retrying
+                            downloading_workers.add(worker_id)
+                            continue
+                        downloading_workers.add(worker_id)
+                        # Create and start download task (runs in parallel)
+                        logging.info(
+                            f"Starting download for worker {worker_id} from {output_hostname} to {output_uri}"
+                        )
+                        try:
+                            download_task = asyncio.create_task(
+                                download(
+                                    output_hostname,
+                                    output_auth_token,
+                                    output_uri,
+                                )
+                            )
+                            download_tasks.append(download_task)
+                            logging.debug(
+                                f"Download task created for worker {worker_id}, task: {download_task}"
+                            )
+                        except Exception as e:
+                            logging.error(
+                                f"Failed to create download task for worker {worker_id}: {e}",
+                                exc_info=True,
+                            )
+                            raise
+                # Check if any download tasks have completed or failed
+                if download_tasks:
+                    completed_tasks = [
+                        task for task in download_tasks if task.done()
+                    ]
+                    for task in completed_tasks:
+                        try:
+                            await task  # This will raise if the task failed
+                            logging.info(
+                                f"Download task completed successfully"
+                            )
+                        except Exception as e:
+                            logging.error(
+                                f"Download task failed: {e}", exc_info=True
+                            )
+                            raise
+                    # Remove completed tasks
+                    download_tasks = [
+                        task for task in download_tasks if not task.done()
+                    ]
+                # Check if all workers are finished
+                all_finished = all(
+                    worker.get("status") in ["finished", "removed"]
+                    for worker in workers
+                )
+                if all_finished:
+                    break
+                # If we have active download tasks, wait a bit for them to make progress
+                # but don't wait the full 30 seconds - check more frequently
+                if download_tasks:
+                    await asyncio.sleep(5)
+                else:
+                    # Wait 30 seconds before next poll if no downloads in progress
+                    await asyncio.sleep(30)
+            # Wait for all download tasks to complete
+            if download_tasks:
+                logging.info(
+                    f"Waiting for {len(download_tasks)} download task(s) to complete"
+                )
+                await asyncio.gather(*download_tasks)
+                logging.info("All downloads completed")
     async def remove(self, force=False):
         await self.trainml._query(
@@ -401,7 +595,8 @@ class Job:
     def _get_msg_handler(self, msg_handler):
         worker_numbers = {
-            w.get("job_worker_uuid"): ind + 1 for ind, w in enumerate(self._workers)
+            w.get("job_worker_uuid"): ind + 1
+            for ind, w in enumerate(self._workers)
         }
         worker_numbers["data_worker"] = 0
@@ -411,7 +606,9 @@ class Job:
                 if msg_handler:
                     msg_handler(data)
                 else:
-                    timestamp = datetime.fromtimestamp(int(data.get("time")) / 1000)
+                    timestamp = datetime.fromtimestamp(
+                        int(data.get("time")) / 1000
+                    )
                     if len(self._workers) > 1:
                         print(
                             f"{timestamp.strftime('%m/%d/%Y, %H:%M:%S')}: Worker {data.get('worker_number')} - {data.get('msg').rstrip()}"
@@ -424,7 +621,10 @@ class Job:
         return handler
     async def attach(self, msg_handler=None):
-        if self.type == "notebook" and self.status != "waiting for data/model download":
+        if (
+            self.type == "notebook"
+            and self.status != "waiting for data/model download"
+        ):
             raise SpecificationError(
                 "type",
                 "Notebooks cannot be attached to after model download is complete.  Use open() instead.",
@@ -441,7 +641,9 @@ class Job:
     async def copy(self, name, **kwargs):
         logging.debug(f"copy request - name: {name} ; kwargs: {kwargs}")
         if self.type != "notebook":
-            raise SpecificationError("job", "Only notebook job types can be copied")
+            raise SpecificationError(
+                "job", "Only notebook job types can be copied"
+            )
         job = await self.trainml.jobs.create(
             name,
@@ -502,7 +704,9 @@ class Job:
         POLL_INTERVAL_MIN = 5
         POLL_INTERVAL_MAX = 60
-        POLL_INTERVAL = max(min(timeout / 60, POLL_INTERVAL_MAX), POLL_INTERVAL_MIN)
+        POLL_INTERVAL = max(
+            min(timeout / 60, POLL_INTERVAL_MAX), POLL_INTERVAL_MIN
+        )
         retry_count = math.ceil(timeout / POLL_INTERVAL)
         count = 0
         while count < retry_count:
@@ -533,11 +737,7 @@ class Job:
                 or (
                     status
                     == "running"  ## this status could be too short for polling could miss it
-                    and self.status
-                    in [
-                        "uploading",
-                        "finished"
-                    ]
+                    and self.status in ["uploading", "finished"]
                 )
             ):
                 return self

trainml/models.py CHANGED Viewed

@@ -10,7 +10,7 @@ from .exceptions import (
     SpecificationError,
     TrainMLException,
 )
-from .connections import Connection
+from trainml.utils.transfer import upload, download
 class Models(object):
@@ -54,7 +54,9 @@ class Models(object):
         return model
     async def remove(self, id, **kwargs):
-        await self.trainml._query(f"/model/{id}", "DELETE", dict(**kwargs, force=True))
+        await self.trainml._query(
+            f"/model/{id}", "DELETE", dict(**kwargs, force=True)
+        )
 class Model:
@@ -65,7 +67,9 @@ class Model:
         self._status = self._model.get("status")
         self._name = self._model.get("name")
         self._size = self._model.get("size") or self._model.get("used_size")
-        self._billed_size = self._model.get("billed_size") or self._model.get("size")
+        self._billed_size = self._model.get("billed_size") or self._model.get(
+            "size"
+        )
         self._project_uuid = self._model.get("project_uuid")
     @property
@@ -113,57 +117,45 @@ class Model:
         )
         return resp
-    async def get_connection_utility_url(self):
-        resp = await self.trainml._query(
-            f"/model/{self._id}/download",
-            "GET",
-            dict(project_uuid=self._project_uuid),
-        )
-        return resp
-    def get_connection_details(self):
-        if self._model.get("vpn"):
-            details = dict(
-                entity_type="model",
-                project_uuid=self._model.get("project_uuid"),
-                cidr=self._model.get("vpn").get("cidr"),
-                ssh_port=self._model.get("vpn").get("client").get("ssh_port"),
-                input_path=(
-                    self._model.get("source_uri")
-                    if self.status in ["new", "downloading"]
-                    else None
-                ),
-                output_path=(
-                    self._model.get("output_uri")
-                    if self.status == "exporting"
-                    else None
-                ),
-            )
-        else:
-            details = dict()
-        logging.debug(f"Connection Details: {details}")
-        return details
     async def connect(self):
-        if self.status in ["ready", "failed"]:
-            raise SpecificationError(
-                "status",
-                f"You can only connect to downloading or exporting models.",
-            )
-        if self.status == "new":
-            await self.wait_for("downloading")
-        connection = Connection(
-            self.trainml, entity_type="model", id=self.id, entity=self
-        )
-        await connection.start()
-        return connection.status
+        if self.status not in ["downloading", "exporting"]:
+            if self.status == "new":
+                await self.wait_for("downloading")
+            else:
+                raise SpecificationError(
+                    "status",
+                    f"You can only connect to downloading or exporting models.",
+                )
-    async def disconnect(self):
-        connection = Connection(
-            self.trainml, entity_type="model", id=self.id, entity=self
-        )
-        await connection.stop()
-        return connection.status
+        # Refresh to get latest entity data
+        await self.refresh()
+        if self.status == "downloading":
+            # Upload task - get auth_token, hostname, and source_uri from model
+            auth_token = self._model.get("auth_token")
+            hostname = self._model.get("hostname")
+            source_uri = self._model.get("source_uri")
+            if not auth_token or not hostname or not source_uri:
+                raise SpecificationError(
+                    "status",
+                    f"Model in downloading status missing required connection properties (auth_token, hostname, source_uri).",
+                )
+            await upload(hostname, auth_token, source_uri)
+        elif self.status == "exporting":
+            # Download task - get auth_token, hostname, and output_uri from model
+            auth_token = self._model.get("auth_token")
+            hostname = self._model.get("hostname")
+            output_uri = self._model.get("output_uri")
+            if not auth_token or not hostname or not output_uri:
+                raise SpecificationError(
+                    "status",
+                    f"Model in exporting status missing required connection properties (auth_token, hostname, output_uri).",
+                )
+            await download(hostname, auth_token, output_uri)
     async def remove(self, force=False):
         await self.trainml._query(
@@ -202,7 +194,9 @@ class Model:
                 if msg_handler:
                     msg_handler(data)
                 else:
-                    timestamp = datetime.fromtimestamp(int(data.get("time")) / 1000)
+                    timestamp = datetime.fromtimestamp(
+                        int(data.get("time")) / 1000
+                    )
                     print(
                         f"{timestamp.strftime('%m/%d/%Y, %H:%M:%S')}: {data.get('msg').rstrip()}"
                     )
@@ -231,7 +225,7 @@ class Model:
     async def wait_for(self, status, timeout=300):
         if self.status == status:
             return
-        valid_statuses = ["downloading", "ready", "archived"]
+        valid_statuses = ["downloading", "ready","exporting", "archived"]
         if not status in valid_statuses:
             raise SpecificationError(
                 "status",
@@ -245,7 +239,9 @@ class Model:
             )
         POLL_INTERVAL_MIN = 5
         POLL_INTERVAL_MAX = 60
-        POLL_INTERVAL = max(min(timeout / 60, POLL_INTERVAL_MAX), POLL_INTERVAL_MIN)
+        POLL_INTERVAL = max(
+            min(timeout / 60, POLL_INTERVAL_MAX), POLL_INTERVAL_MIN
+        )
         retry_count = math.ceil(timeout / POLL_INTERVAL)
         count = 0
         while count < retry_count:

trainml 0.5.17__py3-none-any.whl → 1.0.1__py3-none-any.whl

trainml 0.5.17py3-none-any.whl → 1.0.1py3-none-any.whl