PyPI - trainml - Versions diffs - 0.5.17__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

trainml 0.5.17py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

examples/local_storage.py +0 -2
tests/integration/test_checkpoints_integration.py +4 -3
tests/integration/test_datasets_integration.py +5 -3
tests/integration/test_jobs_integration.py +33 -27
tests/integration/test_models_integration.py +7 -3
tests/integration/test_volumes_integration.py +2 -2
tests/unit/cli/test_cli_checkpoint_unit.py +312 -1
tests/unit/cloudbender/test_nodes_unit.py +112 -0
tests/unit/cloudbender/test_providers_unit.py +96 -0
tests/unit/cloudbender/test_regions_unit.py +106 -0
tests/unit/cloudbender/test_services_unit.py +141 -0
tests/unit/conftest.py +23 -10
tests/unit/projects/test_project_data_connectors_unit.py +39 -0
tests/unit/projects/test_project_datastores_unit.py +37 -0
tests/unit/projects/test_project_members_unit.py +46 -0
tests/unit/projects/test_project_services_unit.py +65 -0
tests/unit/projects/test_projects_unit.py +16 -0
tests/unit/test_auth_unit.py +17 -2
tests/unit/test_checkpoints_unit.py +256 -71
tests/unit/test_datasets_unit.py +218 -68
tests/unit/test_exceptions.py +133 -0
tests/unit/test_gpu_types_unit.py +11 -1
tests/unit/test_jobs_unit.py +1014 -95
tests/unit/test_main_unit.py +20 -0
tests/unit/test_models_unit.py +218 -70
tests/unit/test_trainml_unit.py +627 -3
tests/unit/test_volumes_unit.py +211 -70
tests/unit/utils/__init__.py +1 -0
tests/unit/utils/test_transfer_unit.py +4260 -0
trainml/__init__.py +1 -1
trainml/checkpoints.py +56 -57
trainml/cli/__init__.py +6 -3
trainml/cli/checkpoint.py +18 -57
trainml/cli/dataset.py +17 -57
trainml/cli/job/__init__.py +89 -67
trainml/cli/job/create.py +51 -24
trainml/cli/model.py +14 -56
trainml/cli/volume.py +18 -57
trainml/datasets.py +50 -55
trainml/jobs.py +269 -69
trainml/models.py +51 -55
trainml/trainml.py +159 -114
trainml/utils/__init__.py +1 -0
trainml/utils/auth.py +641 -0
trainml/utils/transfer.py +647 -0
trainml/volumes.py +48 -53
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/METADATA +3 -3
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/RECORD +52 -46
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/LICENSE +0 -0
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/WHEEL +0 -0
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/entry_points.txt +0 -0
{trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/top_level.txt +0 -0

trainml/__init__.py CHANGED Viewed

@@ -13,5 +13,5 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
-__version__ = "0.5.17"
+__version__ = "1.0.1"
 __all__ = "TrainML"

trainml/checkpoints.py CHANGED Viewed

@@ -10,7 +10,7 @@ from .exceptions import (
     SpecificationError,
     TrainMLException,
 )
-from .connections import Connection
+from trainml.utils.transfer import upload, download
 class Checkpoints(object):
@@ -23,7 +23,9 @@ class Checkpoints(object):
     async def list(self, **kwargs):
         resp = await self.trainml._query(f"/checkpoint", "GET", kwargs)
-        checkpoints = [Checkpoint(self.trainml, **checkpoint) for checkpoint in resp]
+        checkpoints = [
+            Checkpoint(self.trainml, **checkpoint) for checkpoint in resp
+        ]
         return checkpoints
     async def list_public(self, **kwargs):
@@ -68,13 +70,17 @@ class Checkpoint:
     def __init__(self, trainml, **kwargs):
         self.trainml = trainml
         self._checkpoint = kwargs
-        self._id = self._checkpoint.get("id", self._checkpoint.get("checkpoint_uuid"))
+        self._id = self._checkpoint.get(
+            "id", self._checkpoint.get("checkpoint_uuid")
+        )
         self._status = self._checkpoint.get("status")
         self._name = self._checkpoint.get("name")
-        self._size = self._checkpoint.get("size") or self._checkpoint.get("used_size")
-        self._billed_size = self._checkpoint.get("billed_size") or self._checkpoint.get(
-            "size"
+        self._size = self._checkpoint.get("size") or self._checkpoint.get(
+            "used_size"
         )
+        self._billed_size = self._checkpoint.get(
+            "billed_size"
+        ) or self._checkpoint.get("size")
         self._project_uuid = self._checkpoint.get("project_uuid")
     @property
@@ -122,56 +128,45 @@ class Checkpoint:
         )
         return resp
-    async def get_connection_utility_url(self):
-        resp = await self.trainml._query(
-            f"/checkpoint/{self._id}/download",
-            "GET",
-            dict(project_uuid=self._project_uuid),
-        )
-        return resp
-    def get_connection_details(self):
-        if self._checkpoint.get("vpn"):
-            details = dict(
-                entity_type="checkpoint",
-                project_uuid=self._checkpoint.get("project_uuid"),
-                cidr=self._checkpoint.get("vpn").get("cidr"),
-                ssh_port=self._checkpoint.get("vpn").get("client").get("ssh_port"),
-                input_path=(
-                    self._checkpoint.get("source_uri")
-                    if self.status in ["new", "downloading"]
-                    else None
-                ),
-                output_path=(
-                    self._checkpoint.get("output_uri")
-                    if self.status == "exporting"
-                    else None
-                ),
-            )
-        else:
-            details = dict()
-        return details
     async def connect(self):
-        if self.status in ["ready", "failed"]:
-            raise SpecificationError(
-                "status",
-                f"You can only connect to downloading or exporting checkpoints.",
-            )
-        if self.status == "new":
-            await self.wait_for("downloading")
-        connection = Connection(
-            self.trainml, entity_type="checkpoint", id=self.id, entity=self
-        )
-        await connection.start()
-        return connection.status
+        if self.status not in ["downloading", "exporting"]:
+            if self.status == "new":
+                await self.wait_for("downloading")
+            else:
+                raise SpecificationError(
+                    "status",
+                    f"You can only connect to downloading or exporting checkpoints.",
+                )
-    async def disconnect(self):
-        connection = Connection(
-            self.trainml, entity_type="checkpoint", id=self.id, entity=self
-        )
-        await connection.stop()
-        return connection.status
+        # Refresh to get latest entity data
+        await self.refresh()
+        if self.status == "downloading":
+            # Upload task - get auth_token, hostname, and source_uri from checkpoint
+            auth_token = self._checkpoint.get("auth_token")
+            hostname = self._checkpoint.get("hostname")
+            source_uri = self._checkpoint.get("source_uri")
+            if not auth_token or not hostname or not source_uri:
+                raise SpecificationError(
+                    "status",
+                    f"Checkpoint in downloading status missing required connection properties (auth_token, hostname, source_uri).",
+                )
+            await upload(hostname, auth_token, source_uri)
+        elif self.status == "exporting":
+            # Download task - get auth_token, hostname, and output_uri from checkpoint
+            auth_token = self._checkpoint.get("auth_token")
+            hostname = self._checkpoint.get("hostname")
+            output_uri = self._checkpoint.get("output_uri")
+            if not auth_token or not hostname or not output_uri:
+                raise SpecificationError(
+                    "status",
+                    f"Checkpoint in exporting status missing required connection properties (auth_token, hostname, output_uri).",
+                )
+            await download(hostname, auth_token, output_uri)
     async def remove(self, force=False):
         await self.trainml._query(
@@ -210,7 +205,9 @@ class Checkpoint:
                 if msg_handler:
                     msg_handler(data)
                 else:
-                    timestamp = datetime.fromtimestamp(int(data.get("time")) / 1000)
+                    timestamp = datetime.fromtimestamp(
+                        int(data.get("time")) / 1000
+                    )
                     print(
                         f"{timestamp.strftime('%m/%d/%Y, %H:%M:%S')}: {data.get('msg').rstrip()}"
                     )
@@ -239,7 +236,7 @@ class Checkpoint:
     async def wait_for(self, status, timeout=300):
         if self.status == status:
             return
-        valid_statuses = ["downloading", "ready", "archived"]
+        valid_statuses = ["downloading", "ready", "exporting", "archived"]
         if not status in valid_statuses:
             raise SpecificationError(
                 "status",
@@ -254,7 +251,9 @@ class Checkpoint:
             )
         POLL_INTERVAL_MIN = 5
         POLL_INTERVAL_MAX = 60
-        POLL_INTERVAL = max(min(timeout / 60, POLL_INTERVAL_MAX), POLL_INTERVAL_MIN)
+        POLL_INTERVAL = max(
+            min(timeout / 60, POLL_INTERVAL_MAX), POLL_INTERVAL_MIN
+        )
         retry_count = math.ceil(timeout / POLL_INTERVAL)
         count = 0
         while count < retry_count:

trainml/cli/__init__.py CHANGED Viewed

@@ -142,7 +142,9 @@ def configure(config):
         project for project in projects if project.id == active_project_id
     ]
-    active_project_name = active_project[0].name if len(active_project) else "UNSET"
+    active_project_name = (
+        active_project[0].name if len(active_project) else "UNSET"
+    )
     click.echo(f"Current Active Project: {active_project_name}")
@@ -152,11 +154,12 @@ def configure(config):
         show_choices=True,
         default=active_project_name,
     )
-    selected_project = [project for project in projects if project.name == name]
+    selected_project = [
+        project for project in projects if project.name == name
+    ]
     config.trainml.client.set_active_project(selected_project[0].id)
-from trainml.cli.connection import connection
 from trainml.cli.dataset import dataset
 from trainml.cli.model import model
 from trainml.cli.checkpoint import checkpoint

trainml/cli/checkpoint.py CHANGED Viewed

@@ -35,15 +35,7 @@ def attach(config, checkpoint):
     if None is found:
         raise click.UsageError("Cannot find specified checkpoint.")
-    try:
-        config.trainml.run(found.attach())
-        return config.trainml.run(found.disconnect())
-    except:
-        try:
-            config.trainml.run(found.disconnect())
-        except:
-            pass
-        raise
+    config.trainml.run(found.attach())
 @checkpoint.command()
@@ -67,18 +59,10 @@ def connect(config, checkpoint, attach):
     if None is found:
         raise click.UsageError("Cannot find specified checkpoint.")
-    try:
-        if attach:
-            config.trainml.run(found.connect(), found.attach())
-            return config.trainml.run(found.disconnect())
-        else:
-            return config.trainml.run(found.connect())
-    except:
-        try:
-            config.trainml.run(found.disconnect())
-        except:
-            pass
-        raise
+    if attach:
+        config.trainml.run(found.connect(), found.attach())
+    else:
+        config.trainml.run(found.connect())
 @checkpoint.command()
@@ -123,41 +107,15 @@ def create(config, attach, connect, source, name, path):
             )
         )
-        try:
-            if connect and attach:
-                config.trainml.run(checkpoint.attach(), checkpoint.connect())
-                return config.trainml.run(checkpoint.disconnect())
-            elif connect:
-                return config.trainml.run(checkpoint.connect())
-            else:
-                raise click.UsageError(
-                    "Abort!\n"
-                    "No logs to show for local sourced checkpoint without connect."
-                )
-        except:
-            try:
-                config.trainml.run(checkpoint.disconnect())
-            except:
-                pass
-            raise
-@checkpoint.command()
-@click.argument("checkpoint", type=click.STRING)
-@pass_config
-def disconnect(config, checkpoint):
-    """
-    Disconnect and clean-up checkpoint upload.
-    CHECKPOINT may be specified by name or ID, but ID is preferred.
-    """
-    checkpoints = config.trainml.run(config.trainml.client.checkpoints.list())
-    found = search_by_id_name(checkpoint, checkpoints)
-    if None is found:
-        raise click.UsageError("Cannot find specified checkpoint.")
-    return config.trainml.run(found.disconnect())
+        if connect and attach:
+            config.trainml.run(checkpoint.attach(), checkpoint.connect())
+        elif connect:
+            config.trainml.run(checkpoint.connect())
+        else:
+            raise click.UsageError(
+                "Abort!\n"
+                "No logs to show for local sourced checkpoint without connect."
+            )
 @checkpoint.command()
@@ -236,7 +194,10 @@ def remove(config, checkpoint, force):
     found = search_by_id_name(checkpoint, checkpoints)
     if None is found:
         if force:
-            config.trainml.run(found.client.checkpoints.remove(checkpoint))
+            config.trainml.run(
+                config.trainml.client.checkpoints.remove(checkpoint)
+            )
+            return
         else:
             raise click.UsageError("Cannot find specified checkpoint.")

trainml/cli/dataset.py CHANGED Viewed

@@ -35,15 +35,7 @@ def attach(config, dataset):
     if None is found:
         raise click.UsageError("Cannot find specified dataset.")
-    try:
-        config.trainml.run(found.attach())
-        return config.trainml.run(found.disconnect())
-    except:
-        try:
-            config.trainml.run(found.disconnect())
-        except:
-            pass
-        raise
+    config.trainml.run(found.attach())
 @dataset.command()
@@ -67,18 +59,10 @@ def connect(config, dataset, attach):
     if None is found:
         raise click.UsageError("Cannot find specified dataset.")
-    try:
-        if attach:
-            config.trainml.run(found.connect(), found.attach())
-            return config.trainml.run(found.disconnect())
-        else:
-            return config.trainml.run(found.connect())
-    except:
-        try:
-            config.trainml.run(found.disconnect())
-        except:
-            pass
-        raise
+    if attach:
+        config.trainml.run(found.connect(), found.attach())
+    else:
+        config.trainml.run(found.connect())
 @dataset.command()
@@ -123,41 +107,15 @@ def create(config, attach, connect, source, name, path):
             )
         )
-        try:
-            if connect and attach:
-                config.trainml.run(dataset.attach(), dataset.connect())
-                return config.trainml.run(dataset.disconnect())
-            elif connect:
-                return config.trainml.run(dataset.connect())
-            else:
-                raise click.UsageError(
-                    "Abort!\n"
-                    "No logs to show for local sourced dataset without connect."
-                )
-        except:
-            try:
-                config.trainml.run(dataset.disconnect())
-            except:
-                pass
-            raise
-@dataset.command()
-@click.argument("dataset", type=click.STRING)
-@pass_config
-def disconnect(config, dataset):
-    """
-    Disconnect and clean-up dataset upload.
-    DATASET may be specified by name or ID, but ID is preferred.
-    """
-    datasets = config.trainml.run(config.trainml.client.datasets.list())
-    found = search_by_id_name(dataset, datasets)
-    if None is found:
-        raise click.UsageError("Cannot find specified dataset.")
-    return config.trainml.run(found.disconnect())
+        if connect and attach:
+            config.trainml.run(dataset.attach(), dataset.connect())
+        elif connect:
+            config.trainml.run(dataset.connect())
+        else:
+            raise click.UsageError(
+                "Abort!\n"
+                "No logs to show for local sourced dataset without connect."
+            )
 @dataset.command()
@@ -252,7 +210,9 @@ def rename(config, dataset, name):
     DATASET may be specified by name or ID, but ID is preferred.
     """
     try:
-        dataset = config.trainml.run(config.trainml.client.datasets.get(dataset))
+        dataset = config.trainml.run(
+            config.trainml.client.datasets.get(dataset)
+        )
         if dataset is None:
             raise click.UsageError("Cannot find specified dataset.")
     except:

trainml/cli/job/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import asyncio
 import click
 from webbrowser import open as browse
 from trainml.cli import cli, pass_config, search_by_id_name
@@ -25,90 +26,111 @@ def attach(config, job):
     if None is found:
         raise click.UsageError("Cannot find specified job.")
-    try:
-        config.trainml.run(found.attach())
-        return config.trainml.run(found.disconnect())
-    except:
-        try:
-            config.trainml.run(found.disconnect())
-        except:
-            pass
-        raise
+    config.trainml.run(found.attach())
-@job.command()
-@click.option(
-    "--attach/--no-attach",
-    default=True,
-    show_default=True,
-    help="Auto attach to job.",
-)
-@click.argument("job", type=click.STRING)
-@pass_config
-def connect(config, job, attach):
+async def _connect_job(job, attach, config):
     """
-    Connect to job.
-    JOB may be specified by name or ID, but ID is preferred.
+    Async helper function to handle job connection with proper
+    handling of local input/output types and attach task management.
     """
-    jobs = config.trainml.run(config.trainml.client.jobs.list())
-    found = search_by_id_name(job, jobs)
-    if None is found:
-        raise click.UsageError("Cannot find specified job.")
-    if found.type != "notebook":
-        try:
-            if attach:
-                config.trainml.run(found.connect(), found.attach())
-                return config.trainml.run(found.disconnect())
-            else:
-                return config.trainml.run(found.connect())
-        except:
-            try:
-                config.trainml.run(found.disconnect())
-            except:
-                pass
-            raise
-    else:
-        if found.status in [
-            "new",
-            "waiting for data/model download",
-            "waiting for GPUs",
-        ]:
-            try:
-                if attach:
-                    config.trainml.run(found.connect(), found.attach())
-                    config.trainml.run(found.disconnect())
-                    click.echo("Launching...", file=config.stdout)
-                    browse(found.notebook_url)
-                else:
-                    return config.trainml.run(found.connect())
-            except:
-                try:
-                    config.trainml.run(found.disconnect())
-                except:
-                    pass
-                raise
-        elif found.status not in [
+    # Get job properties
+    model = job._job.get("model", {})
+    data = job._job.get("data", {})
+    model_local = model.get("source_type") == "local"
+    data_local = data.get("input_type") == "local"
+    output_local = data.get("output_type") == "local"
+    early_statuses = [
+        "new",
+        "waiting for data/model download",
+        "waiting for GPUs",
+        "waiting for resources",
+    ]
+    # Check if we need to wait for data/model download
+    # Only wait if status is early AND (data or model is local)
+    needs_upload_wait = job.status in early_statuses and (
+        model_local or data_local
+    )
+    if needs_upload_wait:
+        # Wait for job to reach data/model download status
+        await job.wait_for("waiting for data/model download", 3600)
+        await job.refresh()
+    # Start attach task early if requested
+    attach_task = None
+    if attach:
+        attach_task = asyncio.create_task(job.attach())
+    # Run first connect (upload if needed)
+    await job.connect()
+    # For notebook jobs, handle opening
+    if job.type == "notebook":
+        # Refresh to get latest status after connect
+        await job.refresh()
+        if job.status in early_statuses:
+            if attach_task:
+                await attach_task
+            click.echo("Launching...", file=config.stdout)
+            browse(job.notebook_url)
+            return
+        elif job.status not in [
             "starting",
             "running",
             "reinitializing",
             "copying",
         ]:
+            if attach_task:
+                attach_task.cancel()
             raise click.UsageError("Notebook job not running.")
         else:
-            config.trainml.run(found.wait_for("running"))
+            await job.wait_for("running")
+            if attach_task:
+                await attach_task
             click.echo("Launching...", file=config.stdout)
-            browse(found.notebook_url)
+            browse(job.notebook_url)
+            return
+    # For non-notebook jobs, check if we need second connect (download)
+    # Refresh to get latest status after first connect
+    await job.refresh()
+    # Run second connect if output_type is local
+    # (as per user's requirement: "if the output_type is 'local'")
+    if output_local:
+        # Always wait for running status before second connect
+        # (as shown in user's example)
+        await job.wait_for("running", 3600)
+        await job.refresh()
+        # Create second connect task (download)
+        connect_task = asyncio.create_task(job.connect())
+        # Gather both attach and second connect tasks
+        if attach_task:
+            await asyncio.gather(attach_task, connect_task)
+        else:
+            await connect_task
+    elif attach_task:
+        # Just wait for attach if no second connect needed
+        await attach_task
 @job.command()
+@click.option(
+    "--attach/--no-attach",
+    default=True,
+    show_default=True,
+    help="Auto attach to job.",
+)
 @click.argument("job", type=click.STRING)
 @pass_config
-def disconnect(config, job):
+def connect(config, job, attach):
     """
-    Disconnect and clean-up job.
+    Connect to job.
     JOB may be specified by name or ID, but ID is preferred.
     """
@@ -118,7 +140,7 @@ def disconnect(config, job):
     if None is found:
         raise click.UsageError("Cannot find specified job.")
-    return config.trainml.run(found.disconnect())
+    config.trainml.run(_connect_job(found, attach, config))
 @job.command()

trainml 0.5.17__py3-none-any.whl → 1.0.1__py3-none-any.whl

trainml 0.5.17py3-none-any.whl → 1.0.1py3-none-any.whl