PyPI - trainml - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

trainml 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

tests/integration/test_checkpoints_integration.py +7 -5
tests/integration/test_datasets_integration.py +4 -5
tests/integration/test_jobs_integration.py +40 -2
tests/integration/test_models_integration.py +8 -10
tests/integration/test_projects_integration.py +2 -6
tests/integration/test_volumes_integration.py +100 -0
tests/unit/cli/cloudbender/test_cli_reservation_unit.py +10 -14
tests/unit/cli/test_cli_project_unit.py +5 -9
tests/unit/cli/test_cli_volume_unit.py +20 -0
tests/unit/cloudbender/test_services_unit.py +161 -0
tests/unit/conftest.py +94 -21
tests/unit/test_projects_unit.py +34 -48
tests/unit/test_volumes_unit.py +447 -0
trainml/__init__.py +1 -1
trainml/cli/__init__.py +3 -6
trainml/cli/cloudbender/__init__.py +1 -1
trainml/cli/cloudbender/service.py +129 -0
trainml/cli/project.py +10 -15
trainml/cli/volume.py +235 -0
trainml/cloudbender/cloudbender.py +2 -2
trainml/cloudbender/services.py +115 -0
trainml/exceptions.py +21 -12
trainml/jobs.py +36 -39
trainml/projects.py +19 -30
trainml/trainml.py +7 -15
trainml/volumes.py +255 -0
{trainml-0.5.4.dist-info → trainml-0.5.6.dist-info}/METADATA +1 -1
{trainml-0.5.4.dist-info → trainml-0.5.6.dist-info}/RECORD +32 -29
tests/integration/test_providers_integration.py +0 -46
tests/unit/test_providers_unit.py +0 -125
trainml/cli/job.py +0 -173
trainml/cli/provider.py +0 -75
trainml/providers.py +0 -63
{trainml-0.5.4.dist-info → trainml-0.5.6.dist-info}/LICENSE +0 -0
{trainml-0.5.4.dist-info → trainml-0.5.6.dist-info}/WHEEL +0 -0
{trainml-0.5.4.dist-info → trainml-0.5.6.dist-info}/entry_points.txt +0 -0
{trainml-0.5.4.dist-info → trainml-0.5.6.dist-info}/top_level.txt +0 -0

trainml/cli/volume.py ADDED Viewed

@@ -0,0 +1,235 @@
+import click
+from trainml.cli import cli, pass_config, search_by_id_name
+def pretty_size(num):
+    if not num:
+        num = 0.0
+    s = ("  B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB")
+    n = 0
+    while num > 1023:
+        num = num / 1024
+        n += 1
+    return f"{num:.2f} {s[n]}"
+@cli.group()
+@pass_config
+def volume(config):
+    """trainML volume commands."""
+    pass
+@volume.command()
+@click.argument("volume", type=click.STRING)
+@pass_config
+def attach(config, volume):
+    """
+    Attach to volume and show creation logs.
+    VOLUME may be specified by name or ID, but ID is preferred.
+    """
+    volumes = config.trainml.run(config.trainml.client.volumes.list())
+    found = search_by_id_name(volume, volumes)
+    if None is found:
+        raise click.UsageError("Cannot find specified volume.")
+    try:
+        config.trainml.run(found.attach())
+        return config.trainml.run(found.disconnect())
+    except:
+        try:
+            config.trainml.run(found.disconnect())
+        except:
+            pass
+        raise
+@volume.command()
+@click.option(
+    "--attach/--no-attach",
+    default=True,
+    show_default=True,
+    help="Auto attach to volume and show creation logs.",
+)
+@click.argument("volume", type=click.STRING)
+@pass_config
+def connect(config, volume, attach):
+    """
+    Connect local source to volume and begin upload.
+    VOLUME may be specified by name or ID, but ID is preferred.
+    """
+    volumes = config.trainml.run(config.trainml.client.volumes.list())
+    found = search_by_id_name(volume, volumes)
+    if None is found:
+        raise click.UsageError("Cannot find specified volume.")
+    try:
+        if attach:
+            config.trainml.run(found.connect(), found.attach())
+            return config.trainml.run(found.disconnect())
+        else:
+            return config.trainml.run(found.connect())
+    except:
+        try:
+            config.trainml.run(found.disconnect())
+        except:
+            pass
+        raise
+@volume.command()
+@click.option(
+    "--attach/--no-attach",
+    default=True,
+    show_default=True,
+    help="Auto attach to volume and show creation logs.",
+)
+@click.option(
+    "--connect/--no-connect",
+    default=True,
+    show_default=True,
+    help="Auto connect source and start volume creation.",
+)
+@click.option(
+    "--source",
+    "-s",
+    type=click.Choice(["local"], case_sensitive=False),
+    default="local",
+    show_default=True,
+    help="Dataset source type.",
+)
+@click.argument("name", type=click.STRING)
+@click.argument("capacity", type=click.INT)
+@click.argument(
+    "path", type=click.Path(exists=True, file_okay=False, resolve_path=True)
+)
+@pass_config
+def create(config, attach, connect, source, name, capacity, path):
+    """
+    Create a volume.
+    A volume with maximum size CAPACITY is created with the specified NAME using a local source at the PATH
+    specified. PATH should be a local directory containing the source data for
+    a local source or a URI for all other source types.
+    """
+    if source == "local":
+        volume = config.trainml.run(
+            config.trainml.client.volumes.create(
+                name=name, source_type="local", source_uri=path, capacity=capacity
+            )
+        )
+        try:
+            if connect and attach:
+                config.trainml.run(volume.attach(), volume.connect())
+                return config.trainml.run(volume.disconnect())
+            elif connect:
+                return config.trainml.run(volume.connect())
+            else:
+                raise click.UsageError(
+                    "Abort!\n"
+                    "No logs to show for local sourced volume without connect."
+                )
+        except:
+            try:
+                config.trainml.run(volume.disconnect())
+            except:
+                pass
+            raise
+@volume.command()
+@click.argument("volume", type=click.STRING)
+@pass_config
+def disconnect(config, volume):
+    """
+    Disconnect and clean-up volume upload.
+    VOLUME may be specified by name or ID, but ID is preferred.
+    """
+    volumes = config.trainml.run(config.trainml.client.volumes.list())
+    found = search_by_id_name(volume, volumes)
+    if None is found:
+        raise click.UsageError("Cannot find specified volume.")
+    return config.trainml.run(found.disconnect())
+@volume.command()
+@pass_config
+def list(config):
+    """List volumes."""
+    data = [
+        ["ID", "STATUS", "NAME", "CAPACITY"],
+        ["-" * 80, "-" * 80, "-" * 80, "-" * 80],
+    ]
+    volumes = config.trainml.run(config.trainml.client.volumes.list())
+    for volume in volumes:
+        data.append(
+            [
+                volume.id,
+                volume.status,
+                volume.name,
+                volume.capacity,
+            ]
+        )
+    for row in data:
+        click.echo(
+            "{: >38.36} {: >13.11} {: >40.38} {: >14.12}" "".format(*row),
+            file=config.stdout,
+        )
+@volume.command()
+@click.option(
+    "--force/--no-force",
+    default=False,
+    show_default=True,
+    help="Force removal.",
+)
+@click.argument("volume", type=click.STRING)
+@pass_config
+def remove(config, volume, force):
+    """
+    Remove a volume.
+    VOLUME may be specified by name or ID, but ID is preferred.
+    """
+    volumes = config.trainml.run(config.trainml.client.volumes.list())
+    found = search_by_id_name(volume, volumes)
+    if None is found:
+        if force:
+            config.trainml.run(found.client.volumes.remove(volume))
+        else:
+            raise click.UsageError("Cannot find specified volume.")
+    return config.trainml.run(found.remove(force=force))
+@volume.command()
+@click.argument("volume", type=click.STRING)
+@click.argument("name", type=click.STRING)
+@pass_config
+def rename(config, volume, name):
+    """
+    Renames a volume.
+    VOLUME may be specified by name or ID, but ID is preferred.
+    """
+    try:
+        volume = config.trainml.run(config.trainml.client.volumes.get(volume))
+        if volume is None:
+            raise click.UsageError("Cannot find specified volume.")
+    except:
+        raise click.UsageError("Cannot find specified volume.")
+    return config.trainml.run(volume.rename(name=name))

trainml/cloudbender/cloudbender.py CHANGED Viewed

@@ -3,7 +3,7 @@ from .regions import Regions
 from .nodes import Nodes
 from .devices import Devices
 from .datastores import Datastores
-from .reservations import Reservations
+from .services import Services
 from .device_configs import DeviceConfigs
@@ -15,5 +15,5 @@ class Cloudbender(object):
         self.nodes = Nodes(trainml)
         self.devices = Devices(trainml)
         self.datastores = Datastores(trainml)
-        self.reservations = Reservations(trainml)
+        self.services = Services(trainml)
         self.device_configs = DeviceConfigs(trainml)

trainml/cloudbender/services.py ADDED Viewed

@@ -0,0 +1,115 @@
+import json
+import logging
+class Services(object):
+    def __init__(self, trainml):
+        self.trainml = trainml
+    async def get(self, provider_uuid, region_uuid, id, **kwargs):
+        resp = await self.trainml._query(
+            f"/provider/{provider_uuid}/region/{region_uuid}/service/{id}",
+            "GET",
+            kwargs,
+        )
+        return Service(self.trainml, **resp)
+    async def list(self, provider_uuid, region_uuid, **kwargs):
+        resp = await self.trainml._query(
+            f"/provider/{provider_uuid}/region/{region_uuid}/service",
+            "GET",
+            kwargs,
+        )
+        services = [Service(self.trainml, **service) for service in resp]
+        return services
+    async def create(
+        self,
+        provider_uuid,
+        region_uuid,
+        name,
+        public,
+        **kwargs,
+    ):
+        logging.info(f"Creating Service {name}")
+        data = dict(
+            name=name,
+            public=public,
+            **kwargs,
+        )
+        payload = {k: v for k, v in data.items() if v is not None}
+        resp = await self.trainml._query(
+            f"/provider/{provider_uuid}/region/{region_uuid}/service",
+            "POST",
+            None,
+            payload,
+        )
+        service = Service(self.trainml, **resp)
+        logging.info(f"Created Service {name} with id {service.id}")
+        return service
+    async def remove(self, provider_uuid, region_uuid, id, **kwargs):
+        await self.trainml._query(
+            f"/provider/{provider_uuid}/region/{region_uuid}/service/{id}",
+            "DELETE",
+            kwargs,
+        )
+class Service:
+    def __init__(self, trainml, **kwargs):
+        self.trainml = trainml
+        self._service = kwargs
+        self._id = self._service.get("service_id")
+        self._provider_uuid = self._service.get("provider_uuid")
+        self._region_uuid = self._service.get("region_uuid")
+        self._public = self._service.get("public")
+        self._name = self._service.get("name")
+        self._hostname = self._service.get("hostname")
+    @property
+    def id(self) -> str:
+        return self._id
+    @property
+    def provider_uuid(self) -> str:
+        return self._provider_uuid
+    @property
+    def region_uuid(self) -> str:
+        return self._region_uuid
+    @property
+    def public(self) -> bool:
+        return self._public
+    @property
+    def name(self) -> str:
+        return self._name
+    @property
+    def hostname(self) -> str:
+        return self._hostname
+    def __str__(self):
+        return json.dumps({k: v for k, v in self._service.items()})
+    def __repr__(self):
+        return f"Service( trainml , **{self._service.__repr__()})"
+    def __bool__(self):
+        return bool(self._id)
+    async def remove(self):
+        await self.trainml._query(
+            f"/provider/{self._provider_uuid}/region/{self._region_uuid}/service/{self._id}",
+            "DELETE",
+        )
+    async def refresh(self):
+        resp = await self.trainml._query(
+            f"/provider/{self._provider_uuid}/region/{self._region_uuid}/service/{self._id}",
+            "GET",
+        )
+        self.__init__(self.trainml, **resp)
+        return self

trainml/exceptions.py CHANGED Viewed

@@ -97,14 +97,27 @@ class CheckpointError(TrainMLException):
         return self._status
     def __repr__(self):
-        return "CheckpointError({self.status}, {self.message})".format(
-            self=self
-        )
+        return "CheckpointError({self.status}, {self.message})".format(self=self)
     def __str__(self):
-        return "CheckpointError({self.status}, {self.message})".format(
-            self=self
-        )
+        return "CheckpointError({self.status}, {self.message})".format(self=self)
+class VolumeError(TrainMLException):
+    def __init__(self, status, data, *args):
+        super().__init__(data, *args)
+        self._status = status
+        self._message = data
+    @property
+    def status(self) -> str:
+        return self._status
+    def __repr__(self):
+        return "VolumeError({self.status}, {self.message})".format(self=self)
+    def __str__(self):
+        return "VolumeError({self.status}, {self.message})".format(self=self)
 class ConnectionError(TrainMLException):
@@ -130,11 +143,7 @@ class SpecificationError(TrainMLException):
         return self._attribute
     def __repr__(self):
-        return "SpecificationError({self.attribute}, {self.message})".format(
-            self=self
-        )
+        return "SpecificationError({self.attribute}, {self.message})".format(self=self)
     def __str__(self):
-        return "SpecificationError({self.attribute}, {self.message})".format(
-            self=self
-        )
+        return "SpecificationError({self.attribute}, {self.message})".format(self=self)

trainml/jobs.py CHANGED Viewed

@@ -77,8 +77,7 @@ class Jobs(object):
             model=model,
             endpoint=endpoint,
             source_job_uuid=kwargs.get("source_job_uuid"),
-            project_uuid=kwargs.get("project_uuid")
-            or self.trainml.active_project,
+            project_uuid=kwargs.get("project_uuid") or self.trainml.active_project,
         )
         payload = {
             k: v
@@ -103,9 +102,7 @@ class Jobs(object):
         return job
     async def remove(self, id, **kwargs):
-        await self.trainml._query(
-            f"/job/{id}", "DELETE", dict(**kwargs, force=True)
-        )
+        await self.trainml._query(f"/job/{id}", "DELETE", dict(**kwargs, force=True))
 class Job:
@@ -308,18 +305,26 @@ class Job:
             entity_type="job",
             project_uuid=self._job.get("project_uuid"),
             cidr=self.dict.get("vpn").get("cidr"),
-            ssh_port=self._job.get("vpn").get("client").get("ssh_port")
-            if self._job.get("vpn").get("client")
-            else None,
-            model_path=self._job.get("model").get("source_uri")
-            if self._job.get("model").get("source_type") == "local"
-            else None,
-            input_path=self._job.get("data").get("input_uri")
-            if self._job.get("data").get("input_type") == "local"
-            else None,
-            output_path=self._job.get("data").get("output_uri")
-            if self._job.get("data").get("output_type") == "local"
-            else None,
+            ssh_port=(
+                self._job.get("vpn").get("client").get("ssh_port")
+                if self._job.get("vpn").get("client")
+                else None
+            ),
+            model_path=(
+                self._job.get("model").get("source_uri")
+                if self._job.get("model").get("source_type") == "local"
+                else None
+            ),
+            input_path=(
+                self._job.get("data").get("input_uri")
+                if self._job.get("data").get("input_type") == "local"
+                else None
+            ),
+            output_path=(
+                self._job.get("data").get("output_uri")
+                if self._job.get("data").get("output_type") == "local"
+                else None
+            ),
         )
         return details
@@ -396,8 +401,7 @@ class Job:
     def _get_msg_handler(self, msg_handler):
         worker_numbers = {
-            w.get("job_worker_uuid"): ind + 1
-            for ind, w in enumerate(self._workers)
+            w.get("job_worker_uuid"): ind + 1 for ind, w in enumerate(self._workers)
         }
         worker_numbers["data_worker"] = 0
@@ -407,9 +411,7 @@ class Job:
                 if msg_handler:
                     msg_handler(data)
                 else:
-                    timestamp = datetime.fromtimestamp(
-                        int(data.get("time")) / 1000
-                    )
+                    timestamp = datetime.fromtimestamp(int(data.get("time")) / 1000)
                     if len(self._workers) > 1:
                         print(
                             f"{timestamp.strftime('%m/%d/%Y, %H:%M:%S')}: Worker {data.get('worker_number')} - {data.get('msg').rstrip()}"
@@ -422,10 +424,7 @@ class Job:
         return handler
     async def attach(self, msg_handler=None):
-        if (
-            self.type == "notebook"
-            and self.status != "waiting for data/model download"
-        ):
+        if self.type == "notebook" and self.status != "waiting for data/model download":
             raise SpecificationError(
                 "type",
                 "Notebooks cannot be attached to after model download is complete.  Use open() instead.",
@@ -442,9 +441,7 @@ class Job:
     async def copy(self, name, **kwargs):
         logging.debug(f"copy request - name: {name} ; kwargs: {kwargs}")
         if self.type != "notebook":
-            raise SpecificationError(
-                "job", "Only notebook job types can be copied"
-            )
+            raise SpecificationError("job", "Only notebook job types can be copied")
         job = await self.trainml.jobs.create(
             name,
@@ -504,9 +501,7 @@ class Job:
         POLL_INTERVAL_MIN = 5
         POLL_INTERVAL_MAX = 60
-        POLL_INTERVAL = max(
-            min(timeout / 60, POLL_INTERVAL_MAX), POLL_INTERVAL_MIN
-        )
+        POLL_INTERVAL = max(min(timeout / 60, POLL_INTERVAL_MAX), POLL_INTERVAL_MIN)
         retry_count = math.ceil(timeout / POLL_INTERVAL)
         count = 0
         while count < retry_count:
@@ -519,23 +514,25 @@ class Job:
                 raise e
             if (
                 self.status == status
-                or (
-                    self.type == "training"
-                    and status == "finished"
-                    and self.status == "stopped"
-                )
                 or (
                     status
                     in [
                         "waiting for GPUs",
                         "waiting for resources",
                     ]  ## this status could be very short and the polling could miss it
-                    and self.status in ["starting", "provisioning", "running"]
+                    and self.status
+                    not in ["new", "waiting for GPUs", "waiting for resources"]
                 )
                 or (
                     status
                     == "waiting for data/model download"  ## this status could be very short and the polling could miss it
-                    and self.status in ["starting", "provisioning", "running"]
+                    and self.status
+                    not in [
+                        "new",
+                        "waiting for GPUs",
+                        "waiting for resources",
+                        "waiting for data/model download",
+                    ]
                 )
             ):
                 return self

trainml/projects.py CHANGED Viewed

@@ -72,17 +72,17 @@ class ProjectDatastore:
         return bool(self._id)
-class ProjectReservation:
+class ProjectService:
     def __init__(self, trainml, **kwargs):
         self.trainml = trainml
-        self._reservation = kwargs
-        self._id = self._reservation.get("id")
-        self._project_uuid = self._reservation.get("project_uuid")
-        self._name = self._reservation.get("name")
-        self._type = self._reservation.get("type")
-        self._hostname = self._reservation.get("hostname")
-        self._resource = self._reservation.get("resource")
-        self._region_uuid = self._reservation.get("region_uuid")
+        self._service = kwargs
+        self._id = self._service.get("id")
+        self._project_uuid = self._service.get("project_uuid")
+        self._name = self._service.get("name")
+        self._type = self._service.get("type")
+        self._hostname = self._service.get("hostname")
+        self._resource = self._service.get("resource")
+        self._region_uuid = self._service.get("region_uuid")
     @property
     def id(self) -> str:
@@ -113,12 +113,10 @@ class ProjectReservation:
         return self._region_uuid
     def __str__(self):
-        return json.dumps({k: v for k, v in self._reservation.items()})
+        return json.dumps({k: v for k, v in self._service.items()})
     def __repr__(self):
-        return (
-            f"ProjectReservation( trainml , **{self._reservation.__repr__()})"
-        )
+        return f"ProjectService( trainml , **{self._service.__repr__()})"
     def __bool__(self):
         return bool(self._id)
@@ -162,26 +160,17 @@ class Project:
         await self.trainml._query(f"/project/{self._id}", "DELETE")
     async def list_datastores(self):
-        resp = await self.trainml._query(
-            f"/project/{self._id}/datastores", "GET"
-        )
-        datastores = [
-            ProjectDatastore(self.trainml, **datastore) for datastore in resp
-        ]
+        resp = await self.trainml._query(f"/project/{self._id}/datastores", "GET")
+        datastores = [ProjectDatastore(self.trainml, **datastore) for datastore in resp]
         return datastores
-    async def list_reservations(self):
-        resp = await self.trainml._query(
-            f"/project/{self._id}/reservations", "GET"
-        )
-        reservations = [
-            ProjectReservation(self.trainml, **reservation)
-            for reservation in resp
-        ]
-        return reservations
+    async def list_services(self):
+        resp = await self.trainml._query(f"/project/{self._id}/services", "GET")
+        services = [ProjectService(self.trainml, **service) for service in resp]
+        return services
     async def refresh_datastores(self):
         await self.trainml._query(f"/project/{self._id}/datastores", "PATCH")
-    async def refresh_reservations(self):
-        await self.trainml._query(f"/project/{self._id}/reservations", "PATCH")
+    async def refresh_services(self):
+        await self.trainml._query(f"/project/{self._id}/services", "PATCH")

trainml 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

trainml 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl