trainml 0.5.17__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. examples/local_storage.py +0 -2
  2. tests/integration/test_checkpoints_integration.py +4 -3
  3. tests/integration/test_datasets_integration.py +5 -3
  4. tests/integration/test_jobs_integration.py +33 -27
  5. tests/integration/test_models_integration.py +7 -3
  6. tests/integration/test_volumes_integration.py +2 -2
  7. tests/unit/cli/test_cli_checkpoint_unit.py +312 -1
  8. tests/unit/cloudbender/test_nodes_unit.py +112 -0
  9. tests/unit/cloudbender/test_providers_unit.py +96 -0
  10. tests/unit/cloudbender/test_regions_unit.py +106 -0
  11. tests/unit/cloudbender/test_services_unit.py +141 -0
  12. tests/unit/conftest.py +23 -10
  13. tests/unit/projects/test_project_data_connectors_unit.py +39 -0
  14. tests/unit/projects/test_project_datastores_unit.py +37 -0
  15. tests/unit/projects/test_project_members_unit.py +46 -0
  16. tests/unit/projects/test_project_services_unit.py +65 -0
  17. tests/unit/projects/test_projects_unit.py +16 -0
  18. tests/unit/test_auth_unit.py +17 -2
  19. tests/unit/test_checkpoints_unit.py +256 -71
  20. tests/unit/test_datasets_unit.py +218 -68
  21. tests/unit/test_exceptions.py +133 -0
  22. tests/unit/test_gpu_types_unit.py +11 -1
  23. tests/unit/test_jobs_unit.py +1014 -95
  24. tests/unit/test_main_unit.py +20 -0
  25. tests/unit/test_models_unit.py +218 -70
  26. tests/unit/test_trainml_unit.py +627 -3
  27. tests/unit/test_volumes_unit.py +211 -70
  28. tests/unit/utils/__init__.py +1 -0
  29. tests/unit/utils/test_transfer_unit.py +4260 -0
  30. trainml/__init__.py +1 -1
  31. trainml/checkpoints.py +56 -57
  32. trainml/cli/__init__.py +6 -3
  33. trainml/cli/checkpoint.py +18 -57
  34. trainml/cli/dataset.py +17 -57
  35. trainml/cli/job/__init__.py +89 -67
  36. trainml/cli/job/create.py +51 -24
  37. trainml/cli/model.py +14 -56
  38. trainml/cli/volume.py +18 -57
  39. trainml/datasets.py +50 -55
  40. trainml/jobs.py +269 -69
  41. trainml/models.py +51 -55
  42. trainml/trainml.py +159 -114
  43. trainml/utils/__init__.py +1 -0
  44. trainml/utils/auth.py +641 -0
  45. trainml/utils/transfer.py +647 -0
  46. trainml/volumes.py +48 -53
  47. {trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/METADATA +3 -3
  48. {trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/RECORD +52 -46
  49. {trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/LICENSE +0 -0
  50. {trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/WHEEL +0 -0
  51. {trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/entry_points.txt +0 -0
  52. {trainml-0.5.17.dist-info → trainml-1.0.1.dist-info}/top_level.txt +0 -0
trainml/__init__.py CHANGED
@@ -13,5 +13,5 @@ logging.basicConfig(
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
15
 
16
- __version__ = "0.5.17"
16
+ __version__ = "1.0.1"
17
17
  __all__ = "TrainML"
trainml/checkpoints.py CHANGED
@@ -10,7 +10,7 @@ from .exceptions import (
10
10
  SpecificationError,
11
11
  TrainMLException,
12
12
  )
13
- from .connections import Connection
13
+ from trainml.utils.transfer import upload, download
14
14
 
15
15
 
16
16
  class Checkpoints(object):
@@ -23,7 +23,9 @@ class Checkpoints(object):
23
23
 
24
24
  async def list(self, **kwargs):
25
25
  resp = await self.trainml._query(f"/checkpoint", "GET", kwargs)
26
- checkpoints = [Checkpoint(self.trainml, **checkpoint) for checkpoint in resp]
26
+ checkpoints = [
27
+ Checkpoint(self.trainml, **checkpoint) for checkpoint in resp
28
+ ]
27
29
  return checkpoints
28
30
 
29
31
  async def list_public(self, **kwargs):
@@ -68,13 +70,17 @@ class Checkpoint:
68
70
  def __init__(self, trainml, **kwargs):
69
71
  self.trainml = trainml
70
72
  self._checkpoint = kwargs
71
- self._id = self._checkpoint.get("id", self._checkpoint.get("checkpoint_uuid"))
73
+ self._id = self._checkpoint.get(
74
+ "id", self._checkpoint.get("checkpoint_uuid")
75
+ )
72
76
  self._status = self._checkpoint.get("status")
73
77
  self._name = self._checkpoint.get("name")
74
- self._size = self._checkpoint.get("size") or self._checkpoint.get("used_size")
75
- self._billed_size = self._checkpoint.get("billed_size") or self._checkpoint.get(
76
- "size"
78
+ self._size = self._checkpoint.get("size") or self._checkpoint.get(
79
+ "used_size"
77
80
  )
81
+ self._billed_size = self._checkpoint.get(
82
+ "billed_size"
83
+ ) or self._checkpoint.get("size")
78
84
  self._project_uuid = self._checkpoint.get("project_uuid")
79
85
 
80
86
  @property
@@ -122,56 +128,45 @@ class Checkpoint:
122
128
  )
123
129
  return resp
124
130
 
125
- async def get_connection_utility_url(self):
126
- resp = await self.trainml._query(
127
- f"/checkpoint/{self._id}/download",
128
- "GET",
129
- dict(project_uuid=self._project_uuid),
130
- )
131
- return resp
132
-
133
- def get_connection_details(self):
134
- if self._checkpoint.get("vpn"):
135
- details = dict(
136
- entity_type="checkpoint",
137
- project_uuid=self._checkpoint.get("project_uuid"),
138
- cidr=self._checkpoint.get("vpn").get("cidr"),
139
- ssh_port=self._checkpoint.get("vpn").get("client").get("ssh_port"),
140
- input_path=(
141
- self._checkpoint.get("source_uri")
142
- if self.status in ["new", "downloading"]
143
- else None
144
- ),
145
- output_path=(
146
- self._checkpoint.get("output_uri")
147
- if self.status == "exporting"
148
- else None
149
- ),
150
- )
151
- else:
152
- details = dict()
153
- return details
154
-
155
131
  async def connect(self):
156
- if self.status in ["ready", "failed"]:
157
- raise SpecificationError(
158
- "status",
159
- f"You can only connect to downloading or exporting checkpoints.",
160
- )
161
- if self.status == "new":
162
- await self.wait_for("downloading")
163
- connection = Connection(
164
- self.trainml, entity_type="checkpoint", id=self.id, entity=self
165
- )
166
- await connection.start()
167
- return connection.status
132
+ if self.status not in ["downloading", "exporting"]:
133
+ if self.status == "new":
134
+ await self.wait_for("downloading")
135
+ else:
136
+ raise SpecificationError(
137
+ "status",
138
+ f"You can only connect to downloading or exporting checkpoints.",
139
+ )
168
140
 
169
- async def disconnect(self):
170
- connection = Connection(
171
- self.trainml, entity_type="checkpoint", id=self.id, entity=self
172
- )
173
- await connection.stop()
174
- return connection.status
141
+ # Refresh to get latest entity data
142
+ await self.refresh()
143
+
144
+ if self.status == "downloading":
145
+ # Upload task - get auth_token, hostname, and source_uri from checkpoint
146
+ auth_token = self._checkpoint.get("auth_token")
147
+ hostname = self._checkpoint.get("hostname")
148
+ source_uri = self._checkpoint.get("source_uri")
149
+
150
+ if not auth_token or not hostname or not source_uri:
151
+ raise SpecificationError(
152
+ "status",
153
+ f"Checkpoint in downloading status missing required connection properties (auth_token, hostname, source_uri).",
154
+ )
155
+
156
+ await upload(hostname, auth_token, source_uri)
157
+ elif self.status == "exporting":
158
+ # Download task - get auth_token, hostname, and output_uri from checkpoint
159
+ auth_token = self._checkpoint.get("auth_token")
160
+ hostname = self._checkpoint.get("hostname")
161
+ output_uri = self._checkpoint.get("output_uri")
162
+
163
+ if not auth_token or not hostname or not output_uri:
164
+ raise SpecificationError(
165
+ "status",
166
+ f"Checkpoint in exporting status missing required connection properties (auth_token, hostname, output_uri).",
167
+ )
168
+
169
+ await download(hostname, auth_token, output_uri)
175
170
 
176
171
  async def remove(self, force=False):
177
172
  await self.trainml._query(
@@ -210,7 +205,9 @@ class Checkpoint:
210
205
  if msg_handler:
211
206
  msg_handler(data)
212
207
  else:
213
- timestamp = datetime.fromtimestamp(int(data.get("time")) / 1000)
208
+ timestamp = datetime.fromtimestamp(
209
+ int(data.get("time")) / 1000
210
+ )
214
211
  print(
215
212
  f"{timestamp.strftime('%m/%d/%Y, %H:%M:%S')}: {data.get('msg').rstrip()}"
216
213
  )
@@ -239,7 +236,7 @@ class Checkpoint:
239
236
  async def wait_for(self, status, timeout=300):
240
237
  if self.status == status:
241
238
  return
242
- valid_statuses = ["downloading", "ready", "archived"]
239
+ valid_statuses = ["downloading", "ready", "exporting", "archived"]
243
240
  if not status in valid_statuses:
244
241
  raise SpecificationError(
245
242
  "status",
@@ -254,7 +251,9 @@ class Checkpoint:
254
251
  )
255
252
  POLL_INTERVAL_MIN = 5
256
253
  POLL_INTERVAL_MAX = 60
257
- POLL_INTERVAL = max(min(timeout / 60, POLL_INTERVAL_MAX), POLL_INTERVAL_MIN)
254
+ POLL_INTERVAL = max(
255
+ min(timeout / 60, POLL_INTERVAL_MAX), POLL_INTERVAL_MIN
256
+ )
258
257
  retry_count = math.ceil(timeout / POLL_INTERVAL)
259
258
  count = 0
260
259
  while count < retry_count:
trainml/cli/__init__.py CHANGED
@@ -142,7 +142,9 @@ def configure(config):
142
142
  project for project in projects if project.id == active_project_id
143
143
  ]
144
144
 
145
- active_project_name = active_project[0].name if len(active_project) else "UNSET"
145
+ active_project_name = (
146
+ active_project[0].name if len(active_project) else "UNSET"
147
+ )
146
148
 
147
149
  click.echo(f"Current Active Project: {active_project_name}")
148
150
 
@@ -152,11 +154,12 @@ def configure(config):
152
154
  show_choices=True,
153
155
  default=active_project_name,
154
156
  )
155
- selected_project = [project for project in projects if project.name == name]
157
+ selected_project = [
158
+ project for project in projects if project.name == name
159
+ ]
156
160
  config.trainml.client.set_active_project(selected_project[0].id)
157
161
 
158
162
 
159
- from trainml.cli.connection import connection
160
163
  from trainml.cli.dataset import dataset
161
164
  from trainml.cli.model import model
162
165
  from trainml.cli.checkpoint import checkpoint
trainml/cli/checkpoint.py CHANGED
@@ -35,15 +35,7 @@ def attach(config, checkpoint):
35
35
  if None is found:
36
36
  raise click.UsageError("Cannot find specified checkpoint.")
37
37
 
38
- try:
39
- config.trainml.run(found.attach())
40
- return config.trainml.run(found.disconnect())
41
- except:
42
- try:
43
- config.trainml.run(found.disconnect())
44
- except:
45
- pass
46
- raise
38
+ config.trainml.run(found.attach())
47
39
 
48
40
 
49
41
  @checkpoint.command()
@@ -67,18 +59,10 @@ def connect(config, checkpoint, attach):
67
59
  if None is found:
68
60
  raise click.UsageError("Cannot find specified checkpoint.")
69
61
 
70
- try:
71
- if attach:
72
- config.trainml.run(found.connect(), found.attach())
73
- return config.trainml.run(found.disconnect())
74
- else:
75
- return config.trainml.run(found.connect())
76
- except:
77
- try:
78
- config.trainml.run(found.disconnect())
79
- except:
80
- pass
81
- raise
62
+ if attach:
63
+ config.trainml.run(found.connect(), found.attach())
64
+ else:
65
+ config.trainml.run(found.connect())
82
66
 
83
67
 
84
68
  @checkpoint.command()
@@ -123,41 +107,15 @@ def create(config, attach, connect, source, name, path):
123
107
  )
124
108
  )
125
109
 
126
- try:
127
- if connect and attach:
128
- config.trainml.run(checkpoint.attach(), checkpoint.connect())
129
- return config.trainml.run(checkpoint.disconnect())
130
- elif connect:
131
- return config.trainml.run(checkpoint.connect())
132
- else:
133
- raise click.UsageError(
134
- "Abort!\n"
135
- "No logs to show for local sourced checkpoint without connect."
136
- )
137
- except:
138
- try:
139
- config.trainml.run(checkpoint.disconnect())
140
- except:
141
- pass
142
- raise
143
-
144
-
145
- @checkpoint.command()
146
- @click.argument("checkpoint", type=click.STRING)
147
- @pass_config
148
- def disconnect(config, checkpoint):
149
- """
150
- Disconnect and clean-up checkpoint upload.
151
-
152
- CHECKPOINT may be specified by name or ID, but ID is preferred.
153
- """
154
- checkpoints = config.trainml.run(config.trainml.client.checkpoints.list())
155
-
156
- found = search_by_id_name(checkpoint, checkpoints)
157
- if None is found:
158
- raise click.UsageError("Cannot find specified checkpoint.")
159
-
160
- return config.trainml.run(found.disconnect())
110
+ if connect and attach:
111
+ config.trainml.run(checkpoint.attach(), checkpoint.connect())
112
+ elif connect:
113
+ config.trainml.run(checkpoint.connect())
114
+ else:
115
+ raise click.UsageError(
116
+ "Abort!\n"
117
+ "No logs to show for local sourced checkpoint without connect."
118
+ )
161
119
 
162
120
 
163
121
  @checkpoint.command()
@@ -236,7 +194,10 @@ def remove(config, checkpoint, force):
236
194
  found = search_by_id_name(checkpoint, checkpoints)
237
195
  if None is found:
238
196
  if force:
239
- config.trainml.run(found.client.checkpoints.remove(checkpoint))
197
+ config.trainml.run(
198
+ config.trainml.client.checkpoints.remove(checkpoint)
199
+ )
200
+ return
240
201
  else:
241
202
  raise click.UsageError("Cannot find specified checkpoint.")
242
203
 
trainml/cli/dataset.py CHANGED
@@ -35,15 +35,7 @@ def attach(config, dataset):
35
35
  if None is found:
36
36
  raise click.UsageError("Cannot find specified dataset.")
37
37
 
38
- try:
39
- config.trainml.run(found.attach())
40
- return config.trainml.run(found.disconnect())
41
- except:
42
- try:
43
- config.trainml.run(found.disconnect())
44
- except:
45
- pass
46
- raise
38
+ config.trainml.run(found.attach())
47
39
 
48
40
 
49
41
  @dataset.command()
@@ -67,18 +59,10 @@ def connect(config, dataset, attach):
67
59
  if None is found:
68
60
  raise click.UsageError("Cannot find specified dataset.")
69
61
 
70
- try:
71
- if attach:
72
- config.trainml.run(found.connect(), found.attach())
73
- return config.trainml.run(found.disconnect())
74
- else:
75
- return config.trainml.run(found.connect())
76
- except:
77
- try:
78
- config.trainml.run(found.disconnect())
79
- except:
80
- pass
81
- raise
62
+ if attach:
63
+ config.trainml.run(found.connect(), found.attach())
64
+ else:
65
+ config.trainml.run(found.connect())
82
66
 
83
67
 
84
68
  @dataset.command()
@@ -123,41 +107,15 @@ def create(config, attach, connect, source, name, path):
123
107
  )
124
108
  )
125
109
 
126
- try:
127
- if connect and attach:
128
- config.trainml.run(dataset.attach(), dataset.connect())
129
- return config.trainml.run(dataset.disconnect())
130
- elif connect:
131
- return config.trainml.run(dataset.connect())
132
- else:
133
- raise click.UsageError(
134
- "Abort!\n"
135
- "No logs to show for local sourced dataset without connect."
136
- )
137
- except:
138
- try:
139
- config.trainml.run(dataset.disconnect())
140
- except:
141
- pass
142
- raise
143
-
144
-
145
- @dataset.command()
146
- @click.argument("dataset", type=click.STRING)
147
- @pass_config
148
- def disconnect(config, dataset):
149
- """
150
- Disconnect and clean-up dataset upload.
151
-
152
- DATASET may be specified by name or ID, but ID is preferred.
153
- """
154
- datasets = config.trainml.run(config.trainml.client.datasets.list())
155
-
156
- found = search_by_id_name(dataset, datasets)
157
- if None is found:
158
- raise click.UsageError("Cannot find specified dataset.")
159
-
160
- return config.trainml.run(found.disconnect())
110
+ if connect and attach:
111
+ config.trainml.run(dataset.attach(), dataset.connect())
112
+ elif connect:
113
+ config.trainml.run(dataset.connect())
114
+ else:
115
+ raise click.UsageError(
116
+ "Abort!\n"
117
+ "No logs to show for local sourced dataset without connect."
118
+ )
161
119
 
162
120
 
163
121
  @dataset.command()
@@ -252,7 +210,9 @@ def rename(config, dataset, name):
252
210
  DATASET may be specified by name or ID, but ID is preferred.
253
211
  """
254
212
  try:
255
- dataset = config.trainml.run(config.trainml.client.datasets.get(dataset))
213
+ dataset = config.trainml.run(
214
+ config.trainml.client.datasets.get(dataset)
215
+ )
256
216
  if dataset is None:
257
217
  raise click.UsageError("Cannot find specified dataset.")
258
218
  except:
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  import click
2
3
  from webbrowser import open as browse
3
4
  from trainml.cli import cli, pass_config, search_by_id_name
@@ -25,90 +26,111 @@ def attach(config, job):
25
26
  if None is found:
26
27
  raise click.UsageError("Cannot find specified job.")
27
28
 
28
- try:
29
- config.trainml.run(found.attach())
30
- return config.trainml.run(found.disconnect())
31
- except:
32
- try:
33
- config.trainml.run(found.disconnect())
34
- except:
35
- pass
36
- raise
29
+ config.trainml.run(found.attach())
37
30
 
38
31
 
39
- @job.command()
40
- @click.option(
41
- "--attach/--no-attach",
42
- default=True,
43
- show_default=True,
44
- help="Auto attach to job.",
45
- )
46
- @click.argument("job", type=click.STRING)
47
- @pass_config
48
- def connect(config, job, attach):
32
+ async def _connect_job(job, attach, config):
49
33
  """
50
- Connect to job.
51
-
52
- JOB may be specified by name or ID, but ID is preferred.
34
+ Async helper function to handle job connection with proper
35
+ handling of local input/output types and attach task management.
53
36
  """
54
- jobs = config.trainml.run(config.trainml.client.jobs.list())
55
-
56
- found = search_by_id_name(job, jobs)
57
- if None is found:
58
- raise click.UsageError("Cannot find specified job.")
59
-
60
- if found.type != "notebook":
61
- try:
62
- if attach:
63
- config.trainml.run(found.connect(), found.attach())
64
- return config.trainml.run(found.disconnect())
65
- else:
66
- return config.trainml.run(found.connect())
67
- except:
68
- try:
69
- config.trainml.run(found.disconnect())
70
- except:
71
- pass
72
- raise
73
- else:
74
- if found.status in [
75
- "new",
76
- "waiting for data/model download",
77
- "waiting for GPUs",
78
- ]:
79
- try:
80
- if attach:
81
- config.trainml.run(found.connect(), found.attach())
82
- config.trainml.run(found.disconnect())
83
- click.echo("Launching...", file=config.stdout)
84
- browse(found.notebook_url)
85
- else:
86
- return config.trainml.run(found.connect())
87
- except:
88
- try:
89
- config.trainml.run(found.disconnect())
90
- except:
91
- pass
92
- raise
93
- elif found.status not in [
37
+ # Get job properties
38
+ model = job._job.get("model", {})
39
+ data = job._job.get("data", {})
40
+ model_local = model.get("source_type") == "local"
41
+ data_local = data.get("input_type") == "local"
42
+ output_local = data.get("output_type") == "local"
43
+ early_statuses = [
44
+ "new",
45
+ "waiting for data/model download",
46
+ "waiting for GPUs",
47
+ "waiting for resources",
48
+ ]
49
+
50
+ # Check if we need to wait for data/model download
51
+ # Only wait if status is early AND (data or model is local)
52
+ needs_upload_wait = job.status in early_statuses and (
53
+ model_local or data_local
54
+ )
55
+
56
+ if needs_upload_wait:
57
+ # Wait for job to reach data/model download status
58
+ await job.wait_for("waiting for data/model download", 3600)
59
+ await job.refresh()
60
+
61
+ # Start attach task early if requested
62
+ attach_task = None
63
+ if attach:
64
+ attach_task = asyncio.create_task(job.attach())
65
+
66
+ # Run first connect (upload if needed)
67
+ await job.connect()
68
+
69
+ # For notebook jobs, handle opening
70
+ if job.type == "notebook":
71
+ # Refresh to get latest status after connect
72
+ await job.refresh()
73
+
74
+ if job.status in early_statuses:
75
+ if attach_task:
76
+ await attach_task
77
+ click.echo("Launching...", file=config.stdout)
78
+ browse(job.notebook_url)
79
+ return
80
+ elif job.status not in [
94
81
  "starting",
95
82
  "running",
96
83
  "reinitializing",
97
84
  "copying",
98
85
  ]:
86
+ if attach_task:
87
+ attach_task.cancel()
99
88
  raise click.UsageError("Notebook job not running.")
100
89
  else:
101
- config.trainml.run(found.wait_for("running"))
90
+ await job.wait_for("running")
91
+ if attach_task:
92
+ await attach_task
102
93
  click.echo("Launching...", file=config.stdout)
103
- browse(found.notebook_url)
94
+ browse(job.notebook_url)
95
+ return
96
+
97
+ # For non-notebook jobs, check if we need second connect (download)
98
+ # Refresh to get latest status after first connect
99
+ await job.refresh()
100
+
101
+ # Run second connect if output_type is local
102
+ # (as per user's requirement: "if the output_type is 'local'")
103
+ if output_local:
104
+ # Always wait for running status before second connect
105
+ # (as shown in user's example)
106
+ await job.wait_for("running", 3600)
107
+ await job.refresh()
108
+
109
+ # Create second connect task (download)
110
+ connect_task = asyncio.create_task(job.connect())
111
+
112
+ # Gather both attach and second connect tasks
113
+ if attach_task:
114
+ await asyncio.gather(attach_task, connect_task)
115
+ else:
116
+ await connect_task
117
+ elif attach_task:
118
+ # Just wait for attach if no second connect needed
119
+ await attach_task
104
120
 
105
121
 
106
122
  @job.command()
123
+ @click.option(
124
+ "--attach/--no-attach",
125
+ default=True,
126
+ show_default=True,
127
+ help="Auto attach to job.",
128
+ )
107
129
  @click.argument("job", type=click.STRING)
108
130
  @pass_config
109
- def disconnect(config, job):
131
+ def connect(config, job, attach):
110
132
  """
111
- Disconnect and clean-up job.
133
+ Connect to job.
112
134
 
113
135
  JOB may be specified by name or ID, but ID is preferred.
114
136
  """
@@ -118,7 +140,7 @@ def disconnect(config, job):
118
140
  if None is found:
119
141
  raise click.UsageError("Cannot find specified job.")
120
142
 
121
- return config.trainml.run(found.disconnect())
143
+ config.trainml.run(_connect_job(found, attach, config))
122
144
 
123
145
 
124
146
  @job.command()