trainml 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
trainml/__init__.py CHANGED
@@ -13,5 +13,5 @@ logging.basicConfig(
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
15
 
16
- __version__ = "1.0.0"
16
+ __version__ = "1.0.1"
17
17
  __all__ = "TrainML"
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  import click
2
3
  from webbrowser import open as browse
3
4
  from trainml.cli import cli, pass_config, search_by_id_name
@@ -28,6 +29,96 @@ def attach(config, job):
28
29
  config.trainml.run(found.attach())
29
30
 
30
31
 
32
+ async def _connect_job(job, attach, config):
33
+ """
34
+ Async helper function to handle job connection with proper
35
+ handling of local input/output types and attach task management.
36
+ """
37
+ # Get job properties
38
+ model = job._job.get("model", {})
39
+ data = job._job.get("data", {})
40
+ model_local = model.get("source_type") == "local"
41
+ data_local = data.get("input_type") == "local"
42
+ output_local = data.get("output_type") == "local"
43
+ early_statuses = [
44
+ "new",
45
+ "waiting for data/model download",
46
+ "waiting for GPUs",
47
+ "waiting for resources",
48
+ ]
49
+
50
+ # Check if we need to wait for data/model download
51
+ # Only wait if status is early AND (data or model is local)
52
+ needs_upload_wait = job.status in early_statuses and (
53
+ model_local or data_local
54
+ )
55
+
56
+ if needs_upload_wait:
57
+ # Wait for job to reach data/model download status
58
+ await job.wait_for("waiting for data/model download", 3600)
59
+ await job.refresh()
60
+
61
+ # Start attach task early if requested
62
+ attach_task = None
63
+ if attach:
64
+ attach_task = asyncio.create_task(job.attach())
65
+
66
+ # Run first connect (upload if needed)
67
+ await job.connect()
68
+
69
+ # For notebook jobs, handle opening
70
+ if job.type == "notebook":
71
+ # Refresh to get latest status after connect
72
+ await job.refresh()
73
+
74
+ if job.status in early_statuses:
75
+ if attach_task:
76
+ await attach_task
77
+ click.echo("Launching...", file=config.stdout)
78
+ browse(job.notebook_url)
79
+ return
80
+ elif job.status not in [
81
+ "starting",
82
+ "running",
83
+ "reinitializing",
84
+ "copying",
85
+ ]:
86
+ if attach_task:
87
+ attach_task.cancel()
88
+ raise click.UsageError("Notebook job not running.")
89
+ else:
90
+ await job.wait_for("running")
91
+ if attach_task:
92
+ await attach_task
93
+ click.echo("Launching...", file=config.stdout)
94
+ browse(job.notebook_url)
95
+ return
96
+
97
+ # For non-notebook jobs, check if we need second connect (download)
98
+ # Refresh to get latest status after first connect
99
+ await job.refresh()
100
+
101
+ # Run second connect if output_type is local
102
+ # (as per user's requirement: "if the output_type is 'local'")
103
+ if output_local:
104
+ # Always wait for running status before second connect
105
+ # (as shown in user's example)
106
+ await job.wait_for("running", 3600)
107
+ await job.refresh()
108
+
109
+ # Create second connect task (download)
110
+ connect_task = asyncio.create_task(job.connect())
111
+
112
+ # Gather both attach and second connect tasks
113
+ if attach_task:
114
+ await asyncio.gather(attach_task, connect_task)
115
+ else:
116
+ await connect_task
117
+ elif attach_task:
118
+ # Just wait for attach if no second connect needed
119
+ await attach_task
120
+
121
+
31
122
  @job.command()
32
123
  @click.option(
33
124
  "--attach/--no-attach",
@@ -49,34 +140,7 @@ def connect(config, job, attach):
49
140
  if None is found:
50
141
  raise click.UsageError("Cannot find specified job.")
51
142
 
52
- if found.type != "notebook":
53
- if attach:
54
- config.trainml.run(found.connect(), found.attach())
55
- else:
56
- config.trainml.run(found.connect())
57
- else:
58
- if found.status in [
59
- "new",
60
- "waiting for data/model download",
61
- "waiting for GPUs",
62
- ]:
63
- if attach:
64
- config.trainml.run(found.connect(), found.attach())
65
- click.echo("Launching...", file=config.stdout)
66
- browse(found.notebook_url)
67
- else:
68
- config.trainml.run(found.connect())
69
- elif found.status not in [
70
- "starting",
71
- "running",
72
- "reinitializing",
73
- "copying",
74
- ]:
75
- raise click.UsageError("Notebook job not running.")
76
- else:
77
- config.trainml.run(found.wait_for("running"))
78
- click.echo("Launching...", file=config.stdout)
79
- browse(found.notebook_url)
143
+ config.trainml.run(_connect_job(found, attach, config))
80
144
 
81
145
 
82
146
  @job.command()
trainml/jobs.py CHANGED
@@ -321,10 +321,29 @@ class Job:
321
321
  ]:
322
322
  return self.url
323
323
 
324
- # Check for invalid statuses
324
+ # Refresh to get latest job data first, so we can check worker statuses
325
+ await self.refresh()
326
+
327
+ # Check worker statuses - if any worker is uploading, allow connection
328
+ # This handles the case where job status might be "finished" but workers are still uploading
329
+ workers = self._job.get("workers", [])
330
+ has_uploading_workers = any(
331
+ worker.get("status") == "uploading" for worker in workers
332
+ ) if workers else False
333
+
334
+ # Log worker statuses for debugging
335
+ if workers:
336
+ worker_statuses = [
337
+ f"Worker {i+1}: {worker.get('status')}"
338
+ for i, worker in enumerate(workers)
339
+ ]
340
+ logging.debug(
341
+ f"Job status: {self.status}, Worker statuses: {', '.join(worker_statuses)}, Has uploading workers: {has_uploading_workers}"
342
+ )
343
+
344
+ # Check for invalid statuses (but allow "finished" if workers are still uploading)
325
345
  if self.status in [
326
346
  "failed",
327
- "finished",
328
347
  "canceled",
329
348
  "archived",
330
349
  "removed",
@@ -335,34 +354,34 @@ class Job:
335
354
  f"You can only connect to active jobs.",
336
355
  )
337
356
 
357
+ # Allow "finished" status if there are workers still uploading
358
+ # This handles reconnection scenarios where some workers are done but others are still uploading
359
+ if self.status == "finished":
360
+ if not has_uploading_workers:
361
+ raise SpecificationError(
362
+ "status",
363
+ f"You can only connect to active jobs.",
364
+ )
365
+ logging.info(
366
+ f"Job status is 'finished' but has {sum(1 for w in workers if w.get('status') == 'uploading')} worker(s) still uploading. Allowing connection to download remaining workers."
367
+ )
368
+ # If we have uploading workers, fall through to download logic
369
+
338
370
  # Only allow specific statuses for connect
339
371
  if self.status not in [
340
372
  "waiting for data/model download",
341
373
  "uploading",
342
374
  "running",
375
+ "finished", # Allow finished if workers are still uploading
343
376
  ]:
344
377
  if self.status == "new":
345
378
  await self.wait_for("waiting for data/model download")
346
379
  else:
347
380
  raise SpecificationError(
348
381
  "status",
349
- f"You can only connect to jobs in 'waiting for data/model download', 'uploading', or 'running' status.",
382
+ f"You can only connect to jobs in 'waiting for data/model download', 'uploading', 'running', or 'finished' (with uploading workers) status.",
350
383
  )
351
384
 
352
- # Refresh to get latest job data
353
- await self.refresh()
354
-
355
- # Re-check status after refresh (status may have changed if attach() is running in parallel)
356
- if self.status not in [
357
- "waiting for data/model download",
358
- "uploading",
359
- "running",
360
- ]:
361
- raise SpecificationError(
362
- "status",
363
- f"Job status changed to '{self.status}'. You can only connect to jobs in 'waiting for data/model download', 'uploading', or 'running' status.",
364
- )
365
-
366
385
  if self.status == "waiting for data/model download":
367
386
  # Upload model and/or data if local
368
387
  model = self._job.get("model", {})
@@ -421,7 +440,7 @@ class Job:
421
440
  if upload_tasks:
422
441
  await asyncio.gather(*upload_tasks)
423
442
 
424
- elif self.status in ["uploading", "running"]:
443
+ elif self.status in ["uploading", "running", "finished"]:
425
444
  # Download output if local
426
445
  data = self._job.get("data", {})
427
446
 
@@ -455,8 +474,15 @@ class Job:
455
474
  f"Job has no workers.",
456
475
  )
457
476
 
458
- # Check if job is finished
459
- if self.status in ["finished", "canceled", "failed"]:
477
+ # Check if job is in a terminal state AND all workers are finished
478
+ # Allow "finished" status if workers are still uploading
479
+ all_workers_finished = all(
480
+ worker.get("status") in ["finished", "removed"]
481
+ for worker in workers
482
+ )
483
+ if self.status in ["canceled", "failed"]:
484
+ break
485
+ if self.status == "finished" and all_workers_finished:
460
486
  break
461
487
 
462
488
  # Check all workers for uploading status
@@ -467,6 +493,7 @@ class Job:
467
493
  worker_status = worker.get("status")
468
494
 
469
495
  # Start download for any worker that enters uploading status
496
+ # This handles both new connections and reconnections where some workers are already uploading
470
497
  if (
471
498
  worker_status == "uploading"
472
499
  and worker_id not in downloading_workers
@@ -478,6 +505,8 @@ class Job:
478
505
  logging.warning(
479
506
  f"Worker {worker_id} in uploading status missing output_auth_token or output_hostname, skipping."
480
507
  )
508
+ # Mark as downloading to avoid retrying
509
+ downloading_workers.add(worker_id)
481
510
  continue
482
511
 
483
512
  downloading_workers.add(worker_id)
trainml/trainml.py CHANGED
@@ -214,118 +214,129 @@ class TrainML(object):
214
214
  "Content-Type": "application/json",
215
215
  }
216
216
  try:
217
- tokens = self.auth.get_tokens()
218
- except TrainMLException as e:
219
- raise e
220
- except Exception:
221
- raise TrainMLException(
222
- f"Error getting authorization tokens. Verify configured credentials. Error: {traceback.format_exc()}"
223
- )
224
- async with aiohttp.ClientSession() as session:
225
- done = False
226
- async with session.ws_connect(
227
- f"wss://{self.ws_url}?Authorization={tokens.get('id_token')}",
228
- headers=headers,
229
- heartbeat=30,
230
- ) as ws:
231
- asyncio.create_task(
232
- ws.send_json(
233
- dict(
234
- action="getlogs",
235
- data=dict(
236
- type="init",
237
- entity=entity,
238
- id=id,
239
- project_uuid=project_uuid,
240
- ),
241
- )
242
- )
217
+ try:
218
+ tokens = self.auth.get_tokens()
219
+ except TrainMLException as e:
220
+ raise e
221
+ except Exception:
222
+ raise TrainMLException(
223
+ f"Error getting authorization tokens. Verify configured credentials. Error: {traceback.format_exc()}"
243
224
  )
244
- asyncio.create_task(
245
- ws.send_json(
246
- dict(
247
- action="subscribe",
248
- data=dict(
249
- type="logs",
250
- entity=entity,
251
- id=id,
252
- project_uuid=project_uuid,
253
- ),
225
+ async with aiohttp.ClientSession() as session:
226
+ done = False
227
+ async with session.ws_connect(
228
+ f"wss://{self.ws_url}?Authorization={tokens.get('id_token')}",
229
+ headers=headers,
230
+ heartbeat=30,
231
+ ) as ws:
232
+ asyncio.create_task(
233
+ ws.send_json(
234
+ dict(
235
+ action="getlogs",
236
+ data=dict(
237
+ type="init",
238
+ entity=entity,
239
+ id=id,
240
+ project_uuid=project_uuid,
241
+ ),
242
+ )
254
243
  )
255
244
  )
256
- )
257
- async for msg in ws:
258
- if msg.type in (
259
- aiohttp.WSMsgType.CLOSED,
260
- aiohttp.WSMsgType.ERROR,
261
- aiohttp.WSMsgType.CLOSE,
262
- ):
263
- logging.debug(
264
- f"Websocket Received Closed Message. Done? {done}"
245
+ asyncio.create_task(
246
+ ws.send_json(
247
+ dict(
248
+ action="subscribe",
249
+ data=dict(
250
+ type="logs",
251
+ entity=entity,
252
+ id=id,
253
+ project_uuid=project_uuid,
254
+ ),
255
+ )
265
256
  )
266
- await ws.close()
267
- break
268
- data = json.loads(msg.data)
269
- if data.get("type") == "end":
270
- done = True
271
- asyncio.create_task(delayed_close(ws))
272
- else:
273
- msg_handler(data)
274
- logging.debug(f"Websocket Disconnected. Done? {done}")
257
+ )
258
+ async for msg in ws:
259
+ if msg.type in (
260
+ aiohttp.WSMsgType.CLOSED,
261
+ aiohttp.WSMsgType.ERROR,
262
+ aiohttp.WSMsgType.CLOSE,
263
+ ):
264
+ logging.debug(
265
+ f"Websocket Received Closed Message. Done? {done}"
266
+ )
267
+ await ws.close()
268
+ break
269
+ data = json.loads(msg.data)
270
+ if data.get("type") == "end":
271
+ done = True
272
+ asyncio.create_task(delayed_close(ws))
273
+ else:
274
+ msg_handler(data)
275
+ logging.debug(f"Websocket Disconnected. Done? {done}")
275
276
 
276
- connection_tries = 0
277
- while not done:
278
- tokens = self.auth.get_tokens()
279
- try:
280
- async with session.ws_connect(
281
- f"wss://{self.ws_url}?Authorization={tokens.get('id_token')}",
282
- headers=headers,
283
- heartbeat=30,
284
- ) as ws:
285
- asyncio.create_task(
286
- ws.send_json(
287
- dict(
288
- action="subscribe",
289
- data=dict(
290
- type="logs",
291
- entity=entity,
292
- id=id,
293
- project_uuid=project_uuid,
294
- ),
277
+ connection_tries = 0
278
+ while not done:
279
+ tokens = self.auth.get_tokens()
280
+ try:
281
+ async with session.ws_connect(
282
+ f"wss://{self.ws_url}?Authorization={tokens.get('id_token')}",
283
+ headers=headers,
284
+ heartbeat=30,
285
+ ) as ws:
286
+ asyncio.create_task(
287
+ ws.send_json(
288
+ dict(
289
+ action="subscribe",
290
+ data=dict(
291
+ type="logs",
292
+ entity=entity,
293
+ id=id,
294
+ project_uuid=project_uuid,
295
+ ),
296
+ )
295
297
  )
296
298
  )
299
+ async for msg in ws:
300
+ if msg.type in (
301
+ aiohttp.WSMsgType.CLOSED,
302
+ aiohttp.WSMsgType.ERROR,
303
+ aiohttp.WSMsgType.CLOSE,
304
+ ):
305
+ logging.debug(
306
+ f"Websocket Received Closed Message. Done? {done}"
307
+ )
308
+ await ws.close()
309
+ break
310
+ data = json.loads(msg.data)
311
+ if data.get("type") == "end":
312
+ done = True
313
+ asyncio.create_task(delayed_close(ws))
314
+ else:
315
+ msg_handler(data)
316
+ connection_tries = 0
317
+ logging.debug(f"Websocket Disconnected. Done? {done}")
318
+ except Exception as e:
319
+ connection_tries += 1
320
+ logging.debug(
321
+ f"Connection error: {traceback.format_exc()}"
297
322
  )
298
- async for msg in ws:
299
- if msg.type in (
300
- aiohttp.WSMsgType.CLOSED,
301
- aiohttp.WSMsgType.ERROR,
302
- aiohttp.WSMsgType.CLOSE,
303
- ):
304
- logging.debug(
305
- f"Websocket Received Closed Message. Done? {done}"
306
- )
307
- await ws.close()
308
- break
309
- data = json.loads(msg.data)
310
- if data.get("type") == "end":
311
- done = True
312
- asyncio.create_task(delayed_close(ws))
313
- else:
314
- msg_handler(data)
315
- connection_tries = 0
316
- logging.debug(f"Websocket Disconnected. Done? {done}")
317
- except Exception as e:
318
- connection_tries += 1
319
- logging.debug(
320
- f"Connection error: {traceback.format_exc()}"
321
- )
322
- if connection_tries == 5:
323
- raise ApiError(
324
- 500,
325
- {
326
- "message": f"Connection error: {traceback.format_exc()}"
327
- },
328
- )
323
+ if connection_tries == 5:
324
+ raise ApiError(
325
+ 500,
326
+ {
327
+ "message": f"Connection error: {traceback.format_exc()}"
328
+ },
329
+ )
330
+ except GeneratorExit:
331
+ # Handle graceful shutdown - GeneratorExit is raised during
332
+ # event loop cleanup. Don't re-raise to avoid "coroutine ignored"
333
+ # warnings.
334
+ logging.debug("Websocket subscription cancelled during shutdown")
335
+ return
336
+ except asyncio.CancelledError:
337
+ # Re-raise CancelledError to properly propagate task cancellation
338
+ logging.debug("Websocket subscription task cancelled")
339
+ raise
329
340
 
330
341
  def set_active_project(self, project_uuid):
331
342
  CONFIG_DIR = os.path.expanduser(
trainml/utils/transfer.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  import re
3
+ import math
3
4
  import asyncio
4
5
  import aiohttp
5
6
  import aiofiles
@@ -28,6 +29,8 @@ RETRY_STATUSES = {
28
29
  # Additional retries for DNS/connection errors (ClientConnectorError)
29
30
  DNS_MAX_RETRIES = 7 # More retries for DNS resolution issues
30
31
  DNS_INITIAL_DELAY = 1 # Initial delay in seconds before first DNS retry
32
+ # Ping warmup timeout: calculate retries so last retry is this many seconds after first try
33
+ PING_WARMUP_TIMEOUT = 8 * 60 # 8 minutes in seconds
31
34
 
32
35
 
33
36
  def normalize_endpoint(endpoint):
@@ -56,6 +59,38 @@ def normalize_endpoint(endpoint):
56
59
  return endpoint
57
60
 
58
61
 
62
+ def calculate_ping_retries(timeout_seconds, backoff_base):
63
+ """
64
+ Calculate the number of retries needed for ping warmup to reach timeout.
65
+
66
+ With exponential backoff, if we have n retries, the total wait time is:
67
+ backoff_base^1 + backoff_base^2 + ... + backoff_base^(n-1) = (backoff_base^n - backoff_base) / (backoff_base - 1)
68
+
69
+ For backoff_base = 2, this simplifies to: 2^n - 2
70
+
71
+ Args:
72
+ timeout_seconds: Total timeout in seconds
73
+ backoff_base: Exponential backoff base
74
+
75
+ Returns:
76
+ Number of retries needed to reach or exceed the timeout
77
+ """
78
+ if backoff_base == 2:
79
+ # Simplified calculation for base 2
80
+ # 2^n - 2 >= timeout_seconds
81
+ # 2^n >= timeout_seconds + 2
82
+ # n >= log2(timeout_seconds + 2)
83
+ n = math.ceil(math.log2(timeout_seconds + 2))
84
+ else:
85
+ # General case: (backoff_base^n - backoff_base) / (backoff_base - 1) >= timeout_seconds
86
+ # backoff_base^n >= timeout_seconds * (backoff_base - 1) + backoff_base
87
+ # n >= log_base(timeout_seconds * (backoff_base - 1) + backoff_base)
88
+ target = timeout_seconds * (backoff_base - 1) + backoff_base
89
+ n = math.ceil(math.log(target, backoff_base))
90
+
91
+ return max(1, int(n))
92
+
93
+
59
94
  async def ping_endpoint(
60
95
  endpoint, auth_token, max_retries=MAX_RETRIES, retry_backoff=RETRY_BACKOFF
61
96
  ):
@@ -68,10 +103,13 @@ async def ping_endpoint(
68
103
  Creates a fresh TCPConnector for each attempt to force fresh DNS resolution
69
104
  and avoid stale DNS cache issues.
70
105
 
106
+ For ping warmup, calculates retries dynamically to ensure the last retry
107
+ occurs PING_WARMUP_TIMEOUT seconds after the first try.
108
+
71
109
  Args:
72
110
  endpoint: Server endpoint URL
73
111
  auth_token: Authentication token
74
- max_retries: Maximum number of retry attempts
112
+ max_retries: Maximum number of retry attempts (ignored for ping warmup)
75
113
  retry_backoff: Exponential backoff base
76
114
 
77
115
  Raises:
@@ -80,7 +118,18 @@ async def ping_endpoint(
80
118
  """
81
119
  endpoint = normalize_endpoint(endpoint)
82
120
  attempt = 1
83
- effective_max_retries = max_retries
121
+ # Calculate retries for ping warmup to reach PING_WARMUP_TIMEOUT
122
+ # Allow max_retries to override when explicitly provided (for testing)
123
+ if max_retries == MAX_RETRIES:
124
+ # Use default, calculate retries for ping warmup
125
+ ping_max_retries = calculate_ping_retries(
126
+ PING_WARMUP_TIMEOUT, retry_backoff
127
+ )
128
+ effective_max_retries = ping_max_retries
129
+ else:
130
+ # Use explicitly provided max_retries (for testing)
131
+ effective_max_retries = max_retries
132
+ ping_max_retries = max_retries # For DNS error handling below
84
133
 
85
134
  while attempt <= effective_max_retries:
86
135
  # Create a fresh connector for each attempt to force DNS re-resolution
@@ -122,8 +171,9 @@ async def ping_endpoint(
122
171
  )
123
172
  except ClientConnectorError as e:
124
173
  # DNS resolution errors need more retries and initial delay
125
- if effective_max_retries == max_retries:
126
- effective_max_retries = max(max_retries, DNS_MAX_RETRIES)
174
+ # Use the higher of DNS_MAX_RETRIES or calculated ping retries
175
+ if effective_max_retries == ping_max_retries:
176
+ effective_max_retries = max(ping_max_retries, DNS_MAX_RETRIES)
127
177
 
128
178
  if attempt < effective_max_retries:
129
179
  # Use initial delay for first retry, then exponential backoff
@@ -419,8 +469,11 @@ async def download(endpoint, auth_token, target_directory, file_name=None):
419
469
  error_text = await response.text()
420
470
  except Exception:
421
471
  error_text = f"Unable to read response body (status: {response.status})"
422
- raise ConnectionError(
423
- f"Failed to get server info (status {response.status}): {error_text}"
472
+ raise ClientResponseError(
473
+ request_info=response.request_info,
474
+ history=response.history,
475
+ status=response.status,
476
+ message=error_text,
424
477
  )
425
478
  return await response.json()
426
479
 
@@ -444,7 +497,14 @@ async def download(endpoint, auth_token, target_directory, file_name=None):
444
497
  "Warning: /info endpoint not available, defaulting to TAR stream mode"
445
498
  )
446
499
  else:
447
- # For other errors, re-raise
500
+ # For other errors, convert ClientResponseError to ConnectionError
501
+ # to maintain backward compatibility
502
+ if isinstance(e, ClientResponseError):
503
+ error_msg = getattr(e, "message", str(e))
504
+ raise ConnectionError(
505
+ f"Failed to get server info (status {e.status}): {error_msg}"
506
+ )
507
+ # For ConnectionError, re-raise as-is
448
508
  raise
449
509
 
450
510
  # Download the archive
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: trainml
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Summary: trainML client SDK and command line utilities
5
5
  Home-page: https://github.com/trainML/trainml-cli
6
6
  Author: trainML
@@ -81,7 +81,7 @@ tests/unit/projects/test_project_services_unit.py,sha256=qGYVcs8CPF7DklPUQS3th8r
81
81
  tests/unit/projects/test_projects_unit.py,sha256=baI9-HRRSs4IPd82329zHWRGPYjNlJQ1qG4Hngcu6ds,4409
82
82
  tests/unit/utils/__init__.py,sha256=VDy3m4Lfazpey8NMzHWdgcs3T9aaXWuH7pglIjCnZW0,38
83
83
  tests/unit/utils/test_transfer_unit.py,sha256=seYUQwQK93kiZjfXEF_3ZN40za3xUTU61oXIyC4na0o,188309
84
- trainml/__init__.py,sha256=x88iUSASEoNJBZtUWRPzi7pu4ndKVwD073w6FrZztPE,432
84
+ trainml/__init__.py,sha256=Jjeuhm0-YUNAgrdmG7fXrw6VgW19jN5yW0cz3JWwO98,432
85
85
  trainml/__main__.py,sha256=JgErYkiskih8Y6oRwowALtR-rwQhAAdqOYWjQraRIPI,59
86
86
  trainml/auth.py,sha256=clbx5S5prJ3u62aEESdBXIHF_HmreQ-L1ShxcyWQNDs,26565
87
87
  trainml/checkpoints.py,sha256=KEDS3xRZa1h5tsT6ZEsS5aanOfC-S_8yaeX52JubO4E,8924
@@ -90,9 +90,9 @@ trainml/datasets.py,sha256=T3z5iOMfFXYea002iPuVBURl0_Eia5nKLC4bOYmySXI,8690
90
90
  trainml/environments.py,sha256=OH4o08zXZ7IJ2CiA1rPnys2Fl45r8qvQHfM2mCBRAIc,1507
91
91
  trainml/exceptions.py,sha256=Qdof2fKRvbMiwarX1VSw1XJXXJjY71H4U3v05nE7-7g,5468
92
92
  trainml/gpu_types.py,sha256=mm-dwfYc02192bmYPIJmzesndyBcoOdkKYBaYZXOUwU,1901
93
- trainml/jobs.py,sha256=bpe8R1mm38TcV0goN0YN7ZlnGOzalfumDnJzNMfEFKs,24928
93
+ trainml/jobs.py,sha256=1XzZrAQ13a35VDxG-x2gs2OKkgHdha_LbVrp-dlBQDE,26737
94
94
  trainml/models.py,sha256=PmbTNQKFlPqfRbq93UMpmo5za9qbNFmQdx6BKdejLXA,8339
95
- trainml/trainml.py,sha256=srQ_r_kxsn3mwDWoaIa-Sh5SA_yhjjysGB6-vWJrXJ4,12724
95
+ trainml/trainml.py,sha256=ZQy87waJniunebTSYfrOAlzIDY4bGxw8InPj7BjowtU,13688
96
96
  trainml/volumes.py,sha256=4Y8wWMymGWKOopRU3yV3NMyTrO2FrDJzdH7dN3hg8yQ,8533
97
97
  trainml/cli/__init__.py,sha256=rNiCPZpAeQSbZwceyQv0zmejzILVz5Km2lCiLw4uqjE,4330
98
98
  trainml/cli/checkpoint.py,sha256=feiqjeBn4HedBEQgPfENL8AtDeP76t2n-Sdcx9-A30o,6020
@@ -110,7 +110,7 @@ trainml/cli/cloudbender/node.py,sha256=iN_WaPCxOhtgDtnSsIFAEMGADG4MKiLjWoez6YSYw
110
110
  trainml/cli/cloudbender/provider.py,sha256=oFjZWKfFQjNY7OtDu7nUdfv-RTmQc_Huuug963D3BdA,1726
111
111
  trainml/cli/cloudbender/region.py,sha256=X6-FYOb-pGpOEazn-NbsYSwa9ergB7FGATFkTe4a8Pk,2892
112
112
  trainml/cli/cloudbender/service.py,sha256=Wh6ycEuECiKL7qpFhc4IyO1rR5lvLtIHk3S475_R6pk,3147
113
- trainml/cli/job/__init__.py,sha256=_yfyKEkwH6756usqSbmoGOXMv945v0VjQcCAi9fwFzU,5379
113
+ trainml/cli/job/__init__.py,sha256=LwnagzsL517w1dLZ9BqEd-ujAtlQaRfeZYMlXsI5qYw,7429
114
114
  trainml/cli/job/create.py,sha256=DegZYOggCNOSCUzeMTmpSrQRSMlYrUSV4XcElaocf5g,35157
115
115
  trainml/cli/project/__init__.py,sha256=HDcJUbKMHhz4Thrvpst5hnywFqzsv0XWmvfKNRi8zuo,1918
116
116
  trainml/cli/project/credential.py,sha256=gByXKiYf5sJeNRtuXWcercWv8P2IzO5TjT8Ypp4mCR8,3443
@@ -138,10 +138,10 @@ trainml/projects/secrets.py,sha256=TIvBd3rAvd4lF3pm5qR98UslHjldzlnzn_n9yvpmLgg,2
138
138
  trainml/projects/services.py,sha256=rI-uFmojqOTNLbqBeX6gaSwMkI6LKzRuJthQCH0A2h4,2771
139
139
  trainml/utils/__init__.py,sha256=YGZaaZGxeZ1Lz8ScoIx_qCDbNHsvmNORJTxC8Bxd_ro,39
140
140
  trainml/utils/auth.py,sha256=v28FItj3zj5nrfXBt2GM5Av1y-1Ug5G39ejS5wrWh0U,26573
141
- trainml/utils/transfer.py,sha256=GA31XbbR-9FOP4rYbm8EMhbEHISlmjSTgHohVP0dU-0,22395
142
- trainml-1.0.0.dist-info/LICENSE,sha256=s0lpBxhSSUEpMavwde-Vb6K_K7xDCTTvSpNznVqVGR0,1069
143
- trainml-1.0.0.dist-info/METADATA,sha256=OouGnOpI8T1jqDEHHVGPZjTQXO-SeHpxoQieldQMiWI,7288
144
- trainml-1.0.0.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
145
- trainml-1.0.0.dist-info/entry_points.txt,sha256=OzBDm2wXby1bSGF02jTVxzRFZLejnbFiLHXhKdW3Bds,63
146
- trainml-1.0.0.dist-info/top_level.txt,sha256=Y1kLFRWKUW7RG8BX7cvejHF_yW8wBOaRYF1JQHENY4w,23
147
- trainml-1.0.0.dist-info/RECORD,,
141
+ trainml/utils/transfer.py,sha256=TcoHQdeRoDRdGj1B6VCjwW43tHHnBpP-zl__5t7pCz4,25118
142
+ trainml-1.0.1.dist-info/LICENSE,sha256=s0lpBxhSSUEpMavwde-Vb6K_K7xDCTTvSpNznVqVGR0,1069
143
+ trainml-1.0.1.dist-info/METADATA,sha256=fGhhOAR13XSyoKTyh3TQYJ6pzYEgg4EoYbM3D3EUr18,7288
144
+ trainml-1.0.1.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
145
+ trainml-1.0.1.dist-info/entry_points.txt,sha256=OzBDm2wXby1bSGF02jTVxzRFZLejnbFiLHXhKdW3Bds,63
146
+ trainml-1.0.1.dist-info/top_level.txt,sha256=Y1kLFRWKUW7RG8BX7cvejHF_yW8wBOaRYF1JQHENY4w,23
147
+ trainml-1.0.1.dist-info/RECORD,,