ml-dash 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,20 +2,25 @@
2
2
 
3
3
  import argparse
4
4
  import json
5
- from pathlib import Path
6
- from typing import List, Dict, Any, Optional
7
- from dataclasses import dataclass, field
8
5
  import threading
9
6
  from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional
10
10
 
11
11
  from rich.console import Console
12
- from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
12
+ from rich.progress import (
13
+ BarColumn,
14
+ Progress,
15
+ SpinnerColumn,
16
+ TaskProgressColumn,
17
+ TextColumn,
18
+ )
13
19
  from rich.table import Table
14
- from rich.panel import Panel
15
20
 
16
- from ..storage import LocalStorage
17
21
  from ..client import RemoteClient
18
22
  from ..config import Config
23
+ from ..storage import LocalStorage
19
24
 
20
25
  # Initialize rich console
21
26
  console = Console()
@@ -23,1226 +28,1371 @@ console = Console()
23
28
 
24
29
  @dataclass
25
30
  class ExperimentInfo:
26
- """Information about an experiment to upload."""
27
- project: str
28
- experiment: str
29
- path: Path
30
- folder: Optional[str] = None
31
- has_logs: bool = False
32
- has_params: bool = False
33
- metric_names: List[str] = field(default_factory=list)
34
- file_count: int = 0
35
- estimated_size: int = 0 # in bytes
31
+ """Information about an experiment to upload."""
32
+
33
+ project: str
34
+ experiment: str
35
+ path: Path
36
+ prefix: Optional[str] = None
37
+ has_logs: bool = False
38
+ has_params: bool = False
39
+ metric_names: List[str] = field(default_factory=list)
40
+ file_count: int = 0
41
+ estimated_size: int = 0 # in bytes
36
42
 
37
43
 
38
44
  @dataclass
39
45
  class ValidationResult:
40
- """Result of experiment validation."""
41
- is_valid: bool = True
42
- warnings: List[str] = field(default_factory=list)
43
- errors: List[str] = field(default_factory=list)
44
- valid_data: Dict[str, Any] = field(default_factory=dict)
46
+ """Result of experiment validation."""
47
+
48
+ is_valid: bool = True
49
+ warnings: List[str] = field(default_factory=list)
50
+ errors: List[str] = field(default_factory=list)
51
+ valid_data: Dict[str, Any] = field(default_factory=dict)
45
52
 
46
53
 
47
54
  @dataclass
48
55
  class UploadResult:
49
- """Result of uploading an experiment."""
50
- experiment: str
51
- success: bool = False
52
- uploaded: Dict[str, int] = field(default_factory=dict) # {"logs": 100, "metrics": 3}
53
- failed: Dict[str, List[str]] = field(default_factory=dict) # {"files": ["error msg"]}
54
- errors: List[str] = field(default_factory=list)
55
- bytes_uploaded: int = 0 # Total bytes uploaded
56
+ """Result of uploading an experiment."""
57
+
58
+ experiment: str
59
+ success: bool = False
60
+ uploaded: Dict[str, int] = field(default_factory=dict) # {"logs": 100, "metrics": 3}
61
+ failed: Dict[str, List[str]] = field(default_factory=dict) # {"files": ["error msg"]}
62
+ errors: List[str] = field(default_factory=list)
63
+ bytes_uploaded: int = 0 # Total bytes uploaded
56
64
 
57
65
 
58
66
  @dataclass
59
67
  class UploadState:
60
- """Tracks upload state for resume functionality."""
61
- local_path: str
62
- remote_url: str
63
- completed_experiments: List[str] = field(default_factory=list) # ["project/experiment"]
64
- failed_experiments: List[str] = field(default_factory=list)
65
- in_progress_experiment: Optional[str] = None
66
- timestamp: Optional[str] = None
67
-
68
- def to_dict(self) -> Dict[str, Any]:
69
- """Convert to dictionary for JSON serialization."""
70
- return {
71
- "local_path": self.local_path,
72
- "remote_url": self.remote_url,
73
- "completed_experiments": self.completed_experiments,
74
- "failed_experiments": self.failed_experiments,
75
- "in_progress_experiment": self.in_progress_experiment,
76
- "timestamp": self.timestamp,
77
- }
78
-
79
- @classmethod
80
- def from_dict(cls, data: Dict[str, Any]) -> "UploadState":
81
- """Create from dictionary."""
82
- return cls(
83
- local_path=data["local_path"],
84
- remote_url=data["remote_url"],
85
- completed_experiments=data.get("completed_experiments", []),
86
- failed_experiments=data.get("failed_experiments", []),
87
- in_progress_experiment=data.get("in_progress_experiment"),
88
- timestamp=data.get("timestamp"),
89
- )
68
+ """Tracks upload state for resume functionality."""
69
+
70
+ local_path: str
71
+ remote_url: str
72
+ completed_experiments: List[str] = field(
73
+ default_factory=list
74
+ ) # ["project/experiment"]
75
+ failed_experiments: List[str] = field(default_factory=list)
76
+ in_progress_experiment: Optional[str] = None
77
+ timestamp: Optional[str] = None
78
+
79
+ def to_dict(self) -> Dict[str, Any]:
80
+ """Convert to dictionary for JSON serialization."""
81
+ return {
82
+ "local_path": self.local_path,
83
+ "remote_url": self.remote_url,
84
+ "completed_experiments": self.completed_experiments,
85
+ "failed_experiments": self.failed_experiments,
86
+ "in_progress_experiment": self.in_progress_experiment,
87
+ "timestamp": self.timestamp,
88
+ }
89
+
90
+ @classmethod
91
+ def from_dict(cls, data: Dict[str, Any]) -> "UploadState":
92
+ """Create from dictionary."""
93
+ return cls(
94
+ local_path=data["local_path"],
95
+ remote_url=data["remote_url"],
96
+ completed_experiments=data.get("completed_experiments", []),
97
+ failed_experiments=data.get("failed_experiments", []),
98
+ in_progress_experiment=data.get("in_progress_experiment"),
99
+ timestamp=data.get("timestamp"),
100
+ )
90
101
 
91
- def save(self, path: Path):
92
- """Save state to file."""
93
- import datetime
94
- self.timestamp = datetime.datetime.now().isoformat()
95
- with open(path, "w") as f:
96
- json.dump(self.to_dict(), f, indent=2)
97
-
98
- @classmethod
99
- def load(cls, path: Path) -> Optional["UploadState"]:
100
- """Load state from file."""
101
- if not path.exists():
102
- return None
103
- try:
104
- with open(path, "r") as f:
105
- data = json.load(f)
106
- return cls.from_dict(data)
107
- except (json.JSONDecodeError, IOError, KeyError):
108
- return None
102
+ def save(self, path: Path):
103
+ """Save state to file."""
104
+ import datetime
105
+
106
+ self.timestamp = datetime.datetime.now().isoformat()
107
+ with open(path, "w") as f:
108
+ json.dump(self.to_dict(), f, indent=2)
109
+
110
+ @classmethod
111
+ def load(cls, path: Path) -> Optional["UploadState"]:
112
+ """Load state from file."""
113
+ if not path.exists():
114
+ return None
115
+ try:
116
+ with open(path, "r") as f:
117
+ data = json.load(f)
118
+ return cls.from_dict(data)
119
+ except (json.JSONDecodeError, IOError, KeyError):
120
+ return None
109
121
 
110
122
 
111
123
  def add_parser(subparsers) -> argparse.ArgumentParser:
112
- """Add upload command parser."""
113
- parser = subparsers.add_parser(
114
- "upload",
115
- help="Upload local experiments to remote server",
116
- description="Upload locally-stored ML-Dash experiment data to a remote server.",
117
- )
124
+ """Add upload command parser."""
125
+ parser = subparsers.add_parser(
126
+ "upload",
127
+ help="Upload local experiments to remote server",
128
+ description="Upload locally-stored ML-Dash experiment data to a remote server.",
129
+ )
130
+
131
+ # Positional argument
132
+ parser.add_argument(
133
+ "path",
134
+ nargs="?",
135
+ default="./.dash",
136
+ help="Local storage directory to upload from (default: ./.dash)",
137
+ )
138
+
139
+ # Remote configuration
140
+ parser.add_argument(
141
+ "--dash-url",
142
+ type=str,
143
+ help="ML-Dash server URL (defaults to config or https://api.dash.ml)",
144
+ )
145
+ parser.add_argument(
146
+ "--api-key",
147
+ type=str,
148
+ help="JWT token for authentication (optional - auto-loads from 'ml-dash login' if not provided)",
149
+ )
150
+
151
+ """
152
+
153
+ cd .dash/geyang
154
+ cd iclr_2026
155
+
156
+ ml-dash upload -p geyang/new-run * # this uploads all of the folders to geyang/new-run.
157
+
158
+ or
159
+
160
+ ml-dash upload --prefix geyang/new-run/local-results ./* # uploads under the local-results prefix.
161
+
162
+ ml-dash download --prefix geyang/new-run/zehua-results --filter *.mp4 --dryrun --verbose
163
+
164
+ mo-dash list --prefix geyang/new-run/zehua-results --filter xxx-xxx --verbose
165
+
166
+ mo-dash list-exp --prefix geyang/new-run/zehua-results --filter xxx-xxx --verbose
167
+
168
+ """
169
+
170
+ # Scope control
171
+ # Ge: project should be {owner}/{proj_name}
172
+ parser.add_argument(
173
+ "-p",
174
+ "--pref",
175
+ "--prefix",
176
+ "--proj",
177
+ "--project",
178
+ type=str,
179
+ help="Filter experiments by prefix pattern (supports glob: 'tom/*/exp*', 'alice/project-?/baseline')",
180
+ )
181
+
182
+ # Target prefix for server (like scp destination)
183
+ parser.add_argument(
184
+ "-t",
185
+ "--target",
186
+ type=str,
187
+ help="Target prefix/directory on server where experiments will be uploaded (e.g., 'alice/shared-project'). Similar to 'scp local/ remote-path/'",
188
+ )
189
+ # parser.add_argument(
190
+ # "--experiment",
191
+ # type=str,
192
+ # help="Upload only this specific experiment (requires --project)",
193
+ # )
194
+
195
+ # Data filtering
196
+ parser.add_argument(
197
+ "--skip-logs",
198
+ action="store_true",
199
+ help="Don't upload logs",
200
+ )
201
+ parser.add_argument(
202
+ "--skip-metrics",
203
+ action="store_true",
204
+ help="Don't upload metrics",
205
+ )
206
+ parser.add_argument(
207
+ "--skip-files",
208
+ action="store_true",
209
+ help="Don't upload files",
210
+ )
211
+ parser.add_argument(
212
+ "--skip-params",
213
+ action="store_true",
214
+ help="Don't upload parameters",
215
+ )
216
+
217
+ # Behavior control
218
+ parser.add_argument(
219
+ "--dry-run",
220
+ action="store_true",
221
+ help="Show what would be uploaded without uploading",
222
+ )
223
+ parser.add_argument(
224
+ "--strict",
225
+ action="store_true",
226
+ help="Fail on any validation error (default: skip invalid data)",
227
+ )
228
+ parser.add_argument(
229
+ "-v",
230
+ "--verbose",
231
+ action="store_true",
232
+ help="Show detailed progress",
233
+ )
234
+ parser.add_argument(
235
+ "--batch-size",
236
+ type=int,
237
+ default=100,
238
+ help="Batch size for logs/metrics (default: 100)",
239
+ )
240
+ parser.add_argument(
241
+ "--resume",
242
+ action="store_true",
243
+ help="Resume previous interrupted upload",
244
+ )
245
+ parser.add_argument(
246
+ "--state-file",
247
+ type=str,
248
+ default=".dash-upload-state.json",
249
+ help="Path to state file for resume (default: .dash-upload-state.json)",
250
+ )
251
+
252
+ return parser
118
253
 
119
- # Positional argument
120
- parser.add_argument(
121
- "path",
122
- nargs="?",
123
- default="./.ml-dash",
124
- help="Local storage directory to upload from (default: ./.ml-dash)",
125
- )
126
254
 
127
- # Remote configuration
128
- parser.add_argument(
129
- "--remote",
130
- type=str,
131
- help="Remote server URL (required unless set in config)",
132
- )
133
- parser.add_argument(
134
- "--api-key",
135
- type=str,
136
- help="JWT token for authentication (optional - auto-loads from 'ml-dash login' if not provided)",
137
- )
255
+ def discover_experiments(
256
+ local_path: Path,
257
+ project_filter: Optional[str] = None,
258
+ experiment_filter: Optional[str] = None,
259
+ ) -> List[ExperimentInfo]:
260
+ """
261
+ Discover experiments in local storage directory.
262
+
263
+ Supports both flat (local_path/project/experiment) and folder-based
264
+ (local_path/folder/project/experiment) hierarchies.
265
+
266
+ Args:
267
+ local_path: Root path of local storage
268
+ project_filter: Glob pattern to filter experiments by prefix (e.g., "tom/*/exp*")
269
+ experiment_filter: Only discover this experiment (requires project_filter)
270
+
271
+ Returns:
272
+ List of ExperimentInfo objects
273
+ """
274
+ import fnmatch
275
+
276
+ local_path = Path(local_path)
277
+
278
+ if not local_path.exists():
279
+ return []
280
+
281
+ experiments = []
282
+
283
+ # Find all experiment.json files recursively
284
+ for exp_json in local_path.rglob("*/experiment.json"):
285
+ exp_dir = exp_json.parent
286
+
287
+ # Read prefix from experiment.json first
288
+ prefix = None
289
+ try:
290
+ with open(exp_json, "r") as f:
291
+ metadata = json.load(f)
292
+ prefix = metadata.get("prefix")
293
+ except:
294
+ pass
295
+
296
+ # Extract project and experiment names from PREFIX (not path)
297
+ # This handles nested folders correctly
298
+ # Prefix format: owner/project/folder.../experiment
299
+ try:
300
+ relative_path = exp_dir.relative_to(local_path)
301
+ full_relative_path = str(relative_path)
302
+
303
+ if prefix:
304
+ # Parse from prefix for accuracy
305
+ prefix_parts = prefix.strip("/").split("/")
306
+ if len(prefix_parts) < 3:
307
+ continue # Need at least owner/project/experiment
308
+
309
+ # owner = prefix_parts[0]
310
+ project_name = prefix_parts[1]
311
+ exp_name = prefix_parts[-1]
312
+ else:
313
+ # Fallback to path-based parsing (legacy support)
314
+ parts = relative_path.parts
315
+ if len(parts) < 2:
316
+ continue
317
+ exp_name = parts[-1]
318
+ project_name = parts[-2]
319
+
320
+ # Apply filters with glob pattern support
321
+ if project_filter:
322
+ # Support glob pattern matching on the full relative path
323
+ if not fnmatch.fnmatch(full_relative_path, project_filter):
324
+ continue
325
+ if experiment_filter and exp_name != experiment_filter:
326
+ continue
327
+
328
+ # Create experiment info
329
+ exp_info = ExperimentInfo(
330
+ project=project_name,
331
+ experiment=exp_name,
332
+ path=exp_dir,
333
+ prefix=prefix,
334
+ )
335
+ except (ValueError, IndexError):
336
+ continue
337
+
338
+ # Check for parameters
339
+ params_file = exp_dir / "parameters.json"
340
+ exp_info.has_params = params_file.exists()
341
+
342
+ # Check for logs
343
+ logs_file = exp_dir / "logs/logs.jsonl"
344
+ exp_info.has_logs = logs_file.exists()
345
+
346
+ # Check for metrics
347
+ metrics_dir = exp_dir / "metrics"
348
+ if metrics_dir.exists():
349
+ for metric_dir in metrics_dir.iterdir():
350
+ if metric_dir.is_dir():
351
+ data_file = metric_dir / "data.jsonl"
352
+ if data_file.exists():
353
+ exp_info.metric_names.append(metric_dir.name)
354
+
355
+ # Check for files
356
+ files_dir = exp_dir / "files"
357
+ if files_dir.exists():
358
+ try:
359
+ # Count files recursively
360
+ exp_info.file_count = sum(1 for _ in files_dir.rglob("*") if _.is_file())
361
+
362
+ # Estimate size
363
+ exp_info.estimated_size = sum(
364
+ f.stat().st_size for f in files_dir.rglob("*") if f.is_file()
365
+ )
366
+ except (OSError, PermissionError):
367
+ pass
138
368
 
139
- # Scope control
140
- parser.add_argument(
141
- "--project",
142
- type=str,
143
- help="Upload only experiments from this project",
144
- )
145
- parser.add_argument(
146
- "--experiment",
147
- type=str,
148
- help="Upload only this specific experiment (requires --project)",
149
- )
369
+ experiments.append(exp_info)
150
370
 
151
- # Data filtering
152
- parser.add_argument(
153
- "--skip-logs",
154
- action="store_true",
155
- help="Don't upload logs",
156
- )
157
- parser.add_argument(
158
- "--skip-metrics",
159
- action="store_true",
160
- help="Don't upload metrics",
161
- )
162
- parser.add_argument(
163
- "--skip-files",
164
- action="store_true",
165
- help="Don't upload files",
166
- )
167
- parser.add_argument(
168
- "--skip-params",
169
- action="store_true",
170
- help="Don't upload parameters",
171
- )
371
+ return experiments
172
372
 
173
- # Behavior control
174
- parser.add_argument(
175
- "--dry-run",
176
- action="store_true",
177
- help="Show what would be uploaded without uploading",
178
- )
179
- parser.add_argument(
180
- "--strict",
181
- action="store_true",
182
- help="Fail on any validation error (default: skip invalid data)",
183
- )
184
- parser.add_argument(
185
- "-v", "--verbose",
186
- action="store_true",
187
- help="Show detailed progress",
188
- )
189
- parser.add_argument(
190
- "--batch-size",
191
- type=int,
192
- default=100,
193
- help="Batch size for logs/metrics (default: 100)",
194
- )
195
- parser.add_argument(
196
- "--resume",
197
- action="store_true",
198
- help="Resume previous interrupted upload",
199
- )
200
- parser.add_argument(
201
- "--state-file",
202
- type=str,
203
- default=".ml-dash-upload-state.json",
204
- help="Path to state file for resume (default: .ml-dash-upload-state.json)",
205
- )
206
373
 
207
- return parser
374
+ class ExperimentValidator:
375
+ """Validates local experiment data before upload."""
208
376
 
377
+ def __init__(self, strict: bool = False):
378
+ """
379
+ Initialize validator.
209
380
 
210
- def discover_experiments(
211
- local_path: Path,
212
- project_filter: Optional[str] = None,
213
- experiment_filter: Optional[str] = None,
214
- ) -> List[ExperimentInfo]:
381
+ Args:
382
+ strict: If True, fail on any validation error
215
383
  """
216
- Discover experiments in local storage directory.
384
+ self.strict = strict
217
385
 
218
- Supports both flat (local_path/project/experiment) and folder-based
219
- (local_path/folder/project/experiment) hierarchies.
386
+ def validate_experiment(self, exp_info: ExperimentInfo) -> ValidationResult:
387
+ """
388
+ Validate experiment directory structure and data.
220
389
 
221
390
  Args:
222
- local_path: Root path of local storage
223
- project_filter: Only discover experiments in this project
224
- experiment_filter: Only discover this experiment (requires project_filter)
391
+ exp_info: Experiment information
225
392
 
226
393
  Returns:
227
- List of ExperimentInfo objects
394
+ ValidationResult with validation status and messages
228
395
  """
229
- local_path = Path(local_path)
230
-
231
- if not local_path.exists():
232
- return []
233
-
234
- experiments = []
235
-
236
- # Find all experiment.json files recursively
237
- for exp_json in local_path.rglob("*/experiment.json"):
238
- exp_dir = exp_json.parent
239
-
240
- # Extract project and experiment names from path
241
- # Path structure: local_path / [folder] / project / experiment
242
- try:
243
- relative_path = exp_dir.relative_to(local_path)
244
- parts = relative_path.parts
396
+ result = ValidationResult()
397
+ result.valid_data = {}
398
+
399
+ # 1. Validate experiment metadata (required)
400
+ if not self._validate_experiment_metadata(exp_info, result):
401
+ result.is_valid = False
402
+ return result
403
+
404
+ # 2. Validate parameters (optional)
405
+ self._validate_parameters(exp_info, result)
406
+
407
+ # 3. Validate logs (optional)
408
+ self._validate_logs(exp_info, result)
409
+
410
+ # 4. Validate metrics (optional)
411
+ self._validate_metrics(exp_info, result)
412
+
413
+ # 5. Validate files (optional)
414
+ self._validate_files(exp_info, result)
415
+
416
+ # In strict mode, any warning becomes an error
417
+ if self.strict and result.warnings:
418
+ result.errors.extend(result.warnings)
419
+ result.warnings = []
420
+ result.is_valid = False
421
+
422
+ return result
423
+
424
+ def _validate_experiment_metadata(
425
+ self, exp_info: ExperimentInfo, result: ValidationResult
426
+ ) -> bool:
427
+ """Validate experiment.json exists and is valid."""
428
+ exp_json = exp_info.path / "experiment.json"
429
+
430
+ if not exp_json.exists():
431
+ result.errors.append("Missing experiment.json")
432
+ return False
433
+
434
+ try:
435
+ with open(exp_json, "r") as f:
436
+ metadata = json.load(f)
437
+
438
+ # Check required fields
439
+ if "name" not in metadata or "project" not in metadata:
440
+ result.errors.append("experiment.json missing required fields (name, project)")
441
+ return False
442
+
443
+ result.valid_data["metadata"] = metadata
444
+ return True
445
+
446
+ except json.JSONDecodeError as e:
447
+ result.errors.append(f"Invalid JSON in experiment.json: {e}")
448
+ return False
449
+ except IOError as e:
450
+ result.errors.append(f"Cannot read experiment.json: {e}")
451
+ return False
452
+
453
+ def _validate_parameters(self, exp_info: ExperimentInfo, result: ValidationResult):
454
+ """Validate parameters.json format."""
455
+ if not exp_info.has_params:
456
+ return
457
+
458
+ params_file = exp_info.path / "parameters.json"
459
+ try:
460
+ with open(params_file, "r") as f:
461
+ params = json.load(f)
462
+
463
+ # Check if it's a dict
464
+ if not isinstance(params, dict):
465
+ result.warnings.append("parameters.json is not a dict (will skip)")
466
+ return
467
+
468
+ # Check for valid data key if using versioned format
469
+ if "data" in params:
470
+ if not isinstance(params["data"], dict):
471
+ result.warnings.append("parameters.json data is not a dict (will skip)")
472
+ return
473
+ result.valid_data["parameters"] = params["data"]
474
+ else:
475
+ result.valid_data["parameters"] = params
476
+
477
+ except json.JSONDecodeError as e:
478
+ result.warnings.append(f"Invalid JSON in parameters.json: {e} (will skip)")
479
+ except IOError as e:
480
+ result.warnings.append(f"Cannot read parameters.json: {e} (will skip)")
481
+
482
+ def _validate_logs(self, exp_info: ExperimentInfo, result: ValidationResult):
483
+ """Validate logs.jsonl format."""
484
+ if not exp_info.has_logs:
485
+ return
486
+
487
+ logs_file = exp_info.path / "logs/logs.jsonl"
488
+ invalid_lines = []
489
+
490
+ try:
491
+ with open(logs_file, "r") as f:
492
+ for line_num, line in enumerate(f, start=1):
493
+ try:
494
+ log_entry = json.loads(line)
495
+ # Check required fields
496
+ if "message" not in log_entry:
497
+ invalid_lines.append(line_num)
498
+ except json.JSONDecodeError:
499
+ invalid_lines.append(line_num)
500
+
501
+ if invalid_lines:
502
+ count = len(invalid_lines)
503
+ preview = invalid_lines[:5]
504
+ result.warnings.append(
505
+ f"logs.jsonl has {count} invalid lines (e.g., {preview}...) - will skip these"
506
+ )
245
507
 
246
- if len(parts) < 2:
247
- continue # Need at least project/experiment
508
+ except IOError as e:
509
+ result.warnings.append(f"Cannot read logs.jsonl: {e} (will skip logs)")
248
510
 
249
- # Last two parts are project/experiment
250
- exp_name = parts[-1]
251
- project_name = parts[-2]
511
+ def _validate_metrics(self, exp_info: ExperimentInfo, result: ValidationResult):
512
+ """Validate metrics data."""
513
+ if not exp_info.metric_names:
514
+ return
252
515
 
253
- # Apply filters
254
- if project_filter and project_name != project_filter:
255
- continue
256
- if experiment_filter and exp_name != experiment_filter:
257
- continue
516
+ for metric_name in exp_info.metric_names:
517
+ metric_dir = exp_info.path / "metrics" / metric_name
518
+ data_file = metric_dir / "data.jsonl"
258
519
 
259
- # Read folder from experiment.json
260
- folder = None
520
+ invalid_lines = []
521
+ try:
522
+ with open(data_file, "r") as f:
523
+ for line_num, line in enumerate(f, start=1):
261
524
  try:
262
- with open(exp_json, 'r') as f:
263
- metadata = json.load(f)
264
- folder = metadata.get('folder')
265
- except:
266
- pass
267
-
268
- # Create experiment info
269
- exp_info = ExperimentInfo(
270
- project=project_name,
271
- experiment=exp_name,
272
- path=exp_dir,
273
- folder=folder,
274
- )
275
- except (ValueError, IndexError):
276
- continue
277
-
278
- # Check for parameters
279
- params_file = exp_dir / "parameters.json"
280
- exp_info.has_params = params_file.exists()
281
-
282
- # Check for logs
283
- logs_file = exp_dir / "logs" / "logs.jsonl"
284
- exp_info.has_logs = logs_file.exists()
285
-
286
- # Check for metrics
287
- metrics_dir = exp_dir / "metrics"
288
- if metrics_dir.exists():
289
- for metric_dir in metrics_dir.iterdir():
290
- if metric_dir.is_dir():
291
- data_file = metric_dir / "data.jsonl"
292
- if data_file.exists():
293
- exp_info.metric_names.append(metric_dir.name)
294
-
295
- # Check for files
296
- files_dir = exp_dir / "files"
297
- if files_dir.exists():
298
- try:
299
- # Count files recursively
300
- exp_info.file_count = sum(1 for _ in files_dir.rglob("*") if _.is_file())
301
-
302
- # Estimate size
303
- exp_info.estimated_size = sum(
304
- f.stat().st_size for f in files_dir.rglob("*") if f.is_file()
305
- )
306
- except (OSError, PermissionError):
307
- pass
308
-
309
- experiments.append(exp_info)
525
+ data_point = json.loads(line)
526
+ # Check for data field
527
+ if "data" not in data_point:
528
+ invalid_lines.append(line_num)
529
+ except json.JSONDecodeError:
530
+ invalid_lines.append(line_num)
531
+
532
+ if invalid_lines:
533
+ count = len(invalid_lines)
534
+ preview = invalid_lines[:5]
535
+ result.warnings.append(
536
+ f"metric '{metric_name}' has {count} invalid lines (e.g., {preview}...) - will skip these"
537
+ )
538
+
539
+ except IOError as e:
540
+ result.warnings.append(f"Cannot read metric '{metric_name}': {e} (will skip)")
541
+
542
+ def _validate_files(self, exp_info: ExperimentInfo, result: ValidationResult):
543
+ """Validate files existence."""
544
+ files_dir = exp_info.path / "files"
545
+ if not files_dir.exists():
546
+ return
547
+
548
+ metadata_file = files_dir / ".files_metadata.json"
549
+ if not metadata_file.exists():
550
+ return
551
+
552
+ try:
553
+ with open(metadata_file, "r") as f:
554
+ files_metadata = json.load(f)
555
+
556
+ missing_files = []
557
+ for file_id, file_info in files_metadata.items():
558
+ if isinstance(file_info, dict) and file_info.get("deletedAt") is None:
559
+ # Check if file exists
560
+ file_path = (
561
+ files_dir
562
+ / file_info.get("prefix", "")
563
+ / file_id
564
+ / file_info.get("filename", "")
565
+ )
566
+ if not file_path.exists():
567
+ missing_files.append(file_info.get("filename", file_id))
568
+
569
+ if missing_files:
570
+ count = len(missing_files)
571
+ preview = missing_files[:3]
572
+ result.warnings.append(
573
+ f"{count} files referenced in metadata but missing on disk (e.g., {preview}...) - will skip these"
574
+ )
310
575
 
311
- return experiments
576
+ except (json.JSONDecodeError, IOError):
577
+ pass # If we can't read metadata, just skip file validation
312
578
 
313
579
 
314
- class ExperimentValidator:
315
- """Validates local experiment data before upload."""
580
+ class ExperimentUploader:
581
+ """Handles uploading a single experiment."""
582
+
583
+ def __init__(
584
+ self,
585
+ local_storage: LocalStorage,
586
+ remote_client: RemoteClient,
587
+ batch_size: int = 100,
588
+ skip_logs: bool = False,
589
+ skip_metrics: bool = False,
590
+ skip_files: bool = False,
591
+ skip_params: bool = False,
592
+ verbose: bool = False,
593
+ progress: Optional[Progress] = None,
594
+ max_concurrent_metrics: int = 5,
595
+ target_prefix: Optional[str] = None,
596
+ ):
597
+ """
598
+ Initialize uploader.
316
599
 
317
- def __init__(self, strict: bool = False):
318
- """
319
- Initialize validator.
600
+ Args:
601
+ local_storage: Local storage instance
602
+ remote_client: Remote client instance
603
+ batch_size: Batch size for logs/metrics
604
+ skip_logs: Skip uploading logs
605
+ skip_metrics: Skip uploading metrics
606
+ skip_files: Skip uploading files
607
+ skip_params: Skip uploading parameters
608
+ verbose: Show verbose output
609
+ progress: Optional rich Progress instance for tracking
610
+ max_concurrent_metrics: Maximum concurrent metric uploads (default: 5)
611
+ target_prefix: Target prefix on server (overrides local prefix)
612
+ """
613
+ self.local = local_storage
614
+ self.remote = remote_client
615
+ self.batch_size = batch_size
616
+ self.skip_logs = skip_logs
617
+ self.skip_metrics = skip_metrics
618
+ self.skip_files = skip_files
619
+ self.skip_params = skip_params
620
+ self.verbose = verbose
621
+ self.progress = progress
622
+ self.max_concurrent_metrics = max_concurrent_metrics
623
+ self.target_prefix = target_prefix
624
+ # Thread-safe lock for shared state updates
625
+ self._lock = threading.Lock()
626
+ # Thread-local storage for remote clients (for thread-safe HTTP requests)
627
+ self._thread_local = threading.local()
628
+
629
+ def _get_remote_client(self) -> RemoteClient:
630
+ """Get thread-local remote client for safe concurrent access."""
631
+ if not hasattr(self._thread_local, "client"):
632
+ # Create a new client for this thread
633
+ # Use graphql_base_url (without /api) since RemoteClient.__init__ will add /api
634
+ self._thread_local.client = RemoteClient(
635
+ base_url=self.remote.graphql_base_url, api_key=self.remote.api_key
636
+ )
637
+ return self._thread_local.client
638
+
639
+ def upload_experiment(
640
+ self, exp_info: ExperimentInfo, validation_result: ValidationResult, task_id=None
641
+ ) -> UploadResult:
642
+ """
643
+ Upload a single experiment with all its data.
320
644
 
321
- Args:
322
- strict: If True, fail on any validation error
323
- """
324
- self.strict = strict
645
+ Args:
646
+ exp_info: Experiment information
647
+ validation_result: Validation results
648
+ task_id: Optional progress task ID
325
649
 
326
- def validate_experiment(self, exp_info: ExperimentInfo) -> ValidationResult:
327
- """
328
- Validate experiment directory structure and data.
650
+ Returns:
651
+ UploadResult with upload status
652
+ """
653
+ result = UploadResult(experiment=f"{exp_info.project}/{exp_info.experiment}")
654
+
655
+ # Calculate total steps for progress tracking
656
+ total_steps = 1 # metadata
657
+ if not self.skip_params and "parameters" in validation_result.valid_data:
658
+ total_steps += 1
659
+ if not self.skip_logs and exp_info.has_logs:
660
+ total_steps += 1
661
+ if not self.skip_metrics and exp_info.metric_names:
662
+ total_steps += len(exp_info.metric_names)
663
+ if not self.skip_files and exp_info.file_count > 0:
664
+ total_steps += exp_info.file_count
665
+
666
+ current_step = 0
667
+
668
+ def update_progress(description: str):
669
+ nonlocal current_step
670
+ current_step += 1
671
+ if self.progress and task_id is not None:
672
+ self.progress.update(
673
+ task_id, completed=current_step, total=total_steps, description=description
674
+ )
329
675
 
330
- Args:
331
- exp_info: Experiment information
676
+ try:
677
+ # 1. Create/update experiment metadata
678
+ update_progress("Creating experiment...")
679
+ if self.verbose:
680
+ console.print(" [dim]Creating experiment...[/dim]")
681
+
682
+ exp_data = validation_result.valid_data
683
+
684
+ # Construct full prefix for server
685
+ # If --target is specified, use it as the base destination prefix
686
+ # Otherwise, preserve the local prefix structure
687
+ if self.target_prefix:
688
+ # User specified a target prefix (like scp destination directory)
689
+ # Append experiment name to it: target_prefix/experiment_name
690
+ full_prefix = f"{self.target_prefix.rstrip('/')}/{exp_info.experiment}"
691
+
692
+ # Extract project from target prefix for API call
693
+ # Target format: owner/project/path...
694
+ target_parts = self.target_prefix.strip("/").split("/")
695
+ if len(target_parts) >= 2:
696
+ target_project = target_parts[1]
697
+ else:
698
+ target_project = exp_info.project # Fallback to original
699
+ elif exp_info.prefix:
700
+ # No target specified, preserve local prefix structure
701
+ full_prefix = f"{exp_info.prefix}/{exp_info.experiment}"
702
+ target_project = exp_info.project
703
+ else:
704
+ full_prefix = exp_info.experiment
705
+ target_project = exp_info.project
706
+
707
+ response = self.remote.create_or_update_experiment(
708
+ project=target_project,
709
+ name=exp_info.experiment,
710
+ description=exp_data.get("description"),
711
+ tags=exp_data.get("tags"),
712
+ bindrs=exp_data.get("bindrs"),
713
+ prefix=full_prefix, # Send full prefix (folder + name) or target prefix
714
+ write_protected=exp_data.get("write_protected", False),
715
+ metadata=exp_data.get("metadata"),
716
+ )
717
+
718
+ # Extract experiment ID from nested response
719
+ experiment_id = response.get("experiment", {}).get("id") or response.get("id")
720
+ if self.verbose:
721
+ console.print(f" [green]✓[/green] Created experiment (id: {experiment_id})")
722
+
723
+ # 2. Upload parameters
724
+ if not self.skip_params and "parameters" in validation_result.valid_data:
725
+ update_progress("Uploading parameters...")
726
+ if self.verbose:
727
+ console.print(" [dim]Uploading parameters...[/dim]")
332
728
 
333
- Returns:
334
- ValidationResult with validation status and messages
335
- """
336
- result = ValidationResult()
337
- result.valid_data = {}
729
+ params = validation_result.valid_data["parameters"]
730
+ self.remote.set_parameters(experiment_id, params)
731
+ result.uploaded["params"] = len(params)
732
+ # Track bytes (approximate JSON size)
733
+ result.bytes_uploaded += len(json.dumps(params).encode("utf-8"))
338
734
 
339
- # 1. Validate experiment metadata (required)
340
- if not self._validate_experiment_metadata(exp_info, result):
341
- result.is_valid = False
342
- return result
735
+ if self.verbose:
736
+ console.print(f" [green]✓[/green] Uploaded {len(params)} parameters")
343
737
 
344
- # 2. Validate parameters (optional)
345
- self._validate_parameters(exp_info, result)
738
+ # 3. Upload logs
739
+ if not self.skip_logs and exp_info.has_logs:
740
+ count = self._upload_logs(
741
+ experiment_id, exp_info, result, task_id, update_progress
742
+ )
743
+ result.uploaded["logs"] = count
346
744
 
347
- # 3. Validate logs (optional)
348
- self._validate_logs(exp_info, result)
745
+ # 4. Upload metrics
746
+ if not self.skip_metrics and exp_info.metric_names:
747
+ count = self._upload_metrics(
748
+ experiment_id, exp_info, result, task_id, update_progress
749
+ )
750
+ result.uploaded["metrics"] = count
349
751
 
350
- # 4. Validate metrics (optional)
351
- self._validate_metrics(exp_info, result)
752
+ # 5. Upload files
753
+ if not self.skip_files and exp_info.file_count > 0:
754
+ count = self._upload_files(
755
+ experiment_id, exp_info, result, task_id, update_progress
756
+ )
757
+ result.uploaded["files"] = count
758
+
759
+ result.success = True
760
+
761
+ except Exception as e:
762
+ result.success = False
763
+ result.errors.append(str(e))
764
+ if self.verbose:
765
+ console.print(f" [red]✗ Error: {e}[/red]")
766
+
767
+ return result
768
+
769
+ def _upload_logs(
770
+ self,
771
+ experiment_id: str,
772
+ exp_info: ExperimentInfo,
773
+ result: UploadResult,
774
+ task_id=None,
775
+ update_progress=None,
776
+ ) -> int:
777
+ """Upload logs in batches."""
778
+ if update_progress:
779
+ update_progress("Uploading logs...")
780
+ if self.verbose:
781
+ console.print(" [dim]Uploading logs...[/dim]")
782
+
783
+ logs_file = exp_info.path / "logs/logs.jsonl"
784
+ logs_batch = []
785
+ total_uploaded = 0
786
+ skipped = 0
787
+
788
+ try:
789
+ with open(logs_file, "r") as f:
790
+ for line in f:
791
+ try:
792
+ log_entry = json.loads(line)
793
+
794
+ # Validate required fields
795
+ if "message" not in log_entry:
796
+ skipped += 1
797
+ continue
798
+
799
+ # Prepare log entry for API
800
+ api_log = {
801
+ "timestamp": log_entry.get("timestamp"),
802
+ "level": log_entry.get("level", "info"),
803
+ "message": log_entry["message"],
804
+ }
805
+ if "metadata" in log_entry:
806
+ api_log["metadata"] = log_entry["metadata"]
352
807
 
353
- # 5. Validate files (optional)
354
- self._validate_files(exp_info, result)
808
+ logs_batch.append(api_log)
809
+ # Track bytes
810
+ result.bytes_uploaded += len(line.encode("utf-8"))
355
811
 
356
- # In strict mode, any warning becomes an error
357
- if self.strict and result.warnings:
358
- result.errors.extend(result.warnings)
359
- result.warnings = []
360
- result.is_valid = False
812
+ # Upload batch
813
+ if len(logs_batch) >= self.batch_size:
814
+ self.remote.create_log_entries(experiment_id, logs_batch)
815
+ total_uploaded += len(logs_batch)
816
+ logs_batch = []
361
817
 
362
- return result
818
+ except json.JSONDecodeError:
819
+ skipped += 1
820
+ continue
363
821
 
364
- def _validate_experiment_metadata(self, exp_info: ExperimentInfo, result: ValidationResult) -> bool:
365
- """Validate experiment.json exists and is valid."""
366
- exp_json = exp_info.path / "experiment.json"
822
+ # Upload remaining logs
823
+ if logs_batch:
824
+ self.remote.create_log_entries(experiment_id, logs_batch)
825
+ total_uploaded += len(logs_batch)
367
826
 
368
- if not exp_json.exists():
369
- result.errors.append("Missing experiment.json")
370
- return False
827
+ if self.verbose:
828
+ msg = f" [green]✓[/green] Uploaded {total_uploaded} log entries"
829
+ if skipped > 0:
830
+ msg += f" (skipped {skipped} invalid)"
831
+ console.print(msg)
371
832
 
372
- try:
373
- with open(exp_json, "r") as f:
374
- metadata = json.load(f)
833
+ except IOError as e:
834
+ result.failed.setdefault("logs", []).append(str(e))
375
835
 
376
- # Check required fields
377
- if "name" not in metadata or "project" not in metadata:
378
- result.errors.append("experiment.json missing required fields (name, project)")
379
- return False
380
-
381
- result.valid_data["metadata"] = metadata
382
- return True
383
-
384
- except json.JSONDecodeError as e:
385
- result.errors.append(f"Invalid JSON in experiment.json: {e}")
386
- return False
387
- except IOError as e:
388
- result.errors.append(f"Cannot read experiment.json: {e}")
389
- return False
390
-
391
- def _validate_parameters(self, exp_info: ExperimentInfo, result: ValidationResult):
392
- """Validate parameters.json format."""
393
- if not exp_info.has_params:
394
- return
395
-
396
- params_file = exp_info.path / "parameters.json"
397
- try:
398
- with open(params_file, "r") as f:
399
- params = json.load(f)
400
-
401
- # Check if it's a dict
402
- if not isinstance(params, dict):
403
- result.warnings.append("parameters.json is not a dict (will skip)")
404
- return
405
-
406
- # Check for valid data key if using versioned format
407
- if "data" in params:
408
- if not isinstance(params["data"], dict):
409
- result.warnings.append("parameters.json data is not a dict (will skip)")
410
- return
411
- result.valid_data["parameters"] = params["data"]
412
- else:
413
- result.valid_data["parameters"] = params
414
-
415
- except json.JSONDecodeError as e:
416
- result.warnings.append(f"Invalid JSON in parameters.json: {e} (will skip)")
417
- except IOError as e:
418
- result.warnings.append(f"Cannot read parameters.json: {e} (will skip)")
419
-
420
- def _validate_logs(self, exp_info: ExperimentInfo, result: ValidationResult):
421
- """Validate logs.jsonl format."""
422
- if not exp_info.has_logs:
423
- return
424
-
425
- logs_file = exp_info.path / "logs" / "logs.jsonl"
426
- invalid_lines = []
836
+ return total_uploaded
427
837
 
428
- try:
429
- with open(logs_file, "r") as f:
430
- for line_num, line in enumerate(f, start=1):
431
- try:
432
- log_entry = json.loads(line)
433
- # Check required fields
434
- if "message" not in log_entry:
435
- invalid_lines.append(line_num)
436
- except json.JSONDecodeError:
437
- invalid_lines.append(line_num)
438
-
439
- if invalid_lines:
440
- count = len(invalid_lines)
441
- preview = invalid_lines[:5]
442
- result.warnings.append(
443
- f"logs.jsonl has {count} invalid lines (e.g., {preview}...) - will skip these"
444
- )
838
+ def _upload_single_metric(
839
+ self, experiment_id: str, metric_name: str, metric_dir: Path, result: UploadResult
840
+ ) -> Dict[str, Any]:
841
+ """
842
+ Upload a single metric (thread-safe helper).
445
843
 
446
- except IOError as e:
447
- result.warnings.append(f"Cannot read logs.jsonl: {e} (will skip logs)")
844
+ Returns:
845
+ Dict with 'success', 'uploaded', 'skipped', 'bytes', and 'error' keys
846
+ """
847
+ data_file = metric_dir / "data.jsonl"
848
+ data_batch = []
849
+ total_uploaded = 0
850
+ skipped = 0
851
+ bytes_uploaded = 0
852
+
853
+ # Get thread-local client for safe concurrent HTTP requests
854
+ remote_client = self._get_remote_client()
855
+
856
+ try:
857
+ with open(data_file, "r") as f:
858
+ for line in f:
859
+ try:
860
+ data_point = json.loads(line)
861
+
862
+ # Validate required fields
863
+ if "data" not in data_point:
864
+ skipped += 1
865
+ continue
866
+
867
+ data_batch.append(data_point["data"])
868
+ bytes_uploaded += len(line.encode("utf-8"))
869
+
870
+ # Upload batch using thread-local client
871
+ if len(data_batch) >= self.batch_size:
872
+ remote_client.append_batch_to_metric(
873
+ experiment_id, metric_name, data_batch
874
+ )
875
+ total_uploaded += len(data_batch)
876
+ data_batch = []
877
+
878
+ except json.JSONDecodeError:
879
+ skipped += 1
880
+ continue
448
881
 
449
- def _validate_metrics(self, exp_info: ExperimentInfo, result: ValidationResult):
450
- """Validate metrics data."""
451
- if not exp_info.metric_names:
452
- return
882
+ # Upload remaining data points using thread-local client
883
+ if data_batch:
884
+ remote_client.append_batch_to_metric(experiment_id, metric_name, data_batch)
885
+ total_uploaded += len(data_batch)
886
+
887
+ return {
888
+ "success": True,
889
+ "uploaded": total_uploaded,
890
+ "skipped": skipped,
891
+ "bytes": bytes_uploaded,
892
+ "error": None,
893
+ }
894
+
895
+ except Exception as e:
896
+ return {
897
+ "success": False,
898
+ "uploaded": 0,
899
+ "skipped": 0,
900
+ "bytes": 0,
901
+ "error": str(e),
902
+ }
903
+
904
+ def _upload_metrics(
905
+ self,
906
+ experiment_id: str,
907
+ exp_info: ExperimentInfo,
908
+ result: UploadResult,
909
+ task_id=None,
910
+ update_progress=None,
911
+ ) -> int:
912
+ """Upload metrics in parallel with concurrency limit."""
913
+ if not exp_info.metric_names:
914
+ return 0
915
+
916
+ total_metrics = 0
917
+
918
+ # Use ThreadPoolExecutor for parallel uploads
919
+ with ThreadPoolExecutor(max_workers=self.max_concurrent_metrics) as executor:
920
+ # Submit all metric upload tasks
921
+ future_to_metric = {}
922
+ for metric_name in exp_info.metric_names:
923
+ metric_dir = exp_info.path / "metrics" / metric_name
924
+ future = executor.submit(
925
+ self._upload_single_metric, experiment_id, metric_name, metric_dir, result
926
+ )
927
+ future_to_metric[future] = metric_name
453
928
 
454
- for metric_name in exp_info.metric_names:
455
- metric_dir = exp_info.path / "metrics" / metric_name
456
- data_file = metric_dir / "data.jsonl"
929
+ # Process completed uploads as they finish
930
+ for future in as_completed(future_to_metric):
931
+ metric_name = future_to_metric[future]
457
932
 
458
- invalid_lines = []
459
- try:
460
- with open(data_file, "r") as f:
461
- for line_num, line in enumerate(f, start=1):
462
- try:
463
- data_point = json.loads(line)
464
- # Check for data field
465
- if "data" not in data_point:
466
- invalid_lines.append(line_num)
467
- except json.JSONDecodeError:
468
- invalid_lines.append(line_num)
469
-
470
- if invalid_lines:
471
- count = len(invalid_lines)
472
- preview = invalid_lines[:5]
473
- result.warnings.append(
474
- f"metric '{metric_name}' has {count} invalid lines (e.g., {preview}...) - will skip these"
475
- )
476
-
477
- except IOError as e:
478
- result.warnings.append(f"Cannot read metric '{metric_name}': {e} (will skip)")
479
-
480
- def _validate_files(self, exp_info: ExperimentInfo, result: ValidationResult):
481
- """Validate files existence."""
482
- files_dir = exp_info.path / "files"
483
- if not files_dir.exists():
484
- return
485
-
486
- metadata_file = files_dir / ".files_metadata.json"
487
- if not metadata_file.exists():
488
- return
933
+ # Update progress
934
+ if update_progress:
935
+ update_progress(f"Uploading metric '{metric_name}'...")
489
936
 
490
937
  try:
491
- with open(metadata_file, "r") as f:
492
- files_metadata = json.load(f)
493
-
494
- missing_files = []
495
- for file_id, file_info in files_metadata.items():
496
- if isinstance(file_info, dict) and file_info.get("deletedAt") is None:
497
- # Check if file exists
498
- file_path = files_dir / file_info.get("prefix", "") / file_id / file_info.get("filename", "")
499
- if not file_path.exists():
500
- missing_files.append(file_info.get("filename", file_id))
501
-
502
- if missing_files:
503
- count = len(missing_files)
504
- preview = missing_files[:3]
505
- result.warnings.append(
506
- f"{count} files referenced in metadata but missing on disk (e.g., {preview}...) - will skip these"
507
- )
508
-
509
- except (json.JSONDecodeError, IOError):
510
- pass # If we can't read metadata, just skip file validation
938
+ upload_result = future.result()
511
939
 
940
+ # Thread-safe update of shared state
941
+ with self._lock:
942
+ result.bytes_uploaded += upload_result["bytes"]
512
943
 
513
- class ExperimentUploader:
514
- """Handles uploading a single experiment."""
515
-
516
- def __init__(
517
- self,
518
- local_storage: LocalStorage,
519
- remote_client: RemoteClient,
520
- batch_size: int = 100,
521
- skip_logs: bool = False,
522
- skip_metrics: bool = False,
523
- skip_files: bool = False,
524
- skip_params: bool = False,
525
- verbose: bool = False,
526
- progress: Optional[Progress] = None,
527
- max_concurrent_metrics: int = 5,
528
- ):
529
- """
530
- Initialize uploader.
531
-
532
- Args:
533
- local_storage: Local storage instance
534
- remote_client: Remote client instance
535
- batch_size: Batch size for logs/metrics
536
- skip_logs: Skip uploading logs
537
- skip_metrics: Skip uploading metrics
538
- skip_files: Skip uploading files
539
- skip_params: Skip uploading parameters
540
- verbose: Show verbose output
541
- progress: Optional rich Progress instance for tracking
542
- max_concurrent_metrics: Maximum concurrent metric uploads (default: 5)
543
- """
544
- self.local = local_storage
545
- self.remote = remote_client
546
- self.batch_size = batch_size
547
- self.skip_logs = skip_logs
548
- self.skip_metrics = skip_metrics
549
- self.skip_files = skip_files
550
- self.skip_params = skip_params
551
- self.verbose = verbose
552
- self.progress = progress
553
- self.max_concurrent_metrics = max_concurrent_metrics
554
- # Thread-safe lock for shared state updates
555
- self._lock = threading.Lock()
556
- # Thread-local storage for remote clients (for thread-safe HTTP requests)
557
- self._thread_local = threading.local()
558
-
559
- def _get_remote_client(self) -> RemoteClient:
560
- """Get thread-local remote client for safe concurrent access."""
561
- if not hasattr(self._thread_local, 'client'):
562
- # Create a new client for this thread
563
- self._thread_local.client = RemoteClient(
564
- base_url=self.remote.base_url,
565
- api_key=self.remote.api_key
566
- )
567
- return self._thread_local.client
568
-
569
- def upload_experiment(
570
- self, exp_info: ExperimentInfo, validation_result: ValidationResult, task_id=None
571
- ) -> UploadResult:
572
- """
573
- Upload a single experiment with all its data.
574
-
575
- Args:
576
- exp_info: Experiment information
577
- validation_result: Validation results
578
- task_id: Optional progress task ID
579
-
580
- Returns:
581
- UploadResult with upload status
582
- """
583
- result = UploadResult(experiment=f"{exp_info.project}/{exp_info.experiment}")
584
-
585
- # Calculate total steps for progress tracking
586
- total_steps = 1 # metadata
587
- if not self.skip_params and "parameters" in validation_result.valid_data:
588
- total_steps += 1
589
- if not self.skip_logs and exp_info.has_logs:
590
- total_steps += 1
591
- if not self.skip_metrics and exp_info.metric_names:
592
- total_steps += len(exp_info.metric_names)
593
- if not self.skip_files and exp_info.file_count > 0:
594
- total_steps += exp_info.file_count
595
-
596
- current_step = 0
597
-
598
- def update_progress(description: str):
599
- nonlocal current_step
600
- current_step += 1
601
- if self.progress and task_id is not None:
602
- self.progress.update(task_id, completed=current_step, total=total_steps, description=description)
944
+ if upload_result["success"]:
945
+ total_metrics += 1
603
946
 
604
- try:
605
- # 1. Create/update experiment metadata
606
- update_progress("Creating experiment...")
947
+ # Thread-safe console output
607
948
  if self.verbose:
608
- console.print(f" [dim]Creating experiment...[/dim]")
609
-
610
- exp_data = validation_result.valid_data
611
-
612
- # Store folder path in metadata (not as folderId which expects Snowflake ID)
613
- custom_metadata = exp_data.get("metadata") or {}
614
- if exp_data.get("folder"):
615
- custom_metadata["folder"] = exp_data["folder"]
616
-
617
- response = self.remote.create_or_update_experiment(
618
- project=exp_info.project,
619
- name=exp_info.experiment,
620
- description=exp_data.get("description"),
621
- tags=exp_data.get("tags"),
622
- bindrs=exp_data.get("bindrs"),
623
- folder=None, # Don't send folder path as folderId (expects Snowflake ID)
624
- write_protected=exp_data.get("write_protected", False),
625
- metadata=custom_metadata if custom_metadata else None,
626
- )
627
-
628
- # Extract experiment ID from nested response
629
- experiment_id = response.get("experiment", {}).get("id") or response.get("id")
630
- if self.verbose:
631
- console.print(f" [green]✓[/green] Created experiment (id: {experiment_id})")
632
-
633
- # 2. Upload parameters
634
- if not self.skip_params and "parameters" in validation_result.valid_data:
635
- update_progress("Uploading parameters...")
636
- if self.verbose:
637
- console.print(f" [dim]Uploading parameters...[/dim]")
638
-
639
- params = validation_result.valid_data["parameters"]
640
- self.remote.set_parameters(experiment_id, params)
641
- result.uploaded["params"] = len(params)
642
- # Track bytes (approximate JSON size)
643
- result.bytes_uploaded += len(json.dumps(params).encode('utf-8'))
644
-
645
- if self.verbose:
646
- console.print(f" [green]✓[/green] Uploaded {len(params)} parameters")
647
-
648
- # 3. Upload logs
649
- if not self.skip_logs and exp_info.has_logs:
650
- count = self._upload_logs(experiment_id, exp_info, result, task_id, update_progress)
651
- result.uploaded["logs"] = count
652
-
653
- # 4. Upload metrics
654
- if not self.skip_metrics and exp_info.metric_names:
655
- count = self._upload_metrics(experiment_id, exp_info, result, task_id, update_progress)
656
- result.uploaded["metrics"] = count
657
-
658
- # 5. Upload files
659
- if not self.skip_files and exp_info.file_count > 0:
660
- count = self._upload_files(experiment_id, exp_info, result, task_id, update_progress)
661
- result.uploaded["files"] = count
662
-
663
- result.success = True
949
+ msg = f" [green][/green] Uploaded {upload_result['uploaded']} data points for '{metric_name}'"
950
+ if upload_result["skipped"] > 0:
951
+ msg += f" (skipped {upload_result['skipped']} invalid)"
952
+ with self._lock:
953
+ console.print(msg)
954
+ else:
955
+ # Record failure
956
+ error_msg = f"{metric_name}: {upload_result['error']}"
957
+ with self._lock:
958
+ result.failed.setdefault("metrics", []).append(error_msg)
959
+ if self.verbose:
960
+ console.print(
961
+ f" [red]✗[/red] Failed to upload '{metric_name}': {upload_result['error']}"
962
+ )
664
963
 
665
964
  except Exception as e:
666
- result.success = False
667
- result.errors.append(str(e))
668
- if self.verbose:
669
- console.print(f" [red]✗ Error: {e}[/red]")
670
-
671
- return result
672
-
673
- def _upload_logs(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
674
- task_id=None, update_progress=None) -> int:
675
- """Upload logs in batches."""
676
- if update_progress:
677
- update_progress("Uploading logs...")
678
- if self.verbose:
679
- console.print(f" [dim]Uploading logs...[/dim]")
680
-
681
- logs_file = exp_info.path / "logs" / "logs.jsonl"
682
- logs_batch = []
683
- total_uploaded = 0
684
- skipped = 0
685
-
686
- try:
687
- with open(logs_file, "r") as f:
688
- for line in f:
689
- try:
690
- log_entry = json.loads(line)
691
-
692
- # Validate required fields
693
- if "message" not in log_entry:
694
- skipped += 1
695
- continue
696
-
697
- # Prepare log entry for API
698
- api_log = {
699
- "timestamp": log_entry.get("timestamp"),
700
- "level": log_entry.get("level", "info"),
701
- "message": log_entry["message"],
702
- }
703
- if "metadata" in log_entry:
704
- api_log["metadata"] = log_entry["metadata"]
705
-
706
- logs_batch.append(api_log)
707
- # Track bytes
708
- result.bytes_uploaded += len(line.encode('utf-8'))
709
-
710
- # Upload batch
711
- if len(logs_batch) >= self.batch_size:
712
- self.remote.create_log_entries(experiment_id, logs_batch)
713
- total_uploaded += len(logs_batch)
714
- logs_batch = []
715
-
716
- except json.JSONDecodeError:
717
- skipped += 1
718
- continue
719
-
720
- # Upload remaining logs
721
- if logs_batch:
722
- self.remote.create_log_entries(experiment_id, logs_batch)
723
- total_uploaded += len(logs_batch)
724
-
965
+ # Handle unexpected errors
966
+ error_msg = f"{metric_name}: {str(e)}"
967
+ with self._lock:
968
+ result.failed.setdefault("metrics", []).append(error_msg)
725
969
  if self.verbose:
726
- msg = f" [green][/green] Uploaded {total_uploaded} log entries"
727
- if skipped > 0:
728
- msg += f" (skipped {skipped} invalid)"
729
- console.print(msg)
730
-
731
- except IOError as e:
732
- result.failed.setdefault("logs", []).append(str(e))
733
-
734
- return total_uploaded
735
-
736
- def _upload_single_metric(
737
- self,
738
- experiment_id: str,
739
- metric_name: str,
740
- metric_dir: Path,
741
- result: UploadResult
742
- ) -> Dict[str, Any]:
743
- """
744
- Upload a single metric (thread-safe helper).
745
-
746
- Returns:
747
- Dict with 'success', 'uploaded', 'skipped', 'bytes', and 'error' keys
748
- """
749
- data_file = metric_dir / "data.jsonl"
750
- data_batch = []
751
- total_uploaded = 0
752
- skipped = 0
753
- bytes_uploaded = 0
754
-
755
- # Get thread-local client for safe concurrent HTTP requests
756
- remote_client = self._get_remote_client()
970
+ console.print(f" [red][/red] Failed to upload '{metric_name}': {e}")
971
+
972
+ return total_metrics
973
+
974
+ def _upload_files(
975
+ self,
976
+ experiment_id: str,
977
+ exp_info: ExperimentInfo,
978
+ result: UploadResult,
979
+ task_id=None,
980
+ update_progress=None,
981
+ ) -> int:
982
+ """Upload files one by one."""
983
+ files_dir = exp_info.path / "files"
984
+ total_uploaded = 0
985
+
986
+ # Parse prefix to get owner, project, and experiment path
987
+ # Format: owner/project/folder.../experiment
988
+ parts = exp_info.prefix.split("/") if exp_info.prefix else []
989
+ if len(parts) < 3:
990
+ # Invalid prefix format, skip file upload
991
+ return 0
992
+
993
+ owner = parts[0]
994
+ project = parts[1]
995
+ # Note: _get_experiment_dir expects the FULL prefix, not just the experiment part
996
+ # So we pass the full prefix to list_files
997
+ full_prefix = exp_info.prefix
998
+
999
+ # Use LocalStorage to list files
1000
+ try:
1001
+ files_list = self.local.list_files(owner, project, full_prefix)
1002
+
1003
+ # Debug: print file count
1004
+ if self.verbose:
1005
+ print(f"[DEBUG] Found {len(files_list)} files to upload")
1006
+ print(f"[DEBUG] Full prefix: {full_prefix}")
1007
+
1008
+ for file_info in files_list:
1009
+ # Skip deleted files
1010
+ if file_info.get("deletedAt") is not None:
1011
+ continue
757
1012
 
758
1013
  try:
759
- with open(data_file, "r") as f:
760
- for line in f:
761
- try:
762
- data_point = json.loads(line)
763
-
764
- # Validate required fields
765
- if "data" not in data_point:
766
- skipped += 1
767
- continue
768
-
769
- data_batch.append(data_point["data"])
770
- bytes_uploaded += len(line.encode('utf-8'))
771
-
772
- # Upload batch using thread-local client
773
- if len(data_batch) >= self.batch_size:
774
- remote_client.append_batch_to_metric(
775
- experiment_id, metric_name, data_batch
776
- )
777
- total_uploaded += len(data_batch)
778
- data_batch = []
779
-
780
- except json.JSONDecodeError:
781
- skipped += 1
782
- continue
783
-
784
- # Upload remaining data points using thread-local client
785
- if data_batch:
786
- remote_client.append_batch_to_metric(experiment_id, metric_name, data_batch)
787
- total_uploaded += len(data_batch)
788
-
789
- return {
790
- 'success': True,
791
- 'uploaded': total_uploaded,
792
- 'skipped': skipped,
793
- 'bytes': bytes_uploaded,
794
- 'error': None
795
- }
1014
+ if update_progress:
1015
+ update_progress(f"Uploading {file_info['filename']}...")
1016
+
1017
+ # Get file path directly from storage without copying
1018
+ file_id = file_info["id"]
1019
+ experiment_dir = self.local._get_experiment_dir(
1020
+ owner, project, full_prefix
1021
+ )
1022
+ files_dir = experiment_dir / "files"
1023
+
1024
+ # Construct file path
1025
+ file_prefix = file_info["path"].lstrip("/") if file_info["path"] else ""
1026
+ if file_prefix:
1027
+ file_path = files_dir / file_prefix / file_id / file_info["filename"]
1028
+ else:
1029
+ file_path = files_dir / file_id / file_info["filename"]
1030
+
1031
+ # Upload to remote with correct parameters
1032
+ self.remote.upload_file(
1033
+ experiment_id=experiment_id,
1034
+ file_path=str(file_path),
1035
+ prefix=file_info.get("path", ""),
1036
+ filename=file_info["filename"],
1037
+ description=file_info.get("description"),
1038
+ tags=file_info.get("tags", []),
1039
+ metadata=file_info.get("metadata"),
1040
+ checksum=file_info["checksum"],
1041
+ content_type=file_info["contentType"],
1042
+ size_bytes=file_info["sizeBytes"],
1043
+ )
1044
+
1045
+ total_uploaded += 1
1046
+ # Track bytes
1047
+ result.bytes_uploaded += file_info.get("sizeBytes", 0)
1048
+
1049
+ if self.verbose:
1050
+ size_mb = file_info.get("sizeBytes", 0) / (1024 * 1024)
1051
+ console.print(
1052
+ f" [green]✓[/green] {file_info['filename']} ({size_mb:.1f}MB)"
1053
+ )
796
1054
 
797
1055
  except Exception as e:
798
- return {
799
- 'success': False,
800
- 'uploaded': 0,
801
- 'skipped': 0,
802
- 'bytes': 0,
803
- 'error': str(e)
804
- }
1056
+ result.failed.setdefault("files", []).append(f"{file_info['filename']}: {e}")
805
1057
 
806
- def _upload_metrics(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
807
- task_id=None, update_progress=None) -> int:
808
- """Upload metrics in parallel with concurrency limit."""
809
- if not exp_info.metric_names:
810
- return 0
811
-
812
- total_metrics = 0
813
-
814
- # Use ThreadPoolExecutor for parallel uploads
815
- with ThreadPoolExecutor(max_workers=self.max_concurrent_metrics) as executor:
816
- # Submit all metric upload tasks
817
- future_to_metric = {}
818
- for metric_name in exp_info.metric_names:
819
- metric_dir = exp_info.path / "metrics" / metric_name
820
- future = executor.submit(
821
- self._upload_single_metric,
822
- experiment_id,
823
- metric_name,
824
- metric_dir,
825
- result
826
- )
827
- future_to_metric[future] = metric_name
828
-
829
- # Process completed uploads as they finish
830
- for future in as_completed(future_to_metric):
831
- metric_name = future_to_metric[future]
832
-
833
- # Update progress
834
- if update_progress:
835
- update_progress(f"Uploading metric '{metric_name}'...")
836
-
837
- try:
838
- upload_result = future.result()
839
-
840
- # Thread-safe update of shared state
841
- with self._lock:
842
- result.bytes_uploaded += upload_result['bytes']
843
-
844
- if upload_result['success']:
845
- total_metrics += 1
846
-
847
- # Thread-safe console output
848
- if self.verbose:
849
- msg = f" [green]✓[/green] Uploaded {upload_result['uploaded']} data points for '{metric_name}'"
850
- if upload_result['skipped'] > 0:
851
- msg += f" (skipped {upload_result['skipped']} invalid)"
852
- with self._lock:
853
- console.print(msg)
854
- else:
855
- # Record failure
856
- error_msg = f"{metric_name}: {upload_result['error']}"
857
- with self._lock:
858
- result.failed.setdefault("metrics", []).append(error_msg)
859
- if self.verbose:
860
- console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {upload_result['error']}")
861
-
862
- except Exception as e:
863
- # Handle unexpected errors
864
- error_msg = f"{metric_name}: {str(e)}"
865
- with self._lock:
866
- result.failed.setdefault("metrics", []).append(error_msg)
867
- if self.verbose:
868
- console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {e}")
869
-
870
- return total_metrics
871
-
872
- def _upload_files(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
873
- task_id=None, update_progress=None) -> int:
874
- """Upload files one by one."""
875
- files_dir = exp_info.path / "files"
876
- total_uploaded = 0
877
-
878
- # Use LocalStorage to list files
879
- try:
880
- files_list = self.local.list_files(exp_info.project, exp_info.experiment)
881
-
882
- for file_info in files_list:
883
- # Skip deleted files
884
- if file_info.get("deletedAt") is not None:
885
- continue
886
-
887
- try:
888
- if update_progress:
889
- update_progress(f"Uploading {file_info['filename']}...")
890
-
891
- # Get file path directly from storage without copying
892
- file_id = file_info["id"]
893
- experiment_dir = self.local._get_experiment_dir(exp_info.project, exp_info.experiment)
894
- files_dir = experiment_dir / "files"
895
-
896
- # Construct file path
897
- file_prefix = file_info["path"].lstrip("/") if file_info["path"] else ""
898
- if file_prefix:
899
- file_path = files_dir / file_prefix / file_id / file_info["filename"]
900
- else:
901
- file_path = files_dir / file_id / file_info["filename"]
902
-
903
- # Upload to remote with correct parameters
904
- self.remote.upload_file(
905
- experiment_id=experiment_id,
906
- file_path=str(file_path),
907
- prefix=file_info.get("path", ""),
908
- filename=file_info["filename"],
909
- description=file_info.get("description"),
910
- tags=file_info.get("tags", []),
911
- metadata=file_info.get("metadata"),
912
- checksum=file_info["checksum"],
913
- content_type=file_info["contentType"],
914
- size_bytes=file_info["sizeBytes"],
915
- )
916
-
917
- total_uploaded += 1
918
- # Track bytes
919
- result.bytes_uploaded += file_info.get("sizeBytes", 0)
920
-
921
- if self.verbose:
922
- size_mb = file_info.get("sizeBytes", 0) / (1024 * 1024)
923
- console.print(f" [green]✓[/green] {file_info['filename']} ({size_mb:.1f}MB)")
924
-
925
- except Exception as e:
926
- result.failed.setdefault("files", []).append(f"{file_info['filename']}: {e}")
927
-
928
- except Exception as e:
929
- result.failed.setdefault("files", []).append(str(e))
1058
+ except Exception as e:
1059
+ result.failed.setdefault("files", []).append(str(e))
930
1060
 
931
- if self.verbose and not result.failed.get("files"):
932
- console.print(f" [green]✓[/green] Uploaded {total_uploaded} files")
1061
+ if self.verbose and not result.failed.get("files"):
1062
+ console.print(f" [green]✓[/green] Uploaded {total_uploaded} files")
933
1063
 
934
- return total_uploaded
1064
+ return total_uploaded
935
1065
 
936
1066
 
937
1067
  def cmd_upload(args: argparse.Namespace) -> int:
938
- """
939
- Execute upload command.
940
-
941
- Args:
942
- args: Parsed command-line arguments
943
-
944
- Returns:
945
- Exit code (0 for success, 1 for error)
946
- """
947
- # Load config
948
- config = Config()
949
-
950
- # Get remote URL (command line > config)
951
- remote_url = args.remote or config.remote_url
952
- if not remote_url:
953
- console.print("[red]Error:[/red] --remote URL is required (or set in config)")
954
- return 1
955
-
956
- # Get API key (command line > config > auto-load from storage)
957
- # RemoteClient will auto-load from storage if api_key is None
958
- api_key = args.api_key or config.api_key
959
-
960
- # Validate experiment filter requires project
961
- if args.experiment and not args.project:
962
- console.print("[red]Error:[/red] --experiment requires --project")
963
- return 1
964
-
965
- # Discover experiments
966
- local_path = Path(args.path)
967
- if not local_path.exists():
968
- console.print(f"[red]Error:[/red] Local storage path does not exist: {local_path}")
969
- return 1
970
-
971
- # Handle state file for resume functionality
972
- state_file = Path(args.state_file)
973
- upload_state = None
974
-
975
- if args.resume:
976
- upload_state = UploadState.load(state_file)
977
- if upload_state:
978
- # Validate state matches current upload
979
- if upload_state.local_path != str(local_path.absolute()):
980
- console.print("[yellow]Warning:[/yellow] State file local path doesn't match. Starting fresh upload.")
981
- upload_state = None
982
- elif upload_state.remote_url != remote_url:
983
- console.print("[yellow]Warning:[/yellow] State file remote URL doesn't match. Starting fresh upload.")
984
- upload_state = None
985
- else:
986
- console.print(f"[green]Resuming previous upload from {upload_state.timestamp}[/green]")
987
- console.print(f" Already completed: {len(upload_state.completed_experiments)} experiments")
988
- console.print(f" Failed: {len(upload_state.failed_experiments)} experiments")
989
- else:
990
- console.print("[yellow]No previous upload state found. Starting fresh upload.[/yellow]")
991
-
992
- # Create new state if not resuming
993
- if not upload_state:
994
- upload_state = UploadState(
995
- local_path=str(local_path.absolute()),
996
- remote_url=remote_url,
1068
+ """
1069
+ Execute upload command.
1070
+
1071
+ Args:
1072
+ args: Parsed command-line arguments
1073
+
1074
+ Returns:
1075
+ Exit code (0 for success, 1 for error)
1076
+ """
1077
+ # Load config
1078
+ config = Config()
1079
+
1080
+ # Get remote URL (command line > config)
1081
+ remote_url = args.dash_url or config.remote_url
1082
+ if not remote_url:
1083
+ console.print("[red]Error:[/red] --dash-url is required (or set in config)")
1084
+ return 1
1085
+
1086
+ # Get API key (command line > config > auto-load from storage)
1087
+ # RemoteClient will auto-load from storage if api_key is None
1088
+ api_key = args.api_key or config.api_key
1089
+
1090
+ # Discover experiments
1091
+ local_path = Path(args.path)
1092
+ if not local_path.exists():
1093
+ console.print(f"[red]Error:[/red] Local storage path does not exist: {local_path}")
1094
+ return 1
1095
+
1096
+ # Handle state file for resume functionality
1097
+ state_file = Path(args.state_file)
1098
+ upload_state = None
1099
+
1100
+ if args.resume:
1101
+ upload_state = UploadState.load(state_file)
1102
+ if upload_state:
1103
+ # Validate state matches current upload
1104
+ if upload_state.local_path != str(local_path.absolute()):
1105
+ console.print(
1106
+ "[yellow]Warning:[/yellow] State file local path doesn't match. Starting fresh upload."
997
1107
  )
998
-
999
- console.print(f"[bold]Scanning local storage:[/bold] {local_path.absolute()}")
1000
- experiments = discover_experiments(
1001
- local_path,
1002
- project_filter=args.project,
1003
- experiment_filter=args.experiment,
1108
+ upload_state = None
1109
+ elif upload_state.remote_url != remote_url:
1110
+ console.print(
1111
+ "[yellow]Warning:[/yellow] State file remote URL doesn't match. Starting fresh upload."
1112
+ )
1113
+ upload_state = None
1114
+ else:
1115
+ console.print(
1116
+ f"[green]Resuming previous upload from {upload_state.timestamp}[/green]"
1117
+ )
1118
+ console.print(
1119
+ f" Already completed: {len(upload_state.completed_experiments)} experiments"
1120
+ )
1121
+ console.print(f" Failed: {len(upload_state.failed_experiments)} experiments")
1122
+ else:
1123
+ console.print(
1124
+ "[yellow]No previous upload state found. Starting fresh upload.[/yellow]"
1125
+ )
1126
+
1127
+ # Create new state if not resuming
1128
+ if not upload_state:
1129
+ upload_state = UploadState(
1130
+ local_path=str(local_path.absolute()),
1131
+ remote_url=remote_url,
1004
1132
  )
1005
1133
 
1006
- if not experiments:
1007
- if args.project and args.experiment:
1008
- console.print(f"[yellow]No experiment found:[/yellow] {args.project}/{args.experiment}")
1009
- elif args.project:
1010
- console.print(f"[yellow]No experiments found in project:[/yellow] {args.project}")
1011
- else:
1012
- console.print("[yellow]No experiments found in local storage[/yellow]")
1013
- return 1
1014
-
1015
- # Filter out already completed experiments when resuming
1016
- if args.resume and upload_state.completed_experiments:
1017
- original_count = len(experiments)
1018
- experiments = [
1019
- exp for exp in experiments
1020
- if f"{exp.project}/{exp.experiment}" not in upload_state.completed_experiments
1021
- ]
1022
- skipped_count = original_count - len(experiments)
1023
- if skipped_count > 0:
1024
- console.print(f"[dim]Skipping {skipped_count} already completed experiment(s)[/dim]")
1025
-
1026
- console.print(f"[green]Found {len(experiments)} experiment(s) to upload[/green]")
1027
-
1028
- # Display discovered experiments
1029
- if args.verbose or args.dry_run:
1030
- console.print("\n[bold]Discovered experiments:[/bold]")
1031
- for exp in experiments:
1032
- parts = []
1033
- if exp.has_logs:
1034
- parts.append("logs")
1035
- if exp.has_params:
1036
- parts.append("params")
1037
- if exp.metric_names:
1038
- parts.append(f"{len(exp.metric_names)} metrics")
1039
- if exp.file_count:
1040
- size_mb = exp.estimated_size / (1024 * 1024)
1041
- parts.append(f"{exp.file_count} files ({size_mb:.1f}MB)")
1042
-
1043
- details = ", ".join(parts) if parts else "metadata only"
1044
- console.print(f" [cyan]•[/cyan] {exp.project}/{exp.experiment} [dim]({details})[/dim]")
1045
-
1046
- # Dry-run mode: stop here
1047
- if args.dry_run:
1048
- console.print("\n[yellow bold]DRY RUN[/yellow bold] - No data will be uploaded")
1049
- console.print("Run without --dry-run to proceed with upload.")
1050
- return 0
1051
-
1052
- # Validate experiments
1053
- console.print("\n[bold]Validating experiments...[/bold]")
1054
- validator = ExperimentValidator(strict=args.strict)
1055
- validation_results = {}
1056
- valid_experiments = []
1057
- invalid_experiments = []
1058
-
1134
+ console.print(f"[bold]Scanning local storage:[/bold] {local_path.absolute()}")
1135
+ experiments = discover_experiments(
1136
+ local_path,
1137
+ project_filter=args.pref, # Using --prefix/-p argument
1138
+ experiment_filter=None,
1139
+ )
1140
+
1141
+ if not experiments:
1142
+ if args.pref:
1143
+ console.print(f"[yellow]No experiments found matching pattern:[/yellow] {args.pref}")
1144
+ else:
1145
+ console.print("[yellow]No experiments found in local storage[/yellow]")
1146
+ return 1
1147
+
1148
+ # Filter out already completed experiments when resuming
1149
+ if args.resume and upload_state.completed_experiments:
1150
+ original_count = len(experiments)
1151
+ experiments = [
1152
+ exp
1153
+ for exp in experiments
1154
+ if f"{exp.project}/{exp.experiment}" not in upload_state.completed_experiments
1155
+ ]
1156
+ skipped_count = original_count - len(experiments)
1157
+ if skipped_count > 0:
1158
+ console.print(
1159
+ f"[dim]Skipping {skipped_count} already completed experiment(s)[/dim]"
1160
+ )
1161
+
1162
+ console.print(f"[green]Found {len(experiments)} experiment(s) to upload[/green]")
1163
+
1164
+ # Display discovered experiments
1165
+ if args.verbose or args.dry_run:
1166
+ console.print("\n[bold]Discovered experiments:[/bold]")
1059
1167
  for exp in experiments:
1060
- validation = validator.validate_experiment(exp)
1061
- validation_results[f"{exp.project}/{exp.experiment}"] = validation
1168
+ parts = []
1169
+ if exp.has_logs:
1170
+ parts.append("logs")
1171
+ if exp.has_params:
1172
+ parts.append("params")
1173
+ if exp.metric_names:
1174
+ parts.append(f"{len(exp.metric_names)} metrics")
1175
+ if exp.file_count:
1176
+ size_mb = exp.estimated_size / (1024 * 1024)
1177
+ parts.append(f"{exp.file_count} files ({size_mb:.1f}MB)")
1178
+
1179
+ details = ", ".join(parts) if parts else "metadata only"
1180
+ console.print(
1181
+ f" [cyan]•[/cyan] {exp.project}/{exp.experiment} [dim]({details})[/dim]"
1182
+ )
1183
+
1184
+ # Dry-run mode: stop here
1185
+ if args.dry_run:
1186
+ console.print("\n[yellow bold]DRY RUN[/yellow bold] - No data will be uploaded")
1187
+ console.print("Run without --dry-run to proceed with upload.")
1188
+ return 0
1189
+
1190
+ # Validate experiments
1191
+ console.print("\n[bold]Validating experiments...[/bold]")
1192
+ validator = ExperimentValidator(strict=args.strict)
1193
+ validation_results = {}
1194
+ valid_experiments = []
1195
+ invalid_experiments = []
1196
+
1197
+ for exp in experiments:
1198
+ validation = validator.validate_experiment(exp)
1199
+ validation_results[f"{exp.project}/{exp.experiment}"] = validation
1200
+
1201
+ if validation.is_valid:
1202
+ valid_experiments.append(exp)
1203
+ else:
1204
+ invalid_experiments.append(exp)
1205
+
1206
+ # Show warnings and errors
1207
+ if args.verbose or validation.errors:
1208
+ exp_key = f"{exp.project}/{exp.experiment}"
1209
+ if validation.errors:
1210
+ console.print(f" [red]✗[/red] {exp_key}:")
1211
+ for error in validation.errors:
1212
+ console.print(f" [red]{error}[/red]")
1213
+ elif validation.warnings:
1214
+ console.print(f" [yellow]⚠[/yellow] {exp_key}:")
1215
+ for warning in validation.warnings:
1216
+ console.print(f" [yellow]{warning}[/yellow]")
1217
+
1218
+ if invalid_experiments:
1219
+ console.print(
1220
+ f"\n[yellow]{len(invalid_experiments)} experiment(s) failed validation and will be skipped[/yellow]"
1221
+ )
1222
+ if args.strict:
1223
+ console.print("[red]Error: Validation failed in --strict mode[/red]")
1224
+ return 1
1225
+
1226
+ if not valid_experiments:
1227
+ console.print("[red]Error: No valid experiments to upload[/red]")
1228
+ return 1
1229
+
1230
+ console.print(
1231
+ f"[green]{len(valid_experiments)} experiment(s) ready to upload[/green]"
1232
+ )
1233
+
1234
+ # Initialize remote client and local storage
1235
+ remote_client = RemoteClient(base_url=remote_url, api_key=api_key)
1236
+ local_storage = LocalStorage(root_path=local_path)
1237
+
1238
+ # Upload experiments with progress tracking
1239
+ console.print(f"\n[bold]Uploading to:[/bold] {remote_url}")
1240
+ if args.target:
1241
+ console.print(f"[bold]Target prefix:[/bold] {args.target}")
1242
+ results = []
1243
+
1244
+ # Track upload timing
1245
+ import time
1246
+
1247
+ start_time = time.time()
1248
+
1249
+ # Create progress bar for overall upload
1250
+ with Progress(
1251
+ SpinnerColumn(),
1252
+ TextColumn("[progress.description]{task.description}"),
1253
+ BarColumn(),
1254
+ TaskProgressColumn(),
1255
+ console=console,
1256
+ transient=not args.verbose, # Keep progress visible in verbose mode
1257
+ ) as progress:
1258
+ # Create uploader with progress tracking
1259
+ uploader = ExperimentUploader(
1260
+ local_storage=local_storage,
1261
+ remote_client=remote_client,
1262
+ batch_size=args.batch_size,
1263
+ skip_logs=args.skip_logs,
1264
+ skip_metrics=args.skip_metrics,
1265
+ skip_files=args.skip_files,
1266
+ skip_params=args.skip_params,
1267
+ verbose=args.verbose,
1268
+ progress=progress,
1269
+ target_prefix=args.target,
1270
+ )
1062
1271
 
1063
- if validation.is_valid:
1064
- valid_experiments.append(exp)
1272
+ for i, exp in enumerate(valid_experiments, start=1):
1273
+ exp_key = f"{exp.project}/{exp.experiment}"
1274
+
1275
+ # Create task for this experiment
1276
+ task_id = progress.add_task(
1277
+ f"[{i}/{len(valid_experiments)}] {exp_key}",
1278
+ total=100, # Will be updated with actual steps
1279
+ )
1280
+
1281
+ # Update state - mark as in progress
1282
+ upload_state.in_progress_experiment = exp_key
1283
+ if not args.dry_run:
1284
+ upload_state.save(state_file)
1285
+
1286
+ validation = validation_results[exp_key]
1287
+ result = uploader.upload_experiment(exp, validation, task_id=task_id)
1288
+ results.append(result)
1289
+
1290
+ # Update state - mark as completed or failed
1291
+ upload_state.in_progress_experiment = None
1292
+ if result.success:
1293
+ upload_state.completed_experiments.append(exp_key)
1294
+ else:
1295
+ upload_state.failed_experiments.append(exp_key)
1296
+
1297
+ if not args.dry_run:
1298
+ upload_state.save(state_file)
1299
+
1300
+ # Update task to completed
1301
+ progress.update(task_id, completed=100, total=100)
1302
+
1303
+ if not args.verbose:
1304
+ # Show brief status
1305
+ if result.success:
1306
+ parts = []
1307
+ if result.uploaded.get("params"):
1308
+ parts.append(f"{result.uploaded['params']} params")
1309
+ if result.uploaded.get("logs"):
1310
+ parts.append(f"{result.uploaded['logs']} logs")
1311
+ if result.uploaded.get("metrics"):
1312
+ parts.append(f"{result.uploaded['metrics']} metrics")
1313
+ if result.uploaded.get("files"):
1314
+ parts.append(f"{result.uploaded['files']} files")
1315
+ status = ", ".join(parts) if parts else "metadata only"
1316
+ console.print(f" [green]✓[/green] Uploaded ({status})")
1065
1317
  else:
1066
- invalid_experiments.append(exp)
1067
-
1068
- # Show warnings and errors
1069
- if args.verbose or validation.errors:
1070
- exp_key = f"{exp.project}/{exp.experiment}"
1071
- if validation.errors:
1072
- console.print(f" [red]✗[/red] {exp_key}:")
1073
- for error in validation.errors:
1074
- console.print(f" [red]{error}[/red]")
1075
- elif validation.warnings:
1076
- console.print(f" [yellow]⚠[/yellow] {exp_key}:")
1077
- for warning in validation.warnings:
1078
- console.print(f" [yellow]{warning}[/yellow]")
1079
-
1080
- if invalid_experiments:
1081
- console.print(f"\n[yellow]{len(invalid_experiments)} experiment(s) failed validation and will be skipped[/yellow]")
1082
- if args.strict:
1083
- console.print("[red]Error: Validation failed in --strict mode[/red]")
1084
- return 1
1085
-
1086
- if not valid_experiments:
1087
- console.print("[red]Error: No valid experiments to upload[/red]")
1088
- return 1
1089
-
1090
- console.print(f"[green]{len(valid_experiments)} experiment(s) ready to upload[/green]")
1091
-
1092
- # Initialize remote client and local storage
1093
- remote_client = RemoteClient(base_url=remote_url, api_key=api_key)
1094
- local_storage = LocalStorage(root_path=local_path)
1095
-
1096
- # Upload experiments with progress tracking
1097
- console.print(f"\n[bold]Uploading to:[/bold] {remote_url}")
1098
- results = []
1099
-
1100
- # Track upload timing
1101
- import time
1102
- start_time = time.time()
1103
-
1104
- # Create progress bar for overall upload
1105
- with Progress(
1106
- SpinnerColumn(),
1107
- TextColumn("[progress.description]{task.description}"),
1108
- BarColumn(),
1109
- TaskProgressColumn(),
1110
- console=console,
1111
- transient=not args.verbose, # Keep progress visible in verbose mode
1112
- ) as progress:
1113
- # Create uploader with progress tracking
1114
- uploader = ExperimentUploader(
1115
- local_storage=local_storage,
1116
- remote_client=remote_client,
1117
- batch_size=args.batch_size,
1118
- skip_logs=args.skip_logs,
1119
- skip_metrics=args.skip_metrics,
1120
- skip_files=args.skip_files,
1121
- skip_params=args.skip_params,
1122
- verbose=args.verbose,
1123
- progress=progress,
1124
- )
1318
+ console.print(" [red]✗[/red] Failed")
1319
+ if result.errors:
1320
+ for error in result.errors[:3]: # Show first 3 errors
1321
+ console.print(f" [red]{error}[/red]")
1322
+
1323
+ # Calculate timing
1324
+ end_time = time.time()
1325
+ elapsed_time = end_time - start_time
1326
+ total_bytes = sum(r.bytes_uploaded for r in results)
1327
+
1328
+ # Print summary with rich Table
1329
+ console.print()
1330
+
1331
+ successful = [r for r in results if r.success]
1332
+ failed = [r for r in results if not r.success]
1333
+
1334
+ # Create summary table
1335
+ summary_table = Table(title="Upload Summary", show_header=True, header_style="bold")
1336
+ summary_table.add_column("Status", style="cyan")
1337
+ summary_table.add_column("Count", justify="right")
1338
+
1339
+ summary_table.add_row(
1340
+ "Successful", f"[green]{len(successful)}/{len(results)}[/green]"
1341
+ )
1342
+ if failed:
1343
+ summary_table.add_row("Failed", f"[red]{len(failed)}/{len(results)}[/red]")
1344
+
1345
+ # Add timing information
1346
+ summary_table.add_row("Total Time", f"{elapsed_time:.2f}s")
1347
+
1348
+ # Calculate and display upload speed
1349
+ if total_bytes > 0 and elapsed_time > 0:
1350
+ # Convert to appropriate unit
1351
+ if total_bytes < 1024 * 1024: # Less than 1 MB
1352
+ speed_kb = (total_bytes / 1024) / elapsed_time
1353
+ summary_table.add_row("Avg Speed", f"{speed_kb:.2f} KB/s")
1354
+ else: # 1 MB or more
1355
+ speed_mb = (total_bytes / (1024 * 1024)) / elapsed_time
1356
+ summary_table.add_row("Avg Speed", f"{speed_mb:.2f} MB/s")
1357
+
1358
+ console.print(summary_table)
1359
+
1360
+ # Show failed experiments
1361
+ if failed:
1362
+ console.print("\n[bold red]Failed Experiments:[/bold red]")
1363
+ for result in failed:
1364
+ console.print(f" [red]✗[/red] {result.experiment}")
1365
+ for error in result.errors:
1366
+ console.print(f" [dim]{error}[/dim]")
1367
+
1368
+ # Data statistics
1369
+ total_logs = sum(r.uploaded.get("logs", 0) for r in results)
1370
+ total_metrics = sum(r.uploaded.get("metrics", 0) for r in results)
1371
+ total_files = sum(r.uploaded.get("files", 0) for r in results)
1372
+
1373
+ if total_logs or total_metrics or total_files:
1374
+ data_table = Table(title="Data Uploaded", show_header=True, header_style="bold")
1375
+ data_table.add_column("Type", style="cyan")
1376
+ data_table.add_column("Count", justify="right", style="green")
1377
+
1378
+ if total_logs:
1379
+ data_table.add_row("Logs", f"{total_logs} entries")
1380
+ if total_metrics:
1381
+ data_table.add_row("Metrics", f"{total_metrics} metrics")
1382
+ if total_files:
1383
+ data_table.add_row("Files", f"{total_files} files")
1125
1384
 
1126
- for i, exp in enumerate(valid_experiments, start=1):
1127
- exp_key = f"{exp.project}/{exp.experiment}"
1128
-
1129
- # Create task for this experiment
1130
- task_id = progress.add_task(
1131
- f"[{i}/{len(valid_experiments)}] {exp_key}",
1132
- total=100, # Will be updated with actual steps
1133
- )
1134
-
1135
- # Update state - mark as in progress
1136
- upload_state.in_progress_experiment = exp_key
1137
- if not args.dry_run:
1138
- upload_state.save(state_file)
1139
-
1140
- validation = validation_results[exp_key]
1141
- result = uploader.upload_experiment(exp, validation, task_id=task_id)
1142
- results.append(result)
1143
-
1144
- # Update state - mark as completed or failed
1145
- upload_state.in_progress_experiment = None
1146
- if result.success:
1147
- upload_state.completed_experiments.append(exp_key)
1148
- else:
1149
- upload_state.failed_experiments.append(exp_key)
1150
-
1151
- if not args.dry_run:
1152
- upload_state.save(state_file)
1153
-
1154
- # Update task to completed
1155
- progress.update(task_id, completed=100, total=100)
1156
-
1157
- if not args.verbose:
1158
- # Show brief status
1159
- if result.success:
1160
- parts = []
1161
- if result.uploaded.get("params"):
1162
- parts.append(f"{result.uploaded['params']} params")
1163
- if result.uploaded.get("logs"):
1164
- parts.append(f"{result.uploaded['logs']} logs")
1165
- if result.uploaded.get("metrics"):
1166
- parts.append(f"{result.uploaded['metrics']} metrics")
1167
- if result.uploaded.get("files"):
1168
- parts.append(f"{result.uploaded['files']} files")
1169
- status = ", ".join(parts) if parts else "metadata only"
1170
- console.print(f" [green]✓[/green] Uploaded ({status})")
1171
- else:
1172
- console.print(f" [red]✗[/red] Failed")
1173
- if result.errors:
1174
- for error in result.errors[:3]: # Show first 3 errors
1175
- console.print(f" [red]{error}[/red]")
1176
-
1177
- # Calculate timing
1178
- end_time = time.time()
1179
- elapsed_time = end_time - start_time
1180
- total_bytes = sum(r.bytes_uploaded for r in results)
1181
-
1182
- # Print summary with rich Table
1183
1385
  console.print()
1386
+ console.print(data_table)
1387
+
1388
+ # Clean up state file if all uploads succeeded
1389
+ if not args.dry_run and len(failed) == 0 and state_file.exists():
1390
+ state_file.unlink()
1391
+ console.print("\n[dim]Upload complete. State file removed.[/dim]")
1392
+ elif not args.dry_run and failed:
1393
+ console.print(
1394
+ f"\n[yellow]State saved to {state_file}. Use --resume to retry failed uploads.[/yellow]"
1395
+ )
1184
1396
 
1185
- successful = [r for r in results if r.success]
1186
- failed = [r for r in results if not r.success]
1187
-
1188
- # Create summary table
1189
- summary_table = Table(title="Upload Summary", show_header=True, header_style="bold")
1190
- summary_table.add_column("Status", style="cyan")
1191
- summary_table.add_column("Count", justify="right")
1192
-
1193
- summary_table.add_row("Successful", f"[green]{len(successful)}/{len(results)}[/green]")
1194
- if failed:
1195
- summary_table.add_row("Failed", f"[red]{len(failed)}/{len(results)}[/red]")
1196
-
1197
- # Add timing information
1198
- summary_table.add_row("Total Time", f"{elapsed_time:.2f}s")
1199
-
1200
- # Calculate and display upload speed
1201
- if total_bytes > 0 and elapsed_time > 0:
1202
- # Convert to appropriate unit
1203
- if total_bytes < 1024 * 1024: # Less than 1 MB
1204
- speed_kb = (total_bytes / 1024) / elapsed_time
1205
- summary_table.add_row("Avg Speed", f"{speed_kb:.2f} KB/s")
1206
- else: # 1 MB or more
1207
- speed_mb = (total_bytes / (1024 * 1024)) / elapsed_time
1208
- summary_table.add_row("Avg Speed", f"{speed_mb:.2f} MB/s")
1209
-
1210
- console.print(summary_table)
1211
-
1212
- # Show failed experiments
1213
- if failed:
1214
- console.print("\n[bold red]Failed Experiments:[/bold red]")
1215
- for result in failed:
1216
- console.print(f" [red]✗[/red] {result.experiment}")
1217
- for error in result.errors:
1218
- console.print(f" [dim]{error}[/dim]")
1219
-
1220
- # Data statistics
1221
- total_logs = sum(r.uploaded.get("logs", 0) for r in results)
1222
- total_metrics = sum(r.uploaded.get("metrics", 0) for r in results)
1223
- total_files = sum(r.uploaded.get("files", 0) for r in results)
1224
-
1225
- if total_logs or total_metrics or total_files:
1226
- data_table = Table(title="Data Uploaded", show_header=True, header_style="bold")
1227
- data_table.add_column("Type", style="cyan")
1228
- data_table.add_column("Count", justify="right", style="green")
1229
-
1230
- if total_logs:
1231
- data_table.add_row("Logs", f"{total_logs} entries")
1232
- if total_metrics:
1233
- data_table.add_row("Metrics", f"{total_metrics} metrics")
1234
- if total_files:
1235
- data_table.add_row("Files", f"{total_files} files")
1236
-
1237
- console.print()
1238
- console.print(data_table)
1239
-
1240
- # Clean up state file if all uploads succeeded
1241
- if not args.dry_run and len(failed) == 0 and state_file.exists():
1242
- state_file.unlink()
1243
- console.print("\n[dim]Upload complete. State file removed.[/dim]")
1244
- elif not args.dry_run and failed:
1245
- console.print(f"\n[yellow]State saved to {state_file}. Use --resume to retry failed uploads.[/yellow]")
1246
-
1247
- # Return exit code
1248
- return 0 if len(failed) == 0 else 1
1397
+ # Return exit code
1398
+ return 0 if len(failed) == 0 else 1