ml-dash 0.6.2rc1__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,20 +2,25 @@
2
2
 
3
3
  import argparse
4
4
  import json
5
- from pathlib import Path
6
- from typing import List, Dict, Any, Optional
7
- from dataclasses import dataclass, field
8
5
  import threading
9
6
  from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional
10
10
 
11
11
  from rich.console import Console
12
- from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
12
+ from rich.progress import (
13
+ BarColumn,
14
+ Progress,
15
+ SpinnerColumn,
16
+ TaskProgressColumn,
17
+ TextColumn,
18
+ )
13
19
  from rich.table import Table
14
- from rich.panel import Panel
15
20
 
16
- from ..storage import LocalStorage
17
21
  from ..client import RemoteClient
18
22
  from ..config import Config
23
+ from ..storage import LocalStorage
19
24
 
20
25
  # Initialize rich console
21
26
  console = Console()
@@ -23,1226 +28,1392 @@ console = Console()
23
28
 
24
29
  @dataclass
25
30
  class ExperimentInfo:
26
- """Information about an experiment to upload."""
27
- project: str
28
- experiment: str
29
- path: Path
30
- folder: Optional[str] = None
31
- has_logs: bool = False
32
- has_params: bool = False
33
- metric_names: List[str] = field(default_factory=list)
34
- file_count: int = 0
35
- estimated_size: int = 0 # in bytes
31
+ """Information about an experiment to upload."""
32
+
33
+ project: str
34
+ experiment: str
35
+ path: Path
36
+ prefix: Optional[str] = None
37
+ has_logs: bool = False
38
+ has_params: bool = False
39
+ metric_names: List[str] = field(default_factory=list)
40
+ file_count: int = 0
41
+ estimated_size: int = 0 # in bytes
36
42
 
37
43
 
38
44
  @dataclass
39
45
  class ValidationResult:
40
- """Result of experiment validation."""
41
- is_valid: bool = True
42
- warnings: List[str] = field(default_factory=list)
43
- errors: List[str] = field(default_factory=list)
44
- valid_data: Dict[str, Any] = field(default_factory=dict)
46
+ """Result of experiment validation."""
47
+
48
+ is_valid: bool = True
49
+ warnings: List[str] = field(default_factory=list)
50
+ errors: List[str] = field(default_factory=list)
51
+ valid_data: Dict[str, Any] = field(default_factory=dict)
45
52
 
46
53
 
47
54
  @dataclass
48
55
  class UploadResult:
49
- """Result of uploading an experiment."""
50
- experiment: str
51
- success: bool = False
52
- uploaded: Dict[str, int] = field(default_factory=dict) # {"logs": 100, "metrics": 3}
53
- failed: Dict[str, List[str]] = field(default_factory=dict) # {"files": ["error msg"]}
54
- errors: List[str] = field(default_factory=list)
55
- bytes_uploaded: int = 0 # Total bytes uploaded
56
+ """Result of uploading an experiment."""
57
+
58
+ experiment: str
59
+ success: bool = False
60
+ uploaded: Dict[str, int] = field(default_factory=dict) # {"logs": 100, "metrics": 3}
61
+ failed: Dict[str, List[str]] = field(default_factory=dict) # {"files": ["error msg"]}
62
+ errors: List[str] = field(default_factory=list)
63
+ bytes_uploaded: int = 0 # Total bytes uploaded
56
64
 
57
65
 
58
66
  @dataclass
59
67
  class UploadState:
60
- """Tracks upload state for resume functionality."""
61
- local_path: str
62
- remote_url: str
63
- completed_experiments: List[str] = field(default_factory=list) # ["project/experiment"]
64
- failed_experiments: List[str] = field(default_factory=list)
65
- in_progress_experiment: Optional[str] = None
66
- timestamp: Optional[str] = None
67
-
68
- def to_dict(self) -> Dict[str, Any]:
69
- """Convert to dictionary for JSON serialization."""
70
- return {
71
- "local_path": self.local_path,
72
- "remote_url": self.remote_url,
73
- "completed_experiments": self.completed_experiments,
74
- "failed_experiments": self.failed_experiments,
75
- "in_progress_experiment": self.in_progress_experiment,
76
- "timestamp": self.timestamp,
77
- }
78
-
79
- @classmethod
80
- def from_dict(cls, data: Dict[str, Any]) -> "UploadState":
81
- """Create from dictionary."""
82
- return cls(
83
- local_path=data["local_path"],
84
- remote_url=data["remote_url"],
85
- completed_experiments=data.get("completed_experiments", []),
86
- failed_experiments=data.get("failed_experiments", []),
87
- in_progress_experiment=data.get("in_progress_experiment"),
88
- timestamp=data.get("timestamp"),
89
- )
68
+ """Tracks upload state for resume functionality."""
69
+
70
+ local_path: str
71
+ remote_url: str
72
+ completed_experiments: List[str] = field(
73
+ default_factory=list
74
+ ) # ["project/experiment"]
75
+ failed_experiments: List[str] = field(default_factory=list)
76
+ in_progress_experiment: Optional[str] = None
77
+ timestamp: Optional[str] = None
78
+
79
+ def to_dict(self) -> Dict[str, Any]:
80
+ """Convert to dictionary for JSON serialization."""
81
+ return {
82
+ "local_path": self.local_path,
83
+ "remote_url": self.remote_url,
84
+ "completed_experiments": self.completed_experiments,
85
+ "failed_experiments": self.failed_experiments,
86
+ "in_progress_experiment": self.in_progress_experiment,
87
+ "timestamp": self.timestamp,
88
+ }
89
+
90
+ @classmethod
91
+ def from_dict(cls, data: Dict[str, Any]) -> "UploadState":
92
+ """Create from dictionary."""
93
+ return cls(
94
+ local_path=data["local_path"],
95
+ remote_url=data["remote_url"],
96
+ completed_experiments=data.get("completed_experiments", []),
97
+ failed_experiments=data.get("failed_experiments", []),
98
+ in_progress_experiment=data.get("in_progress_experiment"),
99
+ timestamp=data.get("timestamp"),
100
+ )
90
101
 
91
- def save(self, path: Path):
92
- """Save state to file."""
93
- import datetime
94
- self.timestamp = datetime.datetime.now().isoformat()
95
- with open(path, "w") as f:
96
- json.dump(self.to_dict(), f, indent=2)
97
-
98
- @classmethod
99
- def load(cls, path: Path) -> Optional["UploadState"]:
100
- """Load state from file."""
101
- if not path.exists():
102
- return None
103
- try:
104
- with open(path, "r") as f:
105
- data = json.load(f)
106
- return cls.from_dict(data)
107
- except (json.JSONDecodeError, IOError, KeyError):
108
- return None
102
+ def save(self, path: Path):
103
+ """Save state to file."""
104
+ import datetime
105
+
106
+ self.timestamp = datetime.datetime.now().isoformat()
107
+ with open(path, "w") as f:
108
+ json.dump(self.to_dict(), f, indent=2)
109
+
110
+ @classmethod
111
+ def load(cls, path: Path) -> Optional["UploadState"]:
112
+ """Load state from file."""
113
+ if not path.exists():
114
+ return None
115
+ try:
116
+ with open(path, "r") as f:
117
+ data = json.load(f)
118
+ return cls.from_dict(data)
119
+ except (json.JSONDecodeError, IOError, KeyError):
120
+ return None
109
121
 
110
122
 
111
123
  def add_parser(subparsers) -> argparse.ArgumentParser:
112
- """Add upload command parser."""
113
- parser = subparsers.add_parser(
114
- "upload",
115
- help="Upload local experiments to remote server",
116
- description="Upload locally-stored ML-Dash experiment data to a remote server.",
117
- )
124
+ """Add upload command parser."""
125
+ parser = subparsers.add_parser(
126
+ "upload",
127
+ help="Upload local experiments to remote server",
128
+ description="Upload locally-stored ML-Dash experiment data to a remote server.",
129
+ )
130
+
131
+ # Positional argument
132
+ parser.add_argument(
133
+ "path",
134
+ nargs="?",
135
+ default="./.dash",
136
+ help="Local storage directory to upload from (default: ./.dash)",
137
+ )
138
+
139
+ # Remote configuration
140
+ parser.add_argument(
141
+ "--dash-url",
142
+ type=str,
143
+ help="ML-Dash server URL (defaults to config or https://api.dash.ml)",
144
+ )
145
+ parser.add_argument(
146
+ "--api-key",
147
+ type=str,
148
+ help="JWT token for authentication (optional - auto-loads from 'ml-dash login' if not provided)",
149
+ )
150
+
151
+ """
152
+
153
+ cd .dash/geyang
154
+ cd iclr_2026
155
+
156
+ ml-dash upload -p geyang/new-run * # this uploads all of the folders to geyang/new-run.
157
+
158
+ or
159
+
160
+ ml-dash upload --prefix geyang/new-run/local-results ./* # uploads under the local-results prefix.
161
+
162
+ ml-dash download --prefix geyang/new-run/zehua-results --filter *.mp4 --dryrun --verbose
163
+
164
+ mo-dash list --prefix geyang/new-run/zehua-results --filter xxx-xxx --verbose
165
+
166
+ mo-dash list-exp --prefix geyang/new-run/zehua-results --filter xxx-xxx --verbose
167
+
168
+ """
169
+
170
+ # Scope control
171
+ # Ge: project should be {owner}/{proj_name}
172
+ parser.add_argument(
173
+ "-p",
174
+ "--pref",
175
+ "--prefix",
176
+ "--proj",
177
+ "--project",
178
+ type=str,
179
+ help="Filter experiments by prefix pattern (supports glob: 'tom/*/exp*', 'alice/project-?/baseline')",
180
+ )
181
+
182
+ # Target prefix for server (like scp destination)
183
+ parser.add_argument(
184
+ "-t",
185
+ "--target",
186
+ type=str,
187
+ help="Target prefix/directory on server where experiments will be uploaded (e.g., 'alice/shared-project'). Similar to 'scp local/ remote-path/'",
188
+ )
189
+ # parser.add_argument(
190
+ # "--experiment",
191
+ # type=str,
192
+ # help="Upload only this specific experiment (requires --project)",
193
+ # )
194
+
195
+ # Data filtering
196
+ parser.add_argument(
197
+ "--skip-logs",
198
+ action="store_true",
199
+ help="Don't upload logs",
200
+ )
201
+ parser.add_argument(
202
+ "--skip-metrics",
203
+ action="store_true",
204
+ help="Don't upload metrics",
205
+ )
206
+ parser.add_argument(
207
+ "--skip-files",
208
+ action="store_true",
209
+ help="Don't upload files",
210
+ )
211
+ parser.add_argument(
212
+ "--skip-params",
213
+ action="store_true",
214
+ help="Don't upload parameters",
215
+ )
216
+
217
+ # Behavior control
218
+ parser.add_argument(
219
+ "--dry-run",
220
+ action="store_true",
221
+ help="Show what would be uploaded without uploading",
222
+ )
223
+ parser.add_argument(
224
+ "--strict",
225
+ action="store_true",
226
+ help="Fail on any validation error (default: skip invalid data)",
227
+ )
228
+ parser.add_argument(
229
+ "-v",
230
+ "--verbose",
231
+ action="store_true",
232
+ help="Show detailed progress",
233
+ )
234
+ parser.add_argument(
235
+ "--batch-size",
236
+ type=int,
237
+ default=100,
238
+ help="Batch size for logs/metrics (default: 100)",
239
+ )
240
+ parser.add_argument(
241
+ "--resume",
242
+ action="store_true",
243
+ help="Resume previous interrupted upload",
244
+ )
245
+ parser.add_argument(
246
+ "--state-file",
247
+ type=str,
248
+ default=".dash-upload-state.json",
249
+ help="Path to state file for resume (default: .dash-upload-state.json)",
250
+ )
251
+
252
+ return parser
118
253
 
119
- # Positional argument
120
- parser.add_argument(
121
- "path",
122
- nargs="?",
123
- default="./.ml-dash",
124
- help="Local storage directory to upload from (default: ./.ml-dash)",
125
- )
126
254
 
127
- # Remote configuration
128
- parser.add_argument(
129
- "--remote",
130
- type=str,
131
- help="Remote server URL (required unless set in config)",
132
- )
133
- parser.add_argument(
134
- "--api-key",
135
- type=str,
136
- help="JWT token for authentication (optional - auto-loads from 'ml-dash login' if not provided)",
137
- )
255
+ def discover_experiments(
256
+ local_path: Path,
257
+ project_filter: Optional[str] = None,
258
+ experiment_filter: Optional[str] = None,
259
+ ) -> List[ExperimentInfo]:
260
+ """
261
+ Discover experiments in local storage directory.
262
+
263
+ Supports both flat (local_path/project/experiment) and folder-based
264
+ (local_path/folder/project/experiment) hierarchies.
265
+
266
+ Args:
267
+ local_path: Root path of local storage
268
+ project_filter: Glob pattern to filter experiments by prefix (e.g., "tom/*/exp*")
269
+ experiment_filter: Only discover this experiment (requires project_filter)
270
+
271
+ Returns:
272
+ List of ExperimentInfo objects
273
+ """
274
+ import fnmatch
275
+
276
+ local_path = Path(local_path)
277
+
278
+ if not local_path.exists():
279
+ return []
280
+
281
+ experiments = []
282
+
283
+ # Find all experiment.json files recursively
284
+ for exp_json in local_path.rglob("*/experiment.json"):
285
+ exp_dir = exp_json.parent
286
+
287
+ # Read prefix from experiment.json first
288
+ prefix = None
289
+ try:
290
+ with open(exp_json, "r") as f:
291
+ metadata = json.load(f)
292
+ prefix = metadata.get("prefix")
293
+ except:
294
+ pass
295
+
296
+ # Extract project and experiment names from PREFIX (not path)
297
+ # This handles nested folders correctly
298
+ # Prefix format: owner/project/folder.../experiment
299
+ try:
300
+ relative_path = exp_dir.relative_to(local_path)
301
+ full_relative_path = str(relative_path)
302
+
303
+ if prefix:
304
+ # Parse from prefix for accuracy
305
+ prefix_parts = prefix.strip("/").split("/")
306
+ if len(prefix_parts) < 3:
307
+ continue # Need at least owner/project/experiment
308
+
309
+ # owner = prefix_parts[0]
310
+ project_name = prefix_parts[1]
311
+ exp_name = prefix_parts[-1]
312
+ else:
313
+ # Fallback to path-based parsing (legacy support)
314
+ parts = relative_path.parts
315
+ if len(parts) < 2:
316
+ continue
317
+ exp_name = parts[-1]
318
+ project_name = parts[-2]
319
+
320
+ # Apply filters with glob pattern support
321
+ if project_filter:
322
+ # Support glob pattern matching on the full relative path
323
+ if not fnmatch.fnmatch(full_relative_path, project_filter):
324
+ continue
325
+ if experiment_filter and exp_name != experiment_filter:
326
+ continue
327
+
328
+ # Create experiment info
329
+ exp_info = ExperimentInfo(
330
+ project=project_name,
331
+ experiment=exp_name,
332
+ path=exp_dir,
333
+ prefix=prefix,
334
+ )
335
+ except (ValueError, IndexError):
336
+ continue
337
+
338
+ # Check for parameters
339
+ params_file = exp_dir / "parameters.json"
340
+ exp_info.has_params = params_file.exists()
341
+
342
+ # Check for logs
343
+ logs_file = exp_dir / "logs/logs.jsonl"
344
+ exp_info.has_logs = logs_file.exists()
345
+
346
+ # Check for metrics
347
+ metrics_dir = exp_dir / "metrics"
348
+ if metrics_dir.exists():
349
+ for metric_dir in metrics_dir.iterdir():
350
+ if metric_dir.is_dir():
351
+ data_file = metric_dir / "data.jsonl"
352
+ if data_file.exists():
353
+ exp_info.metric_names.append(metric_dir.name)
354
+
355
+ # Check for files
356
+ files_dir = exp_dir / "files"
357
+ if files_dir.exists():
358
+ try:
359
+ # Count files recursively
360
+ exp_info.file_count = sum(1 for _ in files_dir.rglob("*") if _.is_file())
361
+
362
+ # Estimate size
363
+ exp_info.estimated_size = sum(
364
+ f.stat().st_size for f in files_dir.rglob("*") if f.is_file()
365
+ )
366
+ except (OSError, PermissionError):
367
+ pass
138
368
 
139
- # Scope control
140
- parser.add_argument(
141
- "--project",
142
- type=str,
143
- help="Upload only experiments from this project",
144
- )
145
- parser.add_argument(
146
- "--experiment",
147
- type=str,
148
- help="Upload only this specific experiment (requires --project)",
149
- )
369
+ experiments.append(exp_info)
150
370
 
151
- # Data filtering
152
- parser.add_argument(
153
- "--skip-logs",
154
- action="store_true",
155
- help="Don't upload logs",
156
- )
157
- parser.add_argument(
158
- "--skip-metrics",
159
- action="store_true",
160
- help="Don't upload metrics",
161
- )
162
- parser.add_argument(
163
- "--skip-files",
164
- action="store_true",
165
- help="Don't upload files",
166
- )
167
- parser.add_argument(
168
- "--skip-params",
169
- action="store_true",
170
- help="Don't upload parameters",
171
- )
371
+ return experiments
172
372
 
173
- # Behavior control
174
- parser.add_argument(
175
- "--dry-run",
176
- action="store_true",
177
- help="Show what would be uploaded without uploading",
178
- )
179
- parser.add_argument(
180
- "--strict",
181
- action="store_true",
182
- help="Fail on any validation error (default: skip invalid data)",
183
- )
184
- parser.add_argument(
185
- "-v", "--verbose",
186
- action="store_true",
187
- help="Show detailed progress",
188
- )
189
- parser.add_argument(
190
- "--batch-size",
191
- type=int,
192
- default=100,
193
- help="Batch size for logs/metrics (default: 100)",
194
- )
195
- parser.add_argument(
196
- "--resume",
197
- action="store_true",
198
- help="Resume previous interrupted upload",
199
- )
200
- parser.add_argument(
201
- "--state-file",
202
- type=str,
203
- default=".ml-dash-upload-state.json",
204
- help="Path to state file for resume (default: .ml-dash-upload-state.json)",
205
- )
206
373
 
207
- return parser
374
+ class ExperimentValidator:
375
+ """Validates local experiment data before upload."""
208
376
 
377
+ def __init__(self, strict: bool = False):
378
+ """
379
+ Initialize validator.
209
380
 
210
- def discover_experiments(
211
- local_path: Path,
212
- project_filter: Optional[str] = None,
213
- experiment_filter: Optional[str] = None,
214
- ) -> List[ExperimentInfo]:
381
+ Args:
382
+ strict: If True, fail on any validation error
215
383
  """
216
- Discover experiments in local storage directory.
384
+ self.strict = strict
217
385
 
218
- Supports both flat (local_path/project/experiment) and folder-based
219
- (local_path/folder/project/experiment) hierarchies.
386
+ def validate_experiment(self, exp_info: ExperimentInfo) -> ValidationResult:
387
+ """
388
+ Validate experiment directory structure and data.
220
389
 
221
390
  Args:
222
- local_path: Root path of local storage
223
- project_filter: Only discover experiments in this project
224
- experiment_filter: Only discover this experiment (requires project_filter)
391
+ exp_info: Experiment information
225
392
 
226
393
  Returns:
227
- List of ExperimentInfo objects
394
+ ValidationResult with validation status and messages
228
395
  """
229
- local_path = Path(local_path)
230
-
231
- if not local_path.exists():
232
- return []
233
-
234
- experiments = []
235
-
236
- # Find all experiment.json files recursively
237
- for exp_json in local_path.rglob("*/experiment.json"):
238
- exp_dir = exp_json.parent
239
-
240
- # Extract project and experiment names from path
241
- # Path structure: local_path / [folder] / project / experiment
242
- try:
243
- relative_path = exp_dir.relative_to(local_path)
244
- parts = relative_path.parts
396
+ result = ValidationResult()
397
+ result.valid_data = {}
398
+
399
+ # 1. Validate experiment metadata (required)
400
+ if not self._validate_experiment_metadata(exp_info, result):
401
+ result.is_valid = False
402
+ return result
403
+
404
+ # 2. Validate parameters (optional)
405
+ self._validate_parameters(exp_info, result)
406
+
407
+ # 3. Validate logs (optional)
408
+ self._validate_logs(exp_info, result)
409
+
410
+ # 4. Validate metrics (optional)
411
+ self._validate_metrics(exp_info, result)
412
+
413
+ # 5. Validate files (optional)
414
+ self._validate_files(exp_info, result)
415
+
416
+ # In strict mode, any warning becomes an error
417
+ if self.strict and result.warnings:
418
+ result.errors.extend(result.warnings)
419
+ result.warnings = []
420
+ result.is_valid = False
421
+
422
+ return result
423
+
424
+ def _validate_experiment_metadata(
425
+ self, exp_info: ExperimentInfo, result: ValidationResult
426
+ ) -> bool:
427
+ """Validate experiment.json exists and is valid."""
428
+ exp_json = exp_info.path / "experiment.json"
429
+
430
+ if not exp_json.exists():
431
+ result.errors.append("Missing experiment.json")
432
+ return False
433
+
434
+ try:
435
+ with open(exp_json, "r") as f:
436
+ metadata = json.load(f)
437
+
438
+ # Check required fields
439
+ if "name" not in metadata or "project" not in metadata:
440
+ result.errors.append("experiment.json missing required fields (name, project)")
441
+ return False
442
+
443
+ result.valid_data["metadata"] = metadata
444
+ return True
445
+
446
+ except json.JSONDecodeError as e:
447
+ result.errors.append(f"Invalid JSON in experiment.json: {e}")
448
+ return False
449
+ except IOError as e:
450
+ result.errors.append(f"Cannot read experiment.json: {e}")
451
+ return False
452
+
453
+ def _validate_parameters(self, exp_info: ExperimentInfo, result: ValidationResult):
454
+ """Validate parameters.json format."""
455
+ if not exp_info.has_params:
456
+ return
457
+
458
+ params_file = exp_info.path / "parameters.json"
459
+ try:
460
+ with open(params_file, "r") as f:
461
+ params = json.load(f)
462
+
463
+ # Check if it's a dict
464
+ if not isinstance(params, dict):
465
+ result.warnings.append("parameters.json is not a dict (will skip)")
466
+ return
467
+
468
+ # Check for valid data key if using versioned format
469
+ if "data" in params:
470
+ if not isinstance(params["data"], dict):
471
+ result.warnings.append("parameters.json data is not a dict (will skip)")
472
+ return
473
+ result.valid_data["parameters"] = params["data"]
474
+ else:
475
+ result.valid_data["parameters"] = params
476
+
477
+ except json.JSONDecodeError as e:
478
+ result.warnings.append(f"Invalid JSON in parameters.json: {e} (will skip)")
479
+ except IOError as e:
480
+ result.warnings.append(f"Cannot read parameters.json: {e} (will skip)")
481
+
482
+ def _validate_logs(self, exp_info: ExperimentInfo, result: ValidationResult):
483
+ """Validate logs.jsonl format."""
484
+ if not exp_info.has_logs:
485
+ return
486
+
487
+ logs_file = exp_info.path / "logs/logs.jsonl"
488
+ invalid_lines = []
489
+
490
+ try:
491
+ with open(logs_file, "r") as f:
492
+ for line_num, line in enumerate(f, start=1):
493
+ try:
494
+ log_entry = json.loads(line)
495
+ # Check required fields
496
+ if "message" not in log_entry:
497
+ invalid_lines.append(line_num)
498
+ except json.JSONDecodeError:
499
+ invalid_lines.append(line_num)
500
+
501
+ if invalid_lines:
502
+ count = len(invalid_lines)
503
+ preview = invalid_lines[:5]
504
+ result.warnings.append(
505
+ f"logs.jsonl has {count} invalid lines (e.g., {preview}...) - will skip these"
506
+ )
245
507
 
246
- if len(parts) < 2:
247
- continue # Need at least project/experiment
508
+ except IOError as e:
509
+ result.warnings.append(f"Cannot read logs.jsonl: {e} (will skip logs)")
248
510
 
249
- # Last two parts are project/experiment
250
- exp_name = parts[-1]
251
- project_name = parts[-2]
511
+ def _validate_metrics(self, exp_info: ExperimentInfo, result: ValidationResult):
512
+ """Validate metrics data."""
513
+ if not exp_info.metric_names:
514
+ return
252
515
 
253
- # Apply filters
254
- if project_filter and project_name != project_filter:
255
- continue
256
- if experiment_filter and exp_name != experiment_filter:
257
- continue
516
+ for metric_name in exp_info.metric_names:
517
+ metric_dir = exp_info.path / "metrics" / metric_name
518
+ data_file = metric_dir / "data.jsonl"
258
519
 
259
- # Read folder from experiment.json
260
- folder = None
520
+ invalid_lines = []
521
+ try:
522
+ with open(data_file, "r") as f:
523
+ for line_num, line in enumerate(f, start=1):
261
524
  try:
262
- with open(exp_json, 'r') as f:
263
- metadata = json.load(f)
264
- folder = metadata.get('folder')
265
- except:
266
- pass
267
-
268
- # Create experiment info
269
- exp_info = ExperimentInfo(
270
- project=project_name,
271
- experiment=exp_name,
272
- path=exp_dir,
273
- folder=folder,
274
- )
275
- except (ValueError, IndexError):
276
- continue
277
-
278
- # Check for parameters
279
- params_file = exp_dir / "parameters.json"
280
- exp_info.has_params = params_file.exists()
281
-
282
- # Check for logs
283
- logs_file = exp_dir / "logs" / "logs.jsonl"
284
- exp_info.has_logs = logs_file.exists()
285
-
286
- # Check for metrics
287
- metrics_dir = exp_dir / "metrics"
288
- if metrics_dir.exists():
289
- for metric_dir in metrics_dir.iterdir():
290
- if metric_dir.is_dir():
291
- data_file = metric_dir / "data.jsonl"
292
- if data_file.exists():
293
- exp_info.metric_names.append(metric_dir.name)
294
-
295
- # Check for files
296
- files_dir = exp_dir / "files"
297
- if files_dir.exists():
298
- try:
299
- # Count files recursively
300
- exp_info.file_count = sum(1 for _ in files_dir.rglob("*") if _.is_file())
301
-
302
- # Estimate size
303
- exp_info.estimated_size = sum(
304
- f.stat().st_size for f in files_dir.rglob("*") if f.is_file()
305
- )
306
- except (OSError, PermissionError):
307
- pass
308
-
309
- experiments.append(exp_info)
525
+ data_point = json.loads(line)
526
+ # Check for data field
527
+ if "data" not in data_point:
528
+ invalid_lines.append(line_num)
529
+ except json.JSONDecodeError:
530
+ invalid_lines.append(line_num)
531
+
532
+ if invalid_lines:
533
+ count = len(invalid_lines)
534
+ preview = invalid_lines[:5]
535
+ result.warnings.append(
536
+ f"metric '{metric_name}' has {count} invalid lines (e.g., {preview}...) - will skip these"
537
+ )
538
+
539
+ except IOError as e:
540
+ result.warnings.append(f"Cannot read metric '{metric_name}': {e} (will skip)")
541
+
542
+ def _validate_files(self, exp_info: ExperimentInfo, result: ValidationResult):
543
+ """Validate files existence."""
544
+ files_dir = exp_info.path / "files"
545
+ if not files_dir.exists():
546
+ return
547
+
548
+ metadata_file = files_dir / ".files_metadata.json"
549
+ if not metadata_file.exists():
550
+ return
551
+
552
+ try:
553
+ with open(metadata_file, "r") as f:
554
+ files_metadata = json.load(f)
555
+
556
+ missing_files = []
557
+ for file_id, file_info in files_metadata.items():
558
+ if isinstance(file_info, dict) and file_info.get("deletedAt") is None:
559
+ # Check if file exists
560
+ file_path = (
561
+ files_dir
562
+ / file_info.get("prefix", "")
563
+ / file_id
564
+ / file_info.get("filename", "")
565
+ )
566
+ if not file_path.exists():
567
+ missing_files.append(file_info.get("filename", file_id))
568
+
569
+ if missing_files:
570
+ count = len(missing_files)
571
+ preview = missing_files[:3]
572
+ result.warnings.append(
573
+ f"{count} files referenced in metadata but missing on disk (e.g., {preview}...) - will skip these"
574
+ )
310
575
 
311
- return experiments
576
+ except (json.JSONDecodeError, IOError):
577
+ pass # If we can't read metadata, just skip file validation
312
578
 
313
579
 
314
- class ExperimentValidator:
315
- """Validates local experiment data before upload."""
580
+ class ExperimentUploader:
581
+ """Handles uploading a single experiment."""
582
+
583
+ def __init__(
584
+ self,
585
+ local_storage: LocalStorage,
586
+ remote_client: RemoteClient,
587
+ batch_size: int = 100,
588
+ skip_logs: bool = False,
589
+ skip_metrics: bool = False,
590
+ skip_files: bool = False,
591
+ skip_params: bool = False,
592
+ verbose: bool = False,
593
+ progress: Optional[Progress] = None,
594
+ max_concurrent_metrics: int = 5,
595
+ target_prefix: Optional[str] = None,
596
+ ):
597
+ """
598
+ Initialize uploader.
316
599
 
317
- def __init__(self, strict: bool = False):
318
- """
319
- Initialize validator.
600
+ Args:
601
+ local_storage: Local storage instance
602
+ remote_client: Remote client instance
603
+ batch_size: Batch size for logs/metrics
604
+ skip_logs: Skip uploading logs
605
+ skip_metrics: Skip uploading metrics
606
+ skip_files: Skip uploading files
607
+ skip_params: Skip uploading parameters
608
+ verbose: Show verbose output
609
+ progress: Optional rich Progress instance for tracking
610
+ max_concurrent_metrics: Maximum concurrent metric uploads (default: 5)
611
+ target_prefix: Target prefix on server (overrides local prefix)
612
+ """
613
+ self.local = local_storage
614
+ self.remote = remote_client
615
+ self.batch_size = batch_size
616
+ self.skip_logs = skip_logs
617
+ self.skip_metrics = skip_metrics
618
+ self.skip_files = skip_files
619
+ self.skip_params = skip_params
620
+ self.verbose = verbose
621
+ self.progress = progress
622
+ self.max_concurrent_metrics = max_concurrent_metrics
623
+ self.target_prefix = target_prefix
624
+ # Thread-safe lock for shared state updates
625
+ self._lock = threading.Lock()
626
+ # Thread-local storage for remote clients (for thread-safe HTTP requests)
627
+ self._thread_local = threading.local()
628
+
629
+ def _get_remote_client(self) -> RemoteClient:
630
+ """Get thread-local remote client for safe concurrent access."""
631
+ if not hasattr(self._thread_local, "client"):
632
+ # Create a new client for this thread
633
+ # Use graphql_base_url (without /api) since RemoteClient.__init__ will add /api
634
+ self._thread_local.client = RemoteClient(
635
+ base_url=self.remote.graphql_base_url,
636
+ namespace=self.remote.namespace,
637
+ api_key=self.remote.api_key
638
+ )
639
+ return self._thread_local.client
640
+
641
+ def upload_experiment(
642
+ self, exp_info: ExperimentInfo, validation_result: ValidationResult, task_id=None
643
+ ) -> UploadResult:
644
+ """
645
+ Upload a single experiment with all its data.
320
646
 
321
- Args:
322
- strict: If True, fail on any validation error
323
- """
324
- self.strict = strict
647
+ Args:
648
+ exp_info: Experiment information
649
+ validation_result: Validation results
650
+ task_id: Optional progress task ID
325
651
 
326
- def validate_experiment(self, exp_info: ExperimentInfo) -> ValidationResult:
327
- """
328
- Validate experiment directory structure and data.
652
+ Returns:
653
+ UploadResult with upload status
654
+ """
655
+ result = UploadResult(experiment=f"{exp_info.project}/{exp_info.experiment}")
656
+
657
+ # Calculate total steps for progress tracking
658
+ total_steps = 1 # metadata
659
+ if not self.skip_params and "parameters" in validation_result.valid_data:
660
+ total_steps += 1
661
+ if not self.skip_logs and exp_info.has_logs:
662
+ total_steps += 1
663
+ if not self.skip_metrics and exp_info.metric_names:
664
+ total_steps += len(exp_info.metric_names)
665
+ if not self.skip_files and exp_info.file_count > 0:
666
+ total_steps += exp_info.file_count
667
+
668
+ current_step = 0
669
+
670
+ def update_progress(description: str):
671
+ nonlocal current_step
672
+ current_step += 1
673
+ if self.progress and task_id is not None:
674
+ self.progress.update(
675
+ task_id, completed=current_step, total=total_steps, description=description
676
+ )
329
677
 
330
- Args:
331
- exp_info: Experiment information
678
+ try:
679
+ # 1. Create/update experiment metadata
680
+ update_progress("Creating experiment...")
681
+ if self.verbose:
682
+ console.print(" [dim]Creating experiment...[/dim]")
683
+
684
+ exp_data = validation_result.valid_data
685
+
686
+ # Construct full prefix for server
687
+ # If --target is specified, use it as the base destination prefix
688
+ # Otherwise, preserve the local prefix structure
689
+ if self.target_prefix:
690
+ # User specified a target prefix (like scp destination directory)
691
+ # Append experiment name to it: target_prefix/experiment_name
692
+ full_prefix = f"{self.target_prefix.rstrip('/')}/{exp_info.experiment}"
693
+
694
+ # Extract project from target prefix for API call
695
+ # Target format: owner/project/path...
696
+ target_parts = self.target_prefix.strip("/").split("/")
697
+ if len(target_parts) >= 2:
698
+ target_project = target_parts[1]
699
+ else:
700
+ target_project = exp_info.project # Fallback to original
701
+ elif exp_info.prefix:
702
+ # No target specified, preserve local prefix structure
703
+ full_prefix = f"{exp_info.prefix}/{exp_info.experiment}"
704
+ target_project = exp_info.project
705
+ else:
706
+ full_prefix = exp_info.experiment
707
+ target_project = exp_info.project
708
+
709
+ response = self.remote.create_or_update_experiment(
710
+ project=target_project,
711
+ name=exp_info.experiment,
712
+ description=exp_data.get("description"),
713
+ tags=exp_data.get("tags"),
714
+ bindrs=exp_data.get("bindrs"),
715
+ prefix=full_prefix, # Send full prefix (folder + name) or target prefix
716
+ write_protected=exp_data.get("write_protected", False),
717
+ metadata=exp_data.get("metadata"),
718
+ )
719
+
720
+ # Extract experiment ID from nested response
721
+ experiment_id = response.get("experiment", {}).get("id") or response.get("id")
722
+ if self.verbose:
723
+ console.print(f" [green]✓[/green] Created experiment (id: {experiment_id})")
724
+
725
+ # 2. Upload parameters
726
+ if not self.skip_params and "parameters" in validation_result.valid_data:
727
+ update_progress("Uploading parameters...")
728
+ if self.verbose:
729
+ console.print(" [dim]Uploading parameters...[/dim]")
332
730
 
333
- Returns:
334
- ValidationResult with validation status and messages
335
- """
336
- result = ValidationResult()
337
- result.valid_data = {}
731
+ params = validation_result.valid_data["parameters"]
732
+ self.remote.set_parameters(experiment_id, params)
733
+ result.uploaded["params"] = len(params)
734
+ # Track bytes (approximate JSON size)
735
+ result.bytes_uploaded += len(json.dumps(params).encode("utf-8"))
338
736
 
339
- # 1. Validate experiment metadata (required)
340
- if not self._validate_experiment_metadata(exp_info, result):
341
- result.is_valid = False
342
- return result
737
+ if self.verbose:
738
+ console.print(f" [green]✓[/green] Uploaded {len(params)} parameters")
343
739
 
344
- # 2. Validate parameters (optional)
345
- self._validate_parameters(exp_info, result)
740
+ # 3. Upload logs
741
+ if not self.skip_logs and exp_info.has_logs:
742
+ count = self._upload_logs(
743
+ experiment_id, exp_info, result, task_id, update_progress
744
+ )
745
+ result.uploaded["logs"] = count
346
746
 
347
- # 3. Validate logs (optional)
348
- self._validate_logs(exp_info, result)
747
+ # 4. Upload metrics
748
+ if not self.skip_metrics and exp_info.metric_names:
749
+ count = self._upload_metrics(
750
+ experiment_id, exp_info, result, task_id, update_progress
751
+ )
752
+ result.uploaded["metrics"] = count
349
753
 
350
- # 4. Validate metrics (optional)
351
- self._validate_metrics(exp_info, result)
754
+ # 5. Upload files
755
+ if not self.skip_files and exp_info.file_count > 0:
756
+ count = self._upload_files(
757
+ experiment_id, exp_info, result, task_id, update_progress
758
+ )
759
+ result.uploaded["files"] = count
760
+
761
+ result.success = True
762
+
763
+ except Exception as e:
764
+ result.success = False
765
+ result.errors.append(str(e))
766
+ if self.verbose:
767
+ console.print(f" [red]✗ Error: {e}[/red]")
768
+
769
+ return result
770
+
771
+ def _upload_logs(
772
+ self,
773
+ experiment_id: str,
774
+ exp_info: ExperimentInfo,
775
+ result: UploadResult,
776
+ task_id=None,
777
+ update_progress=None,
778
+ ) -> int:
779
+ """Upload logs in batches."""
780
+ if update_progress:
781
+ update_progress("Uploading logs...")
782
+ if self.verbose:
783
+ console.print(" [dim]Uploading logs...[/dim]")
784
+
785
+ logs_file = exp_info.path / "logs/logs.jsonl"
786
+ logs_batch = []
787
+ total_uploaded = 0
788
+ skipped = 0
789
+
790
+ try:
791
+ with open(logs_file, "r") as f:
792
+ for line in f:
793
+ try:
794
+ log_entry = json.loads(line)
795
+
796
+ # Validate required fields
797
+ if "message" not in log_entry:
798
+ skipped += 1
799
+ continue
800
+
801
+ # Prepare log entry for API
802
+ api_log = {
803
+ "timestamp": log_entry.get("timestamp"),
804
+ "level": log_entry.get("level", "info"),
805
+ "message": log_entry["message"],
806
+ }
807
+ if "metadata" in log_entry:
808
+ api_log["metadata"] = log_entry["metadata"]
352
809
 
353
- # 5. Validate files (optional)
354
- self._validate_files(exp_info, result)
810
+ logs_batch.append(api_log)
811
+ # Track bytes
812
+ result.bytes_uploaded += len(line.encode("utf-8"))
355
813
 
356
- # In strict mode, any warning becomes an error
357
- if self.strict and result.warnings:
358
- result.errors.extend(result.warnings)
359
- result.warnings = []
360
- result.is_valid = False
814
+ # Upload batch
815
+ if len(logs_batch) >= self.batch_size:
816
+ self.remote.create_log_entries(experiment_id, logs_batch)
817
+ total_uploaded += len(logs_batch)
818
+ logs_batch = []
361
819
 
362
- return result
820
+ except json.JSONDecodeError:
821
+ skipped += 1
822
+ continue
363
823
 
364
- def _validate_experiment_metadata(self, exp_info: ExperimentInfo, result: ValidationResult) -> bool:
365
- """Validate experiment.json exists and is valid."""
366
- exp_json = exp_info.path / "experiment.json"
824
+ # Upload remaining logs
825
+ if logs_batch:
826
+ self.remote.create_log_entries(experiment_id, logs_batch)
827
+ total_uploaded += len(logs_batch)
367
828
 
368
- if not exp_json.exists():
369
- result.errors.append("Missing experiment.json")
370
- return False
829
+ if self.verbose:
830
+ msg = f" [green]✓[/green] Uploaded {total_uploaded} log entries"
831
+ if skipped > 0:
832
+ msg += f" (skipped {skipped} invalid)"
833
+ console.print(msg)
371
834
 
372
- try:
373
- with open(exp_json, "r") as f:
374
- metadata = json.load(f)
835
+ except IOError as e:
836
+ result.failed.setdefault("logs", []).append(str(e))
375
837
 
376
- # Check required fields
377
- if "name" not in metadata or "project" not in metadata:
378
- result.errors.append("experiment.json missing required fields (name, project)")
379
- return False
380
-
381
- result.valid_data["metadata"] = metadata
382
- return True
383
-
384
- except json.JSONDecodeError as e:
385
- result.errors.append(f"Invalid JSON in experiment.json: {e}")
386
- return False
387
- except IOError as e:
388
- result.errors.append(f"Cannot read experiment.json: {e}")
389
- return False
390
-
391
- def _validate_parameters(self, exp_info: ExperimentInfo, result: ValidationResult):
392
- """Validate parameters.json format."""
393
- if not exp_info.has_params:
394
- return
395
-
396
- params_file = exp_info.path / "parameters.json"
397
- try:
398
- with open(params_file, "r") as f:
399
- params = json.load(f)
400
-
401
- # Check if it's a dict
402
- if not isinstance(params, dict):
403
- result.warnings.append("parameters.json is not a dict (will skip)")
404
- return
405
-
406
- # Check for valid data key if using versioned format
407
- if "data" in params:
408
- if not isinstance(params["data"], dict):
409
- result.warnings.append("parameters.json data is not a dict (will skip)")
410
- return
411
- result.valid_data["parameters"] = params["data"]
412
- else:
413
- result.valid_data["parameters"] = params
414
-
415
- except json.JSONDecodeError as e:
416
- result.warnings.append(f"Invalid JSON in parameters.json: {e} (will skip)")
417
- except IOError as e:
418
- result.warnings.append(f"Cannot read parameters.json: {e} (will skip)")
419
-
420
- def _validate_logs(self, exp_info: ExperimentInfo, result: ValidationResult):
421
- """Validate logs.jsonl format."""
422
- if not exp_info.has_logs:
423
- return
424
-
425
- logs_file = exp_info.path / "logs" / "logs.jsonl"
426
- invalid_lines = []
838
+ return total_uploaded
427
839
 
428
- try:
429
- with open(logs_file, "r") as f:
430
- for line_num, line in enumerate(f, start=1):
431
- try:
432
- log_entry = json.loads(line)
433
- # Check required fields
434
- if "message" not in log_entry:
435
- invalid_lines.append(line_num)
436
- except json.JSONDecodeError:
437
- invalid_lines.append(line_num)
438
-
439
- if invalid_lines:
440
- count = len(invalid_lines)
441
- preview = invalid_lines[:5]
442
- result.warnings.append(
443
- f"logs.jsonl has {count} invalid lines (e.g., {preview}...) - will skip these"
444
- )
840
+ def _upload_single_metric(
841
+ self, experiment_id: str, metric_name: str, metric_dir: Path, result: UploadResult
842
+ ) -> Dict[str, Any]:
843
+ """
844
+ Upload a single metric (thread-safe helper).
445
845
 
446
- except IOError as e:
447
- result.warnings.append(f"Cannot read logs.jsonl: {e} (will skip logs)")
846
+ Returns:
847
+ Dict with 'success', 'uploaded', 'skipped', 'bytes', and 'error' keys
848
+ """
849
+ data_file = metric_dir / "data.jsonl"
850
+ data_batch = []
851
+ total_uploaded = 0
852
+ skipped = 0
853
+ bytes_uploaded = 0
854
+
855
+ # Get thread-local client for safe concurrent HTTP requests
856
+ remote_client = self._get_remote_client()
857
+
858
+ try:
859
+ with open(data_file, "r") as f:
860
+ for line in f:
861
+ try:
862
+ data_point = json.loads(line)
863
+
864
+ # Validate required fields
865
+ if "data" not in data_point:
866
+ skipped += 1
867
+ continue
868
+
869
+ data_batch.append(data_point["data"])
870
+ bytes_uploaded += len(line.encode("utf-8"))
871
+
872
+ # Upload batch using thread-local client
873
+ if len(data_batch) >= self.batch_size:
874
+ remote_client.append_batch_to_metric(
875
+ experiment_id, metric_name, data_batch
876
+ )
877
+ total_uploaded += len(data_batch)
878
+ data_batch = []
879
+
880
+ except json.JSONDecodeError:
881
+ skipped += 1
882
+ continue
448
883
 
449
- def _validate_metrics(self, exp_info: ExperimentInfo, result: ValidationResult):
450
- """Validate metrics data."""
451
- if not exp_info.metric_names:
452
- return
884
+ # Upload remaining data points using thread-local client
885
+ if data_batch:
886
+ remote_client.append_batch_to_metric(experiment_id, metric_name, data_batch)
887
+ total_uploaded += len(data_batch)
888
+
889
+ return {
890
+ "success": True,
891
+ "uploaded": total_uploaded,
892
+ "skipped": skipped,
893
+ "bytes": bytes_uploaded,
894
+ "error": None,
895
+ }
896
+
897
+ except Exception as e:
898
+ return {
899
+ "success": False,
900
+ "uploaded": 0,
901
+ "skipped": 0,
902
+ "bytes": 0,
903
+ "error": str(e),
904
+ }
905
+
906
+ def _upload_metrics(
907
+ self,
908
+ experiment_id: str,
909
+ exp_info: ExperimentInfo,
910
+ result: UploadResult,
911
+ task_id=None,
912
+ update_progress=None,
913
+ ) -> int:
914
+ """Upload metrics in parallel with concurrency limit."""
915
+ if not exp_info.metric_names:
916
+ return 0
917
+
918
+ total_metrics = 0
919
+
920
+ # Use ThreadPoolExecutor for parallel uploads
921
+ with ThreadPoolExecutor(max_workers=self.max_concurrent_metrics) as executor:
922
+ # Submit all metric upload tasks
923
+ future_to_metric = {}
924
+ for metric_name in exp_info.metric_names:
925
+ metric_dir = exp_info.path / "metrics" / metric_name
926
+ future = executor.submit(
927
+ self._upload_single_metric, experiment_id, metric_name, metric_dir, result
928
+ )
929
+ future_to_metric[future] = metric_name
453
930
 
454
- for metric_name in exp_info.metric_names:
455
- metric_dir = exp_info.path / "metrics" / metric_name
456
- data_file = metric_dir / "data.jsonl"
931
+ # Process completed uploads as they finish
932
+ for future in as_completed(future_to_metric):
933
+ metric_name = future_to_metric[future]
457
934
 
458
- invalid_lines = []
459
- try:
460
- with open(data_file, "r") as f:
461
- for line_num, line in enumerate(f, start=1):
462
- try:
463
- data_point = json.loads(line)
464
- # Check for data field
465
- if "data" not in data_point:
466
- invalid_lines.append(line_num)
467
- except json.JSONDecodeError:
468
- invalid_lines.append(line_num)
469
-
470
- if invalid_lines:
471
- count = len(invalid_lines)
472
- preview = invalid_lines[:5]
473
- result.warnings.append(
474
- f"metric '{metric_name}' has {count} invalid lines (e.g., {preview}...) - will skip these"
475
- )
476
-
477
- except IOError as e:
478
- result.warnings.append(f"Cannot read metric '{metric_name}': {e} (will skip)")
479
-
480
- def _validate_files(self, exp_info: ExperimentInfo, result: ValidationResult):
481
- """Validate files existence."""
482
- files_dir = exp_info.path / "files"
483
- if not files_dir.exists():
484
- return
485
-
486
- metadata_file = files_dir / ".files_metadata.json"
487
- if not metadata_file.exists():
488
- return
935
+ # Update progress
936
+ if update_progress:
937
+ update_progress(f"Uploading metric '{metric_name}'...")
489
938
 
490
939
  try:
491
- with open(metadata_file, "r") as f:
492
- files_metadata = json.load(f)
493
-
494
- missing_files = []
495
- for file_id, file_info in files_metadata.items():
496
- if isinstance(file_info, dict) and file_info.get("deletedAt") is None:
497
- # Check if file exists
498
- file_path = files_dir / file_info.get("prefix", "") / file_id / file_info.get("filename", "")
499
- if not file_path.exists():
500
- missing_files.append(file_info.get("filename", file_id))
501
-
502
- if missing_files:
503
- count = len(missing_files)
504
- preview = missing_files[:3]
505
- result.warnings.append(
506
- f"{count} files referenced in metadata but missing on disk (e.g., {preview}...) - will skip these"
507
- )
508
-
509
- except (json.JSONDecodeError, IOError):
510
- pass # If we can't read metadata, just skip file validation
940
+ upload_result = future.result()
511
941
 
942
+ # Thread-safe update of shared state
943
+ with self._lock:
944
+ result.bytes_uploaded += upload_result["bytes"]
512
945
 
513
- class ExperimentUploader:
514
- """Handles uploading a single experiment."""
515
-
516
- def __init__(
517
- self,
518
- local_storage: LocalStorage,
519
- remote_client: RemoteClient,
520
- batch_size: int = 100,
521
- skip_logs: bool = False,
522
- skip_metrics: bool = False,
523
- skip_files: bool = False,
524
- skip_params: bool = False,
525
- verbose: bool = False,
526
- progress: Optional[Progress] = None,
527
- max_concurrent_metrics: int = 5,
528
- ):
529
- """
530
- Initialize uploader.
531
-
532
- Args:
533
- local_storage: Local storage instance
534
- remote_client: Remote client instance
535
- batch_size: Batch size for logs/metrics
536
- skip_logs: Skip uploading logs
537
- skip_metrics: Skip uploading metrics
538
- skip_files: Skip uploading files
539
- skip_params: Skip uploading parameters
540
- verbose: Show verbose output
541
- progress: Optional rich Progress instance for tracking
542
- max_concurrent_metrics: Maximum concurrent metric uploads (default: 5)
543
- """
544
- self.local = local_storage
545
- self.remote = remote_client
546
- self.batch_size = batch_size
547
- self.skip_logs = skip_logs
548
- self.skip_metrics = skip_metrics
549
- self.skip_files = skip_files
550
- self.skip_params = skip_params
551
- self.verbose = verbose
552
- self.progress = progress
553
- self.max_concurrent_metrics = max_concurrent_metrics
554
- # Thread-safe lock for shared state updates
555
- self._lock = threading.Lock()
556
- # Thread-local storage for remote clients (for thread-safe HTTP requests)
557
- self._thread_local = threading.local()
558
-
559
- def _get_remote_client(self) -> RemoteClient:
560
- """Get thread-local remote client for safe concurrent access."""
561
- if not hasattr(self._thread_local, 'client'):
562
- # Create a new client for this thread
563
- self._thread_local.client = RemoteClient(
564
- base_url=self.remote.base_url,
565
- api_key=self.remote.api_key
566
- )
567
- return self._thread_local.client
568
-
569
- def upload_experiment(
570
- self, exp_info: ExperimentInfo, validation_result: ValidationResult, task_id=None
571
- ) -> UploadResult:
572
- """
573
- Upload a single experiment with all its data.
574
-
575
- Args:
576
- exp_info: Experiment information
577
- validation_result: Validation results
578
- task_id: Optional progress task ID
579
-
580
- Returns:
581
- UploadResult with upload status
582
- """
583
- result = UploadResult(experiment=f"{exp_info.project}/{exp_info.experiment}")
584
-
585
- # Calculate total steps for progress tracking
586
- total_steps = 1 # metadata
587
- if not self.skip_params and "parameters" in validation_result.valid_data:
588
- total_steps += 1
589
- if not self.skip_logs and exp_info.has_logs:
590
- total_steps += 1
591
- if not self.skip_metrics and exp_info.metric_names:
592
- total_steps += len(exp_info.metric_names)
593
- if not self.skip_files and exp_info.file_count > 0:
594
- total_steps += exp_info.file_count
595
-
596
- current_step = 0
597
-
598
- def update_progress(description: str):
599
- nonlocal current_step
600
- current_step += 1
601
- if self.progress and task_id is not None:
602
- self.progress.update(task_id, completed=current_step, total=total_steps, description=description)
946
+ if upload_result["success"]:
947
+ total_metrics += 1
603
948
 
604
- try:
605
- # 1. Create/update experiment metadata
606
- update_progress("Creating experiment...")
949
+ # Thread-safe console output
607
950
  if self.verbose:
608
- console.print(f" [dim]Creating experiment...[/dim]")
609
-
610
- exp_data = validation_result.valid_data
611
-
612
- # Store folder path in metadata (not as folderId which expects Snowflake ID)
613
- custom_metadata = exp_data.get("metadata") or {}
614
- if exp_data.get("folder"):
615
- custom_metadata["folder"] = exp_data["folder"]
616
-
617
- response = self.remote.create_or_update_experiment(
618
- project=exp_info.project,
619
- name=exp_info.experiment,
620
- description=exp_data.get("description"),
621
- tags=exp_data.get("tags"),
622
- bindrs=exp_data.get("bindrs"),
623
- folder=None, # Don't send folder path as folderId (expects Snowflake ID)
624
- write_protected=exp_data.get("write_protected", False),
625
- metadata=custom_metadata if custom_metadata else None,
626
- )
627
-
628
- # Extract experiment ID from nested response
629
- experiment_id = response.get("experiment", {}).get("id") or response.get("id")
630
- if self.verbose:
631
- console.print(f" [green]✓[/green] Created experiment (id: {experiment_id})")
632
-
633
- # 2. Upload parameters
634
- if not self.skip_params and "parameters" in validation_result.valid_data:
635
- update_progress("Uploading parameters...")
636
- if self.verbose:
637
- console.print(f" [dim]Uploading parameters...[/dim]")
638
-
639
- params = validation_result.valid_data["parameters"]
640
- self.remote.set_parameters(experiment_id, params)
641
- result.uploaded["params"] = len(params)
642
- # Track bytes (approximate JSON size)
643
- result.bytes_uploaded += len(json.dumps(params).encode('utf-8'))
644
-
645
- if self.verbose:
646
- console.print(f" [green]✓[/green] Uploaded {len(params)} parameters")
647
-
648
- # 3. Upload logs
649
- if not self.skip_logs and exp_info.has_logs:
650
- count = self._upload_logs(experiment_id, exp_info, result, task_id, update_progress)
651
- result.uploaded["logs"] = count
652
-
653
- # 4. Upload metrics
654
- if not self.skip_metrics and exp_info.metric_names:
655
- count = self._upload_metrics(experiment_id, exp_info, result, task_id, update_progress)
656
- result.uploaded["metrics"] = count
657
-
658
- # 5. Upload files
659
- if not self.skip_files and exp_info.file_count > 0:
660
- count = self._upload_files(experiment_id, exp_info, result, task_id, update_progress)
661
- result.uploaded["files"] = count
662
-
663
- result.success = True
951
+ msg = f" [green][/green] Uploaded {upload_result['uploaded']} data points for '{metric_name}'"
952
+ if upload_result["skipped"] > 0:
953
+ msg += f" (skipped {upload_result['skipped']} invalid)"
954
+ with self._lock:
955
+ console.print(msg)
956
+ else:
957
+ # Record failure
958
+ error_msg = f"{metric_name}: {upload_result['error']}"
959
+ with self._lock:
960
+ result.failed.setdefault("metrics", []).append(error_msg)
961
+ if self.verbose:
962
+ console.print(
963
+ f" [red]✗[/red] Failed to upload '{metric_name}': {upload_result['error']}"
964
+ )
664
965
 
665
966
  except Exception as e:
666
- result.success = False
667
- result.errors.append(str(e))
668
- if self.verbose:
669
- console.print(f" [red]✗ Error: {e}[/red]")
670
-
671
- return result
672
-
673
- def _upload_logs(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
674
- task_id=None, update_progress=None) -> int:
675
- """Upload logs in batches."""
676
- if update_progress:
677
- update_progress("Uploading logs...")
678
- if self.verbose:
679
- console.print(f" [dim]Uploading logs...[/dim]")
680
-
681
- logs_file = exp_info.path / "logs" / "logs.jsonl"
682
- logs_batch = []
683
- total_uploaded = 0
684
- skipped = 0
685
-
686
- try:
687
- with open(logs_file, "r") as f:
688
- for line in f:
689
- try:
690
- log_entry = json.loads(line)
691
-
692
- # Validate required fields
693
- if "message" not in log_entry:
694
- skipped += 1
695
- continue
696
-
697
- # Prepare log entry for API
698
- api_log = {
699
- "timestamp": log_entry.get("timestamp"),
700
- "level": log_entry.get("level", "info"),
701
- "message": log_entry["message"],
702
- }
703
- if "metadata" in log_entry:
704
- api_log["metadata"] = log_entry["metadata"]
705
-
706
- logs_batch.append(api_log)
707
- # Track bytes
708
- result.bytes_uploaded += len(line.encode('utf-8'))
709
-
710
- # Upload batch
711
- if len(logs_batch) >= self.batch_size:
712
- self.remote.create_log_entries(experiment_id, logs_batch)
713
- total_uploaded += len(logs_batch)
714
- logs_batch = []
715
-
716
- except json.JSONDecodeError:
717
- skipped += 1
718
- continue
719
-
720
- # Upload remaining logs
721
- if logs_batch:
722
- self.remote.create_log_entries(experiment_id, logs_batch)
723
- total_uploaded += len(logs_batch)
724
-
967
+ # Handle unexpected errors
968
+ error_msg = f"{metric_name}: {str(e)}"
969
+ with self._lock:
970
+ result.failed.setdefault("metrics", []).append(error_msg)
725
971
  if self.verbose:
726
- msg = f" [green][/green] Uploaded {total_uploaded} log entries"
727
- if skipped > 0:
728
- msg += f" (skipped {skipped} invalid)"
729
- console.print(msg)
730
-
731
- except IOError as e:
732
- result.failed.setdefault("logs", []).append(str(e))
733
-
734
- return total_uploaded
735
-
736
- def _upload_single_metric(
737
- self,
738
- experiment_id: str,
739
- metric_name: str,
740
- metric_dir: Path,
741
- result: UploadResult
742
- ) -> Dict[str, Any]:
743
- """
744
- Upload a single metric (thread-safe helper).
745
-
746
- Returns:
747
- Dict with 'success', 'uploaded', 'skipped', 'bytes', and 'error' keys
748
- """
749
- data_file = metric_dir / "data.jsonl"
750
- data_batch = []
751
- total_uploaded = 0
752
- skipped = 0
753
- bytes_uploaded = 0
754
-
755
- # Get thread-local client for safe concurrent HTTP requests
756
- remote_client = self._get_remote_client()
972
+ console.print(f" [red][/red] Failed to upload '{metric_name}': {e}")
973
+
974
+ return total_metrics
975
+
976
+ def _upload_files(
977
+ self,
978
+ experiment_id: str,
979
+ exp_info: ExperimentInfo,
980
+ result: UploadResult,
981
+ task_id=None,
982
+ update_progress=None,
983
+ ) -> int:
984
+ """Upload files one by one."""
985
+ files_dir = exp_info.path / "files"
986
+ total_uploaded = 0
987
+
988
+ # Parse prefix to get owner, project, and experiment path
989
+ # Format: owner/project/folder.../experiment
990
+ parts = exp_info.prefix.split("/") if exp_info.prefix else []
991
+ if len(parts) < 3:
992
+ # Invalid prefix format, skip file upload
993
+ return 0
994
+
995
+ owner = parts[0]
996
+ project = parts[1]
997
+ # Note: _get_experiment_dir expects the FULL prefix, not just the experiment part
998
+ # So we pass the full prefix to list_files
999
+ full_prefix = exp_info.prefix
1000
+
1001
+ # Use LocalStorage to list files
1002
+ try:
1003
+ files_list = self.local.list_files(owner, project, full_prefix)
1004
+
1005
+ # Debug: print file count
1006
+ if self.verbose:
1007
+ print(f"[DEBUG] Found {len(files_list)} files to upload")
1008
+ print(f"[DEBUG] Full prefix: {full_prefix}")
1009
+
1010
+ for file_info in files_list:
1011
+ # Skip deleted files
1012
+ if file_info.get("deletedAt") is not None:
1013
+ continue
757
1014
 
758
1015
  try:
759
- with open(data_file, "r") as f:
760
- for line in f:
761
- try:
762
- data_point = json.loads(line)
763
-
764
- # Validate required fields
765
- if "data" not in data_point:
766
- skipped += 1
767
- continue
768
-
769
- data_batch.append(data_point["data"])
770
- bytes_uploaded += len(line.encode('utf-8'))
771
-
772
- # Upload batch using thread-local client
773
- if len(data_batch) >= self.batch_size:
774
- remote_client.append_batch_to_metric(
775
- experiment_id, metric_name, data_batch
776
- )
777
- total_uploaded += len(data_batch)
778
- data_batch = []
779
-
780
- except json.JSONDecodeError:
781
- skipped += 1
782
- continue
783
-
784
- # Upload remaining data points using thread-local client
785
- if data_batch:
786
- remote_client.append_batch_to_metric(experiment_id, metric_name, data_batch)
787
- total_uploaded += len(data_batch)
788
-
789
- return {
790
- 'success': True,
791
- 'uploaded': total_uploaded,
792
- 'skipped': skipped,
793
- 'bytes': bytes_uploaded,
794
- 'error': None
795
- }
1016
+ if update_progress:
1017
+ update_progress(f"Uploading {file_info['filename']}...")
1018
+
1019
+ # Get file path directly from storage without copying
1020
+ file_id = file_info["id"]
1021
+ experiment_dir = self.local._get_experiment_dir(
1022
+ owner, project, full_prefix
1023
+ )
1024
+ files_dir = experiment_dir / "files"
1025
+
1026
+ # Construct file path
1027
+ file_prefix = file_info["path"].lstrip("/") if file_info["path"] else ""
1028
+ if file_prefix:
1029
+ file_path = files_dir / file_prefix / file_id / file_info["filename"]
1030
+ else:
1031
+ file_path = files_dir / file_id / file_info["filename"]
1032
+
1033
+ # Upload to remote with correct parameters
1034
+ self.remote.upload_file(
1035
+ experiment_id=experiment_id,
1036
+ file_path=str(file_path),
1037
+ prefix=file_info.get("path", ""),
1038
+ filename=file_info["filename"],
1039
+ description=file_info.get("description"),
1040
+ tags=file_info.get("tags", []),
1041
+ metadata=file_info.get("metadata"),
1042
+ checksum=file_info["checksum"],
1043
+ content_type=file_info["contentType"],
1044
+ size_bytes=file_info["sizeBytes"],
1045
+ )
1046
+
1047
+ total_uploaded += 1
1048
+ # Track bytes
1049
+ result.bytes_uploaded += file_info.get("sizeBytes", 0)
1050
+
1051
+ if self.verbose:
1052
+ size_mb = file_info.get("sizeBytes", 0) / (1024 * 1024)
1053
+ console.print(
1054
+ f" [green]✓[/green] {file_info['filename']} ({size_mb:.1f}MB)"
1055
+ )
796
1056
 
797
1057
  except Exception as e:
798
- return {
799
- 'success': False,
800
- 'uploaded': 0,
801
- 'skipped': 0,
802
- 'bytes': 0,
803
- 'error': str(e)
804
- }
1058
+ result.failed.setdefault("files", []).append(f"{file_info['filename']}: {e}")
805
1059
 
806
- def _upload_metrics(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
807
- task_id=None, update_progress=None) -> int:
808
- """Upload metrics in parallel with concurrency limit."""
809
- if not exp_info.metric_names:
810
- return 0
811
-
812
- total_metrics = 0
813
-
814
- # Use ThreadPoolExecutor for parallel uploads
815
- with ThreadPoolExecutor(max_workers=self.max_concurrent_metrics) as executor:
816
- # Submit all metric upload tasks
817
- future_to_metric = {}
818
- for metric_name in exp_info.metric_names:
819
- metric_dir = exp_info.path / "metrics" / metric_name
820
- future = executor.submit(
821
- self._upload_single_metric,
822
- experiment_id,
823
- metric_name,
824
- metric_dir,
825
- result
826
- )
827
- future_to_metric[future] = metric_name
828
-
829
- # Process completed uploads as they finish
830
- for future in as_completed(future_to_metric):
831
- metric_name = future_to_metric[future]
832
-
833
- # Update progress
834
- if update_progress:
835
- update_progress(f"Uploading metric '{metric_name}'...")
836
-
837
- try:
838
- upload_result = future.result()
839
-
840
- # Thread-safe update of shared state
841
- with self._lock:
842
- result.bytes_uploaded += upload_result['bytes']
843
-
844
- if upload_result['success']:
845
- total_metrics += 1
846
-
847
- # Thread-safe console output
848
- if self.verbose:
849
- msg = f" [green]✓[/green] Uploaded {upload_result['uploaded']} data points for '{metric_name}'"
850
- if upload_result['skipped'] > 0:
851
- msg += f" (skipped {upload_result['skipped']} invalid)"
852
- with self._lock:
853
- console.print(msg)
854
- else:
855
- # Record failure
856
- error_msg = f"{metric_name}: {upload_result['error']}"
857
- with self._lock:
858
- result.failed.setdefault("metrics", []).append(error_msg)
859
- if self.verbose:
860
- console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {upload_result['error']}")
861
-
862
- except Exception as e:
863
- # Handle unexpected errors
864
- error_msg = f"{metric_name}: {str(e)}"
865
- with self._lock:
866
- result.failed.setdefault("metrics", []).append(error_msg)
867
- if self.verbose:
868
- console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {e}")
869
-
870
- return total_metrics
871
-
872
- def _upload_files(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
873
- task_id=None, update_progress=None) -> int:
874
- """Upload files one by one."""
875
- files_dir = exp_info.path / "files"
876
- total_uploaded = 0
877
-
878
- # Use LocalStorage to list files
879
- try:
880
- files_list = self.local.list_files(exp_info.project, exp_info.experiment)
881
-
882
- for file_info in files_list:
883
- # Skip deleted files
884
- if file_info.get("deletedAt") is not None:
885
- continue
886
-
887
- try:
888
- if update_progress:
889
- update_progress(f"Uploading {file_info['filename']}...")
890
-
891
- # Get file path directly from storage without copying
892
- file_id = file_info["id"]
893
- experiment_dir = self.local._get_experiment_dir(exp_info.project, exp_info.experiment)
894
- files_dir = experiment_dir / "files"
895
-
896
- # Construct file path
897
- file_prefix = file_info["path"].lstrip("/") if file_info["path"] else ""
898
- if file_prefix:
899
- file_path = files_dir / file_prefix / file_id / file_info["filename"]
900
- else:
901
- file_path = files_dir / file_id / file_info["filename"]
902
-
903
- # Upload to remote with correct parameters
904
- self.remote.upload_file(
905
- experiment_id=experiment_id,
906
- file_path=str(file_path),
907
- prefix=file_info.get("path", ""),
908
- filename=file_info["filename"],
909
- description=file_info.get("description"),
910
- tags=file_info.get("tags", []),
911
- metadata=file_info.get("metadata"),
912
- checksum=file_info["checksum"],
913
- content_type=file_info["contentType"],
914
- size_bytes=file_info["sizeBytes"],
915
- )
916
-
917
- total_uploaded += 1
918
- # Track bytes
919
- result.bytes_uploaded += file_info.get("sizeBytes", 0)
920
-
921
- if self.verbose:
922
- size_mb = file_info.get("sizeBytes", 0) / (1024 * 1024)
923
- console.print(f" [green]✓[/green] {file_info['filename']} ({size_mb:.1f}MB)")
924
-
925
- except Exception as e:
926
- result.failed.setdefault("files", []).append(f"{file_info['filename']}: {e}")
927
-
928
- except Exception as e:
929
- result.failed.setdefault("files", []).append(str(e))
1060
+ except Exception as e:
1061
+ result.failed.setdefault("files", []).append(str(e))
930
1062
 
931
- if self.verbose and not result.failed.get("files"):
932
- console.print(f" [green]✓[/green] Uploaded {total_uploaded} files")
1063
+ if self.verbose and not result.failed.get("files"):
1064
+ console.print(f" [green]✓[/green] Uploaded {total_uploaded} files")
933
1065
 
934
- return total_uploaded
1066
+ return total_uploaded
935
1067
 
936
1068
 
937
1069
  def cmd_upload(args: argparse.Namespace) -> int:
938
- """
939
- Execute upload command.
940
-
941
- Args:
942
- args: Parsed command-line arguments
943
-
944
- Returns:
945
- Exit code (0 for success, 1 for error)
946
- """
947
- # Load config
948
- config = Config()
949
-
950
- # Get remote URL (command line > config)
951
- remote_url = args.remote or config.remote_url
952
- if not remote_url:
953
- console.print("[red]Error:[/red] --remote URL is required (or set in config)")
954
- return 1
955
-
956
- # Get API key (command line > config > auto-load from storage)
957
- # RemoteClient will auto-load from storage if api_key is None
958
- api_key = args.api_key or config.api_key
959
-
960
- # Validate experiment filter requires project
961
- if args.experiment and not args.project:
962
- console.print("[red]Error:[/red] --experiment requires --project")
963
- return 1
964
-
965
- # Discover experiments
966
- local_path = Path(args.path)
967
- if not local_path.exists():
968
- console.print(f"[red]Error:[/red] Local storage path does not exist: {local_path}")
969
- return 1
970
-
971
- # Handle state file for resume functionality
972
- state_file = Path(args.state_file)
973
- upload_state = None
974
-
975
- if args.resume:
976
- upload_state = UploadState.load(state_file)
977
- if upload_state:
978
- # Validate state matches current upload
979
- if upload_state.local_path != str(local_path.absolute()):
980
- console.print("[yellow]Warning:[/yellow] State file local path doesn't match. Starting fresh upload.")
981
- upload_state = None
982
- elif upload_state.remote_url != remote_url:
983
- console.print("[yellow]Warning:[/yellow] State file remote URL doesn't match. Starting fresh upload.")
984
- upload_state = None
985
- else:
986
- console.print(f"[green]Resuming previous upload from {upload_state.timestamp}[/green]")
987
- console.print(f" Already completed: {len(upload_state.completed_experiments)} experiments")
988
- console.print(f" Failed: {len(upload_state.failed_experiments)} experiments")
989
- else:
990
- console.print("[yellow]No previous upload state found. Starting fresh upload.[/yellow]")
991
-
992
- # Create new state if not resuming
993
- if not upload_state:
994
- upload_state = UploadState(
995
- local_path=str(local_path.absolute()),
996
- remote_url=remote_url,
1070
+ """
1071
+ Execute upload command.
1072
+
1073
+ Args:
1074
+ args: Parsed command-line arguments
1075
+
1076
+ Returns:
1077
+ Exit code (0 for success, 1 for error)
1078
+ """
1079
+ # Load config
1080
+ config = Config()
1081
+
1082
+ # Get remote URL (command line > config)
1083
+ remote_url = args.dash_url or config.remote_url
1084
+ if not remote_url:
1085
+ console.print("[red]Error:[/red] --dash-url is required (or set in config)")
1086
+ return 1
1087
+
1088
+ # Get API key (command line > config > auto-load from storage)
1089
+ # RemoteClient will auto-load from storage if api_key is None
1090
+ api_key = args.api_key or config.api_key
1091
+
1092
+ # Discover experiments
1093
+ local_path = Path(args.path)
1094
+ if not local_path.exists():
1095
+ console.print(f"[red]Error:[/red] Local storage path does not exist: {local_path}")
1096
+ return 1
1097
+
1098
+ # Handle state file for resume functionality
1099
+ state_file = Path(args.state_file)
1100
+ upload_state = None
1101
+
1102
+ if args.resume:
1103
+ upload_state = UploadState.load(state_file)
1104
+ if upload_state:
1105
+ # Validate state matches current upload
1106
+ if upload_state.local_path != str(local_path.absolute()):
1107
+ console.print(
1108
+ "[yellow]Warning:[/yellow] State file local path doesn't match. Starting fresh upload."
997
1109
  )
998
-
999
- console.print(f"[bold]Scanning local storage:[/bold] {local_path.absolute()}")
1000
- experiments = discover_experiments(
1001
- local_path,
1002
- project_filter=args.project,
1003
- experiment_filter=args.experiment,
1110
+ upload_state = None
1111
+ elif upload_state.remote_url != remote_url:
1112
+ console.print(
1113
+ "[yellow]Warning:[/yellow] State file remote URL doesn't match. Starting fresh upload."
1114
+ )
1115
+ upload_state = None
1116
+ else:
1117
+ console.print(
1118
+ f"[green]Resuming previous upload from {upload_state.timestamp}[/green]"
1119
+ )
1120
+ console.print(
1121
+ f" Already completed: {len(upload_state.completed_experiments)} experiments"
1122
+ )
1123
+ console.print(f" Failed: {len(upload_state.failed_experiments)} experiments")
1124
+ else:
1125
+ console.print(
1126
+ "[yellow]No previous upload state found. Starting fresh upload.[/yellow]"
1127
+ )
1128
+
1129
+ # Create new state if not resuming
1130
+ if not upload_state:
1131
+ upload_state = UploadState(
1132
+ local_path=str(local_path.absolute()),
1133
+ remote_url=remote_url,
1004
1134
  )
1005
1135
 
1006
- if not experiments:
1007
- if args.project and args.experiment:
1008
- console.print(f"[yellow]No experiment found:[/yellow] {args.project}/{args.experiment}")
1009
- elif args.project:
1010
- console.print(f"[yellow]No experiments found in project:[/yellow] {args.project}")
1011
- else:
1012
- console.print("[yellow]No experiments found in local storage[/yellow]")
1013
- return 1
1014
-
1015
- # Filter out already completed experiments when resuming
1016
- if args.resume and upload_state.completed_experiments:
1017
- original_count = len(experiments)
1018
- experiments = [
1019
- exp for exp in experiments
1020
- if f"{exp.project}/{exp.experiment}" not in upload_state.completed_experiments
1021
- ]
1022
- skipped_count = original_count - len(experiments)
1023
- if skipped_count > 0:
1024
- console.print(f"[dim]Skipping {skipped_count} already completed experiment(s)[/dim]")
1025
-
1026
- console.print(f"[green]Found {len(experiments)} experiment(s) to upload[/green]")
1027
-
1028
- # Display discovered experiments
1029
- if args.verbose or args.dry_run:
1030
- console.print("\n[bold]Discovered experiments:[/bold]")
1031
- for exp in experiments:
1032
- parts = []
1033
- if exp.has_logs:
1034
- parts.append("logs")
1035
- if exp.has_params:
1036
- parts.append("params")
1037
- if exp.metric_names:
1038
- parts.append(f"{len(exp.metric_names)} metrics")
1039
- if exp.file_count:
1040
- size_mb = exp.estimated_size / (1024 * 1024)
1041
- parts.append(f"{exp.file_count} files ({size_mb:.1f}MB)")
1042
-
1043
- details = ", ".join(parts) if parts else "metadata only"
1044
- console.print(f" [cyan]•[/cyan] {exp.project}/{exp.experiment} [dim]({details})[/dim]")
1045
-
1046
- # Dry-run mode: stop here
1047
- if args.dry_run:
1048
- console.print("\n[yellow bold]DRY RUN[/yellow bold] - No data will be uploaded")
1049
- console.print("Run without --dry-run to proceed with upload.")
1050
- return 0
1051
-
1052
- # Validate experiments
1053
- console.print("\n[bold]Validating experiments...[/bold]")
1054
- validator = ExperimentValidator(strict=args.strict)
1055
- validation_results = {}
1056
- valid_experiments = []
1057
- invalid_experiments = []
1058
-
1136
+ console.print(f"[bold]Scanning local storage:[/bold] {local_path.absolute()}")
1137
+ experiments = discover_experiments(
1138
+ local_path,
1139
+ project_filter=args.pref, # Using --prefix/-p argument
1140
+ experiment_filter=None,
1141
+ )
1142
+
1143
+ if not experiments:
1144
+ if args.pref:
1145
+ console.print(f"[yellow]No experiments found matching pattern:[/yellow] {args.pref}")
1146
+ else:
1147
+ console.print("[yellow]No experiments found in local storage[/yellow]")
1148
+ return 1
1149
+
1150
+ # Filter out already completed experiments when resuming
1151
+ if args.resume and upload_state.completed_experiments:
1152
+ original_count = len(experiments)
1153
+ experiments = [
1154
+ exp
1155
+ for exp in experiments
1156
+ if f"{exp.project}/{exp.experiment}" not in upload_state.completed_experiments
1157
+ ]
1158
+ skipped_count = original_count - len(experiments)
1159
+ if skipped_count > 0:
1160
+ console.print(
1161
+ f"[dim]Skipping {skipped_count} already completed experiment(s)[/dim]"
1162
+ )
1163
+
1164
+ console.print(f"[green]Found {len(experiments)} experiment(s) to upload[/green]")
1165
+
1166
+ # Display discovered experiments
1167
+ if args.verbose or args.dry_run:
1168
+ console.print("\n[bold]Discovered experiments:[/bold]")
1059
1169
  for exp in experiments:
1060
- validation = validator.validate_experiment(exp)
1061
- validation_results[f"{exp.project}/{exp.experiment}"] = validation
1170
+ parts = []
1171
+ if exp.has_logs:
1172
+ parts.append("logs")
1173
+ if exp.has_params:
1174
+ parts.append("params")
1175
+ if exp.metric_names:
1176
+ parts.append(f"{len(exp.metric_names)} metrics")
1177
+ if exp.file_count:
1178
+ size_mb = exp.estimated_size / (1024 * 1024)
1179
+ parts.append(f"{exp.file_count} files ({size_mb:.1f}MB)")
1180
+
1181
+ details = ", ".join(parts) if parts else "metadata only"
1182
+ console.print(
1183
+ f" [cyan]•[/cyan] {exp.project}/{exp.experiment} [dim]({details})[/dim]"
1184
+ )
1185
+
1186
+ # Dry-run mode: stop here
1187
+ if args.dry_run:
1188
+ console.print("\n[yellow bold]DRY RUN[/yellow bold] - No data will be uploaded")
1189
+ console.print("Run without --dry-run to proceed with upload.")
1190
+ return 0
1191
+
1192
+ # Validate experiments
1193
+ console.print("\n[bold]Validating experiments...[/bold]")
1194
+ validator = ExperimentValidator(strict=args.strict)
1195
+ validation_results = {}
1196
+ valid_experiments = []
1197
+ invalid_experiments = []
1198
+
1199
+ for exp in experiments:
1200
+ validation = validator.validate_experiment(exp)
1201
+ validation_results[f"{exp.project}/{exp.experiment}"] = validation
1202
+
1203
+ if validation.is_valid:
1204
+ valid_experiments.append(exp)
1205
+ else:
1206
+ invalid_experiments.append(exp)
1207
+
1208
+ # Show warnings and errors
1209
+ if args.verbose or validation.errors:
1210
+ exp_key = f"{exp.project}/{exp.experiment}"
1211
+ if validation.errors:
1212
+ console.print(f" [red]✗[/red] {exp_key}:")
1213
+ for error in validation.errors:
1214
+ console.print(f" [red]{error}[/red]")
1215
+ elif validation.warnings:
1216
+ console.print(f" [yellow]⚠[/yellow] {exp_key}:")
1217
+ for warning in validation.warnings:
1218
+ console.print(f" [yellow]{warning}[/yellow]")
1219
+
1220
+ if invalid_experiments:
1221
+ console.print(
1222
+ f"\n[yellow]{len(invalid_experiments)} experiment(s) failed validation and will be skipped[/yellow]"
1223
+ )
1224
+ if args.strict:
1225
+ console.print("[red]Error: Validation failed in --strict mode[/red]")
1226
+ return 1
1227
+
1228
+ if not valid_experiments:
1229
+ console.print("[red]Error: No valid experiments to upload[/red]")
1230
+ return 1
1231
+
1232
+ console.print(
1233
+ f"[green]{len(valid_experiments)} experiment(s) ready to upload[/green]"
1234
+ )
1235
+
1236
+ # Extract namespace from target or first experiment
1237
+ namespace = None
1238
+ if args.target:
1239
+ # Parse namespace from target prefix (format: "owner/project/...")
1240
+ target_parts = args.target.strip("/").split("/")
1241
+ if len(target_parts) >= 1:
1242
+ namespace = target_parts[0]
1243
+ if not namespace and valid_experiments:
1244
+ # Parse namespace from first experiment's prefix
1245
+ first_prefix = valid_experiments[0].prefix
1246
+ if first_prefix:
1247
+ prefix_parts = first_prefix.strip("/").split("/")
1248
+ if len(prefix_parts) >= 1:
1249
+ namespace = prefix_parts[0]
1250
+
1251
+ if not namespace:
1252
+ console.print("[red]Error:[/red] Could not determine namespace from experiments or target")
1253
+ return 1
1254
+
1255
+ # Initialize remote client and local storage
1256
+ remote_client = RemoteClient(base_url=remote_url, namespace=namespace, api_key=api_key)
1257
+ local_storage = LocalStorage(root_path=local_path)
1258
+
1259
+ # Upload experiments with progress tracking
1260
+ console.print(f"\n[bold]Uploading to:[/bold] {remote_url}")
1261
+ if args.target:
1262
+ console.print(f"[bold]Target prefix:[/bold] {args.target}")
1263
+ results = []
1264
+
1265
+ # Track upload timing
1266
+ import time
1267
+
1268
+ start_time = time.time()
1269
+
1270
+ # Create progress bar for overall upload
1271
+ with Progress(
1272
+ SpinnerColumn(),
1273
+ TextColumn("[progress.description]{task.description}"),
1274
+ BarColumn(),
1275
+ TaskProgressColumn(),
1276
+ console=console,
1277
+ transient=not args.verbose, # Keep progress visible in verbose mode
1278
+ ) as progress:
1279
+ # Create uploader with progress tracking
1280
+ uploader = ExperimentUploader(
1281
+ local_storage=local_storage,
1282
+ remote_client=remote_client,
1283
+ batch_size=args.batch_size,
1284
+ skip_logs=args.skip_logs,
1285
+ skip_metrics=args.skip_metrics,
1286
+ skip_files=args.skip_files,
1287
+ skip_params=args.skip_params,
1288
+ verbose=args.verbose,
1289
+ progress=progress,
1290
+ target_prefix=args.target,
1291
+ )
1062
1292
 
1063
- if validation.is_valid:
1064
- valid_experiments.append(exp)
1293
+ for i, exp in enumerate(valid_experiments, start=1):
1294
+ exp_key = f"{exp.project}/{exp.experiment}"
1295
+
1296
+ # Create task for this experiment
1297
+ task_id = progress.add_task(
1298
+ f"[{i}/{len(valid_experiments)}] {exp_key}",
1299
+ total=100, # Will be updated with actual steps
1300
+ )
1301
+
1302
+ # Update state - mark as in progress
1303
+ upload_state.in_progress_experiment = exp_key
1304
+ if not args.dry_run:
1305
+ upload_state.save(state_file)
1306
+
1307
+ validation = validation_results[exp_key]
1308
+ result = uploader.upload_experiment(exp, validation, task_id=task_id)
1309
+ results.append(result)
1310
+
1311
+ # Update state - mark as completed or failed
1312
+ upload_state.in_progress_experiment = None
1313
+ if result.success:
1314
+ upload_state.completed_experiments.append(exp_key)
1315
+ else:
1316
+ upload_state.failed_experiments.append(exp_key)
1317
+
1318
+ if not args.dry_run:
1319
+ upload_state.save(state_file)
1320
+
1321
+ # Update task to completed
1322
+ progress.update(task_id, completed=100, total=100)
1323
+
1324
+ if not args.verbose:
1325
+ # Show brief status
1326
+ if result.success:
1327
+ parts = []
1328
+ if result.uploaded.get("params"):
1329
+ parts.append(f"{result.uploaded['params']} params")
1330
+ if result.uploaded.get("logs"):
1331
+ parts.append(f"{result.uploaded['logs']} logs")
1332
+ if result.uploaded.get("metrics"):
1333
+ parts.append(f"{result.uploaded['metrics']} metrics")
1334
+ if result.uploaded.get("files"):
1335
+ parts.append(f"{result.uploaded['files']} files")
1336
+ status = ", ".join(parts) if parts else "metadata only"
1337
+ console.print(f" [green]✓[/green] Uploaded ({status})")
1065
1338
  else:
1066
- invalid_experiments.append(exp)
1067
-
1068
- # Show warnings and errors
1069
- if args.verbose or validation.errors:
1070
- exp_key = f"{exp.project}/{exp.experiment}"
1071
- if validation.errors:
1072
- console.print(f" [red]✗[/red] {exp_key}:")
1073
- for error in validation.errors:
1074
- console.print(f" [red]{error}[/red]")
1075
- elif validation.warnings:
1076
- console.print(f" [yellow]⚠[/yellow] {exp_key}:")
1077
- for warning in validation.warnings:
1078
- console.print(f" [yellow]{warning}[/yellow]")
1079
-
1080
- if invalid_experiments:
1081
- console.print(f"\n[yellow]{len(invalid_experiments)} experiment(s) failed validation and will be skipped[/yellow]")
1082
- if args.strict:
1083
- console.print("[red]Error: Validation failed in --strict mode[/red]")
1084
- return 1
1085
-
1086
- if not valid_experiments:
1087
- console.print("[red]Error: No valid experiments to upload[/red]")
1088
- return 1
1089
-
1090
- console.print(f"[green]{len(valid_experiments)} experiment(s) ready to upload[/green]")
1091
-
1092
- # Initialize remote client and local storage
1093
- remote_client = RemoteClient(base_url=remote_url, api_key=api_key)
1094
- local_storage = LocalStorage(root_path=local_path)
1095
-
1096
- # Upload experiments with progress tracking
1097
- console.print(f"\n[bold]Uploading to:[/bold] {remote_url}")
1098
- results = []
1099
-
1100
- # Track upload timing
1101
- import time
1102
- start_time = time.time()
1103
-
1104
- # Create progress bar for overall upload
1105
- with Progress(
1106
- SpinnerColumn(),
1107
- TextColumn("[progress.description]{task.description}"),
1108
- BarColumn(),
1109
- TaskProgressColumn(),
1110
- console=console,
1111
- transient=not args.verbose, # Keep progress visible in verbose mode
1112
- ) as progress:
1113
- # Create uploader with progress tracking
1114
- uploader = ExperimentUploader(
1115
- local_storage=local_storage,
1116
- remote_client=remote_client,
1117
- batch_size=args.batch_size,
1118
- skip_logs=args.skip_logs,
1119
- skip_metrics=args.skip_metrics,
1120
- skip_files=args.skip_files,
1121
- skip_params=args.skip_params,
1122
- verbose=args.verbose,
1123
- progress=progress,
1124
- )
1339
+ console.print(" [red]✗[/red] Failed")
1340
+ if result.errors:
1341
+ for error in result.errors[:3]: # Show first 3 errors
1342
+ console.print(f" [red]{error}[/red]")
1343
+
1344
+ # Calculate timing
1345
+ end_time = time.time()
1346
+ elapsed_time = end_time - start_time
1347
+ total_bytes = sum(r.bytes_uploaded for r in results)
1348
+
1349
+ # Print summary with rich Table
1350
+ console.print()
1351
+
1352
+ successful = [r for r in results if r.success]
1353
+ failed = [r for r in results if not r.success]
1354
+
1355
+ # Create summary table
1356
+ summary_table = Table(title="Upload Summary", show_header=True, header_style="bold")
1357
+ summary_table.add_column("Status", style="cyan")
1358
+ summary_table.add_column("Count", justify="right")
1359
+
1360
+ summary_table.add_row(
1361
+ "Successful", f"[green]{len(successful)}/{len(results)}[/green]"
1362
+ )
1363
+ if failed:
1364
+ summary_table.add_row("Failed", f"[red]{len(failed)}/{len(results)}[/red]")
1365
+
1366
+ # Add timing information
1367
+ summary_table.add_row("Total Time", f"{elapsed_time:.2f}s")
1368
+
1369
+ # Calculate and display upload speed
1370
+ if total_bytes > 0 and elapsed_time > 0:
1371
+ # Convert to appropriate unit
1372
+ if total_bytes < 1024 * 1024: # Less than 1 MB
1373
+ speed_kb = (total_bytes / 1024) / elapsed_time
1374
+ summary_table.add_row("Avg Speed", f"{speed_kb:.2f} KB/s")
1375
+ else: # 1 MB or more
1376
+ speed_mb = (total_bytes / (1024 * 1024)) / elapsed_time
1377
+ summary_table.add_row("Avg Speed", f"{speed_mb:.2f} MB/s")
1378
+
1379
+ console.print(summary_table)
1380
+
1381
+ # Show failed experiments
1382
+ if failed:
1383
+ console.print("\n[bold red]Failed Experiments:[/bold red]")
1384
+ for result in failed:
1385
+ console.print(f" [red]✗[/red] {result.experiment}")
1386
+ for error in result.errors:
1387
+ console.print(f" [dim]{error}[/dim]")
1388
+
1389
+ # Data statistics
1390
+ total_logs = sum(r.uploaded.get("logs", 0) for r in results)
1391
+ total_metrics = sum(r.uploaded.get("metrics", 0) for r in results)
1392
+ total_files = sum(r.uploaded.get("files", 0) for r in results)
1393
+
1394
+ if total_logs or total_metrics or total_files:
1395
+ data_table = Table(title="Data Uploaded", show_header=True, header_style="bold")
1396
+ data_table.add_column("Type", style="cyan")
1397
+ data_table.add_column("Count", justify="right", style="green")
1398
+
1399
+ if total_logs:
1400
+ data_table.add_row("Logs", f"{total_logs} entries")
1401
+ if total_metrics:
1402
+ data_table.add_row("Metrics", f"{total_metrics} metrics")
1403
+ if total_files:
1404
+ data_table.add_row("Files", f"{total_files} files")
1125
1405
 
1126
- for i, exp in enumerate(valid_experiments, start=1):
1127
- exp_key = f"{exp.project}/{exp.experiment}"
1128
-
1129
- # Create task for this experiment
1130
- task_id = progress.add_task(
1131
- f"[{i}/{len(valid_experiments)}] {exp_key}",
1132
- total=100, # Will be updated with actual steps
1133
- )
1134
-
1135
- # Update state - mark as in progress
1136
- upload_state.in_progress_experiment = exp_key
1137
- if not args.dry_run:
1138
- upload_state.save(state_file)
1139
-
1140
- validation = validation_results[exp_key]
1141
- result = uploader.upload_experiment(exp, validation, task_id=task_id)
1142
- results.append(result)
1143
-
1144
- # Update state - mark as completed or failed
1145
- upload_state.in_progress_experiment = None
1146
- if result.success:
1147
- upload_state.completed_experiments.append(exp_key)
1148
- else:
1149
- upload_state.failed_experiments.append(exp_key)
1150
-
1151
- if not args.dry_run:
1152
- upload_state.save(state_file)
1153
-
1154
- # Update task to completed
1155
- progress.update(task_id, completed=100, total=100)
1156
-
1157
- if not args.verbose:
1158
- # Show brief status
1159
- if result.success:
1160
- parts = []
1161
- if result.uploaded.get("params"):
1162
- parts.append(f"{result.uploaded['params']} params")
1163
- if result.uploaded.get("logs"):
1164
- parts.append(f"{result.uploaded['logs']} logs")
1165
- if result.uploaded.get("metrics"):
1166
- parts.append(f"{result.uploaded['metrics']} metrics")
1167
- if result.uploaded.get("files"):
1168
- parts.append(f"{result.uploaded['files']} files")
1169
- status = ", ".join(parts) if parts else "metadata only"
1170
- console.print(f" [green]✓[/green] Uploaded ({status})")
1171
- else:
1172
- console.print(f" [red]✗[/red] Failed")
1173
- if result.errors:
1174
- for error in result.errors[:3]: # Show first 3 errors
1175
- console.print(f" [red]{error}[/red]")
1176
-
1177
- # Calculate timing
1178
- end_time = time.time()
1179
- elapsed_time = end_time - start_time
1180
- total_bytes = sum(r.bytes_uploaded for r in results)
1181
-
1182
- # Print summary with rich Table
1183
1406
  console.print()
1407
+ console.print(data_table)
1408
+
1409
+ # Clean up state file if all uploads succeeded
1410
+ if not args.dry_run and len(failed) == 0 and state_file.exists():
1411
+ state_file.unlink()
1412
+ console.print("\n[dim]Upload complete. State file removed.[/dim]")
1413
+ elif not args.dry_run and failed:
1414
+ console.print(
1415
+ f"\n[yellow]State saved to {state_file}. Use --resume to retry failed uploads.[/yellow]"
1416
+ )
1184
1417
 
1185
- successful = [r for r in results if r.success]
1186
- failed = [r for r in results if not r.success]
1187
-
1188
- # Create summary table
1189
- summary_table = Table(title="Upload Summary", show_header=True, header_style="bold")
1190
- summary_table.add_column("Status", style="cyan")
1191
- summary_table.add_column("Count", justify="right")
1192
-
1193
- summary_table.add_row("Successful", f"[green]{len(successful)}/{len(results)}[/green]")
1194
- if failed:
1195
- summary_table.add_row("Failed", f"[red]{len(failed)}/{len(results)}[/red]")
1196
-
1197
- # Add timing information
1198
- summary_table.add_row("Total Time", f"{elapsed_time:.2f}s")
1199
-
1200
- # Calculate and display upload speed
1201
- if total_bytes > 0 and elapsed_time > 0:
1202
- # Convert to appropriate unit
1203
- if total_bytes < 1024 * 1024: # Less than 1 MB
1204
- speed_kb = (total_bytes / 1024) / elapsed_time
1205
- summary_table.add_row("Avg Speed", f"{speed_kb:.2f} KB/s")
1206
- else: # 1 MB or more
1207
- speed_mb = (total_bytes / (1024 * 1024)) / elapsed_time
1208
- summary_table.add_row("Avg Speed", f"{speed_mb:.2f} MB/s")
1209
-
1210
- console.print(summary_table)
1211
-
1212
- # Show failed experiments
1213
- if failed:
1214
- console.print("\n[bold red]Failed Experiments:[/bold red]")
1215
- for result in failed:
1216
- console.print(f" [red]✗[/red] {result.experiment}")
1217
- for error in result.errors:
1218
- console.print(f" [dim]{error}[/dim]")
1219
-
1220
- # Data statistics
1221
- total_logs = sum(r.uploaded.get("logs", 0) for r in results)
1222
- total_metrics = sum(r.uploaded.get("metrics", 0) for r in results)
1223
- total_files = sum(r.uploaded.get("files", 0) for r in results)
1224
-
1225
- if total_logs or total_metrics or total_files:
1226
- data_table = Table(title="Data Uploaded", show_header=True, header_style="bold")
1227
- data_table.add_column("Type", style="cyan")
1228
- data_table.add_column("Count", justify="right", style="green")
1229
-
1230
- if total_logs:
1231
- data_table.add_row("Logs", f"{total_logs} entries")
1232
- if total_metrics:
1233
- data_table.add_row("Metrics", f"{total_metrics} metrics")
1234
- if total_files:
1235
- data_table.add_row("Files", f"{total_files} files")
1236
-
1237
- console.print()
1238
- console.print(data_table)
1239
-
1240
- # Clean up state file if all uploads succeeded
1241
- if not args.dry_run and len(failed) == 0 and state_file.exists():
1242
- state_file.unlink()
1243
- console.print("\n[dim]Upload complete. State file removed.[/dim]")
1244
- elif not args.dry_run and failed:
1245
- console.print(f"\n[yellow]State saved to {state_file}. Use --resume to retry failed uploads.[/yellow]")
1246
-
1247
- # Return exit code
1248
- return 0 if len(failed) == 0 else 1
1418
+ # Return exit code
1419
+ return 0 if len(failed) == 0 else 1