ml-dash 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1248 @@
1
+ """Upload command implementation for ML-Dash CLI."""
2
+
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+ from typing import List, Dict, Any, Optional
7
+ from dataclasses import dataclass, field
8
+ import threading
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+
11
+ from rich.console import Console
12
+ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
13
+ from rich.table import Table
14
+ from rich.panel import Panel
15
+
16
+ from ..storage import LocalStorage
17
+ from ..client import RemoteClient
18
+ from ..config import Config
19
+
20
+ # Initialize rich console
21
+ console = Console()
22
+
23
+
24
+ @dataclass
25
+ class ExperimentInfo:
26
+ """Information about an experiment to upload."""
27
+ project: str
28
+ experiment: str
29
+ path: Path
30
+ folder: Optional[str] = None
31
+ has_logs: bool = False
32
+ has_params: bool = False
33
+ metric_names: List[str] = field(default_factory=list)
34
+ file_count: int = 0
35
+ estimated_size: int = 0 # in bytes
36
+
37
+
38
+ @dataclass
39
+ class ValidationResult:
40
+ """Result of experiment validation."""
41
+ is_valid: bool = True
42
+ warnings: List[str] = field(default_factory=list)
43
+ errors: List[str] = field(default_factory=list)
44
+ valid_data: Dict[str, Any] = field(default_factory=dict)
45
+
46
+
47
+ @dataclass
48
+ class UploadResult:
49
+ """Result of uploading an experiment."""
50
+ experiment: str
51
+ success: bool = False
52
+ uploaded: Dict[str, int] = field(default_factory=dict) # {"logs": 100, "metrics": 3}
53
+ failed: Dict[str, List[str]] = field(default_factory=dict) # {"files": ["error msg"]}
54
+ errors: List[str] = field(default_factory=list)
55
+ bytes_uploaded: int = 0 # Total bytes uploaded
56
+
57
+
58
+ @dataclass
59
+ class UploadState:
60
+ """Tracks upload state for resume functionality."""
61
+ local_path: str
62
+ remote_url: str
63
+ completed_experiments: List[str] = field(default_factory=list) # ["project/experiment"]
64
+ failed_experiments: List[str] = field(default_factory=list)
65
+ in_progress_experiment: Optional[str] = None
66
+ timestamp: Optional[str] = None
67
+
68
+ def to_dict(self) -> Dict[str, Any]:
69
+ """Convert to dictionary for JSON serialization."""
70
+ return {
71
+ "local_path": self.local_path,
72
+ "remote_url": self.remote_url,
73
+ "completed_experiments": self.completed_experiments,
74
+ "failed_experiments": self.failed_experiments,
75
+ "in_progress_experiment": self.in_progress_experiment,
76
+ "timestamp": self.timestamp,
77
+ }
78
+
79
+ @classmethod
80
+ def from_dict(cls, data: Dict[str, Any]) -> "UploadState":
81
+ """Create from dictionary."""
82
+ return cls(
83
+ local_path=data["local_path"],
84
+ remote_url=data["remote_url"],
85
+ completed_experiments=data.get("completed_experiments", []),
86
+ failed_experiments=data.get("failed_experiments", []),
87
+ in_progress_experiment=data.get("in_progress_experiment"),
88
+ timestamp=data.get("timestamp"),
89
+ )
90
+
91
+ def save(self, path: Path):
92
+ """Save state to file."""
93
+ import datetime
94
+ self.timestamp = datetime.datetime.now().isoformat()
95
+ with open(path, "w") as f:
96
+ json.dump(self.to_dict(), f, indent=2)
97
+
98
+ @classmethod
99
+ def load(cls, path: Path) -> Optional["UploadState"]:
100
+ """Load state from file."""
101
+ if not path.exists():
102
+ return None
103
+ try:
104
+ with open(path, "r") as f:
105
+ data = json.load(f)
106
+ return cls.from_dict(data)
107
+ except (json.JSONDecodeError, IOError, KeyError):
108
+ return None
109
+
110
+
111
+ def add_parser(subparsers) -> argparse.ArgumentParser:
112
+ """Add upload command parser."""
113
+ parser = subparsers.add_parser(
114
+ "upload",
115
+ help="Upload local experiments to remote server",
116
+ description="Upload locally-stored ML-Dash experiment data to a remote server.",
117
+ )
118
+
119
+ # Positional argument
120
+ parser.add_argument(
121
+ "path",
122
+ nargs="?",
123
+ default="./.ml-dash",
124
+ help="Local storage directory to upload from (default: ./.ml-dash)",
125
+ )
126
+
127
+ # Remote configuration
128
+ parser.add_argument(
129
+ "--remote",
130
+ type=str,
131
+ help="Remote server URL (required unless set in config)",
132
+ )
133
+ parser.add_argument(
134
+ "--api-key",
135
+ type=str,
136
+ help="JWT token for authentication (optional - auto-loads from 'ml-dash login' if not provided)",
137
+ )
138
+
139
+ # Scope control
140
+ parser.add_argument(
141
+ "--project",
142
+ type=str,
143
+ help="Upload only experiments from this project",
144
+ )
145
+ parser.add_argument(
146
+ "--experiment",
147
+ type=str,
148
+ help="Upload only this specific experiment (requires --project)",
149
+ )
150
+
151
+ # Data filtering
152
+ parser.add_argument(
153
+ "--skip-logs",
154
+ action="store_true",
155
+ help="Don't upload logs",
156
+ )
157
+ parser.add_argument(
158
+ "--skip-metrics",
159
+ action="store_true",
160
+ help="Don't upload metrics",
161
+ )
162
+ parser.add_argument(
163
+ "--skip-files",
164
+ action="store_true",
165
+ help="Don't upload files",
166
+ )
167
+ parser.add_argument(
168
+ "--skip-params",
169
+ action="store_true",
170
+ help="Don't upload parameters",
171
+ )
172
+
173
+ # Behavior control
174
+ parser.add_argument(
175
+ "--dry-run",
176
+ action="store_true",
177
+ help="Show what would be uploaded without uploading",
178
+ )
179
+ parser.add_argument(
180
+ "--strict",
181
+ action="store_true",
182
+ help="Fail on any validation error (default: skip invalid data)",
183
+ )
184
+ parser.add_argument(
185
+ "-v", "--verbose",
186
+ action="store_true",
187
+ help="Show detailed progress",
188
+ )
189
+ parser.add_argument(
190
+ "--batch-size",
191
+ type=int,
192
+ default=100,
193
+ help="Batch size for logs/metrics (default: 100)",
194
+ )
195
+ parser.add_argument(
196
+ "--resume",
197
+ action="store_true",
198
+ help="Resume previous interrupted upload",
199
+ )
200
+ parser.add_argument(
201
+ "--state-file",
202
+ type=str,
203
+ default=".ml-dash-upload-state.json",
204
+ help="Path to state file for resume (default: .ml-dash-upload-state.json)",
205
+ )
206
+
207
+ return parser
208
+
209
+
210
+ def discover_experiments(
211
+ local_path: Path,
212
+ project_filter: Optional[str] = None,
213
+ experiment_filter: Optional[str] = None,
214
+ ) -> List[ExperimentInfo]:
215
+ """
216
+ Discover experiments in local storage directory.
217
+
218
+ Supports both flat (local_path/project/experiment) and folder-based
219
+ (local_path/folder/project/experiment) hierarchies.
220
+
221
+ Args:
222
+ local_path: Root path of local storage
223
+ project_filter: Only discover experiments in this project
224
+ experiment_filter: Only discover this experiment (requires project_filter)
225
+
226
+ Returns:
227
+ List of ExperimentInfo objects
228
+ """
229
+ local_path = Path(local_path)
230
+
231
+ if not local_path.exists():
232
+ return []
233
+
234
+ experiments = []
235
+
236
+ # Find all experiment.json files recursively
237
+ for exp_json in local_path.rglob("*/experiment.json"):
238
+ exp_dir = exp_json.parent
239
+
240
+ # Extract project and experiment names from path
241
+ # Path structure: local_path / [folder] / project / experiment
242
+ try:
243
+ relative_path = exp_dir.relative_to(local_path)
244
+ parts = relative_path.parts
245
+
246
+ if len(parts) < 2:
247
+ continue # Need at least project/experiment
248
+
249
+ # Last two parts are project/experiment
250
+ exp_name = parts[-1]
251
+ project_name = parts[-2]
252
+
253
+ # Apply filters
254
+ if project_filter and project_name != project_filter:
255
+ continue
256
+ if experiment_filter and exp_name != experiment_filter:
257
+ continue
258
+
259
+ # Read folder from experiment.json
260
+ folder = None
261
+ try:
262
+ with open(exp_json, 'r') as f:
263
+ metadata = json.load(f)
264
+ folder = metadata.get('folder')
265
+ except:
266
+ pass
267
+
268
+ # Create experiment info
269
+ exp_info = ExperimentInfo(
270
+ project=project_name,
271
+ experiment=exp_name,
272
+ path=exp_dir,
273
+ folder=folder,
274
+ )
275
+ except (ValueError, IndexError):
276
+ continue
277
+
278
+ # Check for parameters
279
+ params_file = exp_dir / "parameters.json"
280
+ exp_info.has_params = params_file.exists()
281
+
282
+ # Check for logs
283
+ logs_file = exp_dir / "logs" / "logs.jsonl"
284
+ exp_info.has_logs = logs_file.exists()
285
+
286
+ # Check for metrics
287
+ metrics_dir = exp_dir / "metrics"
288
+ if metrics_dir.exists():
289
+ for metric_dir in metrics_dir.iterdir():
290
+ if metric_dir.is_dir():
291
+ data_file = metric_dir / "data.jsonl"
292
+ if data_file.exists():
293
+ exp_info.metric_names.append(metric_dir.name)
294
+
295
+ # Check for files
296
+ files_dir = exp_dir / "files"
297
+ if files_dir.exists():
298
+ try:
299
+ # Count files recursively
300
+ exp_info.file_count = sum(1 for _ in files_dir.rglob("*") if _.is_file())
301
+
302
+ # Estimate size
303
+ exp_info.estimated_size = sum(
304
+ f.stat().st_size for f in files_dir.rglob("*") if f.is_file()
305
+ )
306
+ except (OSError, PermissionError):
307
+ pass
308
+
309
+ experiments.append(exp_info)
310
+
311
+ return experiments
312
+
313
+
314
+ class ExperimentValidator:
315
+ """Validates local experiment data before upload."""
316
+
317
+ def __init__(self, strict: bool = False):
318
+ """
319
+ Initialize validator.
320
+
321
+ Args:
322
+ strict: If True, fail on any validation error
323
+ """
324
+ self.strict = strict
325
+
326
+ def validate_experiment(self, exp_info: ExperimentInfo) -> ValidationResult:
327
+ """
328
+ Validate experiment directory structure and data.
329
+
330
+ Args:
331
+ exp_info: Experiment information
332
+
333
+ Returns:
334
+ ValidationResult with validation status and messages
335
+ """
336
+ result = ValidationResult()
337
+ result.valid_data = {}
338
+
339
+ # 1. Validate experiment metadata (required)
340
+ if not self._validate_experiment_metadata(exp_info, result):
341
+ result.is_valid = False
342
+ return result
343
+
344
+ # 2. Validate parameters (optional)
345
+ self._validate_parameters(exp_info, result)
346
+
347
+ # 3. Validate logs (optional)
348
+ self._validate_logs(exp_info, result)
349
+
350
+ # 4. Validate metrics (optional)
351
+ self._validate_metrics(exp_info, result)
352
+
353
+ # 5. Validate files (optional)
354
+ self._validate_files(exp_info, result)
355
+
356
+ # In strict mode, any warning becomes an error
357
+ if self.strict and result.warnings:
358
+ result.errors.extend(result.warnings)
359
+ result.warnings = []
360
+ result.is_valid = False
361
+
362
+ return result
363
+
364
+ def _validate_experiment_metadata(self, exp_info: ExperimentInfo, result: ValidationResult) -> bool:
365
+ """Validate experiment.json exists and is valid."""
366
+ exp_json = exp_info.path / "experiment.json"
367
+
368
+ if not exp_json.exists():
369
+ result.errors.append("Missing experiment.json")
370
+ return False
371
+
372
+ try:
373
+ with open(exp_json, "r") as f:
374
+ metadata = json.load(f)
375
+
376
+ # Check required fields
377
+ if "name" not in metadata or "project" not in metadata:
378
+ result.errors.append("experiment.json missing required fields (name, project)")
379
+ return False
380
+
381
+ result.valid_data["metadata"] = metadata
382
+ return True
383
+
384
+ except json.JSONDecodeError as e:
385
+ result.errors.append(f"Invalid JSON in experiment.json: {e}")
386
+ return False
387
+ except IOError as e:
388
+ result.errors.append(f"Cannot read experiment.json: {e}")
389
+ return False
390
+
391
+ def _validate_parameters(self, exp_info: ExperimentInfo, result: ValidationResult):
392
+ """Validate parameters.json format."""
393
+ if not exp_info.has_params:
394
+ return
395
+
396
+ params_file = exp_info.path / "parameters.json"
397
+ try:
398
+ with open(params_file, "r") as f:
399
+ params = json.load(f)
400
+
401
+ # Check if it's a dict
402
+ if not isinstance(params, dict):
403
+ result.warnings.append("parameters.json is not a dict (will skip)")
404
+ return
405
+
406
+ # Check for valid data key if using versioned format
407
+ if "data" in params:
408
+ if not isinstance(params["data"], dict):
409
+ result.warnings.append("parameters.json data is not a dict (will skip)")
410
+ return
411
+ result.valid_data["parameters"] = params["data"]
412
+ else:
413
+ result.valid_data["parameters"] = params
414
+
415
+ except json.JSONDecodeError as e:
416
+ result.warnings.append(f"Invalid JSON in parameters.json: {e} (will skip)")
417
+ except IOError as e:
418
+ result.warnings.append(f"Cannot read parameters.json: {e} (will skip)")
419
+
420
+ def _validate_logs(self, exp_info: ExperimentInfo, result: ValidationResult):
421
+ """Validate logs.jsonl format."""
422
+ if not exp_info.has_logs:
423
+ return
424
+
425
+ logs_file = exp_info.path / "logs" / "logs.jsonl"
426
+ invalid_lines = []
427
+
428
+ try:
429
+ with open(logs_file, "r") as f:
430
+ for line_num, line in enumerate(f, start=1):
431
+ try:
432
+ log_entry = json.loads(line)
433
+ # Check required fields
434
+ if "message" not in log_entry:
435
+ invalid_lines.append(line_num)
436
+ except json.JSONDecodeError:
437
+ invalid_lines.append(line_num)
438
+
439
+ if invalid_lines:
440
+ count = len(invalid_lines)
441
+ preview = invalid_lines[:5]
442
+ result.warnings.append(
443
+ f"logs.jsonl has {count} invalid lines (e.g., {preview}...) - will skip these"
444
+ )
445
+
446
+ except IOError as e:
447
+ result.warnings.append(f"Cannot read logs.jsonl: {e} (will skip logs)")
448
+
449
+ def _validate_metrics(self, exp_info: ExperimentInfo, result: ValidationResult):
450
+ """Validate metrics data."""
451
+ if not exp_info.metric_names:
452
+ return
453
+
454
+ for metric_name in exp_info.metric_names:
455
+ metric_dir = exp_info.path / "metrics" / metric_name
456
+ data_file = metric_dir / "data.jsonl"
457
+
458
+ invalid_lines = []
459
+ try:
460
+ with open(data_file, "r") as f:
461
+ for line_num, line in enumerate(f, start=1):
462
+ try:
463
+ data_point = json.loads(line)
464
+ # Check for data field
465
+ if "data" not in data_point:
466
+ invalid_lines.append(line_num)
467
+ except json.JSONDecodeError:
468
+ invalid_lines.append(line_num)
469
+
470
+ if invalid_lines:
471
+ count = len(invalid_lines)
472
+ preview = invalid_lines[:5]
473
+ result.warnings.append(
474
+ f"metric '{metric_name}' has {count} invalid lines (e.g., {preview}...) - will skip these"
475
+ )
476
+
477
+ except IOError as e:
478
+ result.warnings.append(f"Cannot read metric '{metric_name}': {e} (will skip)")
479
+
480
+ def _validate_files(self, exp_info: ExperimentInfo, result: ValidationResult):
481
+ """Validate files existence."""
482
+ files_dir = exp_info.path / "files"
483
+ if not files_dir.exists():
484
+ return
485
+
486
+ metadata_file = files_dir / ".files_metadata.json"
487
+ if not metadata_file.exists():
488
+ return
489
+
490
+ try:
491
+ with open(metadata_file, "r") as f:
492
+ files_metadata = json.load(f)
493
+
494
+ missing_files = []
495
+ for file_id, file_info in files_metadata.items():
496
+ if isinstance(file_info, dict) and file_info.get("deletedAt") is None:
497
+ # Check if file exists
498
+ file_path = files_dir / file_info.get("prefix", "") / file_id / file_info.get("filename", "")
499
+ if not file_path.exists():
500
+ missing_files.append(file_info.get("filename", file_id))
501
+
502
+ if missing_files:
503
+ count = len(missing_files)
504
+ preview = missing_files[:3]
505
+ result.warnings.append(
506
+ f"{count} files referenced in metadata but missing on disk (e.g., {preview}...) - will skip these"
507
+ )
508
+
509
+ except (json.JSONDecodeError, IOError):
510
+ pass # If we can't read metadata, just skip file validation
511
+
512
+
513
+ class ExperimentUploader:
514
+ """Handles uploading a single experiment."""
515
+
516
+ def __init__(
517
+ self,
518
+ local_storage: LocalStorage,
519
+ remote_client: RemoteClient,
520
+ batch_size: int = 100,
521
+ skip_logs: bool = False,
522
+ skip_metrics: bool = False,
523
+ skip_files: bool = False,
524
+ skip_params: bool = False,
525
+ verbose: bool = False,
526
+ progress: Optional[Progress] = None,
527
+ max_concurrent_metrics: int = 5,
528
+ ):
529
+ """
530
+ Initialize uploader.
531
+
532
+ Args:
533
+ local_storage: Local storage instance
534
+ remote_client: Remote client instance
535
+ batch_size: Batch size for logs/metrics
536
+ skip_logs: Skip uploading logs
537
+ skip_metrics: Skip uploading metrics
538
+ skip_files: Skip uploading files
539
+ skip_params: Skip uploading parameters
540
+ verbose: Show verbose output
541
+ progress: Optional rich Progress instance for tracking
542
+ max_concurrent_metrics: Maximum concurrent metric uploads (default: 5)
543
+ """
544
+ self.local = local_storage
545
+ self.remote = remote_client
546
+ self.batch_size = batch_size
547
+ self.skip_logs = skip_logs
548
+ self.skip_metrics = skip_metrics
549
+ self.skip_files = skip_files
550
+ self.skip_params = skip_params
551
+ self.verbose = verbose
552
+ self.progress = progress
553
+ self.max_concurrent_metrics = max_concurrent_metrics
554
+ # Thread-safe lock for shared state updates
555
+ self._lock = threading.Lock()
556
+ # Thread-local storage for remote clients (for thread-safe HTTP requests)
557
+ self._thread_local = threading.local()
558
+
559
+ def _get_remote_client(self) -> RemoteClient:
560
+ """Get thread-local remote client for safe concurrent access."""
561
+ if not hasattr(self._thread_local, 'client'):
562
+ # Create a new client for this thread
563
+ self._thread_local.client = RemoteClient(
564
+ base_url=self.remote.base_url,
565
+ api_key=self.remote.api_key
566
+ )
567
+ return self._thread_local.client
568
+
569
+ def upload_experiment(
570
+ self, exp_info: ExperimentInfo, validation_result: ValidationResult, task_id=None
571
+ ) -> UploadResult:
572
+ """
573
+ Upload a single experiment with all its data.
574
+
575
+ Args:
576
+ exp_info: Experiment information
577
+ validation_result: Validation results
578
+ task_id: Optional progress task ID
579
+
580
+ Returns:
581
+ UploadResult with upload status
582
+ """
583
+ result = UploadResult(experiment=f"{exp_info.project}/{exp_info.experiment}")
584
+
585
+ # Calculate total steps for progress tracking
586
+ total_steps = 1 # metadata
587
+ if not self.skip_params and "parameters" in validation_result.valid_data:
588
+ total_steps += 1
589
+ if not self.skip_logs and exp_info.has_logs:
590
+ total_steps += 1
591
+ if not self.skip_metrics and exp_info.metric_names:
592
+ total_steps += len(exp_info.metric_names)
593
+ if not self.skip_files and exp_info.file_count > 0:
594
+ total_steps += exp_info.file_count
595
+
596
+ current_step = 0
597
+
598
+ def update_progress(description: str):
599
+ nonlocal current_step
600
+ current_step += 1
601
+ if self.progress and task_id is not None:
602
+ self.progress.update(task_id, completed=current_step, total=total_steps, description=description)
603
+
604
+ try:
605
+ # 1. Create/update experiment metadata
606
+ update_progress("Creating experiment...")
607
+ if self.verbose:
608
+ console.print(f" [dim]Creating experiment...[/dim]")
609
+
610
+ exp_data = validation_result.valid_data
611
+
612
+ # Store folder path in metadata (not as folderId which expects Snowflake ID)
613
+ custom_metadata = exp_data.get("metadata") or {}
614
+ if exp_data.get("folder"):
615
+ custom_metadata["folder"] = exp_data["folder"]
616
+
617
+ response = self.remote.create_or_update_experiment(
618
+ project=exp_info.project,
619
+ name=exp_info.experiment,
620
+ description=exp_data.get("description"),
621
+ tags=exp_data.get("tags"),
622
+ bindrs=exp_data.get("bindrs"),
623
+ folder=None, # Don't send folder path as folderId (expects Snowflake ID)
624
+ write_protected=exp_data.get("write_protected", False),
625
+ metadata=custom_metadata if custom_metadata else None,
626
+ )
627
+
628
+ # Extract experiment ID from nested response
629
+ experiment_id = response.get("experiment", {}).get("id") or response.get("id")
630
+ if self.verbose:
631
+ console.print(f" [green]✓[/green] Created experiment (id: {experiment_id})")
632
+
633
+ # 2. Upload parameters
634
+ if not self.skip_params and "parameters" in validation_result.valid_data:
635
+ update_progress("Uploading parameters...")
636
+ if self.verbose:
637
+ console.print(f" [dim]Uploading parameters...[/dim]")
638
+
639
+ params = validation_result.valid_data["parameters"]
640
+ self.remote.set_parameters(experiment_id, params)
641
+ result.uploaded["params"] = len(params)
642
+ # Track bytes (approximate JSON size)
643
+ result.bytes_uploaded += len(json.dumps(params).encode('utf-8'))
644
+
645
+ if self.verbose:
646
+ console.print(f" [green]✓[/green] Uploaded {len(params)} parameters")
647
+
648
+ # 3. Upload logs
649
+ if not self.skip_logs and exp_info.has_logs:
650
+ count = self._upload_logs(experiment_id, exp_info, result, task_id, update_progress)
651
+ result.uploaded["logs"] = count
652
+
653
+ # 4. Upload metrics
654
+ if not self.skip_metrics and exp_info.metric_names:
655
+ count = self._upload_metrics(experiment_id, exp_info, result, task_id, update_progress)
656
+ result.uploaded["metrics"] = count
657
+
658
+ # 5. Upload files
659
+ if not self.skip_files and exp_info.file_count > 0:
660
+ count = self._upload_files(experiment_id, exp_info, result, task_id, update_progress)
661
+ result.uploaded["files"] = count
662
+
663
+ result.success = True
664
+
665
+ except Exception as e:
666
+ result.success = False
667
+ result.errors.append(str(e))
668
+ if self.verbose:
669
+ console.print(f" [red]✗ Error: {e}[/red]")
670
+
671
+ return result
672
+
673
+ def _upload_logs(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
674
+ task_id=None, update_progress=None) -> int:
675
+ """Upload logs in batches."""
676
+ if update_progress:
677
+ update_progress("Uploading logs...")
678
+ if self.verbose:
679
+ console.print(f" [dim]Uploading logs...[/dim]")
680
+
681
+ logs_file = exp_info.path / "logs" / "logs.jsonl"
682
+ logs_batch = []
683
+ total_uploaded = 0
684
+ skipped = 0
685
+
686
+ try:
687
+ with open(logs_file, "r") as f:
688
+ for line in f:
689
+ try:
690
+ log_entry = json.loads(line)
691
+
692
+ # Validate required fields
693
+ if "message" not in log_entry:
694
+ skipped += 1
695
+ continue
696
+
697
+ # Prepare log entry for API
698
+ api_log = {
699
+ "timestamp": log_entry.get("timestamp"),
700
+ "level": log_entry.get("level", "info"),
701
+ "message": log_entry["message"],
702
+ }
703
+ if "metadata" in log_entry:
704
+ api_log["metadata"] = log_entry["metadata"]
705
+
706
+ logs_batch.append(api_log)
707
+ # Track bytes
708
+ result.bytes_uploaded += len(line.encode('utf-8'))
709
+
710
+ # Upload batch
711
+ if len(logs_batch) >= self.batch_size:
712
+ self.remote.create_log_entries(experiment_id, logs_batch)
713
+ total_uploaded += len(logs_batch)
714
+ logs_batch = []
715
+
716
+ except json.JSONDecodeError:
717
+ skipped += 1
718
+ continue
719
+
720
+ # Upload remaining logs
721
+ if logs_batch:
722
+ self.remote.create_log_entries(experiment_id, logs_batch)
723
+ total_uploaded += len(logs_batch)
724
+
725
+ if self.verbose:
726
+ msg = f" [green]✓[/green] Uploaded {total_uploaded} log entries"
727
+ if skipped > 0:
728
+ msg += f" (skipped {skipped} invalid)"
729
+ console.print(msg)
730
+
731
+ except IOError as e:
732
+ result.failed.setdefault("logs", []).append(str(e))
733
+
734
+ return total_uploaded
735
+
736
+ def _upload_single_metric(
737
+ self,
738
+ experiment_id: str,
739
+ metric_name: str,
740
+ metric_dir: Path,
741
+ result: UploadResult
742
+ ) -> Dict[str, Any]:
743
+ """
744
+ Upload a single metric (thread-safe helper).
745
+
746
+ Returns:
747
+ Dict with 'success', 'uploaded', 'skipped', 'bytes', and 'error' keys
748
+ """
749
+ data_file = metric_dir / "data.jsonl"
750
+ data_batch = []
751
+ total_uploaded = 0
752
+ skipped = 0
753
+ bytes_uploaded = 0
754
+
755
+ # Get thread-local client for safe concurrent HTTP requests
756
+ remote_client = self._get_remote_client()
757
+
758
+ try:
759
+ with open(data_file, "r") as f:
760
+ for line in f:
761
+ try:
762
+ data_point = json.loads(line)
763
+
764
+ # Validate required fields
765
+ if "data" not in data_point:
766
+ skipped += 1
767
+ continue
768
+
769
+ data_batch.append(data_point["data"])
770
+ bytes_uploaded += len(line.encode('utf-8'))
771
+
772
+ # Upload batch using thread-local client
773
+ if len(data_batch) >= self.batch_size:
774
+ remote_client.append_batch_to_metric(
775
+ experiment_id, metric_name, data_batch
776
+ )
777
+ total_uploaded += len(data_batch)
778
+ data_batch = []
779
+
780
+ except json.JSONDecodeError:
781
+ skipped += 1
782
+ continue
783
+
784
+ # Upload remaining data points using thread-local client
785
+ if data_batch:
786
+ remote_client.append_batch_to_metric(experiment_id, metric_name, data_batch)
787
+ total_uploaded += len(data_batch)
788
+
789
+ return {
790
+ 'success': True,
791
+ 'uploaded': total_uploaded,
792
+ 'skipped': skipped,
793
+ 'bytes': bytes_uploaded,
794
+ 'error': None
795
+ }
796
+
797
+ except Exception as e:
798
+ return {
799
+ 'success': False,
800
+ 'uploaded': 0,
801
+ 'skipped': 0,
802
+ 'bytes': 0,
803
+ 'error': str(e)
804
+ }
805
+
806
+ def _upload_metrics(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
807
+ task_id=None, update_progress=None) -> int:
808
+ """Upload metrics in parallel with concurrency limit."""
809
+ if not exp_info.metric_names:
810
+ return 0
811
+
812
+ total_metrics = 0
813
+
814
+ # Use ThreadPoolExecutor for parallel uploads
815
+ with ThreadPoolExecutor(max_workers=self.max_concurrent_metrics) as executor:
816
+ # Submit all metric upload tasks
817
+ future_to_metric = {}
818
+ for metric_name in exp_info.metric_names:
819
+ metric_dir = exp_info.path / "metrics" / metric_name
820
+ future = executor.submit(
821
+ self._upload_single_metric,
822
+ experiment_id,
823
+ metric_name,
824
+ metric_dir,
825
+ result
826
+ )
827
+ future_to_metric[future] = metric_name
828
+
829
+ # Process completed uploads as they finish
830
+ for future in as_completed(future_to_metric):
831
+ metric_name = future_to_metric[future]
832
+
833
+ # Update progress
834
+ if update_progress:
835
+ update_progress(f"Uploading metric '{metric_name}'...")
836
+
837
+ try:
838
+ upload_result = future.result()
839
+
840
+ # Thread-safe update of shared state
841
+ with self._lock:
842
+ result.bytes_uploaded += upload_result['bytes']
843
+
844
+ if upload_result['success']:
845
+ total_metrics += 1
846
+
847
+ # Thread-safe console output
848
+ if self.verbose:
849
+ msg = f" [green]✓[/green] Uploaded {upload_result['uploaded']} data points for '{metric_name}'"
850
+ if upload_result['skipped'] > 0:
851
+ msg += f" (skipped {upload_result['skipped']} invalid)"
852
+ with self._lock:
853
+ console.print(msg)
854
+ else:
855
+ # Record failure
856
+ error_msg = f"{metric_name}: {upload_result['error']}"
857
+ with self._lock:
858
+ result.failed.setdefault("metrics", []).append(error_msg)
859
+ if self.verbose:
860
+ console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {upload_result['error']}")
861
+
862
+ except Exception as e:
863
+ # Handle unexpected errors
864
+ error_msg = f"{metric_name}: {str(e)}"
865
+ with self._lock:
866
+ result.failed.setdefault("metrics", []).append(error_msg)
867
+ if self.verbose:
868
+ console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {e}")
869
+
870
+ return total_metrics
871
+
872
+ def _upload_files(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
873
+ task_id=None, update_progress=None) -> int:
874
+ """Upload files one by one."""
875
+ files_dir = exp_info.path / "files"
876
+ total_uploaded = 0
877
+
878
+ # Use LocalStorage to list files
879
+ try:
880
+ files_list = self.local.list_files(exp_info.project, exp_info.experiment)
881
+
882
+ for file_info in files_list:
883
+ # Skip deleted files
884
+ if file_info.get("deletedAt") is not None:
885
+ continue
886
+
887
+ try:
888
+ if update_progress:
889
+ update_progress(f"Uploading {file_info['filename']}...")
890
+
891
+ # Get file path directly from storage without copying
892
+ file_id = file_info["id"]
893
+ experiment_dir = self.local._get_experiment_dir(exp_info.project, exp_info.experiment)
894
+ files_dir = experiment_dir / "files"
895
+
896
+ # Construct file path
897
+ file_prefix = file_info["path"].lstrip("/") if file_info["path"] else ""
898
+ if file_prefix:
899
+ file_path = files_dir / file_prefix / file_id / file_info["filename"]
900
+ else:
901
+ file_path = files_dir / file_id / file_info["filename"]
902
+
903
+ # Upload to remote with correct parameters
904
+ self.remote.upload_file(
905
+ experiment_id=experiment_id,
906
+ file_path=str(file_path),
907
+ prefix=file_info.get("path", ""),
908
+ filename=file_info["filename"],
909
+ description=file_info.get("description"),
910
+ tags=file_info.get("tags", []),
911
+ metadata=file_info.get("metadata"),
912
+ checksum=file_info["checksum"],
913
+ content_type=file_info["contentType"],
914
+ size_bytes=file_info["sizeBytes"],
915
+ )
916
+
917
+ total_uploaded += 1
918
+ # Track bytes
919
+ result.bytes_uploaded += file_info.get("sizeBytes", 0)
920
+
921
+ if self.verbose:
922
+ size_mb = file_info.get("sizeBytes", 0) / (1024 * 1024)
923
+ console.print(f" [green]✓[/green] {file_info['filename']} ({size_mb:.1f}MB)")
924
+
925
+ except Exception as e:
926
+ result.failed.setdefault("files", []).append(f"{file_info['filename']}: {e}")
927
+
928
+ except Exception as e:
929
+ result.failed.setdefault("files", []).append(str(e))
930
+
931
+ if self.verbose and not result.failed.get("files"):
932
+ console.print(f" [green]✓[/green] Uploaded {total_uploaded} files")
933
+
934
+ return total_uploaded
935
+
936
+
937
+ def cmd_upload(args: argparse.Namespace) -> int:
938
+ """
939
+ Execute upload command.
940
+
941
+ Args:
942
+ args: Parsed command-line arguments
943
+
944
+ Returns:
945
+ Exit code (0 for success, 1 for error)
946
+ """
947
+ # Load config
948
+ config = Config()
949
+
950
+ # Get remote URL (command line > config)
951
+ remote_url = args.remote or config.remote_url
952
+ if not remote_url:
953
+ console.print("[red]Error:[/red] --remote URL is required (or set in config)")
954
+ return 1
955
+
956
+ # Get API key (command line > config > auto-load from storage)
957
+ # RemoteClient will auto-load from storage if api_key is None
958
+ api_key = args.api_key or config.api_key
959
+
960
+ # Validate experiment filter requires project
961
+ if args.experiment and not args.project:
962
+ console.print("[red]Error:[/red] --experiment requires --project")
963
+ return 1
964
+
965
+ # Discover experiments
966
+ local_path = Path(args.path)
967
+ if not local_path.exists():
968
+ console.print(f"[red]Error:[/red] Local storage path does not exist: {local_path}")
969
+ return 1
970
+
971
+ # Handle state file for resume functionality
972
+ state_file = Path(args.state_file)
973
+ upload_state = None
974
+
975
+ if args.resume:
976
+ upload_state = UploadState.load(state_file)
977
+ if upload_state:
978
+ # Validate state matches current upload
979
+ if upload_state.local_path != str(local_path.absolute()):
980
+ console.print("[yellow]Warning:[/yellow] State file local path doesn't match. Starting fresh upload.")
981
+ upload_state = None
982
+ elif upload_state.remote_url != remote_url:
983
+ console.print("[yellow]Warning:[/yellow] State file remote URL doesn't match. Starting fresh upload.")
984
+ upload_state = None
985
+ else:
986
+ console.print(f"[green]Resuming previous upload from {upload_state.timestamp}[/green]")
987
+ console.print(f" Already completed: {len(upload_state.completed_experiments)} experiments")
988
+ console.print(f" Failed: {len(upload_state.failed_experiments)} experiments")
989
+ else:
990
+ console.print("[yellow]No previous upload state found. Starting fresh upload.[/yellow]")
991
+
992
+ # Create new state if not resuming
993
+ if not upload_state:
994
+ upload_state = UploadState(
995
+ local_path=str(local_path.absolute()),
996
+ remote_url=remote_url,
997
+ )
998
+
999
+ console.print(f"[bold]Scanning local storage:[/bold] {local_path.absolute()}")
1000
+ experiments = discover_experiments(
1001
+ local_path,
1002
+ project_filter=args.project,
1003
+ experiment_filter=args.experiment,
1004
+ )
1005
+
1006
+ if not experiments:
1007
+ if args.project and args.experiment:
1008
+ console.print(f"[yellow]No experiment found:[/yellow] {args.project}/{args.experiment}")
1009
+ elif args.project:
1010
+ console.print(f"[yellow]No experiments found in project:[/yellow] {args.project}")
1011
+ else:
1012
+ console.print("[yellow]No experiments found in local storage[/yellow]")
1013
+ return 1
1014
+
1015
+ # Filter out already completed experiments when resuming
1016
+ if args.resume and upload_state.completed_experiments:
1017
+ original_count = len(experiments)
1018
+ experiments = [
1019
+ exp for exp in experiments
1020
+ if f"{exp.project}/{exp.experiment}" not in upload_state.completed_experiments
1021
+ ]
1022
+ skipped_count = original_count - len(experiments)
1023
+ if skipped_count > 0:
1024
+ console.print(f"[dim]Skipping {skipped_count} already completed experiment(s)[/dim]")
1025
+
1026
+ console.print(f"[green]Found {len(experiments)} experiment(s) to upload[/green]")
1027
+
1028
+ # Display discovered experiments
1029
+ if args.verbose or args.dry_run:
1030
+ console.print("\n[bold]Discovered experiments:[/bold]")
1031
+ for exp in experiments:
1032
+ parts = []
1033
+ if exp.has_logs:
1034
+ parts.append("logs")
1035
+ if exp.has_params:
1036
+ parts.append("params")
1037
+ if exp.metric_names:
1038
+ parts.append(f"{len(exp.metric_names)} metrics")
1039
+ if exp.file_count:
1040
+ size_mb = exp.estimated_size / (1024 * 1024)
1041
+ parts.append(f"{exp.file_count} files ({size_mb:.1f}MB)")
1042
+
1043
+ details = ", ".join(parts) if parts else "metadata only"
1044
+ console.print(f" [cyan]•[/cyan] {exp.project}/{exp.experiment} [dim]({details})[/dim]")
1045
+
1046
+ # Dry-run mode: stop here
1047
+ if args.dry_run:
1048
+ console.print("\n[yellow bold]DRY RUN[/yellow bold] - No data will be uploaded")
1049
+ console.print("Run without --dry-run to proceed with upload.")
1050
+ return 0
1051
+
1052
+ # Validate experiments
1053
+ console.print("\n[bold]Validating experiments...[/bold]")
1054
+ validator = ExperimentValidator(strict=args.strict)
1055
+ validation_results = {}
1056
+ valid_experiments = []
1057
+ invalid_experiments = []
1058
+
1059
+ for exp in experiments:
1060
+ validation = validator.validate_experiment(exp)
1061
+ validation_results[f"{exp.project}/{exp.experiment}"] = validation
1062
+
1063
+ if validation.is_valid:
1064
+ valid_experiments.append(exp)
1065
+ else:
1066
+ invalid_experiments.append(exp)
1067
+
1068
+ # Show warnings and errors
1069
+ if args.verbose or validation.errors:
1070
+ exp_key = f"{exp.project}/{exp.experiment}"
1071
+ if validation.errors:
1072
+ console.print(f" [red]✗[/red] {exp_key}:")
1073
+ for error in validation.errors:
1074
+ console.print(f" [red]{error}[/red]")
1075
+ elif validation.warnings:
1076
+ console.print(f" [yellow]⚠[/yellow] {exp_key}:")
1077
+ for warning in validation.warnings:
1078
+ console.print(f" [yellow]{warning}[/yellow]")
1079
+
1080
+ if invalid_experiments:
1081
+ console.print(f"\n[yellow]{len(invalid_experiments)} experiment(s) failed validation and will be skipped[/yellow]")
1082
+ if args.strict:
1083
+ console.print("[red]Error: Validation failed in --strict mode[/red]")
1084
+ return 1
1085
+
1086
+ if not valid_experiments:
1087
+ console.print("[red]Error: No valid experiments to upload[/red]")
1088
+ return 1
1089
+
1090
+ console.print(f"[green]{len(valid_experiments)} experiment(s) ready to upload[/green]")
1091
+
1092
+ # Initialize remote client and local storage
1093
+ remote_client = RemoteClient(base_url=remote_url, api_key=api_key)
1094
+ local_storage = LocalStorage(root_path=local_path)
1095
+
1096
+ # Upload experiments with progress tracking
1097
+ console.print(f"\n[bold]Uploading to:[/bold] {remote_url}")
1098
+ results = []
1099
+
1100
+ # Track upload timing
1101
+ import time
1102
+ start_time = time.time()
1103
+
1104
+ # Create progress bar for overall upload
1105
+ with Progress(
1106
+ SpinnerColumn(),
1107
+ TextColumn("[progress.description]{task.description}"),
1108
+ BarColumn(),
1109
+ TaskProgressColumn(),
1110
+ console=console,
1111
+ transient=not args.verbose, # Keep progress visible in verbose mode
1112
+ ) as progress:
1113
+ # Create uploader with progress tracking
1114
+ uploader = ExperimentUploader(
1115
+ local_storage=local_storage,
1116
+ remote_client=remote_client,
1117
+ batch_size=args.batch_size,
1118
+ skip_logs=args.skip_logs,
1119
+ skip_metrics=args.skip_metrics,
1120
+ skip_files=args.skip_files,
1121
+ skip_params=args.skip_params,
1122
+ verbose=args.verbose,
1123
+ progress=progress,
1124
+ )
1125
+
1126
+ for i, exp in enumerate(valid_experiments, start=1):
1127
+ exp_key = f"{exp.project}/{exp.experiment}"
1128
+
1129
+ # Create task for this experiment
1130
+ task_id = progress.add_task(
1131
+ f"[{i}/{len(valid_experiments)}] {exp_key}",
1132
+ total=100, # Will be updated with actual steps
1133
+ )
1134
+
1135
+ # Update state - mark as in progress
1136
+ upload_state.in_progress_experiment = exp_key
1137
+ if not args.dry_run:
1138
+ upload_state.save(state_file)
1139
+
1140
+ validation = validation_results[exp_key]
1141
+ result = uploader.upload_experiment(exp, validation, task_id=task_id)
1142
+ results.append(result)
1143
+
1144
+ # Update state - mark as completed or failed
1145
+ upload_state.in_progress_experiment = None
1146
+ if result.success:
1147
+ upload_state.completed_experiments.append(exp_key)
1148
+ else:
1149
+ upload_state.failed_experiments.append(exp_key)
1150
+
1151
+ if not args.dry_run:
1152
+ upload_state.save(state_file)
1153
+
1154
+ # Update task to completed
1155
+ progress.update(task_id, completed=100, total=100)
1156
+
1157
+ if not args.verbose:
1158
+ # Show brief status
1159
+ if result.success:
1160
+ parts = []
1161
+ if result.uploaded.get("params"):
1162
+ parts.append(f"{result.uploaded['params']} params")
1163
+ if result.uploaded.get("logs"):
1164
+ parts.append(f"{result.uploaded['logs']} logs")
1165
+ if result.uploaded.get("metrics"):
1166
+ parts.append(f"{result.uploaded['metrics']} metrics")
1167
+ if result.uploaded.get("files"):
1168
+ parts.append(f"{result.uploaded['files']} files")
1169
+ status = ", ".join(parts) if parts else "metadata only"
1170
+ console.print(f" [green]✓[/green] Uploaded ({status})")
1171
+ else:
1172
+ console.print(f" [red]✗[/red] Failed")
1173
+ if result.errors:
1174
+ for error in result.errors[:3]: # Show first 3 errors
1175
+ console.print(f" [red]{error}[/red]")
1176
+
1177
+ # Calculate timing
1178
+ end_time = time.time()
1179
+ elapsed_time = end_time - start_time
1180
+ total_bytes = sum(r.bytes_uploaded for r in results)
1181
+
1182
+ # Print summary with rich Table
1183
+ console.print()
1184
+
1185
+ successful = [r for r in results if r.success]
1186
+ failed = [r for r in results if not r.success]
1187
+
1188
+ # Create summary table
1189
+ summary_table = Table(title="Upload Summary", show_header=True, header_style="bold")
1190
+ summary_table.add_column("Status", style="cyan")
1191
+ summary_table.add_column("Count", justify="right")
1192
+
1193
+ summary_table.add_row("Successful", f"[green]{len(successful)}/{len(results)}[/green]")
1194
+ if failed:
1195
+ summary_table.add_row("Failed", f"[red]{len(failed)}/{len(results)}[/red]")
1196
+
1197
+ # Add timing information
1198
+ summary_table.add_row("Total Time", f"{elapsed_time:.2f}s")
1199
+
1200
+ # Calculate and display upload speed
1201
+ if total_bytes > 0 and elapsed_time > 0:
1202
+ # Convert to appropriate unit
1203
+ if total_bytes < 1024 * 1024: # Less than 1 MB
1204
+ speed_kb = (total_bytes / 1024) / elapsed_time
1205
+ summary_table.add_row("Avg Speed", f"{speed_kb:.2f} KB/s")
1206
+ else: # 1 MB or more
1207
+ speed_mb = (total_bytes / (1024 * 1024)) / elapsed_time
1208
+ summary_table.add_row("Avg Speed", f"{speed_mb:.2f} MB/s")
1209
+
1210
+ console.print(summary_table)
1211
+
1212
+ # Show failed experiments
1213
+ if failed:
1214
+ console.print("\n[bold red]Failed Experiments:[/bold red]")
1215
+ for result in failed:
1216
+ console.print(f" [red]✗[/red] {result.experiment}")
1217
+ for error in result.errors:
1218
+ console.print(f" [dim]{error}[/dim]")
1219
+
1220
+ # Data statistics
1221
+ total_logs = sum(r.uploaded.get("logs", 0) for r in results)
1222
+ total_metrics = sum(r.uploaded.get("metrics", 0) for r in results)
1223
+ total_files = sum(r.uploaded.get("files", 0) for r in results)
1224
+
1225
+ if total_logs or total_metrics or total_files:
1226
+ data_table = Table(title="Data Uploaded", show_header=True, header_style="bold")
1227
+ data_table.add_column("Type", style="cyan")
1228
+ data_table.add_column("Count", justify="right", style="green")
1229
+
1230
+ if total_logs:
1231
+ data_table.add_row("Logs", f"{total_logs} entries")
1232
+ if total_metrics:
1233
+ data_table.add_row("Metrics", f"{total_metrics} metrics")
1234
+ if total_files:
1235
+ data_table.add_row("Files", f"{total_files} files")
1236
+
1237
+ console.print()
1238
+ console.print(data_table)
1239
+
1240
+ # Clean up state file if all uploads succeeded
1241
+ if not args.dry_run and len(failed) == 0 and state_file.exists():
1242
+ state_file.unlink()
1243
+ console.print("\n[dim]Upload complete. State file removed.[/dim]")
1244
+ elif not args.dry_run and failed:
1245
+ console.print(f"\n[yellow]State saved to {state_file}. Use --resume to retry failed uploads.[/yellow]")
1246
+
1247
+ # Return exit code
1248
+ return 0 if len(failed) == 0 else 1