gradexp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gradexp/__init__.py +2 -0
- gradexp/auth.py +103 -0
- gradexp/cli.py +21 -0
- gradexp/client.py +468 -0
- gradexp-0.1.0.dist-info/METADATA +86 -0
- gradexp-0.1.0.dist-info/RECORD +8 -0
- gradexp-0.1.0.dist-info/WHEEL +4 -0
- gradexp-0.1.0.dist-info/entry_points.txt +2 -0
gradexp/__init__.py
ADDED
gradexp/auth.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import appdirs
|
|
3
|
+
import json
|
|
4
|
+
import webbrowser
|
|
5
|
+
import click
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
APP_NAME = "gradexp"
|
|
9
|
+
APP_AUTHOR = "GradientExplorer"
|
|
10
|
+
BASE_URL = "gradient-explorer.xyz"
|
|
11
|
+
|
|
12
|
+
def get_config_dir():
|
|
13
|
+
return appdirs.user_config_dir(APP_NAME, APP_AUTHOR)
|
|
14
|
+
|
|
15
|
+
def get_token_path():
|
|
16
|
+
config_dir = get_config_dir()
|
|
17
|
+
return os.path.join(config_dir, "secrets.json")
|
|
18
|
+
|
|
19
|
+
def save_token(token):
|
|
20
|
+
config_dir = get_config_dir()
|
|
21
|
+
if not os.path.exists(config_dir):
|
|
22
|
+
os.makedirs(config_dir)
|
|
23
|
+
|
|
24
|
+
token_path = get_token_path()
|
|
25
|
+
with open(token_path, 'w') as f:
|
|
26
|
+
json.dump({"api_key": token}, f)
|
|
27
|
+
|
|
28
|
+
# Set permissions to be readable only by user (0600)
|
|
29
|
+
os.chmod(token_path, 0o600)
|
|
30
|
+
|
|
31
|
+
def load_token():
|
|
32
|
+
token_path = get_token_path()
|
|
33
|
+
if not os.path.exists(token_path):
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
with open(token_path, 'r') as f:
|
|
38
|
+
data = json.load(f)
|
|
39
|
+
return data.get("api_key")
|
|
40
|
+
except (json.JSONDecodeError, IOError):
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
def validate_api_key(api_key, debug=False):
|
|
44
|
+
try:
|
|
45
|
+
url = f"https://api.{BASE_URL}/api/v1/me"
|
|
46
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
47
|
+
|
|
48
|
+
if debug:
|
|
49
|
+
click.echo(f"DEBUG: Request URL: {url}")
|
|
50
|
+
click.echo(f"DEBUG: Request Headers: Authorization: Bearer {api_key[:4]}...{api_key[-4:]}")
|
|
51
|
+
|
|
52
|
+
response = requests.get(
|
|
53
|
+
url,
|
|
54
|
+
headers=headers
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if debug:
|
|
58
|
+
click.echo(f"DEBUG: Response Status: {response.status_code}")
|
|
59
|
+
click.echo(f"DEBUG: Response Body: {response.text}")
|
|
60
|
+
|
|
61
|
+
response.raise_for_status()
|
|
62
|
+
return response.json()
|
|
63
|
+
except requests.exceptions.RequestException as e:
|
|
64
|
+
if debug:
|
|
65
|
+
click.echo(f"DEBUG: Request failed: {e}")
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
def login_flow(debug=False):
|
|
69
|
+
"""
|
|
70
|
+
Initiates the login flow:
|
|
71
|
+
1. Opens the browser to the authorization URL.
|
|
72
|
+
2. Prompts the user to paste the API key.
|
|
73
|
+
3. Validates the API key.
|
|
74
|
+
4. Saves the API key securely.
|
|
75
|
+
"""
|
|
76
|
+
auth_url = f"https://{BASE_URL}/authorize"
|
|
77
|
+
|
|
78
|
+
click.echo(f"Opening {auth_url} in your default browser...")
|
|
79
|
+
webbrowser.open(auth_url)
|
|
80
|
+
|
|
81
|
+
api_key = click.prompt("Please paste your API key here", hide_input=True)
|
|
82
|
+
|
|
83
|
+
if not api_key:
|
|
84
|
+
click.echo("No API key provided. Login failed.")
|
|
85
|
+
return
|
|
86
|
+
|
|
87
|
+
user_info = validate_api_key(api_key, debug=debug)
|
|
88
|
+
|
|
89
|
+
if user_info:
|
|
90
|
+
save_token(api_key)
|
|
91
|
+
click.secho("Successfully logged in. Your API key is saved.", fg="green")
|
|
92
|
+
else:
|
|
93
|
+
# Construct error message
|
|
94
|
+
click.secho("Error: Invalid API Key", fg="red")
|
|
95
|
+
|
|
96
|
+
def check_login():
|
|
97
|
+
token = load_token()
|
|
98
|
+
if token:
|
|
99
|
+
click.echo(f"Logged in with token: {token[:4]}..." + "*" * 10)
|
|
100
|
+
return True
|
|
101
|
+
else:
|
|
102
|
+
click.echo("Not logged in. Please run `gradexp login`.")
|
|
103
|
+
return False
|
gradexp/cli.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import click
|
|
2
|
+
from . import auth
|
|
3
|
+
|
|
4
|
+
@click.group()
|
|
5
|
+
@click.version_option()
|
|
6
|
+
@click.option('--debug', is_flag=True, help="Enable debug logging")
|
|
7
|
+
@click.pass_context
|
|
8
|
+
def main(ctx, debug):
|
|
9
|
+
"""Gradient Explorer CLI"""
|
|
10
|
+
ctx.ensure_object(dict)
|
|
11
|
+
ctx.obj['DEBUG'] = debug
|
|
12
|
+
|
|
13
|
+
@main.command()
|
|
14
|
+
@click.option('--debug', is_flag=True, help="Enable debug logging")
|
|
15
|
+
@click.pass_context
|
|
16
|
+
def login(ctx, debug):
|
|
17
|
+
"""Log in to Gradient Explorer"""
|
|
18
|
+
auth.login_flow(debug=debug or ctx.obj.get('DEBUG', False))
|
|
19
|
+
|
|
20
|
+
if __name__ == '__main__':
|
|
21
|
+
main()
|
gradexp/client.py
ADDED
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import signal
|
|
4
|
+
import atexit
|
|
5
|
+
import base64
|
|
6
|
+
import json
|
|
7
|
+
import uuid
|
|
8
|
+
import time
|
|
9
|
+
import threading
|
|
10
|
+
import queue
|
|
11
|
+
import shutil
|
|
12
|
+
import appdirs
|
|
13
|
+
import requests
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
from . import auth
|
|
17
|
+
|
|
18
|
+
# Calm red: \033[38;5;174m
|
|
19
|
+
# Reset: \033[0m
|
|
20
|
+
GRADEXP_PREFIX = "\033[38;5;174mgradexp\033[0m: "
|
|
21
|
+
|
|
22
|
+
def _term_log(message, end="\n"):
|
|
23
|
+
"""
|
|
24
|
+
Prints a message with a colored 'gradexp:' prefix.
|
|
25
|
+
"""
|
|
26
|
+
# Handle multi-line messages or messages starting with newline
|
|
27
|
+
if isinstance(message, str) and message.startswith("\n"):
|
|
28
|
+
print(f"\n{GRADEXP_PREFIX}{message[1:]}", end=end)
|
|
29
|
+
else:
|
|
30
|
+
print(f"{GRADEXP_PREFIX}{message}", end=end)
|
|
31
|
+
|
|
32
|
+
class Client:
|
|
33
|
+
def __init__(self):
|
|
34
|
+
self.api_key = None
|
|
35
|
+
self.run_id = None
|
|
36
|
+
self.project_id = None
|
|
37
|
+
self.tensors = {} # {name: {"id": uuid, "step": 0}}
|
|
38
|
+
self.active = False
|
|
39
|
+
self._session = requests.Session()
|
|
40
|
+
self._upload_queue = queue.Queue()
|
|
41
|
+
self._stop_event = threading.Event()
|
|
42
|
+
self._worker_thread = None
|
|
43
|
+
self._cache_dir = None
|
|
44
|
+
self._interrupted = False
|
|
45
|
+
self._original_sigint_handler = None
|
|
46
|
+
self._original_sigterm_handler = None
|
|
47
|
+
|
|
48
|
+
# Progress tracking
|
|
49
|
+
self._progress_lock = threading.Lock()
|
|
50
|
+
self._bytes_uploaded = 0
|
|
51
|
+
self._tensors_uploaded = 0
|
|
52
|
+
self._total_tensors_queued = 0
|
|
53
|
+
self._upload_start_time = None
|
|
54
|
+
|
|
55
|
+
def init(self, project_name=None, run_name=None):
|
|
56
|
+
"""
|
|
57
|
+
Initialize a new run session.
|
|
58
|
+
"""
|
|
59
|
+
# Try to pull from wandb if not provided
|
|
60
|
+
if project_name is None or run_name is None:
|
|
61
|
+
if "wandb" in sys.modules:
|
|
62
|
+
import wandb
|
|
63
|
+
if wandb.run:
|
|
64
|
+
if project_name is None:
|
|
65
|
+
project_name = wandb.run.project
|
|
66
|
+
if run_name is None:
|
|
67
|
+
run_name = wandb.run.name
|
|
68
|
+
|
|
69
|
+
# Fallback to defaults if still None
|
|
70
|
+
if project_name is None:
|
|
71
|
+
project_name = "default"
|
|
72
|
+
if run_name is None:
|
|
73
|
+
run_name = "default-run"
|
|
74
|
+
|
|
75
|
+
self.api_key = auth.load_token()
|
|
76
|
+
if not self.api_key:
|
|
77
|
+
raise RuntimeError("Not logged in. Please run 'gradexp login' first.")
|
|
78
|
+
|
|
79
|
+
current_run_name = run_name
|
|
80
|
+
while True:
|
|
81
|
+
payload = {
|
|
82
|
+
"project_name": project_name,
|
|
83
|
+
"run_name": current_run_name
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
url = f"https://api.{auth.BASE_URL}/api/v1/runs"
|
|
88
|
+
headers = {
|
|
89
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
90
|
+
"Content-Type": "application/json"
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
response = self._session.post(url, headers=headers, json=payload)
|
|
94
|
+
response.raise_for_status()
|
|
95
|
+
|
|
96
|
+
data = response.json()
|
|
97
|
+
self.run_id = data.get("run_id")
|
|
98
|
+
self.project_id = data.get("project_id")
|
|
99
|
+
self.tensors = {}
|
|
100
|
+
self.active = True
|
|
101
|
+
|
|
102
|
+
self._cache_dir = appdirs.user_cache_dir(auth.APP_NAME, auth.APP_AUTHOR)
|
|
103
|
+
os.makedirs(self._cache_dir, exist_ok=True)
|
|
104
|
+
|
|
105
|
+
self._stop_event.clear()
|
|
106
|
+
self._worker_thread = threading.Thread(target=self._upload_worker, daemon=True)
|
|
107
|
+
self._worker_thread.start()
|
|
108
|
+
|
|
109
|
+
base_url = f"https://{auth.BASE_URL}"
|
|
110
|
+
run_url = f"{base_url}/?project={self.project_id}&run={self.run_id}"
|
|
111
|
+
_term_log(f"Initialized. See run progress at {run_url}")
|
|
112
|
+
|
|
113
|
+
atexit.register(self.finish)
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
self._original_sigint_handler = signal.getsignal(signal.SIGINT)
|
|
117
|
+
self._original_sigterm_handler = signal.getsignal(signal.SIGTERM)
|
|
118
|
+
signal.signal(signal.SIGINT, self._handle_interrupt)
|
|
119
|
+
signal.signal(signal.SIGTERM, self._handle_interrupt)
|
|
120
|
+
except ValueError:
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
break # Success
|
|
124
|
+
|
|
125
|
+
except requests.exceptions.HTTPError as e:
|
|
126
|
+
if e.response.status_code == 409:
|
|
127
|
+
try:
|
|
128
|
+
err_data = e.response.json()
|
|
129
|
+
if "suggested_name" in err_data:
|
|
130
|
+
current_run_name = err_data["suggested_name"]
|
|
131
|
+
continue
|
|
132
|
+
except Exception:
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
if "-" in current_run_name and current_run_name.rsplit("-", 1)[1].isdigit():
|
|
136
|
+
parts = current_run_name.rsplit("-", 1)
|
|
137
|
+
base = parts[0]
|
|
138
|
+
num = int(parts[1]) + 1
|
|
139
|
+
current_run_name = f"{base}-{num}"
|
|
140
|
+
else:
|
|
141
|
+
current_run_name = f"{current_run_name}-2"
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
_term_log(f"Failed to initialize: {e}")
|
|
145
|
+
try:
|
|
146
|
+
_term_log(f"Backend error message: {e.response.text}")
|
|
147
|
+
except:
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
if e.response.status_code == 401:
|
|
151
|
+
_term_log("Authentication failed: Unauthorized. Please run 'gradexp login' to authenticate.")
|
|
152
|
+
sys.exit(1)
|
|
153
|
+
else:
|
|
154
|
+
raise
|
|
155
|
+
except requests.exceptions.RequestException as e:
|
|
156
|
+
_term_log(f"Failed to initialize: {e}")
|
|
157
|
+
raise
|
|
158
|
+
|
|
159
|
+
def _ensure_tensor(self, name, array):
|
|
160
|
+
"""
|
|
161
|
+
Ensures a tensor exists on the backend. Returns its ID.
|
|
162
|
+
"""
|
|
163
|
+
if name in self.tensors:
|
|
164
|
+
return self.tensors[name]["id"]
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
url = f"https://api.{auth.BASE_URL}/api/v1/runs/{self.run_id}/tensors"
|
|
168
|
+
headers = {
|
|
169
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
170
|
+
"Content-Type": "application/json"
|
|
171
|
+
}
|
|
172
|
+
payload = {
|
|
173
|
+
"name": name,
|
|
174
|
+
"dtype": str(array.dtype),
|
|
175
|
+
"shape": list(array.shape)
|
|
176
|
+
}
|
|
177
|
+
response = self._session.post(url, headers=headers, json=payload)
|
|
178
|
+
response.raise_for_status()
|
|
179
|
+
|
|
180
|
+
data = response.json()
|
|
181
|
+
tensor_id = data.get("tensor_id")
|
|
182
|
+
self.tensors[name] = {"id": tensor_id, "step": 0}
|
|
183
|
+
return tensor_id
|
|
184
|
+
except Exception as e:
|
|
185
|
+
_term_log(f"Failed to create tensor '{name}': {e}")
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
def log(self, tensor, name="default"):
|
|
189
|
+
"""
|
|
190
|
+
Upload data for the tensor.
|
|
191
|
+
"""
|
|
192
|
+
if not self.active:
|
|
193
|
+
_term_log("Not initialized. Please call gradexp.init() first.")
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
# Convert to numpy if needed
|
|
197
|
+
if not isinstance(tensor, np.ndarray):
|
|
198
|
+
try:
|
|
199
|
+
tensor = np.array(tensor)
|
|
200
|
+
except Exception:
|
|
201
|
+
_term_log("Could not convert input to numpy array.")
|
|
202
|
+
return
|
|
203
|
+
|
|
204
|
+
# Ensure tensor is registered
|
|
205
|
+
tensor_id = self._ensure_tensor(name, tensor)
|
|
206
|
+
if not tensor_id:
|
|
207
|
+
return
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
# Full consistency: float32 for now as per previous logic,
|
|
211
|
+
# but ideally we respect the tensor's own dtype if we sent that to backend.
|
|
212
|
+
# Assuming backend handles what we told it in _ensure_tensor.
|
|
213
|
+
# But previous code cast to float32. Let's stick to float32 for consistency for now unless specified.
|
|
214
|
+
if tensor.dtype != np.float32:
|
|
215
|
+
tensor = tensor.astype(np.float32)
|
|
216
|
+
|
|
217
|
+
tensor_bytes = tensor.tobytes()
|
|
218
|
+
|
|
219
|
+
# Store locally
|
|
220
|
+
unique_id = str(uuid.uuid4())
|
|
221
|
+
file_path = os.path.join(self._cache_dir, f"step_{unique_id}.bin")
|
|
222
|
+
|
|
223
|
+
with open(file_path, "wb") as f:
|
|
224
|
+
f.write(tensor_bytes)
|
|
225
|
+
|
|
226
|
+
# Get and increment step
|
|
227
|
+
step_index = self.tensors[name]["step"]
|
|
228
|
+
self.tensors[name]["step"] += 1
|
|
229
|
+
|
|
230
|
+
# Enqueue for background upload
|
|
231
|
+
with self._progress_lock:
|
|
232
|
+
self._total_tensors_queued += 1
|
|
233
|
+
self._upload_queue.put({
|
|
234
|
+
"file_path": file_path,
|
|
235
|
+
"tensor_id": tensor_id,
|
|
236
|
+
"step": step_index
|
|
237
|
+
})
|
|
238
|
+
|
|
239
|
+
except Exception as e:
|
|
240
|
+
_term_log(f"Failed to buffer tensor data: {e}")
|
|
241
|
+
|
|
242
|
+
def _update_status(self, status):
|
|
243
|
+
"""
|
|
244
|
+
Updates the status of the run.
|
|
245
|
+
Uses PATCH /api/v1/runs/{run_id}
|
|
246
|
+
"""
|
|
247
|
+
if not self.run_id or not self.api_key:
|
|
248
|
+
return
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
url = f"https://api.{auth.BASE_URL}/api/v1/runs/{self.run_id}"
|
|
252
|
+
headers = {
|
|
253
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
254
|
+
"Content-Type": "application/json"
|
|
255
|
+
}
|
|
256
|
+
payload = {"status": status}
|
|
257
|
+
response = self._session.patch(url, headers=headers, json=payload)
|
|
258
|
+
response.raise_for_status()
|
|
259
|
+
except Exception as e:
|
|
260
|
+
_term_log(f"Failed to update session status to {status}: {e}")
|
|
261
|
+
|
|
262
|
+
def _print_progress(self):
|
|
263
|
+
"""
|
|
264
|
+
Prints a progress bar showing upload status and throughput.
|
|
265
|
+
"""
|
|
266
|
+
with self._progress_lock:
|
|
267
|
+
uploaded = self._tensors_uploaded
|
|
268
|
+
total = self._total_tensors_queued
|
|
269
|
+
bytes_up = self._bytes_uploaded
|
|
270
|
+
start_time = self._upload_start_time
|
|
271
|
+
|
|
272
|
+
# Calculate throughput
|
|
273
|
+
if start_time and bytes_up > 0:
|
|
274
|
+
elapsed = time.time() - start_time
|
|
275
|
+
if elapsed > 0:
|
|
276
|
+
mbps = (bytes_up / (1024 * 1024)) / elapsed
|
|
277
|
+
else:
|
|
278
|
+
mbps = 0.0
|
|
279
|
+
else:
|
|
280
|
+
mbps = 0.0
|
|
281
|
+
|
|
282
|
+
# Build progress bar
|
|
283
|
+
bar_width = 30
|
|
284
|
+
if total > 0:
|
|
285
|
+
filled = int(bar_width * uploaded / total)
|
|
286
|
+
else:
|
|
287
|
+
filled = 0
|
|
288
|
+
bar = "█" * filled + "░" * (bar_width - filled)
|
|
289
|
+
|
|
290
|
+
sys.stdout.write(f"\r{GRADEXP_PREFIX}[{bar}] {uploaded}/{total} tensors | {mbps:.2f} MB/s ")
|
|
291
|
+
sys.stdout.flush()
|
|
292
|
+
|
|
293
|
+
def _handle_interrupt(self, signum, frame):
|
|
294
|
+
"""
|
|
295
|
+
Handles SIGINT (Ctrl+C) or SIGTERM.
|
|
296
|
+
"""
|
|
297
|
+
if not self._interrupted:
|
|
298
|
+
self._interrupted = True
|
|
299
|
+
q_size = self._upload_queue.qsize()
|
|
300
|
+
|
|
301
|
+
if q_size == 0:
|
|
302
|
+
_term_log("\nInterrupt received. No pending uploads. Shutting down...")
|
|
303
|
+
self.finish("stopped")
|
|
304
|
+
sys.exit(0)
|
|
305
|
+
else:
|
|
306
|
+
_term_log(f"\nInterrupt received. Waiting for {q_size} pending uploads to complete.")
|
|
307
|
+
_term_log("Press Ctrl+C again to force quit.\n")
|
|
308
|
+
|
|
309
|
+
# Show progress while waiting for uploads
|
|
310
|
+
try:
|
|
311
|
+
while not self._upload_queue.empty() or (self._worker_thread and self._worker_thread.is_alive() and self._tensors_uploaded < self._total_tensors_queued):
|
|
312
|
+
self._print_progress()
|
|
313
|
+
time.sleep(0.2)
|
|
314
|
+
self._print_progress() # Final update
|
|
315
|
+
print() # Newline after progress bar
|
|
316
|
+
self.finish("stopped")
|
|
317
|
+
sys.exit(0)
|
|
318
|
+
except KeyboardInterrupt:
|
|
319
|
+
# Second Ctrl+C during progress display - force quit
|
|
320
|
+
_term_log("\nForce quitting...")
|
|
321
|
+
self._update_status("stopped")
|
|
322
|
+
os._exit(1)
|
|
323
|
+
else:
|
|
324
|
+
_term_log("\nForce quitting...")
|
|
325
|
+
# Attempt a quick status update if possible, but don't block
|
|
326
|
+
try:
|
|
327
|
+
# Direct status update attempt to backend, skipping the queue
|
|
328
|
+
# This might fail if the session is already half-closed but worth a try
|
|
329
|
+
self._update_status("stopped")
|
|
330
|
+
except:
|
|
331
|
+
pass
|
|
332
|
+
os._exit(1) # Immediate exit
|
|
333
|
+
|
|
334
|
+
def _upload_worker(self):
|
|
335
|
+
"""
|
|
336
|
+
Background worker that uploads data from the queue.
|
|
337
|
+
"""
|
|
338
|
+
while not self._stop_event.is_set() or not self._upload_queue.empty():
|
|
339
|
+
try:
|
|
340
|
+
# Use a timeout to occasionally check the stop event
|
|
341
|
+
item = self._upload_queue.get(timeout=1.0)
|
|
342
|
+
except queue.Empty:
|
|
343
|
+
continue
|
|
344
|
+
|
|
345
|
+
file_path = item["file_path"]
|
|
346
|
+
tensor_id = item["tensor_id"]
|
|
347
|
+
step = item["step"]
|
|
348
|
+
|
|
349
|
+
try:
|
|
350
|
+
if not os.path.exists(file_path):
|
|
351
|
+
self._upload_queue.task_done()
|
|
352
|
+
continue
|
|
353
|
+
|
|
354
|
+
with open(file_path, "rb") as f:
|
|
355
|
+
content = f.read()
|
|
356
|
+
|
|
357
|
+
content_len = len(content)
|
|
358
|
+
|
|
359
|
+
# Set upload start time on first upload
|
|
360
|
+
with self._progress_lock:
|
|
361
|
+
if self._upload_start_time is None:
|
|
362
|
+
self._upload_start_time = time.time()
|
|
363
|
+
|
|
364
|
+
url = f"https://api.{auth.BASE_URL}/api/v1/tensors/{tensor_id}/step"
|
|
365
|
+
headers = {
|
|
366
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
367
|
+
"Content-Type": "application/octet-stream"
|
|
368
|
+
}
|
|
369
|
+
params = {
|
|
370
|
+
"step": step,
|
|
371
|
+
"index": step # Kept for backward compatibility if needed
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
response = self._session.post(url, headers=headers, data=content, params=params)
|
|
375
|
+
response.raise_for_status()
|
|
376
|
+
|
|
377
|
+
# Success: update progress and delete the local file
|
|
378
|
+
with self._progress_lock:
|
|
379
|
+
self._bytes_uploaded += content_len
|
|
380
|
+
self._tensors_uploaded += 1
|
|
381
|
+
os.remove(file_path)
|
|
382
|
+
|
|
383
|
+
except requests.exceptions.RequestException as e:
|
|
384
|
+
_term_log(f"Failed to upload tensor step in background: {e}")
|
|
385
|
+
if os.path.exists(file_path):
|
|
386
|
+
os.remove(file_path)
|
|
387
|
+
except Exception as e:
|
|
388
|
+
_term_log(f"Unexpected error in upload worker: {e}")
|
|
389
|
+
if os.path.exists(file_path):
|
|
390
|
+
os.remove(file_path)
|
|
391
|
+
finally:
|
|
392
|
+
self._upload_queue.task_done()
|
|
393
|
+
|
|
394
|
+
def finish(self, status="complete"):
|
|
395
|
+
"""
|
|
396
|
+
Marks the session as finished and flushes the upload queue.
|
|
397
|
+
"""
|
|
398
|
+
if not self.active:
|
|
399
|
+
return
|
|
400
|
+
|
|
401
|
+
# Signal the worker to stop after processing remaining items
|
|
402
|
+
self._stop_event.set()
|
|
403
|
+
|
|
404
|
+
if self._worker_thread and self._worker_thread.is_alive():
|
|
405
|
+
# Check if there are pending uploads
|
|
406
|
+
with self._progress_lock:
|
|
407
|
+
pending = self._total_tensors_queued - self._tensors_uploaded
|
|
408
|
+
|
|
409
|
+
if pending > 0:
|
|
410
|
+
_term_log("finishing... Waiting for background uploads to complete.")
|
|
411
|
+
# Show progress while waiting
|
|
412
|
+
while self._worker_thread.is_alive() and self._tensors_uploaded < self._total_tensors_queued:
|
|
413
|
+
self._print_progress()
|
|
414
|
+
time.sleep(0.2)
|
|
415
|
+
self._print_progress() # Final update
|
|
416
|
+
print() # Newline after progress bar
|
|
417
|
+
else:
|
|
418
|
+
# No pending uploads, just wait for thread to finish
|
|
419
|
+
self._worker_thread.join()
|
|
420
|
+
|
|
421
|
+
# Update status on backend
|
|
422
|
+
self._update_status(status)
|
|
423
|
+
|
|
424
|
+
# Restore signal handlers
|
|
425
|
+
try:
|
|
426
|
+
if self._original_sigint_handler:
|
|
427
|
+
signal.signal(signal.SIGINT, self._original_sigint_handler)
|
|
428
|
+
if self._original_sigterm_handler:
|
|
429
|
+
signal.signal(signal.SIGTERM, self._original_sigterm_handler)
|
|
430
|
+
except ValueError:
|
|
431
|
+
pass
|
|
432
|
+
|
|
433
|
+
self._interrupted = False
|
|
434
|
+
self.active = False
|
|
435
|
+
_term_log(f"Session {status}.")
|
|
436
|
+
|
|
437
|
+
# Singleton instance
|
|
438
|
+
_client = Client()
|
|
439
|
+
|
|
440
|
+
def _excepthook(type, value, traceback):
|
|
441
|
+
if _client.active:
|
|
442
|
+
_client.finish("stopped")
|
|
443
|
+
sys.__excepthook__(type, value, traceback)
|
|
444
|
+
|
|
445
|
+
def init(project_name=None, run_name=None):
|
|
446
|
+
# Install excepthook if not already installed
|
|
447
|
+
if sys.excepthook is not _excepthook:
|
|
448
|
+
# We might want to chain if there's already a custom one?
|
|
449
|
+
# For simplicity, we just use ours which calls the default __excepthook__.
|
|
450
|
+
# If user has another custom one, this might override it.
|
|
451
|
+
# A safer way is to store the previous one.
|
|
452
|
+
global _original_excepthook
|
|
453
|
+
_original_excepthook = sys.excepthook
|
|
454
|
+
|
|
455
|
+
def tagged_excepthook(type, value, traceback):
|
|
456
|
+
if _client.active:
|
|
457
|
+
_client.finish("stopped")
|
|
458
|
+
_original_excepthook(type, value, traceback)
|
|
459
|
+
|
|
460
|
+
sys.excepthook = tagged_excepthook
|
|
461
|
+
|
|
462
|
+
_client.init(project_name=project_name, run_name=run_name)
|
|
463
|
+
|
|
464
|
+
def log(tensor, name="default"):
|
|
465
|
+
_client.log(tensor, name=name)
|
|
466
|
+
|
|
467
|
+
def finish(status="complete"):
|
|
468
|
+
_client.finish(status)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gradexp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Gradient Explorer Client Library
|
|
5
|
+
Author-email: Misha Obu <misha@parallel-ocean.xyz>
|
|
6
|
+
Requires-Python: >=3.7
|
|
7
|
+
Requires-Dist: appdirs
|
|
8
|
+
Requires-Dist: click
|
|
9
|
+
Requires-Dist: numpy
|
|
10
|
+
Requires-Dist: requests
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
source ./venv/bin/activate
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
This repo is intended to reimplement some key features of wandb for our own purposes. This is the pythonic frontend for our wandb-for-tensors service.
|
|
18
|
+
|
|
19
|
+
Repo features:
|
|
20
|
+
pip install gradexp
|
|
21
|
+
gradexp login -> opens webpage -> gets token and stores in in permanent context
|
|
22
|
+
In python file:
|
|
23
|
+
```
|
|
24
|
+
import gradexp
|
|
25
|
+
gradexp.init("project name")
|
|
26
|
+
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
# Automatically starts tracking run id, step, etc
|
|
30
|
+
gradexp.log(TODO ... )
|
|
31
|
+
|
|
32
|
+
...
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
App-wide features:
|
|
37
|
+
|
|
38
|
+
1) Local staging + asynchronous streaming
|
|
39
|
+
When you call wandb.log() in your training script, the client SDK:
|
|
40
|
+
|
|
41
|
+
collects the metric/media payload locally (in memory and also writes to local run files).
|
|
42
|
+
|
|
43
|
+
hands it off to a separate streaming thread/process that runs alongside your main process.
|
|
44
|
+
|
|
45
|
+
this uploader process asynchronously batches and sends events over the network. This avoids blocking your training process.
|
|
46
|
+
|
|
47
|
+
events are queued in memory and written to disk if needed (e.g., offline mode) and then eventually synced.
|
|
48
|
+
|
|
49
|
+
if you set WANDB_MODE=offline, nothing is sent to the server until a sync is triggered.
|
|
50
|
+
This behavior effectively decouples ingestion from your training loop. It’s not “direct write to bucket” from your app — it goes through this upload pipeline first.
|
|
51
|
+
Weights & Biases Documentation
|
|
52
|
+
|
|
53
|
+
2) Upload endpoints and storage target
|
|
54
|
+
On the server side (hosted or self-managed):
|
|
55
|
+
|
|
56
|
+
incoming streaming events (metrics, history tuples) arrive via W&B’s application backend.
|
|
57
|
+
|
|
58
|
+
metadata and small event records (like per-step metrics) are stored relationally (MySQL in self-managed reference architecture).
|
|
59
|
+
|
|
60
|
+
larger blobs (logs, media files, artifacts) are written out to object storage buckets (e.g., S3 or your own BYOB bucket).
|
|
61
|
+
You don’t stream directly into the bucket in small per-log increments — the backend receives the event first and the service layer persists the data.
|
|
62
|
+
Weights & Biases Documentation
|
|
63
|
+
+1
|
|
64
|
+
|
|
65
|
+
3) How “appending” works
|
|
66
|
+
Object storage (S3/compatible) isn’t a traditional file system — you can’t literally open and append to an existing file like with a local file. Instead:
|
|
67
|
+
|
|
68
|
+
W&B will write each piece of logged data as a separate object or part of a structured object.
|
|
69
|
+
|
|
70
|
+
for metric history exports (e.g., after a run completes), W&B creates Parquet exports and pushes those into the bucket as artifacts/history files.
|
|
71
|
+
|
|
72
|
+
this is a batch write rather than incremental byte-level append.
|
|
73
|
+
The UI and service layer then stitch these pieces together logically for history views.
|
|
74
|
+
(This pattern is common across object-store backed systems — you don’t append bytes to objects at every metric call in practice.)
|
|
75
|
+
Weights & Biases Documentation
|
|
76
|
+
|
|
77
|
+
4) Back-end buffering & batching
|
|
78
|
+
Even though metrics are “live” in the UI, there’s batching:
|
|
79
|
+
|
|
80
|
+
the SDK batches small updates and flushes over HTTP.
|
|
81
|
+
|
|
82
|
+
on the server side, those are ingested through API layers and persisted into the database or stored as discrete objects.
|
|
83
|
+
|
|
84
|
+
visualization updates pull from the latest ingested state.
|
|
85
|
+
The “real-time” aspect is achieved by frequent flushes and update propagation — not by writing to a monolithic streaming file.
|
|
86
|
+
W&B’s process model means it tolerates network latency/outages by buffering locally and then syncing when available.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
gradexp/__init__.py,sha256=kZfL0MotOvzx9JGu4rOv_WQkhVcOk4f__8-6M2lWZW4,60
|
|
2
|
+
gradexp/auth.py,sha256=I3idBycevSCYC02znm13cRvqIJdYnEndMZz6JfToHHw,2915
|
|
3
|
+
gradexp/cli.py,sha256=a9Y4b7o09oKcFBB2N9eB_CRB8tKuKcLjTVYAQYR_nGY,540
|
|
4
|
+
gradexp/client.py,sha256=vqzPhzIGjNuIE56k4p-JWVxKlX7m0NtOcrDoGQ1zQ8U,17336
|
|
5
|
+
gradexp-0.1.0.dist-info/METADATA,sha256=_tsPJaRgHM6oz0oAfqkqyE_dQaxNFRFjWgOFMLLlE80,3550
|
|
6
|
+
gradexp-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
7
|
+
gradexp-0.1.0.dist-info/entry_points.txt,sha256=8Azz89F2lTdJNq0x8lItv713gIj6YSk4ePS_C5gncEk,45
|
|
8
|
+
gradexp-0.1.0.dist-info/RECORD,,
|