wandb 0.20.1__py3-none-any.whl → 0.20.2rc20250616__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +3 -6
- wandb/__init__.pyi +1 -1
- wandb/analytics/sentry.py +2 -2
- wandb/apis/importers/internals/internal.py +0 -3
- wandb/apis/public/api.py +2 -2
- wandb/apis/public/registries/{utils.py → _utils.py} +12 -12
- wandb/apis/public/registries/registries_search.py +2 -2
- wandb/apis/public/registries/registry.py +19 -18
- wandb/bin/gpu_stats +0 -0
- wandb/cli/beta.py +1 -7
- wandb/cli/cli.py +0 -30
- wandb/env.py +0 -6
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v6/wandb_settings_pb2.py +2 -2
- wandb/proto/v6/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +42 -1
- wandb/sdk/backend/backend.py +1 -1
- wandb/sdk/internal/handler.py +1 -69
- wandb/sdk/lib/printer.py +6 -7
- wandb/sdk/lib/progress.py +1 -3
- wandb/sdk/lib/service/ipc_support.py +13 -0
- wandb/sdk/lib/{service_connection.py → service/service_connection.py} +20 -56
- wandb/sdk/lib/service/service_port_file.py +105 -0
- wandb/sdk/lib/service/service_process.py +111 -0
- wandb/sdk/lib/service/service_token.py +164 -0
- wandb/sdk/lib/sock_client.py +8 -12
- wandb/sdk/wandb_init.py +0 -3
- wandb/sdk/wandb_require.py +9 -20
- wandb/sdk/wandb_run.py +0 -24
- wandb/sdk/wandb_settings.py +0 -9
- wandb/sdk/wandb_setup.py +2 -13
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/METADATA +1 -3
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/RECORD +41 -67
- wandb/sdk/internal/flow_control.py +0 -263
- wandb/sdk/internal/internal.py +0 -401
- wandb/sdk/internal/internal_util.py +0 -97
- wandb/sdk/internal/system/__init__.py +0 -0
- wandb/sdk/internal/system/assets/__init__.py +0 -25
- wandb/sdk/internal/system/assets/aggregators.py +0 -31
- wandb/sdk/internal/system/assets/asset_registry.py +0 -20
- wandb/sdk/internal/system/assets/cpu.py +0 -163
- wandb/sdk/internal/system/assets/disk.py +0 -210
- wandb/sdk/internal/system/assets/gpu.py +0 -416
- wandb/sdk/internal/system/assets/gpu_amd.py +0 -233
- wandb/sdk/internal/system/assets/interfaces.py +0 -205
- wandb/sdk/internal/system/assets/ipu.py +0 -177
- wandb/sdk/internal/system/assets/memory.py +0 -166
- wandb/sdk/internal/system/assets/network.py +0 -125
- wandb/sdk/internal/system/assets/open_metrics.py +0 -293
- wandb/sdk/internal/system/assets/tpu.py +0 -154
- wandb/sdk/internal/system/assets/trainium.py +0 -393
- wandb/sdk/internal/system/env_probe_helpers.py +0 -13
- wandb/sdk/internal/system/system_info.py +0 -248
- wandb/sdk/internal/system/system_monitor.py +0 -224
- wandb/sdk/internal/writer.py +0 -204
- wandb/sdk/lib/service_token.py +0 -93
- wandb/sdk/service/__init__.py +0 -0
- wandb/sdk/service/_startup_debug.py +0 -22
- wandb/sdk/service/port_file.py +0 -53
- wandb/sdk/service/server.py +0 -107
- wandb/sdk/service/server_sock.py +0 -286
- wandb/sdk/service/service.py +0 -252
- wandb/sdk/service/streams.py +0 -425
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/WHEEL +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/entry_points.txt +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/licenses/LICENSE +0 -0
@@ -1,393 +0,0 @@
|
|
1
|
-
import collections
|
2
|
-
import dataclasses
|
3
|
-
import json
|
4
|
-
import logging
|
5
|
-
import os
|
6
|
-
import pathlib
|
7
|
-
import shutil
|
8
|
-
import subprocess
|
9
|
-
import tempfile
|
10
|
-
import threading
|
11
|
-
import time
|
12
|
-
from collections import deque
|
13
|
-
from typing import TYPE_CHECKING, Any, Dict, Final, List, Optional, Tuple, Union
|
14
|
-
|
15
|
-
from wandb.sdk.lib import telemetry
|
16
|
-
|
17
|
-
from .aggregators import aggregate_mean
|
18
|
-
from .asset_registry import asset_registry
|
19
|
-
from .interfaces import Interface, Metric, MetricsMonitor
|
20
|
-
|
21
|
-
if TYPE_CHECKING:
|
22
|
-
from typing import Deque
|
23
|
-
|
24
|
-
from wandb.sdk.internal.settings_static import SettingsStatic
|
25
|
-
|
26
|
-
|
27
|
-
logger = logging.getLogger(__name__)
|
28
|
-
|
29
|
-
|
30
|
-
NEURON_MONITOR_DEFAULT_CONFIG: Final[dict] = {
|
31
|
-
"period": "1s",
|
32
|
-
"neuron_runtimes": [
|
33
|
-
{
|
34
|
-
"tag_filter": ".*",
|
35
|
-
"metrics": [
|
36
|
-
{"type": "neuroncore_counters"},
|
37
|
-
{"type": "memory_used"},
|
38
|
-
{"type": "neuron_runtime_vcpu_usage"},
|
39
|
-
# {"type": "execution_stats"},
|
40
|
-
],
|
41
|
-
}
|
42
|
-
],
|
43
|
-
"system_metrics": [
|
44
|
-
{"type": "vcpu_usage"},
|
45
|
-
{"type": "memory_info"},
|
46
|
-
{"type": "neuron_hw_counters"},
|
47
|
-
],
|
48
|
-
}
|
49
|
-
|
50
|
-
# todo: once a python sdk is released with the Neuron utils, rewrite this
|
51
|
-
NEURON_LS_COMMAND: Final[Tuple[str, str]] = (
|
52
|
-
shutil.which("neuron-ls") or "/opt/aws/neuron/bin/neuron-ls",
|
53
|
-
"-j",
|
54
|
-
)
|
55
|
-
NEURON_MONITOR_PATH: Final[str] = (
|
56
|
-
shutil.which("neuron-monitor") or "/opt/aws/neuron/bin/neuron-monitor"
|
57
|
-
)
|
58
|
-
|
59
|
-
|
60
|
-
@dataclasses.dataclass
|
61
|
-
class _NeuronCoreMemoryUsage:
|
62
|
-
constants: int
|
63
|
-
model_code: int
|
64
|
-
model_shared_scratchpad: int
|
65
|
-
runtime_memory: int
|
66
|
-
tensors: int
|
67
|
-
|
68
|
-
|
69
|
-
@dataclasses.dataclass
|
70
|
-
class _HostMemoryUsage:
|
71
|
-
application_memory: int
|
72
|
-
constants: int
|
73
|
-
dma_buffers: int
|
74
|
-
tensors: int
|
75
|
-
|
76
|
-
|
77
|
-
@dataclasses.dataclass
|
78
|
-
class _Stats:
|
79
|
-
neuroncore_utilization: Dict[int, float] # per neuron core utilization
|
80
|
-
host_total_memory_usage: int # total memory usage in bytes
|
81
|
-
neuron_device_total_memory_usage: int # total memory usage
|
82
|
-
host_memory_usage: _HostMemoryUsage # host memory usage breakdown
|
83
|
-
neuroncore_memory_usage: Dict[
|
84
|
-
int, _NeuronCoreMemoryUsage
|
85
|
-
] # per core memory usage breakdown
|
86
|
-
|
87
|
-
|
88
|
-
class NeuronCoreStats:
|
89
|
-
"""AWS Trainium stats."""
|
90
|
-
|
91
|
-
name: str = "trn.{key}"
|
92
|
-
samples: "Deque[_Stats]"
|
93
|
-
|
94
|
-
def write_neuron_monitor_config(self) -> None:
|
95
|
-
"""Write neuron monitor config file."""
|
96
|
-
# mkdir if not exists
|
97
|
-
pathlib.Path(self.neuron_monitor_config_path).parent.mkdir(
|
98
|
-
parents=True, exist_ok=True
|
99
|
-
)
|
100
|
-
# write default config
|
101
|
-
with open(self.neuron_monitor_config_path, "w") as f:
|
102
|
-
json.dump(NEURON_MONITOR_DEFAULT_CONFIG, f, indent=4)
|
103
|
-
|
104
|
-
def neuron_monitor(self) -> None:
|
105
|
-
"""Run neuron-monitor in a separate process to collect raw data."""
|
106
|
-
self.write_neuron_monitor_config()
|
107
|
-
|
108
|
-
try:
|
109
|
-
command = [
|
110
|
-
NEURON_MONITOR_PATH,
|
111
|
-
"-c",
|
112
|
-
self.neuron_monitor_config_path,
|
113
|
-
]
|
114
|
-
with subprocess.Popen(
|
115
|
-
command,
|
116
|
-
stdout=subprocess.PIPE,
|
117
|
-
stderr=None,
|
118
|
-
) as process:
|
119
|
-
while not self.shutdown_event.is_set():
|
120
|
-
if process.stdout is None:
|
121
|
-
self.shutdown_event.wait(0.1)
|
122
|
-
continue
|
123
|
-
|
124
|
-
raw_data = process.stdout.readline()
|
125
|
-
if raw_data:
|
126
|
-
self.raw_samples.append(raw_data)
|
127
|
-
process.kill()
|
128
|
-
process.wait()
|
129
|
-
except Exception:
|
130
|
-
logger.exception("neuron-monitor failed")
|
131
|
-
|
132
|
-
def __init__(
|
133
|
-
self,
|
134
|
-
pid: int,
|
135
|
-
neuron_monitor_config_path: Optional[str],
|
136
|
-
) -> None:
|
137
|
-
self.pid = pid
|
138
|
-
# neuron-monitor requires a config file (json)
|
139
|
-
# we provide an option to supply a custom config file path
|
140
|
-
# in case the default temp file path is not writable
|
141
|
-
self.neuron_monitor_config_path = (
|
142
|
-
neuron_monitor_config_path or tempfile.NamedTemporaryFile(delete=False).name
|
143
|
-
)
|
144
|
-
self.raw_samples: Deque[bytes] = deque(maxlen=10)
|
145
|
-
self.samples: Deque[_Stats] = deque()
|
146
|
-
self.shutdown_event = threading.Event()
|
147
|
-
|
148
|
-
self.neuron_monitor_thread: Optional[threading.Thread] = None
|
149
|
-
|
150
|
-
def setup(self) -> None:
|
151
|
-
"""Start the neuron-monitor thread for collecting raw data."""
|
152
|
-
if self.neuron_monitor_thread is not None:
|
153
|
-
return
|
154
|
-
|
155
|
-
logger.debug("Starting neuron-monitor thread")
|
156
|
-
self.shutdown_event.clear()
|
157
|
-
self.neuron_monitor_thread = threading.Thread(
|
158
|
-
name="NeuronCoreMntr",
|
159
|
-
target=self.neuron_monitor,
|
160
|
-
daemon=True,
|
161
|
-
)
|
162
|
-
self.neuron_monitor_thread.start()
|
163
|
-
|
164
|
-
def teardown(self) -> None:
|
165
|
-
"""Stop the neuron-monitor thread."""
|
166
|
-
logger.debug("Stopping neuron-monitor thread")
|
167
|
-
try:
|
168
|
-
self.shutdown_event.set()
|
169
|
-
assert self.neuron_monitor_thread is not None
|
170
|
-
self.neuron_monitor_thread.join()
|
171
|
-
except Exception:
|
172
|
-
logger.exception("neuron-monitor thread failed to stop")
|
173
|
-
finally:
|
174
|
-
self.neuron_monitor_thread = None
|
175
|
-
|
176
|
-
def _is_matching_entry(self, entry: dict) -> bool:
|
177
|
-
"""Check if the entry should be saved.
|
178
|
-
|
179
|
-
Checks if the pid in the entry matches the pid of the process.
|
180
|
-
If not (as in the case of multi-process training with torchrun),
|
181
|
-
checks if the LOCAL_RANK environment variable is set.
|
182
|
-
|
183
|
-
todo: add matching by neuron_runtime_tag
|
184
|
-
"""
|
185
|
-
return (int(entry["pid"]) == int(self.pid)) or "LOCAL_RANK" in os.environ
|
186
|
-
|
187
|
-
def sample(self) -> None:
|
188
|
-
try:
|
189
|
-
raw_stats = json.loads(self.raw_samples[-1])
|
190
|
-
neuron_runtime_data = [
|
191
|
-
entry["report"]
|
192
|
-
for entry in raw_stats["neuron_runtime_data"]
|
193
|
-
if self._is_matching_entry(entry)
|
194
|
-
][0] # there should be only one entry with the pid
|
195
|
-
|
196
|
-
neuroncores_in_use = neuron_runtime_data["neuroncore_counters"][
|
197
|
-
"neuroncores_in_use"
|
198
|
-
]
|
199
|
-
# per-core utilization stats:
|
200
|
-
neuroncore_utilization = {
|
201
|
-
int(k): v["neuroncore_utilization"]
|
202
|
-
for k, v in neuroncores_in_use.items()
|
203
|
-
}
|
204
|
-
# memory usage
|
205
|
-
neuron_runtime_used_bytes = neuron_runtime_data["memory_used"][
|
206
|
-
"neuron_runtime_used_bytes"
|
207
|
-
]
|
208
|
-
# memory usage totals
|
209
|
-
host_total_memory_usage = neuron_runtime_used_bytes["host"]
|
210
|
-
neuron_device_total_memory_usage = neuron_runtime_used_bytes[
|
211
|
-
"neuron_device"
|
212
|
-
]
|
213
|
-
# memory usage breakdown
|
214
|
-
usage_breakdown = neuron_runtime_used_bytes["usage_breakdown"]
|
215
|
-
host_memory_usage = _HostMemoryUsage(**usage_breakdown["host"])
|
216
|
-
neuroncore_memory_usage = {
|
217
|
-
int(k): _NeuronCoreMemoryUsage(**v)
|
218
|
-
for k, v in usage_breakdown["neuroncore_memory_usage"].items()
|
219
|
-
}
|
220
|
-
|
221
|
-
# When the training script is executed with torchrun,
|
222
|
-
# we only want to keep the relevant LOCAL_RANK stats
|
223
|
-
local_rank = int(os.environ.get("LOCAL_RANK", -1337))
|
224
|
-
if local_rank >= 0:
|
225
|
-
neuroncore_utilization = {
|
226
|
-
local_rank: neuroncore_utilization[local_rank]
|
227
|
-
}
|
228
|
-
neuroncore_memory_usage = {
|
229
|
-
local_rank: neuroncore_memory_usage[local_rank]
|
230
|
-
}
|
231
|
-
|
232
|
-
stats: _Stats = _Stats(
|
233
|
-
neuroncore_utilization=neuroncore_utilization,
|
234
|
-
host_total_memory_usage=host_total_memory_usage,
|
235
|
-
neuron_device_total_memory_usage=neuron_device_total_memory_usage,
|
236
|
-
host_memory_usage=host_memory_usage,
|
237
|
-
neuroncore_memory_usage=neuroncore_memory_usage,
|
238
|
-
)
|
239
|
-
self.samples.append(stats)
|
240
|
-
|
241
|
-
except Exception as e: # noqa
|
242
|
-
pass
|
243
|
-
|
244
|
-
def clear(self) -> None:
|
245
|
-
self.samples.clear()
|
246
|
-
|
247
|
-
@staticmethod
|
248
|
-
def flatten_stats(sample: _Stats) -> dict:
|
249
|
-
"""Flatten _Stats object into a flat dict of numbers."""
|
250
|
-
flattened = {}
|
251
|
-
|
252
|
-
def helper(key: str, value: Any) -> None:
|
253
|
-
if isinstance(value, (int, float)):
|
254
|
-
ret = {f"{key}": value}
|
255
|
-
flattened.update(ret)
|
256
|
-
return
|
257
|
-
elif isinstance(value, dict):
|
258
|
-
for kk, vv in value.items():
|
259
|
-
if isinstance(kk, int):
|
260
|
-
# top-level keys are neuron core ids,
|
261
|
-
# so we swap the order to comply with the
|
262
|
-
# frontend expectations
|
263
|
-
helper(f"{kk}.{key}", vv)
|
264
|
-
else:
|
265
|
-
helper(f"{key}.{kk}", vv)
|
266
|
-
return
|
267
|
-
elif isinstance(value, list):
|
268
|
-
for i, val in enumerate(value):
|
269
|
-
helper(f"{i}.{key}", val)
|
270
|
-
|
271
|
-
for kkk, vvv in dataclasses.asdict(sample).items():
|
272
|
-
helper(kkk, vvv)
|
273
|
-
|
274
|
-
return flattened
|
275
|
-
|
276
|
-
def aggregate(self) -> dict:
|
277
|
-
if not self.samples:
|
278
|
-
return {}
|
279
|
-
|
280
|
-
stats = {}
|
281
|
-
|
282
|
-
# Stats could be: numbers or dataclass objects or lists of such.
|
283
|
-
# In the latter case that means per-core stats.
|
284
|
-
# The dataclass objects are flat containers of numbers.
|
285
|
-
|
286
|
-
# flatten samples and merge the corresponding values into lists
|
287
|
-
merged_samples: Dict[str, List[Union[int, float]]] = collections.defaultdict(
|
288
|
-
list
|
289
|
-
)
|
290
|
-
for flattened_sample in (self.flatten_stats(sample) for sample in self.samples):
|
291
|
-
for k, v in flattened_sample.items():
|
292
|
-
merged_samples[k].append(v)
|
293
|
-
|
294
|
-
# aggregate the lists
|
295
|
-
for k, v in merged_samples.items():
|
296
|
-
stats[self.name.format(key=k)] = aggregate_mean(v)
|
297
|
-
|
298
|
-
return stats
|
299
|
-
|
300
|
-
|
301
|
-
@asset_registry.register
|
302
|
-
class Trainium:
|
303
|
-
def __init__(
|
304
|
-
self,
|
305
|
-
interface: "Interface",
|
306
|
-
settings: "SettingsStatic",
|
307
|
-
shutdown_event: threading.Event,
|
308
|
-
) -> None:
|
309
|
-
self.name = self.__class__.__name__.lower()
|
310
|
-
self.metrics: List[Metric] = [
|
311
|
-
NeuronCoreStats(
|
312
|
-
settings.x_stats_pid,
|
313
|
-
settings.x_stats_neuron_monitor_config_path,
|
314
|
-
),
|
315
|
-
]
|
316
|
-
self.metrics_monitor = MetricsMonitor(
|
317
|
-
self.name,
|
318
|
-
self.metrics,
|
319
|
-
interface,
|
320
|
-
settings,
|
321
|
-
shutdown_event,
|
322
|
-
)
|
323
|
-
telemetry_record = telemetry.TelemetryRecord()
|
324
|
-
telemetry_record.env.trainium = True
|
325
|
-
interface._publish_telemetry(telemetry_record)
|
326
|
-
|
327
|
-
@classmethod
|
328
|
-
def is_available(cls) -> bool:
|
329
|
-
# todo: check if neuron-ls is available and if yes, what it reports. see:
|
330
|
-
# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-ls.html
|
331
|
-
if not pathlib.Path(NEURON_LS_COMMAND[0]).exists():
|
332
|
-
return False
|
333
|
-
# need to be extra careful as neuron tools could be pre-installed
|
334
|
-
# on some systems that do not have the hardware
|
335
|
-
try:
|
336
|
-
# redirect stderr to null to avoid printing errors to the console
|
337
|
-
# todo: alternative: check /dev/neuron0 ? sysfs support coming soon in neuron tools
|
338
|
-
output = subprocess.check_output(
|
339
|
-
NEURON_LS_COMMAND,
|
340
|
-
universal_newlines=True,
|
341
|
-
stderr=subprocess.DEVNULL,
|
342
|
-
).strip()
|
343
|
-
if len(json.loads(output)) > 0:
|
344
|
-
return True
|
345
|
-
except (OSError, ValueError, TypeError, subprocess.CalledProcessError):
|
346
|
-
pass
|
347
|
-
|
348
|
-
return False
|
349
|
-
|
350
|
-
def start(self) -> None:
|
351
|
-
self.metrics_monitor.start()
|
352
|
-
|
353
|
-
def finish(self) -> None:
|
354
|
-
self.metrics_monitor.finish()
|
355
|
-
|
356
|
-
def probe(self) -> dict:
|
357
|
-
try:
|
358
|
-
self.metrics[0].check_neuron_monitor_config() # type: ignore
|
359
|
-
neuron_hardware_info: dict = {}
|
360
|
-
command = [
|
361
|
-
NEURON_MONITOR_PATH,
|
362
|
-
"-c",
|
363
|
-
self.metrics[0].neuron_monitor_config_path, # type: ignore
|
364
|
-
]
|
365
|
-
with subprocess.Popen(
|
366
|
-
command,
|
367
|
-
stdout=subprocess.PIPE,
|
368
|
-
stderr=None,
|
369
|
-
) as process:
|
370
|
-
while True:
|
371
|
-
if process.stdout is None:
|
372
|
-
time.sleep(0.1)
|
373
|
-
continue
|
374
|
-
|
375
|
-
raw_data = process.stdout.readline()
|
376
|
-
if raw_data:
|
377
|
-
parsed_data = json.loads(raw_data)
|
378
|
-
neuron_hardware_info = parsed_data.get(
|
379
|
-
"neuron_hardware_info", {}
|
380
|
-
)
|
381
|
-
neuron_hardware_info.pop("error", None)
|
382
|
-
break
|
383
|
-
|
384
|
-
try:
|
385
|
-
process.kill()
|
386
|
-
process.wait()
|
387
|
-
except: # noqa
|
388
|
-
pass
|
389
|
-
|
390
|
-
return {self.name: neuron_hardware_info}
|
391
|
-
except Exception:
|
392
|
-
logger.exception("neuron-monitor failed")
|
393
|
-
return {}
|
@@ -1,13 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
|
3
|
-
from sentry_sdk.integrations.aws_lambda import get_lambda_bootstrap # type: ignore
|
4
|
-
|
5
|
-
logger = logging.getLogger(__name__)
|
6
|
-
|
7
|
-
|
8
|
-
def is_aws_lambda() -> bool:
|
9
|
-
"""Check if we are running in a lambda environment."""
|
10
|
-
lambda_bootstrap = get_lambda_bootstrap()
|
11
|
-
if not lambda_bootstrap or not hasattr(lambda_bootstrap, "handle_event_request"):
|
12
|
-
return False
|
13
|
-
return True
|
@@ -1,248 +0,0 @@
|
|
1
|
-
# Information about the system and the environment
|
2
|
-
import datetime
|
3
|
-
import glob
|
4
|
-
import json
|
5
|
-
import logging
|
6
|
-
import os
|
7
|
-
import subprocess
|
8
|
-
import sys
|
9
|
-
from shutil import copyfile
|
10
|
-
from typing import Any, Dict, List, Optional
|
11
|
-
from urllib.parse import unquote
|
12
|
-
|
13
|
-
from wandb.sdk.internal.settings_static import SettingsStatic
|
14
|
-
from wandb.sdk.lib import filesystem
|
15
|
-
from wandb.sdk.lib.filenames import CONDA_ENVIRONMENTS_FNAME, DIFF_FNAME, METADATA_FNAME
|
16
|
-
from wandb.sdk.lib.gitlib import GitRepo
|
17
|
-
|
18
|
-
from .assets.interfaces import Interface
|
19
|
-
|
20
|
-
logger = logging.getLogger(__name__)
|
21
|
-
|
22
|
-
|
23
|
-
class SystemInfo:
|
24
|
-
# todo: this is mostly a copy of the legacy Meta class, but it should be refactored
|
25
|
-
def __init__(self, settings: SettingsStatic, interface: Interface) -> None:
|
26
|
-
logger.debug("System info init")
|
27
|
-
self.settings = settings
|
28
|
-
|
29
|
-
self.metadata_file_name = os.path.join(self.settings.files_dir, METADATA_FNAME)
|
30
|
-
self.backend_interface = interface
|
31
|
-
self.git = GitRepo(
|
32
|
-
root=self.settings.git_root,
|
33
|
-
remote=self.settings.git_remote,
|
34
|
-
remote_url=self.settings.git_remote_url,
|
35
|
-
commit=self.settings.git_commit,
|
36
|
-
)
|
37
|
-
# Location under "code" directory in files where program was saved.
|
38
|
-
self.saved_program: Optional[os.PathLike] = None
|
39
|
-
# Locations under files directory where diff patches were saved.
|
40
|
-
self.saved_patches: List[str] = []
|
41
|
-
logger.debug("System info init done")
|
42
|
-
|
43
|
-
def _save_code(self) -> None:
|
44
|
-
logger.debug("Saving code")
|
45
|
-
if not self.settings.program_relpath:
|
46
|
-
logger.warning("unable to save code -- program entry not found")
|
47
|
-
return None
|
48
|
-
|
49
|
-
root: str = self.git.root or os.getcwd()
|
50
|
-
program_relative: str = self.settings.program_relpath
|
51
|
-
filesystem.mkdir_exists_ok(
|
52
|
-
os.path.join(
|
53
|
-
self.settings.files_dir, "code", os.path.dirname(program_relative)
|
54
|
-
)
|
55
|
-
)
|
56
|
-
program_absolute = os.path.join(root, program_relative)
|
57
|
-
if not os.path.exists(program_absolute):
|
58
|
-
logger.warning(f"unable to save code -- can't find {program_absolute}")
|
59
|
-
return None
|
60
|
-
saved_program = os.path.join(self.settings.files_dir, "code", program_relative)
|
61
|
-
self.saved_program = program_relative # type: ignore
|
62
|
-
|
63
|
-
if not os.path.exists(saved_program):
|
64
|
-
copyfile(program_absolute, saved_program)
|
65
|
-
logger.debug("Saving code done")
|
66
|
-
|
67
|
-
def _save_patches(self) -> None:
|
68
|
-
"""Save the current state of this repository to one or more patches.
|
69
|
-
|
70
|
-
Makes one patch against HEAD and another one against the most recent
|
71
|
-
commit that occurs in an upstream branch. This way we can be robust
|
72
|
-
to history editing as long as the user never does "push -f" to break
|
73
|
-
history on an upstream branch.
|
74
|
-
|
75
|
-
Writes the first patch to <files_dir>/<DIFF_FNAME> and the second to
|
76
|
-
<files_dir>/upstream_diff_<commit_id>.patch.
|
77
|
-
|
78
|
-
"""
|
79
|
-
if not self.git.enabled:
|
80
|
-
return None
|
81
|
-
|
82
|
-
logger.debug("Saving git patches")
|
83
|
-
try:
|
84
|
-
root = self.git.root
|
85
|
-
diff_args = ["git", "diff"]
|
86
|
-
if self.git.has_submodule_diff:
|
87
|
-
diff_args.append("--submodule=diff")
|
88
|
-
|
89
|
-
if self.git.dirty:
|
90
|
-
patch_path = os.path.join(self.settings.files_dir, DIFF_FNAME)
|
91
|
-
with open(patch_path, "wb") as patch:
|
92
|
-
# we diff against HEAD to ensure we get changes in the index
|
93
|
-
subprocess.check_call(
|
94
|
-
diff_args + ["HEAD"], stdout=patch, cwd=root, timeout=5
|
95
|
-
)
|
96
|
-
self.saved_patches.append(
|
97
|
-
os.path.relpath(patch_path, start=self.settings.files_dir)
|
98
|
-
)
|
99
|
-
|
100
|
-
upstream_commit = self.git.get_upstream_fork_point()
|
101
|
-
if upstream_commit and upstream_commit != self.git.repo.head.commit: # type: ignore
|
102
|
-
sha = upstream_commit.hexsha
|
103
|
-
upstream_patch_path = os.path.join(
|
104
|
-
self.settings.files_dir, f"upstream_diff_{sha}.patch"
|
105
|
-
)
|
106
|
-
with open(upstream_patch_path, "wb") as upstream_patch:
|
107
|
-
subprocess.check_call(
|
108
|
-
diff_args + [sha], stdout=upstream_patch, cwd=root, timeout=5
|
109
|
-
)
|
110
|
-
self.saved_patches.append(
|
111
|
-
os.path.relpath(
|
112
|
-
upstream_patch_path, start=self.settings.files_dir
|
113
|
-
)
|
114
|
-
)
|
115
|
-
# TODO: A customer saw `ValueError: Reference at 'refs/remotes/origin/foo'
|
116
|
-
# does not exist` so we now catch ValueError. Catching this error feels
|
117
|
-
# too generic.
|
118
|
-
except (
|
119
|
-
ValueError,
|
120
|
-
subprocess.CalledProcessError,
|
121
|
-
subprocess.TimeoutExpired,
|
122
|
-
):
|
123
|
-
logger.exception("Error generating diff.")
|
124
|
-
logger.debug("Saving git patches done")
|
125
|
-
|
126
|
-
def _probe_git(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
127
|
-
if self.settings.disable_git or self.settings.x_disable_machine_info:
|
128
|
-
return data
|
129
|
-
|
130
|
-
# in case of manually passing the git repo info, `enabled` would be False,
|
131
|
-
# but we still want to save the git repo info
|
132
|
-
if not self.git.enabled and self.git.auto:
|
133
|
-
return data
|
134
|
-
|
135
|
-
logger.debug("Probing git")
|
136
|
-
|
137
|
-
data["git"] = {
|
138
|
-
"remote": self.git.remote_url,
|
139
|
-
"commit": self.git.last_commit,
|
140
|
-
}
|
141
|
-
data["email"] = self.git.email
|
142
|
-
data["root"] = self.git.root or data.get("root") or os.getcwd()
|
143
|
-
logger.debug("Probing git done")
|
144
|
-
|
145
|
-
return data
|
146
|
-
|
147
|
-
def probe(self) -> Dict[str, Any]:
|
148
|
-
"""Probe the system for information about the current environment."""
|
149
|
-
# todo: refactor this quality code 🤮🤮🤮🤮🤮
|
150
|
-
logger.debug("Probing system")
|
151
|
-
data: Dict[str, Any] = dict()
|
152
|
-
|
153
|
-
data["os"] = self.settings._os
|
154
|
-
data["python"] = self.settings._python
|
155
|
-
data["heartbeatAt"] = datetime.datetime.utcnow().isoformat()
|
156
|
-
data["startedAt"] = (
|
157
|
-
datetime.datetime.utcfromtimestamp(self.settings.x_start_time).isoformat()
|
158
|
-
if self.settings.x_start_time
|
159
|
-
else None
|
160
|
-
)
|
161
|
-
|
162
|
-
data["docker"] = self.settings.docker
|
163
|
-
|
164
|
-
data["args"] = tuple(self.settings._args or ())
|
165
|
-
data["state"] = "running"
|
166
|
-
|
167
|
-
if self.settings.program is not None:
|
168
|
-
data["program"] = self.settings.program
|
169
|
-
# Used during artifact-job creation, always points to the relpath
|
170
|
-
# of code execution, even when in a git repo
|
171
|
-
data["codePathLocal"] = self.settings._code_path_local
|
172
|
-
if not (self.settings.disable_code or self.settings.x_disable_machine_info):
|
173
|
-
if self.settings.program_relpath:
|
174
|
-
data["codePath"] = self.settings.program_relpath
|
175
|
-
elif self.settings._jupyter:
|
176
|
-
if self.settings.notebook_name:
|
177
|
-
data["program"] = self.settings.notebook_name
|
178
|
-
elif self.settings.x_jupyter_path:
|
179
|
-
if self.settings.x_jupyter_path.startswith("fileId="):
|
180
|
-
unescaped = unquote(self.settings.x_jupyter_path)
|
181
|
-
data["colab"] = (
|
182
|
-
"https://colab.research.google.com/notebook#" + unescaped
|
183
|
-
)
|
184
|
-
data["program"] = self.settings.x_jupyter_name
|
185
|
-
else:
|
186
|
-
data["program"] = self.settings.x_jupyter_path
|
187
|
-
data["root"] = self.settings.x_jupyter_root
|
188
|
-
# get the git repo info
|
189
|
-
data = self._probe_git(data)
|
190
|
-
|
191
|
-
if self.settings.anonymous not in ["allow", "must"]:
|
192
|
-
data["host"] = self.settings.host
|
193
|
-
data["username"] = self.settings.username
|
194
|
-
data["executable"] = sys.executable
|
195
|
-
else:
|
196
|
-
data.pop("email", None)
|
197
|
-
data.pop("root", None)
|
198
|
-
|
199
|
-
logger.debug("Probing system done")
|
200
|
-
|
201
|
-
return data
|
202
|
-
|
203
|
-
def _save_conda(self) -> None:
|
204
|
-
current_shell_is_conda = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
|
205
|
-
if not current_shell_is_conda:
|
206
|
-
return None
|
207
|
-
|
208
|
-
logger.debug(
|
209
|
-
"Saving list of conda packages installed into the current environment"
|
210
|
-
)
|
211
|
-
try:
|
212
|
-
with open(
|
213
|
-
os.path.join(self.settings.files_dir, CONDA_ENVIRONMENTS_FNAME), "w"
|
214
|
-
) as f:
|
215
|
-
subprocess.call(
|
216
|
-
["conda", "env", "export"],
|
217
|
-
stdout=f,
|
218
|
-
stderr=subprocess.DEVNULL,
|
219
|
-
timeout=15, # add timeout since conda env export could take a really long time
|
220
|
-
)
|
221
|
-
except Exception:
|
222
|
-
logger.exception("Error saving conda packages")
|
223
|
-
logger.debug("Saving conda packages done")
|
224
|
-
|
225
|
-
def publish(self, system_info: dict) -> None:
|
226
|
-
# save pip, conda, code patches to disk
|
227
|
-
if self.settings.x_save_requirements:
|
228
|
-
self._save_conda()
|
229
|
-
if self.settings.save_code:
|
230
|
-
self._save_code()
|
231
|
-
self._save_patches()
|
232
|
-
|
233
|
-
# save system_info to disk
|
234
|
-
with open(self.metadata_file_name, "w") as f:
|
235
|
-
s = json.dumps(system_info, indent=4)
|
236
|
-
f.write(s)
|
237
|
-
f.write("\n")
|
238
|
-
base_name = os.path.basename(self.metadata_file_name)
|
239
|
-
files = dict(files=[(base_name, "now")])
|
240
|
-
|
241
|
-
if self.saved_program:
|
242
|
-
saved_program = os.path.join("code", self.saved_program)
|
243
|
-
files["files"].append((glob.escape(saved_program), "now"))
|
244
|
-
for patch in self.saved_patches:
|
245
|
-
files["files"].append((glob.escape(patch), "now"))
|
246
|
-
|
247
|
-
# publish files to the backend
|
248
|
-
self.backend_interface.publish_files(files) # type: ignore
|