wandb 0.20.1__py3-none-any.whl → 0.20.2rc20250616__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. wandb/__init__.py +3 -6
  2. wandb/__init__.pyi +1 -1
  3. wandb/analytics/sentry.py +2 -2
  4. wandb/apis/importers/internals/internal.py +0 -3
  5. wandb/apis/public/api.py +2 -2
  6. wandb/apis/public/registries/{utils.py → _utils.py} +12 -12
  7. wandb/apis/public/registries/registries_search.py +2 -2
  8. wandb/apis/public/registries/registry.py +19 -18
  9. wandb/bin/gpu_stats +0 -0
  10. wandb/cli/beta.py +1 -7
  11. wandb/cli/cli.py +0 -30
  12. wandb/env.py +0 -6
  13. wandb/proto/v3/wandb_settings_pb2.py +2 -2
  14. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  15. wandb/proto/v4/wandb_settings_pb2.py +2 -2
  16. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  17. wandb/proto/v5/wandb_settings_pb2.py +2 -2
  18. wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
  19. wandb/proto/v6/wandb_settings_pb2.py +2 -2
  20. wandb/proto/v6/wandb_telemetry_pb2.py +10 -10
  21. wandb/sdk/artifacts/storage_handlers/s3_handler.py +42 -1
  22. wandb/sdk/backend/backend.py +1 -1
  23. wandb/sdk/internal/handler.py +1 -69
  24. wandb/sdk/lib/printer.py +6 -7
  25. wandb/sdk/lib/progress.py +1 -3
  26. wandb/sdk/lib/service/ipc_support.py +13 -0
  27. wandb/sdk/lib/{service_connection.py → service/service_connection.py} +20 -56
  28. wandb/sdk/lib/service/service_port_file.py +105 -0
  29. wandb/sdk/lib/service/service_process.py +111 -0
  30. wandb/sdk/lib/service/service_token.py +164 -0
  31. wandb/sdk/lib/sock_client.py +8 -12
  32. wandb/sdk/wandb_init.py +0 -3
  33. wandb/sdk/wandb_require.py +9 -20
  34. wandb/sdk/wandb_run.py +0 -24
  35. wandb/sdk/wandb_settings.py +0 -9
  36. wandb/sdk/wandb_setup.py +2 -13
  37. {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/METADATA +1 -3
  38. {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/RECORD +41 -67
  39. wandb/sdk/internal/flow_control.py +0 -263
  40. wandb/sdk/internal/internal.py +0 -401
  41. wandb/sdk/internal/internal_util.py +0 -97
  42. wandb/sdk/internal/system/__init__.py +0 -0
  43. wandb/sdk/internal/system/assets/__init__.py +0 -25
  44. wandb/sdk/internal/system/assets/aggregators.py +0 -31
  45. wandb/sdk/internal/system/assets/asset_registry.py +0 -20
  46. wandb/sdk/internal/system/assets/cpu.py +0 -163
  47. wandb/sdk/internal/system/assets/disk.py +0 -210
  48. wandb/sdk/internal/system/assets/gpu.py +0 -416
  49. wandb/sdk/internal/system/assets/gpu_amd.py +0 -233
  50. wandb/sdk/internal/system/assets/interfaces.py +0 -205
  51. wandb/sdk/internal/system/assets/ipu.py +0 -177
  52. wandb/sdk/internal/system/assets/memory.py +0 -166
  53. wandb/sdk/internal/system/assets/network.py +0 -125
  54. wandb/sdk/internal/system/assets/open_metrics.py +0 -293
  55. wandb/sdk/internal/system/assets/tpu.py +0 -154
  56. wandb/sdk/internal/system/assets/trainium.py +0 -393
  57. wandb/sdk/internal/system/env_probe_helpers.py +0 -13
  58. wandb/sdk/internal/system/system_info.py +0 -248
  59. wandb/sdk/internal/system/system_monitor.py +0 -224
  60. wandb/sdk/internal/writer.py +0 -204
  61. wandb/sdk/lib/service_token.py +0 -93
  62. wandb/sdk/service/__init__.py +0 -0
  63. wandb/sdk/service/_startup_debug.py +0 -22
  64. wandb/sdk/service/port_file.py +0 -53
  65. wandb/sdk/service/server.py +0 -107
  66. wandb/sdk/service/server_sock.py +0 -286
  67. wandb/sdk/service/service.py +0 -252
  68. wandb/sdk/service/streams.py +0 -425
  69. {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/WHEEL +0 -0
  70. {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/entry_points.txt +0 -0
  71. {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/licenses/LICENSE +0 -0
@@ -1,393 +0,0 @@
1
- import collections
2
- import dataclasses
3
- import json
4
- import logging
5
- import os
6
- import pathlib
7
- import shutil
8
- import subprocess
9
- import tempfile
10
- import threading
11
- import time
12
- from collections import deque
13
- from typing import TYPE_CHECKING, Any, Dict, Final, List, Optional, Tuple, Union
14
-
15
- from wandb.sdk.lib import telemetry
16
-
17
- from .aggregators import aggregate_mean
18
- from .asset_registry import asset_registry
19
- from .interfaces import Interface, Metric, MetricsMonitor
20
-
21
- if TYPE_CHECKING:
22
- from typing import Deque
23
-
24
- from wandb.sdk.internal.settings_static import SettingsStatic
25
-
26
-
27
- logger = logging.getLogger(__name__)
28
-
29
-
30
- NEURON_MONITOR_DEFAULT_CONFIG: Final[dict] = {
31
- "period": "1s",
32
- "neuron_runtimes": [
33
- {
34
- "tag_filter": ".*",
35
- "metrics": [
36
- {"type": "neuroncore_counters"},
37
- {"type": "memory_used"},
38
- {"type": "neuron_runtime_vcpu_usage"},
39
- # {"type": "execution_stats"},
40
- ],
41
- }
42
- ],
43
- "system_metrics": [
44
- {"type": "vcpu_usage"},
45
- {"type": "memory_info"},
46
- {"type": "neuron_hw_counters"},
47
- ],
48
- }
49
-
50
- # todo: once a python sdk is released with the Neuron utils, rewrite this
51
- NEURON_LS_COMMAND: Final[Tuple[str, str]] = (
52
- shutil.which("neuron-ls") or "/opt/aws/neuron/bin/neuron-ls",
53
- "-j",
54
- )
55
- NEURON_MONITOR_PATH: Final[str] = (
56
- shutil.which("neuron-monitor") or "/opt/aws/neuron/bin/neuron-monitor"
57
- )
58
-
59
-
60
- @dataclasses.dataclass
61
- class _NeuronCoreMemoryUsage:
62
- constants: int
63
- model_code: int
64
- model_shared_scratchpad: int
65
- runtime_memory: int
66
- tensors: int
67
-
68
-
69
- @dataclasses.dataclass
70
- class _HostMemoryUsage:
71
- application_memory: int
72
- constants: int
73
- dma_buffers: int
74
- tensors: int
75
-
76
-
77
- @dataclasses.dataclass
78
- class _Stats:
79
- neuroncore_utilization: Dict[int, float] # per neuron core utilization
80
- host_total_memory_usage: int # total memory usage in bytes
81
- neuron_device_total_memory_usage: int # total memory usage
82
- host_memory_usage: _HostMemoryUsage # host memory usage breakdown
83
- neuroncore_memory_usage: Dict[
84
- int, _NeuronCoreMemoryUsage
85
- ] # per core memory usage breakdown
86
-
87
-
88
- class NeuronCoreStats:
89
- """AWS Trainium stats."""
90
-
91
- name: str = "trn.{key}"
92
- samples: "Deque[_Stats]"
93
-
94
- def write_neuron_monitor_config(self) -> None:
95
- """Write neuron monitor config file."""
96
- # mkdir if not exists
97
- pathlib.Path(self.neuron_monitor_config_path).parent.mkdir(
98
- parents=True, exist_ok=True
99
- )
100
- # write default config
101
- with open(self.neuron_monitor_config_path, "w") as f:
102
- json.dump(NEURON_MONITOR_DEFAULT_CONFIG, f, indent=4)
103
-
104
- def neuron_monitor(self) -> None:
105
- """Run neuron-monitor in a separate process to collect raw data."""
106
- self.write_neuron_monitor_config()
107
-
108
- try:
109
- command = [
110
- NEURON_MONITOR_PATH,
111
- "-c",
112
- self.neuron_monitor_config_path,
113
- ]
114
- with subprocess.Popen(
115
- command,
116
- stdout=subprocess.PIPE,
117
- stderr=None,
118
- ) as process:
119
- while not self.shutdown_event.is_set():
120
- if process.stdout is None:
121
- self.shutdown_event.wait(0.1)
122
- continue
123
-
124
- raw_data = process.stdout.readline()
125
- if raw_data:
126
- self.raw_samples.append(raw_data)
127
- process.kill()
128
- process.wait()
129
- except Exception:
130
- logger.exception("neuron-monitor failed")
131
-
132
- def __init__(
133
- self,
134
- pid: int,
135
- neuron_monitor_config_path: Optional[str],
136
- ) -> None:
137
- self.pid = pid
138
- # neuron-monitor requires a config file (json)
139
- # we provide an option to supply a custom config file path
140
- # in case the default temp file path is not writable
141
- self.neuron_monitor_config_path = (
142
- neuron_monitor_config_path or tempfile.NamedTemporaryFile(delete=False).name
143
- )
144
- self.raw_samples: Deque[bytes] = deque(maxlen=10)
145
- self.samples: Deque[_Stats] = deque()
146
- self.shutdown_event = threading.Event()
147
-
148
- self.neuron_monitor_thread: Optional[threading.Thread] = None
149
-
150
- def setup(self) -> None:
151
- """Start the neuron-monitor thread for collecting raw data."""
152
- if self.neuron_monitor_thread is not None:
153
- return
154
-
155
- logger.debug("Starting neuron-monitor thread")
156
- self.shutdown_event.clear()
157
- self.neuron_monitor_thread = threading.Thread(
158
- name="NeuronCoreMntr",
159
- target=self.neuron_monitor,
160
- daemon=True,
161
- )
162
- self.neuron_monitor_thread.start()
163
-
164
- def teardown(self) -> None:
165
- """Stop the neuron-monitor thread."""
166
- logger.debug("Stopping neuron-monitor thread")
167
- try:
168
- self.shutdown_event.set()
169
- assert self.neuron_monitor_thread is not None
170
- self.neuron_monitor_thread.join()
171
- except Exception:
172
- logger.exception("neuron-monitor thread failed to stop")
173
- finally:
174
- self.neuron_monitor_thread = None
175
-
176
- def _is_matching_entry(self, entry: dict) -> bool:
177
- """Check if the entry should be saved.
178
-
179
- Checks if the pid in the entry matches the pid of the process.
180
- If not (as in the case of multi-process training with torchrun),
181
- checks if the LOCAL_RANK environment variable is set.
182
-
183
- todo: add matching by neuron_runtime_tag
184
- """
185
- return (int(entry["pid"]) == int(self.pid)) or "LOCAL_RANK" in os.environ
186
-
187
- def sample(self) -> None:
188
- try:
189
- raw_stats = json.loads(self.raw_samples[-1])
190
- neuron_runtime_data = [
191
- entry["report"]
192
- for entry in raw_stats["neuron_runtime_data"]
193
- if self._is_matching_entry(entry)
194
- ][0] # there should be only one entry with the pid
195
-
196
- neuroncores_in_use = neuron_runtime_data["neuroncore_counters"][
197
- "neuroncores_in_use"
198
- ]
199
- # per-core utilization stats:
200
- neuroncore_utilization = {
201
- int(k): v["neuroncore_utilization"]
202
- for k, v in neuroncores_in_use.items()
203
- }
204
- # memory usage
205
- neuron_runtime_used_bytes = neuron_runtime_data["memory_used"][
206
- "neuron_runtime_used_bytes"
207
- ]
208
- # memory usage totals
209
- host_total_memory_usage = neuron_runtime_used_bytes["host"]
210
- neuron_device_total_memory_usage = neuron_runtime_used_bytes[
211
- "neuron_device"
212
- ]
213
- # memory usage breakdown
214
- usage_breakdown = neuron_runtime_used_bytes["usage_breakdown"]
215
- host_memory_usage = _HostMemoryUsage(**usage_breakdown["host"])
216
- neuroncore_memory_usage = {
217
- int(k): _NeuronCoreMemoryUsage(**v)
218
- for k, v in usage_breakdown["neuroncore_memory_usage"].items()
219
- }
220
-
221
- # When the training script is executed with torchrun,
222
- # we only want to keep the relevant LOCAL_RANK stats
223
- local_rank = int(os.environ.get("LOCAL_RANK", -1337))
224
- if local_rank >= 0:
225
- neuroncore_utilization = {
226
- local_rank: neuroncore_utilization[local_rank]
227
- }
228
- neuroncore_memory_usage = {
229
- local_rank: neuroncore_memory_usage[local_rank]
230
- }
231
-
232
- stats: _Stats = _Stats(
233
- neuroncore_utilization=neuroncore_utilization,
234
- host_total_memory_usage=host_total_memory_usage,
235
- neuron_device_total_memory_usage=neuron_device_total_memory_usage,
236
- host_memory_usage=host_memory_usage,
237
- neuroncore_memory_usage=neuroncore_memory_usage,
238
- )
239
- self.samples.append(stats)
240
-
241
- except Exception as e: # noqa
242
- pass
243
-
244
- def clear(self) -> None:
245
- self.samples.clear()
246
-
247
- @staticmethod
248
- def flatten_stats(sample: _Stats) -> dict:
249
- """Flatten _Stats object into a flat dict of numbers."""
250
- flattened = {}
251
-
252
- def helper(key: str, value: Any) -> None:
253
- if isinstance(value, (int, float)):
254
- ret = {f"{key}": value}
255
- flattened.update(ret)
256
- return
257
- elif isinstance(value, dict):
258
- for kk, vv in value.items():
259
- if isinstance(kk, int):
260
- # top-level keys are neuron core ids,
261
- # so we swap the order to comply with the
262
- # frontend expectations
263
- helper(f"{kk}.{key}", vv)
264
- else:
265
- helper(f"{key}.{kk}", vv)
266
- return
267
- elif isinstance(value, list):
268
- for i, val in enumerate(value):
269
- helper(f"{i}.{key}", val)
270
-
271
- for kkk, vvv in dataclasses.asdict(sample).items():
272
- helper(kkk, vvv)
273
-
274
- return flattened
275
-
276
- def aggregate(self) -> dict:
277
- if not self.samples:
278
- return {}
279
-
280
- stats = {}
281
-
282
- # Stats could be: numbers or dataclass objects or lists of such.
283
- # In the latter case that means per-core stats.
284
- # The dataclass objects are flat containers of numbers.
285
-
286
- # flatten samples and merge the corresponding values into lists
287
- merged_samples: Dict[str, List[Union[int, float]]] = collections.defaultdict(
288
- list
289
- )
290
- for flattened_sample in (self.flatten_stats(sample) for sample in self.samples):
291
- for k, v in flattened_sample.items():
292
- merged_samples[k].append(v)
293
-
294
- # aggregate the lists
295
- for k, v in merged_samples.items():
296
- stats[self.name.format(key=k)] = aggregate_mean(v)
297
-
298
- return stats
299
-
300
-
301
- @asset_registry.register
302
- class Trainium:
303
- def __init__(
304
- self,
305
- interface: "Interface",
306
- settings: "SettingsStatic",
307
- shutdown_event: threading.Event,
308
- ) -> None:
309
- self.name = self.__class__.__name__.lower()
310
- self.metrics: List[Metric] = [
311
- NeuronCoreStats(
312
- settings.x_stats_pid,
313
- settings.x_stats_neuron_monitor_config_path,
314
- ),
315
- ]
316
- self.metrics_monitor = MetricsMonitor(
317
- self.name,
318
- self.metrics,
319
- interface,
320
- settings,
321
- shutdown_event,
322
- )
323
- telemetry_record = telemetry.TelemetryRecord()
324
- telemetry_record.env.trainium = True
325
- interface._publish_telemetry(telemetry_record)
326
-
327
- @classmethod
328
- def is_available(cls) -> bool:
329
- # todo: check if neuron-ls is available and if yes, what it reports. see:
330
- # https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-ls.html
331
- if not pathlib.Path(NEURON_LS_COMMAND[0]).exists():
332
- return False
333
- # need to be extra careful as neuron tools could be pre-installed
334
- # on some systems that do not have the hardware
335
- try:
336
- # redirect stderr to null to avoid printing errors to the console
337
- # todo: alternative: check /dev/neuron0 ? sysfs support coming soon in neuron tools
338
- output = subprocess.check_output(
339
- NEURON_LS_COMMAND,
340
- universal_newlines=True,
341
- stderr=subprocess.DEVNULL,
342
- ).strip()
343
- if len(json.loads(output)) > 0:
344
- return True
345
- except (OSError, ValueError, TypeError, subprocess.CalledProcessError):
346
- pass
347
-
348
- return False
349
-
350
- def start(self) -> None:
351
- self.metrics_monitor.start()
352
-
353
- def finish(self) -> None:
354
- self.metrics_monitor.finish()
355
-
356
- def probe(self) -> dict:
357
- try:
358
- self.metrics[0].check_neuron_monitor_config() # type: ignore
359
- neuron_hardware_info: dict = {}
360
- command = [
361
- NEURON_MONITOR_PATH,
362
- "-c",
363
- self.metrics[0].neuron_monitor_config_path, # type: ignore
364
- ]
365
- with subprocess.Popen(
366
- command,
367
- stdout=subprocess.PIPE,
368
- stderr=None,
369
- ) as process:
370
- while True:
371
- if process.stdout is None:
372
- time.sleep(0.1)
373
- continue
374
-
375
- raw_data = process.stdout.readline()
376
- if raw_data:
377
- parsed_data = json.loads(raw_data)
378
- neuron_hardware_info = parsed_data.get(
379
- "neuron_hardware_info", {}
380
- )
381
- neuron_hardware_info.pop("error", None)
382
- break
383
-
384
- try:
385
- process.kill()
386
- process.wait()
387
- except: # noqa
388
- pass
389
-
390
- return {self.name: neuron_hardware_info}
391
- except Exception:
392
- logger.exception("neuron-monitor failed")
393
- return {}
@@ -1,13 +0,0 @@
1
- import logging
2
-
3
- from sentry_sdk.integrations.aws_lambda import get_lambda_bootstrap # type: ignore
4
-
5
- logger = logging.getLogger(__name__)
6
-
7
-
8
- def is_aws_lambda() -> bool:
9
- """Check if we are running in a lambda environment."""
10
- lambda_bootstrap = get_lambda_bootstrap()
11
- if not lambda_bootstrap or not hasattr(lambda_bootstrap, "handle_event_request"):
12
- return False
13
- return True
@@ -1,248 +0,0 @@
1
- # Information about the system and the environment
2
- import datetime
3
- import glob
4
- import json
5
- import logging
6
- import os
7
- import subprocess
8
- import sys
9
- from shutil import copyfile
10
- from typing import Any, Dict, List, Optional
11
- from urllib.parse import unquote
12
-
13
- from wandb.sdk.internal.settings_static import SettingsStatic
14
- from wandb.sdk.lib import filesystem
15
- from wandb.sdk.lib.filenames import CONDA_ENVIRONMENTS_FNAME, DIFF_FNAME, METADATA_FNAME
16
- from wandb.sdk.lib.gitlib import GitRepo
17
-
18
- from .assets.interfaces import Interface
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- class SystemInfo:
24
- # todo: this is mostly a copy of the legacy Meta class, but it should be refactored
25
- def __init__(self, settings: SettingsStatic, interface: Interface) -> None:
26
- logger.debug("System info init")
27
- self.settings = settings
28
-
29
- self.metadata_file_name = os.path.join(self.settings.files_dir, METADATA_FNAME)
30
- self.backend_interface = interface
31
- self.git = GitRepo(
32
- root=self.settings.git_root,
33
- remote=self.settings.git_remote,
34
- remote_url=self.settings.git_remote_url,
35
- commit=self.settings.git_commit,
36
- )
37
- # Location under "code" directory in files where program was saved.
38
- self.saved_program: Optional[os.PathLike] = None
39
- # Locations under files directory where diff patches were saved.
40
- self.saved_patches: List[str] = []
41
- logger.debug("System info init done")
42
-
43
- def _save_code(self) -> None:
44
- logger.debug("Saving code")
45
- if not self.settings.program_relpath:
46
- logger.warning("unable to save code -- program entry not found")
47
- return None
48
-
49
- root: str = self.git.root or os.getcwd()
50
- program_relative: str = self.settings.program_relpath
51
- filesystem.mkdir_exists_ok(
52
- os.path.join(
53
- self.settings.files_dir, "code", os.path.dirname(program_relative)
54
- )
55
- )
56
- program_absolute = os.path.join(root, program_relative)
57
- if not os.path.exists(program_absolute):
58
- logger.warning(f"unable to save code -- can't find {program_absolute}")
59
- return None
60
- saved_program = os.path.join(self.settings.files_dir, "code", program_relative)
61
- self.saved_program = program_relative # type: ignore
62
-
63
- if not os.path.exists(saved_program):
64
- copyfile(program_absolute, saved_program)
65
- logger.debug("Saving code done")
66
-
67
- def _save_patches(self) -> None:
68
- """Save the current state of this repository to one or more patches.
69
-
70
- Makes one patch against HEAD and another one against the most recent
71
- commit that occurs in an upstream branch. This way we can be robust
72
- to history editing as long as the user never does "push -f" to break
73
- history on an upstream branch.
74
-
75
- Writes the first patch to <files_dir>/<DIFF_FNAME> and the second to
76
- <files_dir>/upstream_diff_<commit_id>.patch.
77
-
78
- """
79
- if not self.git.enabled:
80
- return None
81
-
82
- logger.debug("Saving git patches")
83
- try:
84
- root = self.git.root
85
- diff_args = ["git", "diff"]
86
- if self.git.has_submodule_diff:
87
- diff_args.append("--submodule=diff")
88
-
89
- if self.git.dirty:
90
- patch_path = os.path.join(self.settings.files_dir, DIFF_FNAME)
91
- with open(patch_path, "wb") as patch:
92
- # we diff against HEAD to ensure we get changes in the index
93
- subprocess.check_call(
94
- diff_args + ["HEAD"], stdout=patch, cwd=root, timeout=5
95
- )
96
- self.saved_patches.append(
97
- os.path.relpath(patch_path, start=self.settings.files_dir)
98
- )
99
-
100
- upstream_commit = self.git.get_upstream_fork_point()
101
- if upstream_commit and upstream_commit != self.git.repo.head.commit: # type: ignore
102
- sha = upstream_commit.hexsha
103
- upstream_patch_path = os.path.join(
104
- self.settings.files_dir, f"upstream_diff_{sha}.patch"
105
- )
106
- with open(upstream_patch_path, "wb") as upstream_patch:
107
- subprocess.check_call(
108
- diff_args + [sha], stdout=upstream_patch, cwd=root, timeout=5
109
- )
110
- self.saved_patches.append(
111
- os.path.relpath(
112
- upstream_patch_path, start=self.settings.files_dir
113
- )
114
- )
115
- # TODO: A customer saw `ValueError: Reference at 'refs/remotes/origin/foo'
116
- # does not exist` so we now catch ValueError. Catching this error feels
117
- # too generic.
118
- except (
119
- ValueError,
120
- subprocess.CalledProcessError,
121
- subprocess.TimeoutExpired,
122
- ):
123
- logger.exception("Error generating diff.")
124
- logger.debug("Saving git patches done")
125
-
126
- def _probe_git(self, data: Dict[str, Any]) -> Dict[str, Any]:
127
- if self.settings.disable_git or self.settings.x_disable_machine_info:
128
- return data
129
-
130
- # in case of manually passing the git repo info, `enabled` would be False,
131
- # but we still want to save the git repo info
132
- if not self.git.enabled and self.git.auto:
133
- return data
134
-
135
- logger.debug("Probing git")
136
-
137
- data["git"] = {
138
- "remote": self.git.remote_url,
139
- "commit": self.git.last_commit,
140
- }
141
- data["email"] = self.git.email
142
- data["root"] = self.git.root or data.get("root") or os.getcwd()
143
- logger.debug("Probing git done")
144
-
145
- return data
146
-
147
- def probe(self) -> Dict[str, Any]:
148
- """Probe the system for information about the current environment."""
149
- # todo: refactor this quality code 🤮🤮🤮🤮🤮
150
- logger.debug("Probing system")
151
- data: Dict[str, Any] = dict()
152
-
153
- data["os"] = self.settings._os
154
- data["python"] = self.settings._python
155
- data["heartbeatAt"] = datetime.datetime.utcnow().isoformat()
156
- data["startedAt"] = (
157
- datetime.datetime.utcfromtimestamp(self.settings.x_start_time).isoformat()
158
- if self.settings.x_start_time
159
- else None
160
- )
161
-
162
- data["docker"] = self.settings.docker
163
-
164
- data["args"] = tuple(self.settings._args or ())
165
- data["state"] = "running"
166
-
167
- if self.settings.program is not None:
168
- data["program"] = self.settings.program
169
- # Used during artifact-job creation, always points to the relpath
170
- # of code execution, even when in a git repo
171
- data["codePathLocal"] = self.settings._code_path_local
172
- if not (self.settings.disable_code or self.settings.x_disable_machine_info):
173
- if self.settings.program_relpath:
174
- data["codePath"] = self.settings.program_relpath
175
- elif self.settings._jupyter:
176
- if self.settings.notebook_name:
177
- data["program"] = self.settings.notebook_name
178
- elif self.settings.x_jupyter_path:
179
- if self.settings.x_jupyter_path.startswith("fileId="):
180
- unescaped = unquote(self.settings.x_jupyter_path)
181
- data["colab"] = (
182
- "https://colab.research.google.com/notebook#" + unescaped
183
- )
184
- data["program"] = self.settings.x_jupyter_name
185
- else:
186
- data["program"] = self.settings.x_jupyter_path
187
- data["root"] = self.settings.x_jupyter_root
188
- # get the git repo info
189
- data = self._probe_git(data)
190
-
191
- if self.settings.anonymous not in ["allow", "must"]:
192
- data["host"] = self.settings.host
193
- data["username"] = self.settings.username
194
- data["executable"] = sys.executable
195
- else:
196
- data.pop("email", None)
197
- data.pop("root", None)
198
-
199
- logger.debug("Probing system done")
200
-
201
- return data
202
-
203
- def _save_conda(self) -> None:
204
- current_shell_is_conda = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
205
- if not current_shell_is_conda:
206
- return None
207
-
208
- logger.debug(
209
- "Saving list of conda packages installed into the current environment"
210
- )
211
- try:
212
- with open(
213
- os.path.join(self.settings.files_dir, CONDA_ENVIRONMENTS_FNAME), "w"
214
- ) as f:
215
- subprocess.call(
216
- ["conda", "env", "export"],
217
- stdout=f,
218
- stderr=subprocess.DEVNULL,
219
- timeout=15, # add timeout since conda env export could take a really long time
220
- )
221
- except Exception:
222
- logger.exception("Error saving conda packages")
223
- logger.debug("Saving conda packages done")
224
-
225
- def publish(self, system_info: dict) -> None:
226
- # save pip, conda, code patches to disk
227
- if self.settings.x_save_requirements:
228
- self._save_conda()
229
- if self.settings.save_code:
230
- self._save_code()
231
- self._save_patches()
232
-
233
- # save system_info to disk
234
- with open(self.metadata_file_name, "w") as f:
235
- s = json.dumps(system_info, indent=4)
236
- f.write(s)
237
- f.write("\n")
238
- base_name = os.path.basename(self.metadata_file_name)
239
- files = dict(files=[(base_name, "now")])
240
-
241
- if self.saved_program:
242
- saved_program = os.path.join("code", self.saved_program)
243
- files["files"].append((glob.escape(saved_program), "now"))
244
- for patch in self.saved_patches:
245
- files["files"].append((glob.escape(patch), "now"))
246
-
247
- # publish files to the backend
248
- self.backend_interface.publish_files(files) # type: ignore