matyan-client 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matyan_client-0.1.0/PKG-INFO +92 -0
- matyan_client-0.1.0/README.md +0 -0
- matyan_client-0.1.0/pyproject.toml +129 -0
- matyan_client-0.1.0/setup.cfg +4 -0
- matyan_client-0.1.0/src/matyan_client/__init__.py +15 -0
- matyan_client-0.1.0/src/matyan_client/_blob_uploader.py +94 -0
- matyan_client-0.1.0/src/matyan_client/_cache.py +43 -0
- matyan_client-0.1.0/src/matyan_client/_resource_tracker.py +309 -0
- matyan_client-0.1.0/src/matyan_client/_system_params.py +94 -0
- matyan_client-0.1.0/src/matyan_client/_types.py +9 -0
- matyan_client-0.1.0/src/matyan_client/adapters/__init__.py +24 -0
- matyan_client-0.1.0/src/matyan_client/adapters/_exception_resistant.py +89 -0
- matyan_client-0.1.0/src/matyan_client/adapters/_keras_mixins.py +55 -0
- matyan_client-0.1.0/src/matyan_client/adapters/_utils.py +33 -0
- matyan_client-0.1.0/src/matyan_client/adapters/acme.py +101 -0
- matyan_client-0.1.0/src/matyan_client/adapters/catboost.py +99 -0
- matyan_client-0.1.0/src/matyan_client/adapters/distributed_hugging_face.py +325 -0
- matyan_client-0.1.0/src/matyan_client/adapters/fastai.py +172 -0
- matyan_client-0.1.0/src/matyan_client/adapters/hugging_face.py +184 -0
- matyan_client-0.1.0/src/matyan_client/adapters/keras.py +59 -0
- matyan_client-0.1.0/src/matyan_client/adapters/keras_tuner.py +80 -0
- matyan_client-0.1.0/src/matyan_client/adapters/lightgbm.py +99 -0
- matyan_client-0.1.0/src/matyan_client/adapters/mxnet.py +232 -0
- matyan_client-0.1.0/src/matyan_client/adapters/optuna.py +144 -0
- matyan_client-0.1.0/src/matyan_client/adapters/paddle.py +95 -0
- matyan_client-0.1.0/src/matyan_client/adapters/prophet.py +99 -0
- matyan_client-0.1.0/src/matyan_client/adapters/pytorch.py +56 -0
- matyan_client-0.1.0/src/matyan_client/adapters/pytorch_ignite.py +179 -0
- matyan_client-0.1.0/src/matyan_client/adapters/pytorch_lightning.py +205 -0
- matyan_client-0.1.0/src/matyan_client/adapters/sb3.py +108 -0
- matyan_client-0.1.0/src/matyan_client/adapters/tensorflow.py +60 -0
- matyan_client-0.1.0/src/matyan_client/adapters/xgboost.py +103 -0
- matyan_client-0.1.0/src/matyan_client/config.py +20 -0
- matyan_client-0.1.0/src/matyan_client/objects/__init__.py +7 -0
- matyan_client-0.1.0/src/matyan_client/objects/audio.py +112 -0
- matyan_client-0.1.0/src/matyan_client/objects/distribution.py +98 -0
- matyan_client-0.1.0/src/matyan_client/objects/figure.py +66 -0
- matyan_client-0.1.0/src/matyan_client/objects/image.py +114 -0
- matyan_client-0.1.0/src/matyan_client/objects/text.py +24 -0
- matyan_client-0.1.0/src/matyan_client/repo.py +222 -0
- matyan_client-0.1.0/src/matyan_client/run.py +555 -0
- matyan_client-0.1.0/src/matyan_client/transport/__init__.py +4 -0
- matyan_client-0.1.0/src/matyan_client/transport/http.py +318 -0
- matyan_client-0.1.0/src/matyan_client/transport/ws.py +456 -0
- matyan_client-0.1.0/src/matyan_client.egg-info/PKG-INFO +92 -0
- matyan_client-0.1.0/src/matyan_client.egg-info/SOURCES.txt +57 -0
- matyan_client-0.1.0/src/matyan_client.egg-info/dependency_links.txt +1 -0
- matyan_client-0.1.0/src/matyan_client.egg-info/requires.txt +114 -0
- matyan_client-0.1.0/src/matyan_client.egg-info/top_level.txt +1 -0
- matyan_client-0.1.0/tests/test_blob_uploader.py +151 -0
- matyan_client-0.1.0/tests/test_cache.py +94 -0
- matyan_client-0.1.0/tests/test_config.py +34 -0
- matyan_client-0.1.0/tests/test_http_transport.py +514 -0
- matyan_client-0.1.0/tests/test_repo.py +308 -0
- matyan_client-0.1.0/tests/test_resource_tracker.py +505 -0
- matyan_client-0.1.0/tests/test_run.py +794 -0
- matyan_client-0.1.0/tests/test_system_params.py +158 -0
- matyan_client-0.1.0/tests/test_types.py +17 -0
- matyan_client-0.1.0/tests/test_ws_transport.py +657 -0
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: matyan-client
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Matyan SDK — Aim-compatible client for experiment tracking
|
|
5
|
+
Author-email: Tigran Grigoryan <grigoryan.tigran119@gmail.com>
|
|
6
|
+
Requires-Python: <4,>=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: httpx~=0.28.0
|
|
9
|
+
Requires-Dist: loguru~=0.7.0
|
|
10
|
+
Requires-Dist: matyan-api-models~=0.1.0
|
|
11
|
+
Requires-Dist: psutil>=5.9
|
|
12
|
+
Requires-Dist: pydantic~=2.0
|
|
13
|
+
Requires-Dist: pydantic-settings~=2.0
|
|
14
|
+
Requires-Dist: websockets~=15.0
|
|
15
|
+
Provides-Extra: image
|
|
16
|
+
Requires-Dist: Pillow>=9.0; extra == "image"
|
|
17
|
+
Requires-Dist: numpy>=1.21; extra == "image"
|
|
18
|
+
Provides-Extra: audio
|
|
19
|
+
Requires-Dist: numpy>=1.21; extra == "audio"
|
|
20
|
+
Provides-Extra: figure
|
|
21
|
+
Requires-Dist: plotly>=5.0; extra == "figure"
|
|
22
|
+
Provides-Extra: matplotlib
|
|
23
|
+
Requires-Dist: matplotlib>=3.5; extra == "matplotlib"
|
|
24
|
+
Requires-Dist: plotly>=5.0; extra == "matplotlib"
|
|
25
|
+
Provides-Extra: numpy
|
|
26
|
+
Requires-Dist: numpy>=1.21; extra == "numpy"
|
|
27
|
+
Provides-Extra: msgpack
|
|
28
|
+
Requires-Dist: msgpack>=1.0; extra == "msgpack"
|
|
29
|
+
Provides-Extra: gpu
|
|
30
|
+
Requires-Dist: nvidia-ml-py>=11.0; extra == "gpu"
|
|
31
|
+
Provides-Extra: keras
|
|
32
|
+
Requires-Dist: keras>=2.0; extra == "keras"
|
|
33
|
+
Provides-Extra: tensorflow
|
|
34
|
+
Requires-Dist: tensorflow>=2.0; extra == "tensorflow"
|
|
35
|
+
Provides-Extra: pytorch
|
|
36
|
+
Requires-Dist: torch>=1.9; extra == "pytorch"
|
|
37
|
+
Provides-Extra: pytorch-lightning
|
|
38
|
+
Requires-Dist: lightning>=2.0; python_version >= "3.10" and extra == "pytorch-lightning"
|
|
39
|
+
Requires-Dist: omegaconf>=2.0; extra == "pytorch-lightning"
|
|
40
|
+
Provides-Extra: pytorch-ignite
|
|
41
|
+
Requires-Dist: pytorch-ignite>=0.4; extra == "pytorch-ignite"
|
|
42
|
+
Requires-Dist: omegaconf>=2.0; extra == "pytorch-ignite"
|
|
43
|
+
Provides-Extra: hugging-face
|
|
44
|
+
Requires-Dist: transformers>=4.0; extra == "hugging-face"
|
|
45
|
+
Provides-Extra: distributed-hugging-face
|
|
46
|
+
Requires-Dist: transformers>=4.0; extra == "distributed-hugging-face"
|
|
47
|
+
Requires-Dist: accelerate>=0.20; extra == "distributed-hugging-face"
|
|
48
|
+
Provides-Extra: xgboost
|
|
49
|
+
Requires-Dist: xgboost>=1.0; extra == "xgboost"
|
|
50
|
+
Provides-Extra: lightgbm
|
|
51
|
+
Requires-Dist: lightgbm>=3.0; extra == "lightgbm"
|
|
52
|
+
Provides-Extra: catboost
|
|
53
|
+
Requires-Dist: catboost>=1.0; extra == "catboost"
|
|
54
|
+
Provides-Extra: optuna
|
|
55
|
+
Requires-Dist: optuna>=3.0; extra == "optuna"
|
|
56
|
+
Provides-Extra: keras-tuner
|
|
57
|
+
Requires-Dist: keras-tuner>=1.0; extra == "keras-tuner"
|
|
58
|
+
Provides-Extra: prophet
|
|
59
|
+
Requires-Dist: prophet>=1.0; extra == "prophet"
|
|
60
|
+
Provides-Extra: sb3
|
|
61
|
+
Requires-Dist: stable-baselines3>=1.0; extra == "sb3"
|
|
62
|
+
Provides-Extra: fastai
|
|
63
|
+
Requires-Dist: fastai>=2.0; extra == "fastai"
|
|
64
|
+
Requires-Dist: ipython; extra == "fastai"
|
|
65
|
+
Provides-Extra: mxnet
|
|
66
|
+
Requires-Dist: mxnet>=1.9; extra == "mxnet"
|
|
67
|
+
Provides-Extra: adapters-all
|
|
68
|
+
Requires-Dist: keras>=2.0; extra == "adapters-all"
|
|
69
|
+
Requires-Dist: tensorflow>=2.0; extra == "adapters-all"
|
|
70
|
+
Requires-Dist: torch>=1.9; extra == "adapters-all"
|
|
71
|
+
Requires-Dist: lightning>=2.0; python_version >= "3.10" and extra == "adapters-all"
|
|
72
|
+
Requires-Dist: pytorch-ignite>=0.4; extra == "adapters-all"
|
|
73
|
+
Requires-Dist: omegaconf>=2.0; extra == "adapters-all"
|
|
74
|
+
Requires-Dist: transformers>=4.0; extra == "adapters-all"
|
|
75
|
+
Requires-Dist: accelerate>=0.20; extra == "adapters-all"
|
|
76
|
+
Requires-Dist: xgboost>=1.0; extra == "adapters-all"
|
|
77
|
+
Requires-Dist: lightgbm>=3.0; extra == "adapters-all"
|
|
78
|
+
Requires-Dist: catboost>=1.0; extra == "adapters-all"
|
|
79
|
+
Requires-Dist: optuna>=3.0; extra == "adapters-all"
|
|
80
|
+
Requires-Dist: keras-tuner>=1.0; extra == "adapters-all"
|
|
81
|
+
Requires-Dist: prophet>=1.0; extra == "adapters-all"
|
|
82
|
+
Requires-Dist: stable-baselines3>=1.0; extra == "adapters-all"
|
|
83
|
+
Requires-Dist: fastai>=2.0; extra == "adapters-all"
|
|
84
|
+
Requires-Dist: ipython; extra == "adapters-all"
|
|
85
|
+
Requires-Dist: mxnet>=1.9; extra == "adapters-all"
|
|
86
|
+
Provides-Extra: extended
|
|
87
|
+
Requires-Dist: Pillow>=9.0; extra == "extended"
|
|
88
|
+
Requires-Dist: numpy>=1.21; extra == "extended"
|
|
89
|
+
Requires-Dist: plotly>=5.0; extra == "extended"
|
|
90
|
+
Requires-Dist: matplotlib>=3.5; extra == "extended"
|
|
91
|
+
Requires-Dist: msgpack>=1.0; extra == "extended"
|
|
92
|
+
Requires-Dist: nvidia-ml-py>=11.0; extra == "extended"
|
|
File without changes
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "matyan-client"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Matyan SDK — Aim-compatible client for experiment tracking"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [
|
|
11
|
+
{ name = "Tigran Grigoryan", email = "grigoryan.tigran119@gmail.com" }
|
|
12
|
+
]
|
|
13
|
+
requires-python = ">=3.10, <4"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"httpx~=0.28.0",
|
|
16
|
+
"loguru~=0.7.0",
|
|
17
|
+
"matyan-api-models~=0.1.0",
|
|
18
|
+
"psutil>=5.9",
|
|
19
|
+
"pydantic~=2.0",
|
|
20
|
+
"pydantic-settings~=2.0",
|
|
21
|
+
"websockets~=15.0",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
image = ["Pillow>=9.0", "numpy>=1.21"]
|
|
26
|
+
audio = ["numpy>=1.21"]
|
|
27
|
+
figure = ["plotly>=5.0"]
|
|
28
|
+
matplotlib = ["matplotlib>=3.5", "plotly>=5.0"]
|
|
29
|
+
numpy = ["numpy>=1.21"]
|
|
30
|
+
msgpack = ["msgpack>=1.0"]
|
|
31
|
+
gpu = ["nvidia-ml-py>=11.0"]
|
|
32
|
+
# -- adapter framework extras --
|
|
33
|
+
keras = ["keras>=2.0"]
|
|
34
|
+
tensorflow = ["tensorflow>=2.0"]
|
|
35
|
+
pytorch = ["torch>=1.9"]
|
|
36
|
+
pytorch-lightning = ["lightning>=2.0; python_version>='3.10'", "omegaconf>=2.0"]
|
|
37
|
+
pytorch-ignite = ["pytorch-ignite>=0.4", "omegaconf>=2.0"]
|
|
38
|
+
hugging-face = ["transformers>=4.0"]
|
|
39
|
+
distributed-hugging-face = ["transformers>=4.0", "accelerate>=0.20"]
|
|
40
|
+
xgboost = ["xgboost>=1.0"]
|
|
41
|
+
lightgbm = ["lightgbm>=3.0"]
|
|
42
|
+
catboost = ["catboost>=1.0"]
|
|
43
|
+
optuna = ["optuna>=3.0"]
|
|
44
|
+
keras-tuner = ["keras-tuner>=1.0"]
|
|
45
|
+
prophet = ["prophet>=1.0"]
|
|
46
|
+
sb3 = ["stable-baselines3>=1.0"]
|
|
47
|
+
# acme = ["dm-acme>=0.4"]
|
|
48
|
+
fastai = ["fastai>=2.0", "ipython"]
|
|
49
|
+
# paddle = ["paddlepaddle>=2.0"]
|
|
50
|
+
mxnet = ["mxnet>=1.9"]
|
|
51
|
+
adapters-all = [
|
|
52
|
+
"keras>=2.0",
|
|
53
|
+
"tensorflow>=2.0",
|
|
54
|
+
"torch>=1.9",
|
|
55
|
+
"lightning>=2.0; python_version>='3.10'",
|
|
56
|
+
"pytorch-ignite>=0.4",
|
|
57
|
+
"omegaconf>=2.0",
|
|
58
|
+
"transformers>=4.0",
|
|
59
|
+
"accelerate>=0.20",
|
|
60
|
+
"xgboost>=1.0",
|
|
61
|
+
"lightgbm>=3.0",
|
|
62
|
+
"catboost>=1.0",
|
|
63
|
+
"optuna>=3.0",
|
|
64
|
+
"keras-tuner>=1.0",
|
|
65
|
+
"prophet>=1.0",
|
|
66
|
+
"stable-baselines3>=1.0",
|
|
67
|
+
# "dm-acme>=0.4",
|
|
68
|
+
"fastai>=2.0",
|
|
69
|
+
"ipython",
|
|
70
|
+
# "paddlepaddle>=2.0",
|
|
71
|
+
"mxnet>=1.9",
|
|
72
|
+
]
|
|
73
|
+
extended = [
|
|
74
|
+
"Pillow>=9.0",
|
|
75
|
+
"numpy>=1.21",
|
|
76
|
+
"plotly>=5.0",
|
|
77
|
+
"matplotlib>=3.5",
|
|
78
|
+
"msgpack>=1.0",
|
|
79
|
+
"nvidia-ml-py>=11.0",
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
[dependency-groups]
|
|
83
|
+
dev = [
|
|
84
|
+
"matyan-api-models",
|
|
85
|
+
"mypy~=1.15",
|
|
86
|
+
"pytest~=8.0",
|
|
87
|
+
"pytest-cov~=6.0",
|
|
88
|
+
"pytest-asyncio~=0.25",
|
|
89
|
+
"respx~=0.22",
|
|
90
|
+
"Pillow>=9.0",
|
|
91
|
+
"numpy>=1.21",
|
|
92
|
+
"plotly>=5.0",
|
|
93
|
+
"datasets~=4.6",
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
[tool.ruff]
|
|
97
|
+
line-length = 120
|
|
98
|
+
target-version = "py310"
|
|
99
|
+
exclude = ["*.ipynb"]
|
|
100
|
+
|
|
101
|
+
[tool.ruff.lint]
|
|
102
|
+
select = ["ALL"]
|
|
103
|
+
ignore = [
|
|
104
|
+
"FBT001",
|
|
105
|
+
"FBT002",
|
|
106
|
+
"D100",
|
|
107
|
+
"D101",
|
|
108
|
+
"D102",
|
|
109
|
+
"D103",
|
|
110
|
+
"D104",
|
|
111
|
+
"D105",
|
|
112
|
+
"D106",
|
|
113
|
+
"D107",
|
|
114
|
+
"D203",
|
|
115
|
+
"D213",
|
|
116
|
+
"D417",
|
|
117
|
+
"FBT003",
|
|
118
|
+
"EXE002",
|
|
119
|
+
"S311",
|
|
120
|
+
"TD003",
|
|
121
|
+
"D205",
|
|
122
|
+
"PLR2004",
|
|
123
|
+
"PLR0913",
|
|
124
|
+
"S101",
|
|
125
|
+
"S104"
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
[tool.uv.sources]
|
|
129
|
+
matyan-api-models = { path = "../matyan-api-models", editable = true }
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Matyan Client SDK — Aim-compatible experiment tracking."""
|
|
2
|
+
|
|
3
|
+
from .objects import Audio, Distribution, Figure, Image, Text
|
|
4
|
+
from .repo import Repo
|
|
5
|
+
from .run import Run
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Audio",
|
|
9
|
+
"Distribution",
|
|
10
|
+
"Figure",
|
|
11
|
+
"Image",
|
|
12
|
+
"Repo",
|
|
13
|
+
"Run",
|
|
14
|
+
"Text",
|
|
15
|
+
]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Background blob uploader using a thread pool.
|
|
2
|
+
|
|
3
|
+
Presigned-URL acquisition and S3 PUT uploads run in worker threads so that
|
|
4
|
+
``run.track(Image(...))`` returns immediately without blocking the training
|
|
5
|
+
loop.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
from loguru import logger
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from .transport.http import HttpTransport
|
|
18
|
+
|
|
19
|
+
_MAX_WORKERS = 4
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BlobUploader:
|
|
23
|
+
"""Manages background S3 blob uploads via presigned URLs."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
http: HttpTransport,
|
|
28
|
+
frontier_url: str,
|
|
29
|
+
run_id: str,
|
|
30
|
+
) -> None:
|
|
31
|
+
self._http = http
|
|
32
|
+
self._frontier_url = frontier_url
|
|
33
|
+
self._run_id = run_id
|
|
34
|
+
self._pool = ThreadPoolExecutor(max_workers=_MAX_WORKERS, thread_name_prefix="blob-upload")
|
|
35
|
+
self._futures: list[Future[None]] = []
|
|
36
|
+
|
|
37
|
+
def submit(
|
|
38
|
+
self,
|
|
39
|
+
raw_bytes: bytes,
|
|
40
|
+
artifact_path: str,
|
|
41
|
+
content_type: str,
|
|
42
|
+
) -> str:
|
|
43
|
+
"""Submit a blob upload and return the deterministic ``s3_key`` immediately."""
|
|
44
|
+
s3_key = f"{self._run_id}/{artifact_path}"
|
|
45
|
+
fut = self._pool.submit(self._upload, raw_bytes, artifact_path, content_type, s3_key)
|
|
46
|
+
self._futures.append(fut)
|
|
47
|
+
return s3_key
|
|
48
|
+
|
|
49
|
+
def _upload(self, raw_bytes: bytes, artifact_path: str, content_type: str, s3_key: str) -> None:
|
|
50
|
+
try:
|
|
51
|
+
resp = self._http.presign_artifact(
|
|
52
|
+
self._frontier_url,
|
|
53
|
+
self._run_id,
|
|
54
|
+
artifact_path,
|
|
55
|
+
content_type=content_type,
|
|
56
|
+
)
|
|
57
|
+
upload_resp = httpx.put(
|
|
58
|
+
resp["upload_url"],
|
|
59
|
+
content=raw_bytes,
|
|
60
|
+
headers={"Content-Type": content_type},
|
|
61
|
+
timeout=120,
|
|
62
|
+
)
|
|
63
|
+
upload_resp.raise_for_status()
|
|
64
|
+
except httpx.HTTPError:
|
|
65
|
+
logger.exception("Background blob upload failed", s3_key=s3_key)
|
|
66
|
+
|
|
67
|
+
def drain(self, timeout: float = 60.0) -> int:
|
|
68
|
+
"""Wait for all pending uploads to finish. Returns the number of failures."""
|
|
69
|
+
failures = 0
|
|
70
|
+
for fut in self._futures:
|
|
71
|
+
try:
|
|
72
|
+
fut.result(timeout=timeout)
|
|
73
|
+
except TimeoutError: # noqa: PERF203
|
|
74
|
+
failures += 1
|
|
75
|
+
logger.warning("Blob upload timed out after {:.0f}s", timeout)
|
|
76
|
+
except httpx.HTTPError:
|
|
77
|
+
failures += 1
|
|
78
|
+
logger.exception("Blob upload failed with HTTP error")
|
|
79
|
+
except Exception: # noqa: BLE001
|
|
80
|
+
failures += 1
|
|
81
|
+
logger.exception("Blob upload failed with unexpected error")
|
|
82
|
+
self._futures.clear()
|
|
83
|
+
return failures
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def pending(self) -> int:
|
|
87
|
+
return sum(1 for f in self._futures if not f.done())
|
|
88
|
+
|
|
89
|
+
def shutdown(self, timeout: float = 60.0) -> None:
|
|
90
|
+
"""Drain pending uploads and shut down the thread pool."""
|
|
91
|
+
failures = self.drain(timeout=timeout)
|
|
92
|
+
if failures:
|
|
93
|
+
logger.warning("{} blob upload(s) failed during shutdown", failures)
|
|
94
|
+
self._pool.shutdown(wait=False)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Client-side write cache for read-after-write consistency.
|
|
2
|
+
|
|
3
|
+
All writes (track, __setitem__, property setters) are stored locally so
|
|
4
|
+
that subsequent reads return the latest value even though ingestion to
|
|
5
|
+
FoundationDB is asynchronous.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import threading
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class WriteCache:
|
|
15
|
+
"""Thread-safe in-memory cache keyed by ``(run_hash, dotted_path)``."""
|
|
16
|
+
|
|
17
|
+
def __init__(self) -> None:
|
|
18
|
+
self._data: dict[tuple[str, str], Any] = {}
|
|
19
|
+
self._lock = threading.Lock()
|
|
20
|
+
|
|
21
|
+
def set(self, run_hash: str, path: str, value: Any) -> None: # noqa: ANN401
|
|
22
|
+
with self._lock:
|
|
23
|
+
self._data[(run_hash, path)] = value
|
|
24
|
+
|
|
25
|
+
def get(self, run_hash: str, path: str, default: Any = None) -> Any: # noqa: ANN401
|
|
26
|
+
with self._lock:
|
|
27
|
+
return self._data.get((run_hash, path), default)
|
|
28
|
+
|
|
29
|
+
def has(self, run_hash: str, path: str) -> bool:
|
|
30
|
+
with self._lock:
|
|
31
|
+
return (run_hash, path) in self._data
|
|
32
|
+
|
|
33
|
+
def get_tree(self, run_hash: str) -> dict[str, Any]:
|
|
34
|
+
"""Return all cached key-value pairs for *run_hash* as a flat dict."""
|
|
35
|
+
with self._lock:
|
|
36
|
+
return {path: val for (rh, path), val in self._data.items() if rh == run_hash}
|
|
37
|
+
|
|
38
|
+
def clear(self, run_hash: str | None = None) -> None:
|
|
39
|
+
with self._lock:
|
|
40
|
+
if run_hash is None:
|
|
41
|
+
self._data.clear()
|
|
42
|
+
else:
|
|
43
|
+
self._data = {k: v for k, v in self._data.items() if k[0] != run_hash}
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
"""Background resource tracker for system metrics and terminal log capture.
|
|
2
|
+
|
|
3
|
+
Adapted from ``aim/aim/ext/resource/tracker.py``. Runs a single daemon
|
|
4
|
+
thread that periodically collects CPU/memory/disk/GPU stats and (optionally)
|
|
5
|
+
captures ``sys.stdout`` / ``sys.stderr`` output.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import io
|
|
11
|
+
import re
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
from threading import Event, Thread
|
|
15
|
+
from typing import TYPE_CHECKING
|
|
16
|
+
from weakref import WeakValueDictionary
|
|
17
|
+
|
|
18
|
+
from loguru import logger
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from collections.abc import Callable
|
|
22
|
+
|
|
23
|
+
METRIC_PREFIX = "__system__"
|
|
24
|
+
|
|
25
|
+
STAT_INTERVAL_MIN = 0.1
|
|
26
|
+
STAT_INTERVAL_MAX = 86400.0 # 24 hours
|
|
27
|
+
_LOG_CAPTURE_INTERVAL = 1.0 # seconds
|
|
28
|
+
_TICK = 0.1 # main-loop sleep granularity
|
|
29
|
+
|
|
30
|
+
_ANSI_CSI_RE = re.compile(rb"\001?\033\[((?:\d|;)*)([a-dA-D])\002?")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ResourceTracker:
|
|
34
|
+
"""Collects system resource metrics and/or terminal logs in the background.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
track_fn:
|
|
39
|
+
``(value, name, context) -> None`` — called for each metric sample.
|
|
40
|
+
interval:
|
|
41
|
+
Seconds between system-stat collections. ``None`` disables stats.
|
|
42
|
+
capture_logs:
|
|
43
|
+
If ``True``, intercept ``sys.stdout`` / ``sys.stderr``.
|
|
44
|
+
send_terminal_line_fn:
|
|
45
|
+
``(line, step) -> None`` — called for each captured terminal line.
|
|
46
|
+
log_offset:
|
|
47
|
+
Starting step number for terminal lines (for resume).
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
_buffer_registry: WeakValueDictionary[int, io.BytesIO] = WeakValueDictionary()
|
|
52
|
+
_old_out_write: Callable | None = None
|
|
53
|
+
_old_err_write: Callable | None = None
|
|
54
|
+
_patches_installed: bool = False
|
|
55
|
+
|
|
56
|
+
# ------------------------------------------------------------------
|
|
57
|
+
# Class-level stdout/stderr patching
|
|
58
|
+
# ------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def _install_stream_patches(cls) -> None:
|
|
62
|
+
if cls._patches_installed:
|
|
63
|
+
return
|
|
64
|
+
cls._old_out_write = sys.stdout.write
|
|
65
|
+
cls._old_err_write = sys.stderr.write
|
|
66
|
+
|
|
67
|
+
def _new_out_write(data: str) -> int:
|
|
68
|
+
result = cls._old_out_write(data) # type: ignore[misc]
|
|
69
|
+
raw = data.encode() if isinstance(data, str) else data
|
|
70
|
+
for buf in cls._buffer_registry.values():
|
|
71
|
+
buf.write(raw)
|
|
72
|
+
return result
|
|
73
|
+
|
|
74
|
+
def _new_err_write(data: str) -> int:
|
|
75
|
+
result = cls._old_err_write(data) # type: ignore[misc]
|
|
76
|
+
raw = data.encode() if isinstance(data, str) else data
|
|
77
|
+
for buf in cls._buffer_registry.values():
|
|
78
|
+
buf.write(raw)
|
|
79
|
+
return result
|
|
80
|
+
|
|
81
|
+
sys.stdout.write = _new_out_write # type: ignore[assignment]
|
|
82
|
+
sys.stderr.write = _new_err_write # type: ignore[assignment]
|
|
83
|
+
cls._patches_installed = True
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def _uninstall_stream_patches(cls) -> None:
|
|
87
|
+
if not cls._patches_installed:
|
|
88
|
+
return
|
|
89
|
+
if cls._old_out_write is not None:
|
|
90
|
+
sys.stdout.write = cls._old_out_write # type: ignore[assignment]
|
|
91
|
+
if cls._old_err_write is not None:
|
|
92
|
+
sys.stderr.write = cls._old_err_write # type: ignore[assignment]
|
|
93
|
+
cls._patches_installed = False
|
|
94
|
+
|
|
95
|
+
# ------------------------------------------------------------------
|
|
96
|
+
# Validation
|
|
97
|
+
# ------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def check_interval(cls, interval: float | None, *, warn: bool = True) -> bool:
|
|
101
|
+
"""Return ``True`` if *interval* is a valid stat-collection interval."""
|
|
102
|
+
if interval is None:
|
|
103
|
+
return False
|
|
104
|
+
if not isinstance(interval, (int, float)) or not (STAT_INTERVAL_MIN <= interval <= STAT_INTERVAL_MAX):
|
|
105
|
+
if warn:
|
|
106
|
+
logger.warning(
|
|
107
|
+
"system_tracking_interval must be between {} and {} seconds",
|
|
108
|
+
STAT_INTERVAL_MIN,
|
|
109
|
+
STAT_INTERVAL_MAX,
|
|
110
|
+
)
|
|
111
|
+
return False
|
|
112
|
+
return True
|
|
113
|
+
|
|
114
|
+
# ------------------------------------------------------------------
|
|
115
|
+
# Init / lifecycle
|
|
116
|
+
# ------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
def __init__(
|
|
119
|
+
self,
|
|
120
|
+
*,
|
|
121
|
+
track_fn: Callable[[float, str, dict | None], None],
|
|
122
|
+
interval: float | None = None,
|
|
123
|
+
capture_logs: bool = False,
|
|
124
|
+
send_terminal_line_fn: Callable[[str, int], None] | None = None,
|
|
125
|
+
log_offset: int = 0,
|
|
126
|
+
) -> None:
|
|
127
|
+
self._track_fn = track_fn
|
|
128
|
+
self._stat_interval: float | None = interval if self.check_interval(interval, warn=False) else None
|
|
129
|
+
self._capture_logs = capture_logs
|
|
130
|
+
self._send_line_fn = send_terminal_line_fn
|
|
131
|
+
self._line_counter = log_offset
|
|
132
|
+
|
|
133
|
+
self._io_buffer = io.BytesIO()
|
|
134
|
+
self._shutdown = Event()
|
|
135
|
+
self._started = False
|
|
136
|
+
self._thread: Thread | None = None
|
|
137
|
+
|
|
138
|
+
self._process = None
|
|
139
|
+
if self._stat_interval is not None:
|
|
140
|
+
try:
|
|
141
|
+
import psutil # noqa: PLC0415
|
|
142
|
+
|
|
143
|
+
self._process = psutil.Process()
|
|
144
|
+
psutil.cpu_percent(0.0)
|
|
145
|
+
except Exception: # noqa: BLE001
|
|
146
|
+
logger.debug("psutil unavailable; system stats disabled")
|
|
147
|
+
self._stat_interval = None
|
|
148
|
+
|
|
149
|
+
def start(self) -> None:
|
|
150
|
+
if self._started:
|
|
151
|
+
return
|
|
152
|
+
if self._stat_interval is None and not self._capture_logs:
|
|
153
|
+
return
|
|
154
|
+
self._started = True
|
|
155
|
+
if self._capture_logs:
|
|
156
|
+
if not self._buffer_registry:
|
|
157
|
+
self._install_stream_patches()
|
|
158
|
+
self._buffer_registry[id(self)] = self._io_buffer
|
|
159
|
+
self._thread = Thread(target=self._loop, daemon=True, name="matyan-resource-tracker")
|
|
160
|
+
self._thread.start()
|
|
161
|
+
|
|
162
|
+
def stop(self) -> None:
|
|
163
|
+
if not self._started:
|
|
164
|
+
return
|
|
165
|
+
self._shutdown.set()
|
|
166
|
+
if self._thread is not None:
|
|
167
|
+
self._thread.join(timeout=5)
|
|
168
|
+
if self._capture_logs:
|
|
169
|
+
self._flush_logs()
|
|
170
|
+
self._buffer_registry.pop(id(self), None)
|
|
171
|
+
if not self._buffer_registry:
|
|
172
|
+
self._uninstall_stream_patches()
|
|
173
|
+
self._started = False
|
|
174
|
+
|
|
175
|
+
# ------------------------------------------------------------------
|
|
176
|
+
# Main collection loop
|
|
177
|
+
# ------------------------------------------------------------------
|
|
178
|
+
|
|
179
|
+
def _loop(self) -> None:
|
|
180
|
+
stat_elapsed = 0.0
|
|
181
|
+
log_elapsed = 0.0
|
|
182
|
+
|
|
183
|
+
if self._stat_interval is not None:
|
|
184
|
+
self._collect_stats()
|
|
185
|
+
|
|
186
|
+
while not self._shutdown.is_set():
|
|
187
|
+
time.sleep(_TICK)
|
|
188
|
+
stat_elapsed += _TICK
|
|
189
|
+
log_elapsed += _TICK
|
|
190
|
+
|
|
191
|
+
if self._stat_interval is not None and stat_elapsed >= self._stat_interval:
|
|
192
|
+
self._collect_stats()
|
|
193
|
+
stat_elapsed = 0.0
|
|
194
|
+
|
|
195
|
+
if self._capture_logs and log_elapsed >= _LOG_CAPTURE_INTERVAL:
|
|
196
|
+
self._flush_logs()
|
|
197
|
+
log_elapsed = 0.0
|
|
198
|
+
|
|
199
|
+
# ------------------------------------------------------------------
|
|
200
|
+
# System stats
|
|
201
|
+
# ------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
def _collect_stats(self) -> None:
|
|
204
|
+
try:
|
|
205
|
+
self._collect_system_stats()
|
|
206
|
+
except Exception: # noqa: BLE001
|
|
207
|
+
logger.debug("Error collecting system stats")
|
|
208
|
+
try:
|
|
209
|
+
self._collect_gpu_stats()
|
|
210
|
+
except Exception: # noqa: BLE001
|
|
211
|
+
logger.debug("GPU stats collection failed", exc_info=True)
|
|
212
|
+
|
|
213
|
+
def _collect_system_stats(self) -> None:
|
|
214
|
+
import psutil # noqa: PLC0415
|
|
215
|
+
|
|
216
|
+
proc = self._process
|
|
217
|
+
if proc is None:
|
|
218
|
+
return
|
|
219
|
+
|
|
220
|
+
metrics = {
|
|
221
|
+
"cpu": round(proc.cpu_percent(0.0), 5),
|
|
222
|
+
"p_memory_percent": round(proc.memory_percent(), 5),
|
|
223
|
+
"memory_percent": round(psutil.virtual_memory().percent, 5),
|
|
224
|
+
"disk_percent": round(psutil.disk_usage("/").percent, 5),
|
|
225
|
+
}
|
|
226
|
+
for name, value in metrics.items():
|
|
227
|
+
self._track_fn(value, f"{METRIC_PREFIX}{name}", None)
|
|
228
|
+
|
|
229
|
+
def _collect_gpu_stats(self) -> None:
|
|
230
|
+
try:
|
|
231
|
+
import pynvml # noqa: PLC0415 # pyright: ignore[reportMissingImports]
|
|
232
|
+
except ImportError:
|
|
233
|
+
logger.debug("pynvml unavailable; GPU stats disabled")
|
|
234
|
+
return
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
pynvml.nvmlInit()
|
|
238
|
+
except pynvml.NVMLError:
|
|
239
|
+
logger.debug("NVML initialization failed", exc_info=True)
|
|
240
|
+
return
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
count = pynvml.nvmlDeviceGetCount()
|
|
244
|
+
for idx in range(count):
|
|
245
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
|
|
246
|
+
ctx = {"gpu": idx}
|
|
247
|
+
try:
|
|
248
|
+
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
|
249
|
+
self._track_fn(round(util.gpu, 5), f"{METRIC_PREFIX}gpu", ctx)
|
|
250
|
+
except pynvml.NVMLError:
|
|
251
|
+
logger.debug("GPU {} utilization query failed", idx, exc_info=True)
|
|
252
|
+
try:
|
|
253
|
+
mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
254
|
+
pct = round(mem.used * 100 / mem.total, 5) if mem.total else 0.0
|
|
255
|
+
self._track_fn(pct, f"{METRIC_PREFIX}gpu_memory_percent", ctx)
|
|
256
|
+
except pynvml.NVMLError:
|
|
257
|
+
logger.debug("GPU {} memory query failed", idx, exc_info=True)
|
|
258
|
+
try:
|
|
259
|
+
temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
|
|
260
|
+
self._track_fn(round(float(temp), 5), f"{METRIC_PREFIX}gpu_temp", ctx)
|
|
261
|
+
except pynvml.NVMLError:
|
|
262
|
+
logger.debug("GPU {} temperature query failed", idx, exc_info=True)
|
|
263
|
+
try:
|
|
264
|
+
power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
|
|
265
|
+
self._track_fn(round(power, 5), f"{METRIC_PREFIX}gpu_power_watts", ctx)
|
|
266
|
+
except pynvml.NVMLError:
|
|
267
|
+
logger.debug("GPU {} power query failed", idx, exc_info=True)
|
|
268
|
+
finally:
|
|
269
|
+
try:
|
|
270
|
+
pynvml.nvmlShutdown()
|
|
271
|
+
except pynvml.NVMLError:
|
|
272
|
+
logger.debug("NVML shutdown failed", exc_info=True)
|
|
273
|
+
|
|
274
|
+
# ------------------------------------------------------------------
|
|
275
|
+
# Terminal log capture
|
|
276
|
+
# ------------------------------------------------------------------
|
|
277
|
+
|
|
278
|
+
def _flush_logs(self) -> None:
|
|
279
|
+
if self._send_line_fn is None:
|
|
280
|
+
return
|
|
281
|
+
buf_size = self._io_buffer.tell()
|
|
282
|
+
if not buf_size:
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
self._io_buffer.seek(0)
|
|
286
|
+
data = self._io_buffer.read(buf_size)
|
|
287
|
+
self._io_buffer.seek(0)
|
|
288
|
+
self._io_buffer.truncate()
|
|
289
|
+
|
|
290
|
+
lines = data.split(b"\n")
|
|
291
|
+
for line in lines:
|
|
292
|
+
line = self._strip_ansi(line) # noqa: PLW2901
|
|
293
|
+
line = line.rsplit(b"\r", maxsplit=1)[-1] # noqa: PLW2901
|
|
294
|
+
try:
|
|
295
|
+
text = line.decode("utf-8", errors="replace")
|
|
296
|
+
except Exception: # noqa: BLE001
|
|
297
|
+
logger.debug("Failed to decode terminal line", exc_info=True)
|
|
298
|
+
continue
|
|
299
|
+
self._send_line_fn(text, self._line_counter)
|
|
300
|
+
self._line_counter += 1
|
|
301
|
+
|
|
302
|
+
# last line without trailing newline stays in counter
|
|
303
|
+
self._line_counter -= 1
|
|
304
|
+
if lines[-1] != b"":
|
|
305
|
+
self._io_buffer.write(lines[-1])
|
|
306
|
+
|
|
307
|
+
@staticmethod
|
|
308
|
+
def _strip_ansi(line: bytes) -> bytes:
|
|
309
|
+
return re.sub(_ANSI_CSI_RE, b"", line)
|