matyan-client 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. matyan_client-0.1.0/PKG-INFO +92 -0
  2. matyan_client-0.1.0/README.md +0 -0
  3. matyan_client-0.1.0/pyproject.toml +129 -0
  4. matyan_client-0.1.0/setup.cfg +4 -0
  5. matyan_client-0.1.0/src/matyan_client/__init__.py +15 -0
  6. matyan_client-0.1.0/src/matyan_client/_blob_uploader.py +94 -0
  7. matyan_client-0.1.0/src/matyan_client/_cache.py +43 -0
  8. matyan_client-0.1.0/src/matyan_client/_resource_tracker.py +309 -0
  9. matyan_client-0.1.0/src/matyan_client/_system_params.py +94 -0
  10. matyan_client-0.1.0/src/matyan_client/_types.py +9 -0
  11. matyan_client-0.1.0/src/matyan_client/adapters/__init__.py +24 -0
  12. matyan_client-0.1.0/src/matyan_client/adapters/_exception_resistant.py +89 -0
  13. matyan_client-0.1.0/src/matyan_client/adapters/_keras_mixins.py +55 -0
  14. matyan_client-0.1.0/src/matyan_client/adapters/_utils.py +33 -0
  15. matyan_client-0.1.0/src/matyan_client/adapters/acme.py +101 -0
  16. matyan_client-0.1.0/src/matyan_client/adapters/catboost.py +99 -0
  17. matyan_client-0.1.0/src/matyan_client/adapters/distributed_hugging_face.py +325 -0
  18. matyan_client-0.1.0/src/matyan_client/adapters/fastai.py +172 -0
  19. matyan_client-0.1.0/src/matyan_client/adapters/hugging_face.py +184 -0
  20. matyan_client-0.1.0/src/matyan_client/adapters/keras.py +59 -0
  21. matyan_client-0.1.0/src/matyan_client/adapters/keras_tuner.py +80 -0
  22. matyan_client-0.1.0/src/matyan_client/adapters/lightgbm.py +99 -0
  23. matyan_client-0.1.0/src/matyan_client/adapters/mxnet.py +232 -0
  24. matyan_client-0.1.0/src/matyan_client/adapters/optuna.py +144 -0
  25. matyan_client-0.1.0/src/matyan_client/adapters/paddle.py +95 -0
  26. matyan_client-0.1.0/src/matyan_client/adapters/prophet.py +99 -0
  27. matyan_client-0.1.0/src/matyan_client/adapters/pytorch.py +56 -0
  28. matyan_client-0.1.0/src/matyan_client/adapters/pytorch_ignite.py +179 -0
  29. matyan_client-0.1.0/src/matyan_client/adapters/pytorch_lightning.py +205 -0
  30. matyan_client-0.1.0/src/matyan_client/adapters/sb3.py +108 -0
  31. matyan_client-0.1.0/src/matyan_client/adapters/tensorflow.py +60 -0
  32. matyan_client-0.1.0/src/matyan_client/adapters/xgboost.py +103 -0
  33. matyan_client-0.1.0/src/matyan_client/config.py +20 -0
  34. matyan_client-0.1.0/src/matyan_client/objects/__init__.py +7 -0
  35. matyan_client-0.1.0/src/matyan_client/objects/audio.py +112 -0
  36. matyan_client-0.1.0/src/matyan_client/objects/distribution.py +98 -0
  37. matyan_client-0.1.0/src/matyan_client/objects/figure.py +66 -0
  38. matyan_client-0.1.0/src/matyan_client/objects/image.py +114 -0
  39. matyan_client-0.1.0/src/matyan_client/objects/text.py +24 -0
  40. matyan_client-0.1.0/src/matyan_client/repo.py +222 -0
  41. matyan_client-0.1.0/src/matyan_client/run.py +555 -0
  42. matyan_client-0.1.0/src/matyan_client/transport/__init__.py +4 -0
  43. matyan_client-0.1.0/src/matyan_client/transport/http.py +318 -0
  44. matyan_client-0.1.0/src/matyan_client/transport/ws.py +456 -0
  45. matyan_client-0.1.0/src/matyan_client.egg-info/PKG-INFO +92 -0
  46. matyan_client-0.1.0/src/matyan_client.egg-info/SOURCES.txt +57 -0
  47. matyan_client-0.1.0/src/matyan_client.egg-info/dependency_links.txt +1 -0
  48. matyan_client-0.1.0/src/matyan_client.egg-info/requires.txt +114 -0
  49. matyan_client-0.1.0/src/matyan_client.egg-info/top_level.txt +1 -0
  50. matyan_client-0.1.0/tests/test_blob_uploader.py +151 -0
  51. matyan_client-0.1.0/tests/test_cache.py +94 -0
  52. matyan_client-0.1.0/tests/test_config.py +34 -0
  53. matyan_client-0.1.0/tests/test_http_transport.py +514 -0
  54. matyan_client-0.1.0/tests/test_repo.py +308 -0
  55. matyan_client-0.1.0/tests/test_resource_tracker.py +505 -0
  56. matyan_client-0.1.0/tests/test_run.py +794 -0
  57. matyan_client-0.1.0/tests/test_system_params.py +158 -0
  58. matyan_client-0.1.0/tests/test_types.py +17 -0
  59. matyan_client-0.1.0/tests/test_ws_transport.py +657 -0
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.4
2
+ Name: matyan-client
3
+ Version: 0.1.0
4
+ Summary: Matyan SDK — Aim-compatible client for experiment tracking
5
+ Author-email: Tigran Grigoryan <grigoryan.tigran119@gmail.com>
6
+ Requires-Python: <4,>=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: httpx~=0.28.0
9
+ Requires-Dist: loguru~=0.7.0
10
+ Requires-Dist: matyan-api-models~=0.1.0
11
+ Requires-Dist: psutil>=5.9
12
+ Requires-Dist: pydantic~=2.0
13
+ Requires-Dist: pydantic-settings~=2.0
14
+ Requires-Dist: websockets~=15.0
15
+ Provides-Extra: image
16
+ Requires-Dist: Pillow>=9.0; extra == "image"
17
+ Requires-Dist: numpy>=1.21; extra == "image"
18
+ Provides-Extra: audio
19
+ Requires-Dist: numpy>=1.21; extra == "audio"
20
+ Provides-Extra: figure
21
+ Requires-Dist: plotly>=5.0; extra == "figure"
22
+ Provides-Extra: matplotlib
23
+ Requires-Dist: matplotlib>=3.5; extra == "matplotlib"
24
+ Requires-Dist: plotly>=5.0; extra == "matplotlib"
25
+ Provides-Extra: numpy
26
+ Requires-Dist: numpy>=1.21; extra == "numpy"
27
+ Provides-Extra: msgpack
28
+ Requires-Dist: msgpack>=1.0; extra == "msgpack"
29
+ Provides-Extra: gpu
30
+ Requires-Dist: nvidia-ml-py>=11.0; extra == "gpu"
31
+ Provides-Extra: keras
32
+ Requires-Dist: keras>=2.0; extra == "keras"
33
+ Provides-Extra: tensorflow
34
+ Requires-Dist: tensorflow>=2.0; extra == "tensorflow"
35
+ Provides-Extra: pytorch
36
+ Requires-Dist: torch>=1.9; extra == "pytorch"
37
+ Provides-Extra: pytorch-lightning
38
+ Requires-Dist: lightning>=2.0; python_version >= "3.10" and extra == "pytorch-lightning"
39
+ Requires-Dist: omegaconf>=2.0; extra == "pytorch-lightning"
40
+ Provides-Extra: pytorch-ignite
41
+ Requires-Dist: pytorch-ignite>=0.4; extra == "pytorch-ignite"
42
+ Requires-Dist: omegaconf>=2.0; extra == "pytorch-ignite"
43
+ Provides-Extra: hugging-face
44
+ Requires-Dist: transformers>=4.0; extra == "hugging-face"
45
+ Provides-Extra: distributed-hugging-face
46
+ Requires-Dist: transformers>=4.0; extra == "distributed-hugging-face"
47
+ Requires-Dist: accelerate>=0.20; extra == "distributed-hugging-face"
48
+ Provides-Extra: xgboost
49
+ Requires-Dist: xgboost>=1.0; extra == "xgboost"
50
+ Provides-Extra: lightgbm
51
+ Requires-Dist: lightgbm>=3.0; extra == "lightgbm"
52
+ Provides-Extra: catboost
53
+ Requires-Dist: catboost>=1.0; extra == "catboost"
54
+ Provides-Extra: optuna
55
+ Requires-Dist: optuna>=3.0; extra == "optuna"
56
+ Provides-Extra: keras-tuner
57
+ Requires-Dist: keras-tuner>=1.0; extra == "keras-tuner"
58
+ Provides-Extra: prophet
59
+ Requires-Dist: prophet>=1.0; extra == "prophet"
60
+ Provides-Extra: sb3
61
+ Requires-Dist: stable-baselines3>=1.0; extra == "sb3"
62
+ Provides-Extra: fastai
63
+ Requires-Dist: fastai>=2.0; extra == "fastai"
64
+ Requires-Dist: ipython; extra == "fastai"
65
+ Provides-Extra: mxnet
66
+ Requires-Dist: mxnet>=1.9; extra == "mxnet"
67
+ Provides-Extra: adapters-all
68
+ Requires-Dist: keras>=2.0; extra == "adapters-all"
69
+ Requires-Dist: tensorflow>=2.0; extra == "adapters-all"
70
+ Requires-Dist: torch>=1.9; extra == "adapters-all"
71
+ Requires-Dist: lightning>=2.0; python_version >= "3.10" and extra == "adapters-all"
72
+ Requires-Dist: pytorch-ignite>=0.4; extra == "adapters-all"
73
+ Requires-Dist: omegaconf>=2.0; extra == "adapters-all"
74
+ Requires-Dist: transformers>=4.0; extra == "adapters-all"
75
+ Requires-Dist: accelerate>=0.20; extra == "adapters-all"
76
+ Requires-Dist: xgboost>=1.0; extra == "adapters-all"
77
+ Requires-Dist: lightgbm>=3.0; extra == "adapters-all"
78
+ Requires-Dist: catboost>=1.0; extra == "adapters-all"
79
+ Requires-Dist: optuna>=3.0; extra == "adapters-all"
80
+ Requires-Dist: keras-tuner>=1.0; extra == "adapters-all"
81
+ Requires-Dist: prophet>=1.0; extra == "adapters-all"
82
+ Requires-Dist: stable-baselines3>=1.0; extra == "adapters-all"
83
+ Requires-Dist: fastai>=2.0; extra == "adapters-all"
84
+ Requires-Dist: ipython; extra == "adapters-all"
85
+ Requires-Dist: mxnet>=1.9; extra == "adapters-all"
86
+ Provides-Extra: extended
87
+ Requires-Dist: Pillow>=9.0; extra == "extended"
88
+ Requires-Dist: numpy>=1.21; extra == "extended"
89
+ Requires-Dist: plotly>=5.0; extra == "extended"
90
+ Requires-Dist: matplotlib>=3.5; extra == "extended"
91
+ Requires-Dist: msgpack>=1.0; extra == "extended"
92
+ Requires-Dist: nvidia-ml-py>=11.0; extra == "extended"
File without changes
@@ -0,0 +1,129 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "matyan-client"
7
+ version = "0.1.0"
8
+ description = "Matyan SDK — Aim-compatible client for experiment tracking"
9
+ readme = "README.md"
10
+ authors = [
11
+ { name = "Tigran Grigoryan", email = "grigoryan.tigran119@gmail.com" }
12
+ ]
13
+ requires-python = ">=3.10, <4"
14
+ dependencies = [
15
+ "httpx~=0.28.0",
16
+ "loguru~=0.7.0",
17
+ "matyan-api-models~=0.1.0",
18
+ "psutil>=5.9",
19
+ "pydantic~=2.0",
20
+ "pydantic-settings~=2.0",
21
+ "websockets~=15.0",
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ image = ["Pillow>=9.0", "numpy>=1.21"]
26
+ audio = ["numpy>=1.21"]
27
+ figure = ["plotly>=5.0"]
28
+ matplotlib = ["matplotlib>=3.5", "plotly>=5.0"]
29
+ numpy = ["numpy>=1.21"]
30
+ msgpack = ["msgpack>=1.0"]
31
+ gpu = ["nvidia-ml-py>=11.0"]
32
+ # -- adapter framework extras --
33
+ keras = ["keras>=2.0"]
34
+ tensorflow = ["tensorflow>=2.0"]
35
+ pytorch = ["torch>=1.9"]
36
+ pytorch-lightning = ["lightning>=2.0; python_version>='3.10'", "omegaconf>=2.0"]
37
+ pytorch-ignite = ["pytorch-ignite>=0.4", "omegaconf>=2.0"]
38
+ hugging-face = ["transformers>=4.0"]
39
+ distributed-hugging-face = ["transformers>=4.0", "accelerate>=0.20"]
40
+ xgboost = ["xgboost>=1.0"]
41
+ lightgbm = ["lightgbm>=3.0"]
42
+ catboost = ["catboost>=1.0"]
43
+ optuna = ["optuna>=3.0"]
44
+ keras-tuner = ["keras-tuner>=1.0"]
45
+ prophet = ["prophet>=1.0"]
46
+ sb3 = ["stable-baselines3>=1.0"]
47
+ # acme = ["dm-acme>=0.4"]
48
+ fastai = ["fastai>=2.0", "ipython"]
49
+ # paddle = ["paddlepaddle>=2.0"]
50
+ mxnet = ["mxnet>=1.9"]
51
+ adapters-all = [
52
+ "keras>=2.0",
53
+ "tensorflow>=2.0",
54
+ "torch>=1.9",
55
+ "lightning>=2.0; python_version>='3.10'",
56
+ "pytorch-ignite>=0.4",
57
+ "omegaconf>=2.0",
58
+ "transformers>=4.0",
59
+ "accelerate>=0.20",
60
+ "xgboost>=1.0",
61
+ "lightgbm>=3.0",
62
+ "catboost>=1.0",
63
+ "optuna>=3.0",
64
+ "keras-tuner>=1.0",
65
+ "prophet>=1.0",
66
+ "stable-baselines3>=1.0",
67
+ # "dm-acme>=0.4",
68
+ "fastai>=2.0",
69
+ "ipython",
70
+ # "paddlepaddle>=2.0",
71
+ "mxnet>=1.9",
72
+ ]
73
+ extended = [
74
+ "Pillow>=9.0",
75
+ "numpy>=1.21",
76
+ "plotly>=5.0",
77
+ "matplotlib>=3.5",
78
+ "msgpack>=1.0",
79
+ "nvidia-ml-py>=11.0",
80
+ ]
81
+
82
+ [dependency-groups]
83
+ dev = [
84
+ "matyan-api-models",
85
+ "mypy~=1.15",
86
+ "pytest~=8.0",
87
+ "pytest-cov~=6.0",
88
+ "pytest-asyncio~=0.25",
89
+ "respx~=0.22",
90
+ "Pillow>=9.0",
91
+ "numpy>=1.21",
92
+ "plotly>=5.0",
93
+ "datasets~=4.6",
94
+ ]
95
+
96
+ [tool.ruff]
97
+ line-length = 120
98
+ target-version = "py310"
99
+ exclude = ["*.ipynb"]
100
+
101
+ [tool.ruff.lint]
102
+ select = ["ALL"]
103
+ ignore = [
104
+ "FBT001",
105
+ "FBT002",
106
+ "D100",
107
+ "D101",
108
+ "D102",
109
+ "D103",
110
+ "D104",
111
+ "D105",
112
+ "D106",
113
+ "D107",
114
+ "D203",
115
+ "D213",
116
+ "D417",
117
+ "FBT003",
118
+ "EXE002",
119
+ "S311",
120
+ "TD003",
121
+ "D205",
122
+ "PLR2004",
123
+ "PLR0913",
124
+ "S101",
125
+ "S104"
126
+ ]
127
+
128
+ [tool.uv.sources]
129
+ matyan-api-models = { path = "../matyan-api-models", editable = true }
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,15 @@
1
+ """Matyan Client SDK — Aim-compatible experiment tracking."""
2
+
3
+ from .objects import Audio, Distribution, Figure, Image, Text
4
+ from .repo import Repo
5
+ from .run import Run
6
+
7
+ __all__ = [
8
+ "Audio",
9
+ "Distribution",
10
+ "Figure",
11
+ "Image",
12
+ "Repo",
13
+ "Run",
14
+ "Text",
15
+ ]
@@ -0,0 +1,94 @@
1
+ """Background blob uploader using a thread pool.
2
+
3
+ Presigned-URL acquisition and S3 PUT uploads run in worker threads so that
4
+ ``run.track(Image(...))`` returns immediately without blocking the training
5
+ loop.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from concurrent.futures import Future, ThreadPoolExecutor
11
+ from typing import TYPE_CHECKING
12
+
13
+ import httpx
14
+ from loguru import logger
15
+
16
+ if TYPE_CHECKING:
17
+ from .transport.http import HttpTransport
18
+
19
+ _MAX_WORKERS = 4
20
+
21
+
22
+ class BlobUploader:
23
+ """Manages background S3 blob uploads via presigned URLs."""
24
+
25
+ def __init__(
26
+ self,
27
+ http: HttpTransport,
28
+ frontier_url: str,
29
+ run_id: str,
30
+ ) -> None:
31
+ self._http = http
32
+ self._frontier_url = frontier_url
33
+ self._run_id = run_id
34
+ self._pool = ThreadPoolExecutor(max_workers=_MAX_WORKERS, thread_name_prefix="blob-upload")
35
+ self._futures: list[Future[None]] = []
36
+
37
+ def submit(
38
+ self,
39
+ raw_bytes: bytes,
40
+ artifact_path: str,
41
+ content_type: str,
42
+ ) -> str:
43
+ """Submit a blob upload and return the deterministic ``s3_key`` immediately."""
44
+ s3_key = f"{self._run_id}/{artifact_path}"
45
+ fut = self._pool.submit(self._upload, raw_bytes, artifact_path, content_type, s3_key)
46
+ self._futures.append(fut)
47
+ return s3_key
48
+
49
+ def _upload(self, raw_bytes: bytes, artifact_path: str, content_type: str, s3_key: str) -> None:
50
+ try:
51
+ resp = self._http.presign_artifact(
52
+ self._frontier_url,
53
+ self._run_id,
54
+ artifact_path,
55
+ content_type=content_type,
56
+ )
57
+ upload_resp = httpx.put(
58
+ resp["upload_url"],
59
+ content=raw_bytes,
60
+ headers={"Content-Type": content_type},
61
+ timeout=120,
62
+ )
63
+ upload_resp.raise_for_status()
64
+ except httpx.HTTPError:
65
+ logger.exception("Background blob upload failed", s3_key=s3_key)
66
+
67
+ def drain(self, timeout: float = 60.0) -> int:
68
+ """Wait for all pending uploads to finish. Returns the number of failures."""
69
+ failures = 0
70
+ for fut in self._futures:
71
+ try:
72
+ fut.result(timeout=timeout)
73
+ except TimeoutError: # noqa: PERF203
74
+ failures += 1
75
+ logger.warning("Blob upload timed out after {:.0f}s", timeout)
76
+ except httpx.HTTPError:
77
+ failures += 1
78
+ logger.exception("Blob upload failed with HTTP error")
79
+ except Exception: # noqa: BLE001
80
+ failures += 1
81
+ logger.exception("Blob upload failed with unexpected error")
82
+ self._futures.clear()
83
+ return failures
84
+
85
+ @property
86
+ def pending(self) -> int:
87
+ return sum(1 for f in self._futures if not f.done())
88
+
89
+ def shutdown(self, timeout: float = 60.0) -> None:
90
+ """Drain pending uploads and shut down the thread pool."""
91
+ failures = self.drain(timeout=timeout)
92
+ if failures:
93
+ logger.warning("{} blob upload(s) failed during shutdown", failures)
94
+ self._pool.shutdown(wait=False)
@@ -0,0 +1,43 @@
1
+ """Client-side write cache for read-after-write consistency.
2
+
3
+ All writes (track, __setitem__, property setters) are stored locally so
4
+ that subsequent reads return the latest value even though ingestion to
5
+ FoundationDB is asynchronous.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import threading
11
+ from typing import Any
12
+
13
+
14
+ class WriteCache:
15
+ """Thread-safe in-memory cache keyed by ``(run_hash, dotted_path)``."""
16
+
17
+ def __init__(self) -> None:
18
+ self._data: dict[tuple[str, str], Any] = {}
19
+ self._lock = threading.Lock()
20
+
21
+ def set(self, run_hash: str, path: str, value: Any) -> None: # noqa: ANN401
22
+ with self._lock:
23
+ self._data[(run_hash, path)] = value
24
+
25
+ def get(self, run_hash: str, path: str, default: Any = None) -> Any: # noqa: ANN401
26
+ with self._lock:
27
+ return self._data.get((run_hash, path), default)
28
+
29
+ def has(self, run_hash: str, path: str) -> bool:
30
+ with self._lock:
31
+ return (run_hash, path) in self._data
32
+
33
+ def get_tree(self, run_hash: str) -> dict[str, Any]:
34
+ """Return all cached key-value pairs for *run_hash* as a flat dict."""
35
+ with self._lock:
36
+ return {path: val for (rh, path), val in self._data.items() if rh == run_hash}
37
+
38
+ def clear(self, run_hash: str | None = None) -> None:
39
+ with self._lock:
40
+ if run_hash is None:
41
+ self._data.clear()
42
+ else:
43
+ self._data = {k: v for k, v in self._data.items() if k[0] != run_hash}
@@ -0,0 +1,309 @@
1
+ """Background resource tracker for system metrics and terminal log capture.
2
+
3
+ Adapted from ``aim/aim/ext/resource/tracker.py``. Runs a single daemon
4
+ thread that periodically collects CPU/memory/disk/GPU stats and (optionally)
5
+ captures ``sys.stdout`` / ``sys.stderr`` output.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import io
11
+ import re
12
+ import sys
13
+ import time
14
+ from threading import Event, Thread
15
+ from typing import TYPE_CHECKING
16
+ from weakref import WeakValueDictionary
17
+
18
+ from loguru import logger
19
+
20
+ if TYPE_CHECKING:
21
+ from collections.abc import Callable
22
+
23
+ METRIC_PREFIX = "__system__"
24
+
25
+ STAT_INTERVAL_MIN = 0.1
26
+ STAT_INTERVAL_MAX = 86400.0 # 24 hours
27
+ _LOG_CAPTURE_INTERVAL = 1.0 # seconds
28
+ _TICK = 0.1 # main-loop sleep granularity
29
+
30
+ _ANSI_CSI_RE = re.compile(rb"\001?\033\[((?:\d|;)*)([a-dA-D])\002?")
31
+
32
+
33
+ class ResourceTracker:
34
+ """Collects system resource metrics and/or terminal logs in the background.
35
+
36
+ Parameters
37
+ ----------
38
+ track_fn:
39
+ ``(value, name, context) -> None`` — called for each metric sample.
40
+ interval:
41
+ Seconds between system-stat collections. ``None`` disables stats.
42
+ capture_logs:
43
+ If ``True``, intercept ``sys.stdout`` / ``sys.stderr``.
44
+ send_terminal_line_fn:
45
+ ``(line, step) -> None`` — called for each captured terminal line.
46
+ log_offset:
47
+ Starting step number for terminal lines (for resume).
48
+
49
+ """
50
+
51
+ _buffer_registry: WeakValueDictionary[int, io.BytesIO] = WeakValueDictionary()
52
+ _old_out_write: Callable | None = None
53
+ _old_err_write: Callable | None = None
54
+ _patches_installed: bool = False
55
+
56
+ # ------------------------------------------------------------------
57
+ # Class-level stdout/stderr patching
58
+ # ------------------------------------------------------------------
59
+
60
+ @classmethod
61
+ def _install_stream_patches(cls) -> None:
62
+ if cls._patches_installed:
63
+ return
64
+ cls._old_out_write = sys.stdout.write
65
+ cls._old_err_write = sys.stderr.write
66
+
67
+ def _new_out_write(data: str) -> int:
68
+ result = cls._old_out_write(data) # type: ignore[misc]
69
+ raw = data.encode() if isinstance(data, str) else data
70
+ for buf in cls._buffer_registry.values():
71
+ buf.write(raw)
72
+ return result
73
+
74
+ def _new_err_write(data: str) -> int:
75
+ result = cls._old_err_write(data) # type: ignore[misc]
76
+ raw = data.encode() if isinstance(data, str) else data
77
+ for buf in cls._buffer_registry.values():
78
+ buf.write(raw)
79
+ return result
80
+
81
+ sys.stdout.write = _new_out_write # type: ignore[assignment]
82
+ sys.stderr.write = _new_err_write # type: ignore[assignment]
83
+ cls._patches_installed = True
84
+
85
+ @classmethod
86
+ def _uninstall_stream_patches(cls) -> None:
87
+ if not cls._patches_installed:
88
+ return
89
+ if cls._old_out_write is not None:
90
+ sys.stdout.write = cls._old_out_write # type: ignore[assignment]
91
+ if cls._old_err_write is not None:
92
+ sys.stderr.write = cls._old_err_write # type: ignore[assignment]
93
+ cls._patches_installed = False
94
+
95
+ # ------------------------------------------------------------------
96
+ # Validation
97
+ # ------------------------------------------------------------------
98
+
99
+ @classmethod
100
+ def check_interval(cls, interval: float | None, *, warn: bool = True) -> bool:
101
+ """Return ``True`` if *interval* is a valid stat-collection interval."""
102
+ if interval is None:
103
+ return False
104
+ if not isinstance(interval, (int, float)) or not (STAT_INTERVAL_MIN <= interval <= STAT_INTERVAL_MAX):
105
+ if warn:
106
+ logger.warning(
107
+ "system_tracking_interval must be between {} and {} seconds",
108
+ STAT_INTERVAL_MIN,
109
+ STAT_INTERVAL_MAX,
110
+ )
111
+ return False
112
+ return True
113
+
114
+ # ------------------------------------------------------------------
115
+ # Init / lifecycle
116
+ # ------------------------------------------------------------------
117
+
118
+ def __init__(
119
+ self,
120
+ *,
121
+ track_fn: Callable[[float, str, dict | None], None],
122
+ interval: float | None = None,
123
+ capture_logs: bool = False,
124
+ send_terminal_line_fn: Callable[[str, int], None] | None = None,
125
+ log_offset: int = 0,
126
+ ) -> None:
127
+ self._track_fn = track_fn
128
+ self._stat_interval: float | None = interval if self.check_interval(interval, warn=False) else None
129
+ self._capture_logs = capture_logs
130
+ self._send_line_fn = send_terminal_line_fn
131
+ self._line_counter = log_offset
132
+
133
+ self._io_buffer = io.BytesIO()
134
+ self._shutdown = Event()
135
+ self._started = False
136
+ self._thread: Thread | None = None
137
+
138
+ self._process = None
139
+ if self._stat_interval is not None:
140
+ try:
141
+ import psutil # noqa: PLC0415
142
+
143
+ self._process = psutil.Process()
144
+ psutil.cpu_percent(0.0)
145
+ except Exception: # noqa: BLE001
146
+ logger.debug("psutil unavailable; system stats disabled")
147
+ self._stat_interval = None
148
+
149
+ def start(self) -> None:
150
+ if self._started:
151
+ return
152
+ if self._stat_interval is None and not self._capture_logs:
153
+ return
154
+ self._started = True
155
+ if self._capture_logs:
156
+ if not self._buffer_registry:
157
+ self._install_stream_patches()
158
+ self._buffer_registry[id(self)] = self._io_buffer
159
+ self._thread = Thread(target=self._loop, daemon=True, name="matyan-resource-tracker")
160
+ self._thread.start()
161
+
162
+ def stop(self) -> None:
163
+ if not self._started:
164
+ return
165
+ self._shutdown.set()
166
+ if self._thread is not None:
167
+ self._thread.join(timeout=5)
168
+ if self._capture_logs:
169
+ self._flush_logs()
170
+ self._buffer_registry.pop(id(self), None)
171
+ if not self._buffer_registry:
172
+ self._uninstall_stream_patches()
173
+ self._started = False
174
+
175
+ # ------------------------------------------------------------------
176
+ # Main collection loop
177
+ # ------------------------------------------------------------------
178
+
179
+ def _loop(self) -> None:
180
+ stat_elapsed = 0.0
181
+ log_elapsed = 0.0
182
+
183
+ if self._stat_interval is not None:
184
+ self._collect_stats()
185
+
186
+ while not self._shutdown.is_set():
187
+ time.sleep(_TICK)
188
+ stat_elapsed += _TICK
189
+ log_elapsed += _TICK
190
+
191
+ if self._stat_interval is not None and stat_elapsed >= self._stat_interval:
192
+ self._collect_stats()
193
+ stat_elapsed = 0.0
194
+
195
+ if self._capture_logs and log_elapsed >= _LOG_CAPTURE_INTERVAL:
196
+ self._flush_logs()
197
+ log_elapsed = 0.0
198
+
199
+ # ------------------------------------------------------------------
200
+ # System stats
201
+ # ------------------------------------------------------------------
202
+
203
+ def _collect_stats(self) -> None:
204
+ try:
205
+ self._collect_system_stats()
206
+ except Exception: # noqa: BLE001
207
+ logger.debug("Error collecting system stats")
208
+ try:
209
+ self._collect_gpu_stats()
210
+ except Exception: # noqa: BLE001
211
+ logger.debug("GPU stats collection failed", exc_info=True)
212
+
213
+ def _collect_system_stats(self) -> None:
214
+ import psutil # noqa: PLC0415
215
+
216
+ proc = self._process
217
+ if proc is None:
218
+ return
219
+
220
+ metrics = {
221
+ "cpu": round(proc.cpu_percent(0.0), 5),
222
+ "p_memory_percent": round(proc.memory_percent(), 5),
223
+ "memory_percent": round(psutil.virtual_memory().percent, 5),
224
+ "disk_percent": round(psutil.disk_usage("/").percent, 5),
225
+ }
226
+ for name, value in metrics.items():
227
+ self._track_fn(value, f"{METRIC_PREFIX}{name}", None)
228
+
229
+ def _collect_gpu_stats(self) -> None:
230
+ try:
231
+ import pynvml # noqa: PLC0415 # pyright: ignore[reportMissingImports]
232
+ except ImportError:
233
+ logger.debug("pynvml unavailable; GPU stats disabled")
234
+ return
235
+
236
+ try:
237
+ pynvml.nvmlInit()
238
+ except pynvml.NVMLError:
239
+ logger.debug("NVML initialization failed", exc_info=True)
240
+ return
241
+
242
+ try:
243
+ count = pynvml.nvmlDeviceGetCount()
244
+ for idx in range(count):
245
+ handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
246
+ ctx = {"gpu": idx}
247
+ try:
248
+ util = pynvml.nvmlDeviceGetUtilizationRates(handle)
249
+ self._track_fn(round(util.gpu, 5), f"{METRIC_PREFIX}gpu", ctx)
250
+ except pynvml.NVMLError:
251
+ logger.debug("GPU {} utilization query failed", idx, exc_info=True)
252
+ try:
253
+ mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
254
+ pct = round(mem.used * 100 / mem.total, 5) if mem.total else 0.0
255
+ self._track_fn(pct, f"{METRIC_PREFIX}gpu_memory_percent", ctx)
256
+ except pynvml.NVMLError:
257
+ logger.debug("GPU {} memory query failed", idx, exc_info=True)
258
+ try:
259
+ temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
260
+ self._track_fn(round(float(temp), 5), f"{METRIC_PREFIX}gpu_temp", ctx)
261
+ except pynvml.NVMLError:
262
+ logger.debug("GPU {} temperature query failed", idx, exc_info=True)
263
+ try:
264
+ power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
265
+ self._track_fn(round(power, 5), f"{METRIC_PREFIX}gpu_power_watts", ctx)
266
+ except pynvml.NVMLError:
267
+ logger.debug("GPU {} power query failed", idx, exc_info=True)
268
+ finally:
269
+ try:
270
+ pynvml.nvmlShutdown()
271
+ except pynvml.NVMLError:
272
+ logger.debug("NVML shutdown failed", exc_info=True)
273
+
274
+ # ------------------------------------------------------------------
275
+ # Terminal log capture
276
+ # ------------------------------------------------------------------
277
+
278
+ def _flush_logs(self) -> None:
279
+ if self._send_line_fn is None:
280
+ return
281
+ buf_size = self._io_buffer.tell()
282
+ if not buf_size:
283
+ return
284
+
285
+ self._io_buffer.seek(0)
286
+ data = self._io_buffer.read(buf_size)
287
+ self._io_buffer.seek(0)
288
+ self._io_buffer.truncate()
289
+
290
+ lines = data.split(b"\n")
291
+ for line in lines:
292
+ line = self._strip_ansi(line) # noqa: PLW2901
293
+ line = line.rsplit(b"\r", maxsplit=1)[-1] # noqa: PLW2901
294
+ try:
295
+ text = line.decode("utf-8", errors="replace")
296
+ except Exception: # noqa: BLE001
297
+ logger.debug("Failed to decode terminal line", exc_info=True)
298
+ continue
299
+ self._send_line_fn(text, self._line_counter)
300
+ self._line_counter += 1
301
+
302
+ # last line without trailing newline stays in counter
303
+ self._line_counter -= 1
304
+ if lines[-1] != b"":
305
+ self._io_buffer.write(lines[-1])
306
+
307
+ @staticmethod
308
+ def _strip_ansi(line: bytes) -> bytes:
309
+ return re.sub(_ANSI_CSI_RE, b"", line)