gpu-usage-audit 1.1.0__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/PKG-INFO +1 -1
- gpu_usage_audit-1.2.0/docs/work-specs/0002-daemon-cloud-mode.ko.md +43 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/pyproject.toml +1 -1
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/__main__.py +57 -2
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/daemon.py +18 -3
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_cloud_cli.py +63 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_daemon.py +49 -1
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/uv.lock +1 -1
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/.github/workflows/ci.yml +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/.github/workflows/release.yml +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/.gitignore +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/CHANGELOG.md +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/LICENSE +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/README.ko.md +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/README.md +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/docs/work-specs/0001-gua-board-cloud-sync.ko.md +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/projects/bare-metal-1.0/handoff.ko.md +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/projects/bare-metal-1.0/plan.ko.md +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/projects/bare-metal-1.0/status.ko.md +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/scripts/check-tag-version.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/scripts/smoke-dist-wheel.sh +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/__init__.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/classify.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/cloud/__init__.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/cloud/client.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/cloud/config.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/cloud/snapshot.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/db.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/doctor.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/identity.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/model.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/nvml.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/paths.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/render.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/report.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/summarize.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/src/gpu_usage_audit/tier.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/__init__.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_classify.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_cloud_client.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_cloud_config.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_cloud_snapshot.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_db.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_doctor.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_identity.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_nvml.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_render.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_report.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_smoke.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_summarize.py +0 -0
- {gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/tests/test_tier.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-usage-audit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Single-host daemon that surfaces 'idle-held' NVIDIA GPU memory — the embarrassing category conventional dashboards miss.
|
|
5
5
|
Project-URL: Homepage, https://github.com/AI-Ocean/gpu-usage-audit
|
|
6
6
|
Project-URL: Issues, https://github.com/AI-Ocean/gpu-usage-audit/issues
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# 0002 Daemon Cloud Mode (continuous push)
|
|
2
|
+
|
|
3
|
+
상태: draft
|
|
4
|
+
관련: 0001 (enroll + sync-once)
|
|
5
|
+
목표: `gua daemon --cloud` — 데몬이 매 틱 local DB 에 기록한 뒤 latest snapshot 을 GUA Board 로 push 한다. 일회성 `sync-once` 를 연속 운영으로 확장(없으면 보드가 stale).
|
|
6
|
+
|
|
7
|
+
## 배경
|
|
8
|
+
|
|
9
|
+
0001 에서 enroll + sync-once(1회 수집→local write→push)가 생겼다. 보드가 살아있으려면 호스트가 주기적으로 latest 를 올려야 한다. 기존 `gua daemon` 루프(anti-drift, 시그널 종료, local-write-first)를 재사용해 cloud push 를 얹는다.
|
|
10
|
+
|
|
11
|
+
## 범위
|
|
12
|
+
|
|
13
|
+
포함:
|
|
14
|
+
|
|
15
|
+
- `daemon` 에 `--cloud` + `--config` 플래그(두 CLI 파서 모두: `gua`, `gpu-usage-audit`).
|
|
16
|
+
- `daemon.run_daemon`/`_tick` 에 optional `on_tick(snap, ts)` 후크. **daemon 모듈은 cloud 를 import 하지 않는다** — CLI 가 콜백 주입(결합도 분리). 후크는 local write *이후* 호출, 실패해도 로그만 남기고 다음 틱 계속(local-write-first 불변식).
|
|
17
|
+
- `_cmd_daemon`: `--cloud` 면 NVML 열기 *전에* `load_cloud_config` 검증(미enroll → exit 2). push 콜백 = `build_observation_payload` + `post_observation`(0001 재사용).
|
|
18
|
+
- `gua daemon --cloud`(백그라운드)면 spawn 커맨드에 `--cloud --config` 전파.
|
|
19
|
+
|
|
20
|
+
제외:
|
|
21
|
+
|
|
22
|
+
- pull/명령 채널, 재시도 백오프 정교화, 오프라인 큐잉(실패 틱은 다음 틱이 latest 로 덮음 — replay 안 함).
|
|
23
|
+
- systemd 유닛 패키징(설치 UX 별도).
|
|
24
|
+
|
|
25
|
+
## Acceptance
|
|
26
|
+
|
|
27
|
+
- `gua daemon --cloud`(enrolled): 매 틱 local 기록 + latest push, 보드에 호스트/GPU 표시.
|
|
28
|
+
- push 실패(네트워크/CloudError/payload ValueError)는 데몬을 멈추지 않고 local 기록도 보존.
|
|
29
|
+
- `--cloud` + 미enroll → exit 2(NVML 열기 전).
|
|
30
|
+
- `--cloud` 없으면 기존 동작 그대로(push 없음).
|
|
31
|
+
- 백그라운드 `gua daemon --cloud` 가 자식 프로세스로 옵션 전파.
|
|
32
|
+
|
|
33
|
+
## Verification
|
|
34
|
+
|
|
35
|
+
- `tests/test_daemon.py`: `on_tick` 매 틱 local write 이후 호출 + raise 해도 데몬 계속·local 보존.
|
|
36
|
+
- `tests/test_cloud_cli.py`: `--cloud` 미enroll → exit 2(`run \`gua enroll\``); 백그라운드 spawn 커맨드에 `--cloud/--config` 포함.
|
|
37
|
+
- 전체 `pytest` 163 passed, `ruff` clean.
|
|
38
|
+
|
|
39
|
+
## Implementation Notes
|
|
40
|
+
|
|
41
|
+
- on_tick 후크 타입 `OnTick = Callable[[Snapshot, datetime], None]` (daemon.py).
|
|
42
|
+
- 실패 처리: `_tick` 이 on_tick 을 try/except 로 감싸 `logger.exception` 후 계속(틱 자체는 성공으로 간주).
|
|
43
|
+
- "push latest only, no replay" — 실패 틱을 재전송하지 않고 다음 틱 latest 가 보드를 갱신.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "gpu-usage-audit"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.2.0"
|
|
4
4
|
description = "Single-host daemon that surfaces 'idle-held' NVIDIA GPU memory — the embarrassing category conventional dashboards miss."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -43,7 +43,7 @@ from .daemon import install_signal_handlers, resolve_proc_identities, run_daemon
|
|
|
43
43
|
from .db import open_db, write_snapshot
|
|
44
44
|
from .doctor import build_doctor_report, doctor_report_to_dict, render_doctor
|
|
45
45
|
from .identity import system_process_name_lookup, system_user_lookup
|
|
46
|
-
from .model import HostMeta
|
|
46
|
+
from .model import HostMeta, Snapshot
|
|
47
47
|
from .nvml import NVMLNotAvailableError, NVMLTier
|
|
48
48
|
from .paths import (
|
|
49
49
|
DEFAULT_CLOUD_CONFIG_PATH,
|
|
@@ -121,6 +121,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
121
121
|
default=timedelta(seconds=30),
|
|
122
122
|
help="Tick interval (e.g. 30s, 1m, 200ms) [default: 30s]",
|
|
123
123
|
)
|
|
124
|
+
_add_cloud_args(p_daemon)
|
|
124
125
|
p_daemon.set_defaults(func=_cmd_daemon)
|
|
125
126
|
|
|
126
127
|
p_report = sub.add_parser(
|
|
@@ -198,6 +199,19 @@ def _add_daemon_args(parser: argparse.ArgumentParser) -> None:
|
|
|
198
199
|
)
|
|
199
200
|
|
|
200
201
|
|
|
202
|
+
def _add_cloud_args(parser: argparse.ArgumentParser) -> None:
|
|
203
|
+
parser.add_argument(
|
|
204
|
+
"--cloud",
|
|
205
|
+
action="store_true",
|
|
206
|
+
help="After each tick, push the latest snapshot to GUA Board (requires `gua enroll`)",
|
|
207
|
+
)
|
|
208
|
+
parser.add_argument(
|
|
209
|
+
"--config",
|
|
210
|
+
default=str(DEFAULT_CLOUD_CONFIG_PATH),
|
|
211
|
+
help=f"Cloud config path (used with --cloud) [default: {DEFAULT_CLOUD_CONFIG_PATH}]",
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
|
|
201
215
|
def _add_report_args(parser: argparse.ArgumentParser) -> None:
|
|
202
216
|
parser.add_argument(
|
|
203
217
|
"--db",
|
|
@@ -294,6 +308,7 @@ def build_gua_parser() -> argparse.ArgumentParser:
|
|
|
294
308
|
)
|
|
295
309
|
_add_daemon_args(p_daemon)
|
|
296
310
|
_add_runtime_file_args(p_daemon)
|
|
311
|
+
_add_cloud_args(p_daemon)
|
|
297
312
|
p_daemon.add_argument(
|
|
298
313
|
"--foreground",
|
|
299
314
|
action="store_true",
|
|
@@ -307,6 +322,7 @@ def build_gua_parser() -> argparse.ArgumentParser:
|
|
|
307
322
|
)
|
|
308
323
|
_add_daemon_args(p_start)
|
|
309
324
|
_add_runtime_file_args(p_start)
|
|
325
|
+
_add_cloud_args(p_start)
|
|
310
326
|
p_start.set_defaults(func=_cmd_gua_start)
|
|
311
327
|
|
|
312
328
|
p_status = sub.add_parser(
|
|
@@ -598,6 +614,9 @@ def _cmd_gua_start(args: argparse.Namespace) -> int:
|
|
|
598
614
|
"--interval",
|
|
599
615
|
_duration_cli_value(args.interval),
|
|
600
616
|
]
|
|
617
|
+
# cloud sync 옵션을 백그라운드 프로세스로 전파한다.
|
|
618
|
+
if getattr(args, "cloud", False):
|
|
619
|
+
command += ["--cloud", "--config", str(args.config)]
|
|
601
620
|
env = os.environ.copy()
|
|
602
621
|
env[DISPLAY_COMMAND_ENV] = "gua daemon --foreground"
|
|
603
622
|
with log_path.open("ab") as log:
|
|
@@ -712,6 +731,16 @@ def _cmd_daemon(args: argparse.Namespace) -> int:
|
|
|
712
731
|
return 2
|
|
713
732
|
if is_default_db_path(db_path):
|
|
714
733
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
734
|
+
|
|
735
|
+
# cloud sync 가 켜졌으면 enroll 설정을 *먼저* 검증한다(미enroll 이면 NVML 열기 전에 중단).
|
|
736
|
+
cloud_config = None
|
|
737
|
+
if getattr(args, "cloud", False):
|
|
738
|
+
try:
|
|
739
|
+
cloud_config = load_cloud_config(args.config)
|
|
740
|
+
except CloudConfigError as exc:
|
|
741
|
+
print(f"{display_command}: {exc}", file=sys.stderr)
|
|
742
|
+
return 2
|
|
743
|
+
|
|
715
744
|
tier = NVMLTier()
|
|
716
745
|
try:
|
|
717
746
|
try:
|
|
@@ -721,12 +750,37 @@ def _cmd_daemon(args: argparse.Namespace) -> int:
|
|
|
721
750
|
return 1
|
|
722
751
|
conn = open_db(db_path)
|
|
723
752
|
try:
|
|
753
|
+
hostname = socket.gethostname() or "unknown"
|
|
724
754
|
host = HostMeta(
|
|
725
|
-
hostname=
|
|
755
|
+
hostname=hostname,
|
|
726
756
|
env_kind=LOCAL_ENV_KIND,
|
|
727
757
|
driver_version=driver,
|
|
728
758
|
first_seen=datetime.now(UTC),
|
|
729
759
|
)
|
|
760
|
+
|
|
761
|
+
# cloud on_tick 후크: local write 이후 latest snapshot 을 board 로 push.
|
|
762
|
+
# 빌드/푸시 실패는 daemon._tick 이 잡아 로그만 남기고 다음 틱을 계속한다.
|
|
763
|
+
on_tick = None
|
|
764
|
+
if cloud_config is not None:
|
|
765
|
+
|
|
766
|
+
def push_snapshot(snap: Snapshot, ts: datetime) -> None:
|
|
767
|
+
payload = build_observation_payload(
|
|
768
|
+
snapshot=snap,
|
|
769
|
+
hostname=hostname,
|
|
770
|
+
driver_version=driver,
|
|
771
|
+
agent_version=__version__,
|
|
772
|
+
observed_at=ts,
|
|
773
|
+
host_id=cloud_config.host_id,
|
|
774
|
+
display_name=cloud_config.display_name,
|
|
775
|
+
)
|
|
776
|
+
post_observation(cloud_config, payload)
|
|
777
|
+
|
|
778
|
+
on_tick = push_snapshot
|
|
779
|
+
print(
|
|
780
|
+
f"{display_command}: cloud sync enabled -> "
|
|
781
|
+
f"{cloud_config.display_name} ({cloud_config.server_url})"
|
|
782
|
+
)
|
|
783
|
+
|
|
730
784
|
stop = threading.Event()
|
|
731
785
|
install_signal_handlers(stop)
|
|
732
786
|
run_daemon(
|
|
@@ -737,6 +791,7 @@ def _cmd_daemon(args: argparse.Namespace) -> int:
|
|
|
737
791
|
lookup=system_user_lookup,
|
|
738
792
|
name_lookup=system_process_name_lookup,
|
|
739
793
|
stop=stop,
|
|
794
|
+
on_tick=on_tick,
|
|
740
795
|
)
|
|
741
796
|
total = conn.execute("SELECT COUNT(*) FROM gpu_sample").fetchone()[0]
|
|
742
797
|
print(f"\n{args.db}: {total} total gpu_sample rows")
|
|
@@ -22,13 +22,16 @@ from datetime import UTC, datetime, timedelta
|
|
|
22
22
|
from typing import TextIO
|
|
23
23
|
|
|
24
24
|
from .db import start_daemon_run, write_snapshot
|
|
25
|
-
from .model import HostMeta, ProcSample
|
|
25
|
+
from .model import HostMeta, ProcSample, Snapshot
|
|
26
26
|
from .summarize import summarize
|
|
27
27
|
from .tier import Tier
|
|
28
28
|
|
|
29
29
|
logger = logging.getLogger(__name__)
|
|
30
30
|
|
|
31
31
|
UserLookup = Callable[[int], str | None]
|
|
32
|
+
# 틱 후크: local write *이후* 의 부가 작업(예: cloud push). 데몬 모듈은 cloud 를
|
|
33
|
+
# 모르고, CLI 가 콜백을 주입한다 — 결합도 분리.
|
|
34
|
+
OnTick = Callable[[Snapshot, datetime], None]
|
|
32
35
|
|
|
33
36
|
|
|
34
37
|
def _noop_lookup(_pid: int) -> str | None:
|
|
@@ -78,8 +81,13 @@ def _tick(
|
|
|
78
81
|
n: int,
|
|
79
82
|
out: TextIO,
|
|
80
83
|
run_id: int,
|
|
84
|
+
on_tick: OnTick | None = None,
|
|
81
85
|
) -> None:
|
|
82
|
-
"""한 틱: tier.collect → loginuid/process_name 해석 → 적재 → 한 줄 로그.
|
|
86
|
+
"""한 틱: tier.collect → loginuid/process_name 해석 → 적재 → 한 줄 로그.
|
|
87
|
+
|
|
88
|
+
on_tick 이 있으면 local write *이후* 호출한다(예: cloud push). 후크 실패는
|
|
89
|
+
이미 커밋된 local write 와 다음 틱을 막지 않는다 — 로그만 남기고 계속.
|
|
90
|
+
"""
|
|
83
91
|
snap = tier.collect(ts)
|
|
84
92
|
# ProcSample 이 mutable slots — *제자리* 갱신.
|
|
85
93
|
resolve_proc_identities(snap.procs, lookup, name_lookup)
|
|
@@ -89,6 +97,12 @@ def _tick(
|
|
|
89
97
|
ts_short = ts.strftime("%H:%M:%S.") + f"{ts.microsecond // 1000:03d}"
|
|
90
98
|
print(f"Tick {n} ts={ts_short} {classes}", file=out)
|
|
91
99
|
|
|
100
|
+
if on_tick is not None:
|
|
101
|
+
try:
|
|
102
|
+
on_tick(snap, ts)
|
|
103
|
+
except Exception:
|
|
104
|
+
logger.exception("tick %d on_tick hook failed; continuing", n)
|
|
105
|
+
|
|
92
106
|
|
|
93
107
|
def run_daemon(
|
|
94
108
|
*,
|
|
@@ -101,6 +115,7 @@ def run_daemon(
|
|
|
101
115
|
stop: threading.Event | None = None,
|
|
102
116
|
max_ticks: int | None = None,
|
|
103
117
|
out: TextIO | None = None,
|
|
118
|
+
on_tick: OnTick | None = None,
|
|
104
119
|
) -> int:
|
|
105
120
|
"""ctx 캔슬까지 interval 간격으로 한 틱씩 반복. 적재한 틱 총 수 반환.
|
|
106
121
|
|
|
@@ -135,7 +150,7 @@ def run_daemon(
|
|
|
135
150
|
run_id = start_daemon_run(db, datetime.now(UTC), interval)
|
|
136
151
|
|
|
137
152
|
try:
|
|
138
|
-
_tick(tier, db, host, lookup, name_lookup, datetime.now(UTC), n, out, run_id)
|
|
153
|
+
_tick(tier, db, host, lookup, name_lookup, datetime.now(UTC), n, out, run_id, on_tick)
|
|
139
154
|
except Exception:
|
|
140
155
|
logger.exception("tick %d failed; continuing", n)
|
|
141
156
|
n += 1
|
|
@@ -273,3 +273,66 @@ def test_sync_once_without_enrollment_exits_2(
|
|
|
273
273
|
)
|
|
274
274
|
assert rc == 2
|
|
275
275
|
assert "run `gua enroll`" in capsys.readouterr().err
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
# ── daemon --cloud ───────────────────────────────────────────────
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def test_daemon_cloud_without_enrollment_exits_2(
|
|
282
|
+
tmp_path: Path,
|
|
283
|
+
capsys: pytest.CaptureFixture[str],
|
|
284
|
+
) -> None:
|
|
285
|
+
# --cloud 인데 enroll 안 됨 → NVML 열기 *전에* 설정 검증 실패로 종료.
|
|
286
|
+
rc = gua_main(
|
|
287
|
+
[
|
|
288
|
+
"daemon",
|
|
289
|
+
"--foreground",
|
|
290
|
+
"--cloud",
|
|
291
|
+
"--db",
|
|
292
|
+
str(tmp_path / "gua.db"),
|
|
293
|
+
"--config",
|
|
294
|
+
str(tmp_path / "absent.json"),
|
|
295
|
+
]
|
|
296
|
+
)
|
|
297
|
+
assert rc == 2
|
|
298
|
+
assert "run `gua enroll`" in capsys.readouterr().err
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def test_daemon_start_propagates_cloud_flags_to_background(
|
|
302
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
303
|
+
tmp_path: Path,
|
|
304
|
+
) -> None:
|
|
305
|
+
# 백그라운드 spawn 커맨드에 --cloud/--config 가 실려야 데몬이 push 한다.
|
|
306
|
+
captured: dict[str, Any] = {}
|
|
307
|
+
|
|
308
|
+
class FakePopen:
|
|
309
|
+
def __init__(self, command: list[str], **_kwargs: Any) -> None:
|
|
310
|
+
captured["command"] = command
|
|
311
|
+
self.pid = 4242
|
|
312
|
+
|
|
313
|
+
def poll(self) -> int | None:
|
|
314
|
+
return None
|
|
315
|
+
|
|
316
|
+
monkeypatch.setattr("gpu_usage_audit.__main__.subprocess.Popen", FakePopen)
|
|
317
|
+
monkeypatch.setattr("gpu_usage_audit.__main__.time.sleep", lambda *_a, **_k: None)
|
|
318
|
+
|
|
319
|
+
config_path = tmp_path / "cloud.json"
|
|
320
|
+
rc = gua_main(
|
|
321
|
+
[
|
|
322
|
+
"daemon",
|
|
323
|
+
"--cloud",
|
|
324
|
+
"--db",
|
|
325
|
+
str(tmp_path / "gua.db"),
|
|
326
|
+
"--config",
|
|
327
|
+
str(config_path),
|
|
328
|
+
"--pid-file",
|
|
329
|
+
str(tmp_path / "daemon.pid"),
|
|
330
|
+
"--log-file",
|
|
331
|
+
str(tmp_path / "daemon.log"),
|
|
332
|
+
]
|
|
333
|
+
)
|
|
334
|
+
assert rc == 0
|
|
335
|
+
command = captured["command"]
|
|
336
|
+
assert "--cloud" in command
|
|
337
|
+
assert "--config" in command
|
|
338
|
+
assert str(config_path) in command
|
|
@@ -13,7 +13,7 @@ import pytest
|
|
|
13
13
|
|
|
14
14
|
from gpu_usage_audit.daemon import run_daemon
|
|
15
15
|
from gpu_usage_audit.db import open_db
|
|
16
|
-
from gpu_usage_audit.model import HostMeta
|
|
16
|
+
from gpu_usage_audit.model import HostMeta, Snapshot
|
|
17
17
|
from gpu_usage_audit.tier import FakeTier
|
|
18
18
|
|
|
19
19
|
INTERVAL = timedelta(milliseconds=20)
|
|
@@ -110,3 +110,51 @@ def test_run_daemon_lookup_resolves_loginuid(db: sqlite3.Connection, host: HostM
|
|
|
110
110
|
assert 1234 not in lookup_calls
|
|
111
111
|
assert 5678 not in lookup_calls
|
|
112
112
|
assert 9999 in lookup_calls
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def test_run_daemon_invokes_on_tick_after_local_write(
|
|
116
|
+
db: sqlite3.Connection, host: HostMeta
|
|
117
|
+
) -> None:
|
|
118
|
+
# on_tick 은 매 틱 local write *이후* 호출된다(cloud push 가 얹히는 자리).
|
|
119
|
+
calls: list[int] = []
|
|
120
|
+
|
|
121
|
+
def on_tick(snap: Snapshot, _ts: datetime) -> None:
|
|
122
|
+
# 콜백 시점엔 이미 이번 틱이 DB 에 기록돼 있어야 한다.
|
|
123
|
+
calls.append(db.execute("SELECT COUNT(*) FROM gpu_sample").fetchone()[0])
|
|
124
|
+
assert len(snap.gpus) == 3 # FakeTier 의 스냅샷이 그대로 전달된다.
|
|
125
|
+
|
|
126
|
+
n = run_daemon(
|
|
127
|
+
tier=FakeTier(),
|
|
128
|
+
db=db,
|
|
129
|
+
host=host,
|
|
130
|
+
interval=INTERVAL,
|
|
131
|
+
max_ticks=3,
|
|
132
|
+
out=io.StringIO(),
|
|
133
|
+
on_tick=on_tick,
|
|
134
|
+
)
|
|
135
|
+
assert n == 3
|
|
136
|
+
# 매 틱 호출되고, 호출 시점의 누적 행 수는 3, 6, 9 (틱당 3 GPU).
|
|
137
|
+
assert calls == [3, 6, 9]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def test_run_daemon_continues_when_on_tick_raises(db: sqlite3.Connection, host: HostMeta) -> None:
|
|
141
|
+
# on_tick(예: cloud push) 실패는 local write 와 다음 틱을 막지 않는다.
|
|
142
|
+
attempts: list[int] = []
|
|
143
|
+
|
|
144
|
+
def boom(_snap: Snapshot, _ts: datetime) -> None:
|
|
145
|
+
attempts.append(1)
|
|
146
|
+
raise RuntimeError("cloud push failed")
|
|
147
|
+
|
|
148
|
+
n = run_daemon(
|
|
149
|
+
tier=FakeTier(),
|
|
150
|
+
db=db,
|
|
151
|
+
host=host,
|
|
152
|
+
interval=INTERVAL,
|
|
153
|
+
max_ticks=3,
|
|
154
|
+
out=io.StringIO(),
|
|
155
|
+
on_tick=boom,
|
|
156
|
+
)
|
|
157
|
+
assert n == 3
|
|
158
|
+
assert len(attempts) == 3 # 매 틱 호출(예외에도 멈추지 않음).
|
|
159
|
+
# local write 는 전부 보존: 3 틱 * 3 GPU = 9 행.
|
|
160
|
+
assert db.execute("SELECT COUNT(*) FROM gpu_sample").fetchone()[0] == 9
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_usage_audit-1.1.0 → gpu_usage_audit-1.2.0}/docs/work-specs/0001-gua-board-cloud-sync.ko.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|