gitm-labs 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitm_labs-0.0.1/.github/workflows/workflow.yml +31 -0
- gitm_labs-0.0.1/.gitignore +39 -0
- gitm_labs-0.0.1/Dockerfile +59 -0
- gitm_labs-0.0.1/PKG-INFO +264 -0
- gitm_labs-0.0.1/README.md +232 -0
- gitm_labs-0.0.1/assets/arch.png +0 -0
- gitm_labs-0.0.1/benchmarks/Makefile.common +81 -0
- gitm_labs-0.0.1/benchmarks/README.md +92 -0
- gitm_labs-0.0.1/benchmarks/__init__.py +1 -0
- gitm_labs-0.0.1/benchmarks/_smoke_harness.py +101 -0
- gitm_labs-0.0.1/benchmarks/_templates/datasets.md +25 -0
- gitm_labs-0.0.1/benchmarks/_templates/spec.md +21 -0
- gitm_labs-0.0.1/benchmarks/biotech/Makefile +7 -0
- gitm_labs-0.0.1/benchmarks/biotech/__init__.py +1 -0
- gitm_labs-0.0.1/benchmarks/biotech/bench.smoke.toml +24 -0
- gitm_labs-0.0.1/benchmarks/biotech/bench.toml +31 -0
- gitm_labs-0.0.1/benchmarks/biotech/datasets.md +37 -0
- gitm_labs-0.0.1/benchmarks/biotech/fetch.py +226 -0
- gitm_labs-0.0.1/benchmarks/biotech/harness.py +194 -0
- gitm_labs-0.0.1/benchmarks/biotech/spec.md +31 -0
- gitm_labs-0.0.1/benchmarks/edge/Makefile +15 -0
- gitm_labs-0.0.1/benchmarks/edge/__init__.py +1 -0
- gitm_labs-0.0.1/benchmarks/edge/bench.smoke.toml +24 -0
- gitm_labs-0.0.1/benchmarks/edge/bench.toml +29 -0
- gitm_labs-0.0.1/benchmarks/edge/build_manifest.py +125 -0
- gitm_labs-0.0.1/benchmarks/edge/datasets.md +256 -0
- gitm_labs-0.0.1/benchmarks/edge/fetch.py +143 -0
- gitm_labs-0.0.1/benchmarks/edge/harness.py +293 -0
- gitm_labs-0.0.1/benchmarks/edge/kitti_source.py +101 -0
- gitm_labs-0.0.1/benchmarks/edge/nuscenes_source.py +178 -0
- gitm_labs-0.0.1/benchmarks/edge/spec.md +35 -0
- gitm_labs-0.0.1/benchmarks/hft/Makefile +10 -0
- gitm_labs-0.0.1/benchmarks/hft/__init__.py +1 -0
- gitm_labs-0.0.1/benchmarks/hft/bench.smoke.toml +26 -0
- gitm_labs-0.0.1/benchmarks/hft/bench.toml +30 -0
- gitm_labs-0.0.1/benchmarks/hft/datasets.md +46 -0
- gitm_labs-0.0.1/benchmarks/hft/gen_manifest.py +24 -0
- gitm_labs-0.0.1/benchmarks/hft/generate.py +181 -0
- gitm_labs-0.0.1/benchmarks/hft/generator/CMakeLists.txt +23 -0
- gitm_labs-0.0.1/benchmarks/hft/generator/main.cpp +102 -0
- gitm_labs-0.0.1/benchmarks/hft/harness.py +180 -0
- gitm_labs-0.0.1/benchmarks/hft/manifest.yaml +16 -0
- gitm_labs-0.0.1/benchmarks/hft/spec.md +34 -0
- gitm_labs-0.0.1/benchmarks/kitti/results.md +46 -0
- gitm_labs-0.0.1/benchmarks/kitti/spec.md +80 -0
- gitm_labs-0.0.1/benchmarks/skeleton/__init__.py +1 -0
- gitm_labs-0.0.1/benchmarks/skeleton/measure_overhead.py +104 -0
- gitm_labs-0.0.1/benchmarks/skeleton/overhead.md +45 -0
- gitm_labs-0.0.1/constraints.txt +24 -0
- gitm_labs-0.0.1/docs/invariants.md +53 -0
- gitm_labs-0.0.1/docs/scoring/input_contract.md +43 -0
- gitm_labs-0.0.1/gitm/__init__.py +9 -0
- gitm_labs-0.0.1/gitm/_paths.py +109 -0
- gitm_labs-0.0.1/gitm/agents/__init__.py +12 -0
- gitm_labs-0.0.1/gitm/agents/policy.py +48 -0
- gitm_labs-0.0.1/gitm/api.py +37 -0
- gitm_labs-0.0.1/gitm/bench/__init__.py +30 -0
- gitm_labs-0.0.1/gitm/bench/__main__.py +10 -0
- gitm_labs-0.0.1/gitm/bench/baseline.py +169 -0
- gitm_labs-0.0.1/gitm/bench/cli.py +290 -0
- gitm_labs-0.0.1/gitm/bench/edge_manifest.py +138 -0
- gitm_labs-0.0.1/gitm/bench/manifest.py +183 -0
- gitm_labs-0.0.1/gitm/bench/profile.py +299 -0
- gitm_labs-0.0.1/gitm/bench/reproduce.py +120 -0
- gitm_labs-0.0.1/gitm/bench/results.py +68 -0
- gitm_labs-0.0.1/gitm/bench/runner.py +137 -0
- gitm_labs-0.0.1/gitm/bench/schema.py +168 -0
- gitm_labs-0.0.1/gitm/bench/templates/results.md.j2 +37 -0
- gitm_labs-0.0.1/gitm/benchmarks/__init__.py +0 -0
- gitm_labs-0.0.1/gitm/benchmarks/kitti/__init__.py +9 -0
- gitm_labs-0.0.1/gitm/benchmarks/kitti/baseline.py +249 -0
- gitm_labs-0.0.1/gitm/benchmarks/kitti/workunit.py +223 -0
- gitm_labs-0.0.1/gitm/cli.py +120 -0
- gitm_labs-0.0.1/gitm/doctor.py +29 -0
- gitm_labs-0.0.1/gitm/kernels/__init__.py +13 -0
- gitm_labs-0.0.1/gitm/kernels/library.py +24 -0
- gitm_labs-0.0.1/gitm/kernels/library.yaml +345 -0
- gitm_labs-0.0.1/gitm/kernels/spec.py +51 -0
- gitm_labs-0.0.1/gitm/optimizer/__init__.py +32 -0
- gitm_labs-0.0.1/gitm/optimizer/apply.py +206 -0
- gitm_labs-0.0.1/gitm/optimizer/attribution.py +90 -0
- gitm_labs-0.0.1/gitm/optimizer/dr.py +154 -0
- gitm_labs-0.0.1/gitm/optimizer/invariants.py +45 -0
- gitm_labs-0.0.1/gitm/optimizer/monitor.py +164 -0
- gitm_labs-0.0.1/gitm/optimizer/multibasis.py +86 -0
- gitm_labs-0.0.1/gitm/optimizer/qualification.py +77 -0
- gitm_labs-0.0.1/gitm/optimizer/replay.py +59 -0
- gitm_labs-0.0.1/gitm/optimizer/replay_validation.py +125 -0
- gitm_labs-0.0.1/gitm/optimizer/report.py +110 -0
- gitm_labs-0.0.1/gitm/optimizer/templates/report.md.j2 +41 -0
- gitm_labs-0.0.1/gitm/planner/__init__.py +22 -0
- gitm_labs-0.0.1/gitm/planner/graph.py +117 -0
- gitm_labs-0.0.1/gitm/planner/roofline.py +96 -0
- gitm_labs-0.0.1/gitm/routing/__init__.py +0 -0
- gitm_labs-0.0.1/gitm/routing/scorer_v0.py +89 -0
- gitm_labs-0.0.1/gitm/scheduler/__init__.py +18 -0
- gitm_labs-0.0.1/gitm/scheduler/loop.py +205 -0
- gitm_labs-0.0.1/gitm/telemetry/__init__.py +18 -0
- gitm_labs-0.0.1/gitm/telemetry/backends/__init__.py +13 -0
- gitm_labs-0.0.1/gitm/telemetry/backends/amd.py +40 -0
- gitm_labs-0.0.1/gitm/telemetry/backends/base.py +27 -0
- gitm_labs-0.0.1/gitm/telemetry/backends/discover.py +43 -0
- gitm_labs-0.0.1/gitm/telemetry/backends/nvidia.py +141 -0
- gitm_labs-0.0.1/gitm/telemetry/collector.py +85 -0
- gitm_labs-0.0.1/gitm/telemetry/schema.py +78 -0
- gitm_labs-0.0.1/gitm/telemetry/sinks/__init__.py +50 -0
- gitm_labs-0.0.1/gitm/telemetry/sinks/jsonl.py +33 -0
- gitm_labs-0.0.1/gitm/telemetry/sinks/otlp.py +54 -0
- gitm_labs-0.0.1/gitm/telemetry/sinks/prometheus.py +48 -0
- gitm_labs-0.0.1/gitm/telemetry/sinks/s3.py +52 -0
- gitm_labs-0.0.1/gitm/tracer/__init__.py +20 -0
- gitm_labs-0.0.1/gitm/tracer/_cupti/__init__.py +24 -0
- gitm_labs-0.0.1/gitm/tracer/_cupti/build.py +193 -0
- gitm_labs-0.0.1/gitm/tracer/_cupti/cupti_shim.c +294 -0
- gitm_labs-0.0.1/gitm/tracer/_cupti_decode.py +123 -0
- gitm_labs-0.0.1/gitm/tracer/capture.py +108 -0
- gitm_labs-0.0.1/gitm/tracer/cupti.py +46 -0
- gitm_labs-0.0.1/gitm/tracer/schema.py +80 -0
- gitm_labs-0.0.1/harness/gen_kitti_manifest.py +122 -0
- gitm_labs-0.0.1/harness/run_baselines.sh +90 -0
- gitm_labs-0.0.1/harness/setup_openpcdet.sh +119 -0
- gitm_labs-0.0.1/harness/smoke_kitti.py +115 -0
- gitm_labs-0.0.1/harness/verify_manifest.py +100 -0
- gitm_labs-0.0.1/pyproject.toml +60 -0
- gitm_labs-0.0.1/scripts/compare_results.py +82 -0
- gitm_labs-0.0.1/scripts/emit_report.py +113 -0
- gitm_labs-0.0.1/scripts/gpu_live_capture.py +74 -0
- gitm_labs-0.0.1/scripts/gpu_setup.sh +49 -0
- gitm_labs-0.0.1/scripts/run_under_runtime.py +340 -0
- gitm_labs-0.0.1/scripts/ship_to_pod.sh +34 -0
- gitm_labs-0.0.1/scripts/verify_infra.sh +127 -0
- gitm_labs-0.0.1/scripts/w2_on_real_trace.py +101 -0
- gitm_labs-0.0.1/skills/gitm-internal-status-loop.md +178 -0
- gitm_labs-0.0.1/tests/__init__.py +0 -0
- gitm_labs-0.0.1/tests/conftest.py +113 -0
- gitm_labs-0.0.1/tests/golden/report_basic.md +32 -0
- gitm_labs-0.0.1/tests/test_apply_rollback.py +155 -0
- gitm_labs-0.0.1/tests/test_bench.py +380 -0
- gitm_labs-0.0.1/tests/test_bench_datasets.py +193 -0
- gitm_labs-0.0.1/tests/test_cupti.py +227 -0
- gitm_labs-0.0.1/tests/test_framework_harnesses.py +161 -0
- gitm_labs-0.0.1/tests/test_hft_harness.py +117 -0
- gitm_labs-0.0.1/tests/test_kitti_benchmark.py +126 -0
- gitm_labs-0.0.1/tests/test_planner_roofline.py +114 -0
- gitm_labs-0.0.1/tests/test_qualification_fingerprint.py +117 -0
- gitm_labs-0.0.1/tests/test_report_snapshot.py +112 -0
- gitm_labs-0.0.1/tests/test_scorer.py +51 -0
- gitm_labs-0.0.1/tests/test_smoke.py +121 -0
- gitm_labs-0.0.1/tests/test_tracer_jsonl.py +176 -0
- gitm_labs-0.0.1/tests/test_w2_runtime.py +157 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
workflow_dispatch: # allows manual trigger
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
permissions:
|
|
12
|
+
id-token: write # Required for OIDC trusted publishing
|
|
13
|
+
contents: read
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Set up Python
|
|
19
|
+
uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: "3.x"
|
|
22
|
+
|
|
23
|
+
- name: Install build tools
|
|
24
|
+
run: pip install build
|
|
25
|
+
|
|
26
|
+
- name: Build package
|
|
27
|
+
run: python -m build
|
|
28
|
+
|
|
29
|
+
- name: Publish to PyPI
|
|
30
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
31
|
+
# No API token needed — OIDC handles auth automatically
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# GITM convention: scratch/data dirs live outside the repo (see gitm/_paths.py:
|
|
2
|
+
# $GITM_S3_ROOT / $GITM_SCRATCH). Leading slash anchors these to the repo ROOT so
|
|
3
|
+
# they don't also match code packages like gitm/telemetry/.
|
|
4
|
+
/datasets/
|
|
5
|
+
/runs/
|
|
6
|
+
/traces/
|
|
7
|
+
/telemetry/
|
|
8
|
+
/data/
|
|
9
|
+
|
|
10
|
+
# Python
|
|
11
|
+
__pycache__/
|
|
12
|
+
*.py[cod]
|
|
13
|
+
*$py.class
|
|
14
|
+
*.egg-info/
|
|
15
|
+
*.egg
|
|
16
|
+
.eggs/
|
|
17
|
+
build/
|
|
18
|
+
dist/
|
|
19
|
+
.venv/
|
|
20
|
+
venv/
|
|
21
|
+
.env
|
|
22
|
+
|
|
23
|
+
# Tooling
|
|
24
|
+
.pytest_cache/
|
|
25
|
+
.mypy_cache/
|
|
26
|
+
.ruff_cache/
|
|
27
|
+
htmlcov/
|
|
28
|
+
.coverage
|
|
29
|
+
.coverage.*
|
|
30
|
+
|
|
31
|
+
# IDE
|
|
32
|
+
.vscode/
|
|
33
|
+
.idea/
|
|
34
|
+
*.swp
|
|
35
|
+
.DS_Store
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
.claude
|
|
39
|
+
venv
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# GITM reproducible runtime image — the shared environment for Adit + interns.
|
|
2
|
+
#
|
|
3
|
+
# Build ONCE, push, and share the resulting image *digest*. Everyone who runs
|
|
4
|
+
# that digest gets a byte-identical software stack, so results match (perf within
|
|
5
|
+
# the 2% spread gate on the same GPU SKU; everything else exactly). See
|
|
6
|
+
# docs/REPRODUCIBILITY.md.
|
|
7
|
+
#
|
|
8
|
+
# docker build -t <registry>/gitm:<tag> .
|
|
9
|
+
# docker push <registry>/gitm:<tag>
|
|
10
|
+
# docker inspect --format='{{index .RepoDigests 0}}' <registry>/gitm:<tag> # share this
|
|
11
|
+
#
|
|
12
|
+
# Needs the CUDA *devel* base for nvcc + the CUPTI headers the tracer shim links.
|
|
13
|
+
|
|
14
|
+
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
|
|
15
|
+
|
|
16
|
+
ENV DEBIAN_FRONTEND=noninteractive \
|
|
17
|
+
CUDA_HOME=/usr/local/cuda \
|
|
18
|
+
PYTHONDONTWRITEBYTECODE=1 \
|
|
19
|
+
PIP_NO_CACHE_DIR=1
|
|
20
|
+
|
|
21
|
+
# Python 3.12 (matches the tested stack) + build toolchain for the CUPTI shim.
|
|
22
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
23
|
+
software-properties-common ca-certificates git build-essential \
|
|
24
|
+
&& add-apt-repository ppa:deadsnakes/ppa \
|
|
25
|
+
&& apt-get update && apt-get install -y --no-install-recommends \
|
|
26
|
+
python3.12 python3.12-dev python3.12-venv curl \
|
|
27
|
+
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 \
|
|
28
|
+
&& update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 \
|
|
29
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
30
|
+
|
|
31
|
+
WORKDIR /opt/gitm
|
|
32
|
+
|
|
33
|
+
# Pinned GPU compute stack, tied to the CUDA 12.4 base. Confirm these resolve on
|
|
34
|
+
# first build; once the image is pushed the digest freezes them for everyone.
|
|
35
|
+
ARG TORCH_VERSION=2.4.1
|
|
36
|
+
ARG CUDF_VERSION=24.10.01
|
|
37
|
+
ARG CUPY_VERSION=13.3.0
|
|
38
|
+
RUN python -m pip install --upgrade pip \
|
|
39
|
+
&& python -m pip install "torch==${TORCH_VERSION}" \
|
|
40
|
+
--index-url https://download.pytorch.org/whl/cu124 \
|
|
41
|
+
&& python -m pip install --extra-index-url=https://pypi.nvidia.com \
|
|
42
|
+
"cudf-cu12==${CUDF_VERSION}" "cupy-cuda12x==${CUPY_VERSION}"
|
|
43
|
+
|
|
44
|
+
# The package + its CPU deps, pinned via constraints.txt for reproducibility.
|
|
45
|
+
COPY pyproject.toml constraints.txt README.md ./
|
|
46
|
+
COPY gitm ./gitm
|
|
47
|
+
COPY benchmarks ./benchmarks
|
|
48
|
+
COPY tests ./tests
|
|
49
|
+
COPY docs ./docs
|
|
50
|
+
COPY scripts ./scripts
|
|
51
|
+
RUN python -m pip install -e ".[dev,bench,nvidia]" -c constraints.txt
|
|
52
|
+
|
|
53
|
+
# Build the CUPTI tracer shim against this image's CUDA toolkit.
|
|
54
|
+
RUN python -m gitm.tracer._cupti.build
|
|
55
|
+
|
|
56
|
+
# Freeze the fully-resolved stack into the image for auditing / exact re-pin.
|
|
57
|
+
RUN python -m pip freeze > /opt/gitm/requirements.lock
|
|
58
|
+
|
|
59
|
+
CMD ["./scripts/verify_infra.sh"]
|
gitm_labs-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gitm-labs
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Behavioral compiler and intervention runtime for vLLM decode workloads.
|
|
5
|
+
Author: GITM
|
|
6
|
+
License: Proprietary
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: jinja2>=3.1
|
|
9
|
+
Requires-Dist: numpy>=1.26
|
|
10
|
+
Requires-Dist: pydantic>=2.6
|
|
11
|
+
Requires-Dist: pyyaml>=6.0
|
|
12
|
+
Requires-Dist: statsmodels>=0.14
|
|
13
|
+
Provides-Extra: bench
|
|
14
|
+
Requires-Dist: pandas>=2.0; extra == 'bench'
|
|
15
|
+
Requires-Dist: pyarrow>=15; extra == 'bench'
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
18
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
19
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
20
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
21
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
22
|
+
Provides-Extra: nvidia
|
|
23
|
+
Requires-Dist: pynvml>=11.5; extra == 'nvidia'
|
|
24
|
+
Provides-Extra: otlp
|
|
25
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.25; extra == 'otlp'
|
|
26
|
+
Requires-Dist: opentelemetry-sdk>=1.25; extra == 'otlp'
|
|
27
|
+
Provides-Extra: prometheus
|
|
28
|
+
Requires-Dist: prometheus-client>=0.20; extra == 'prometheus'
|
|
29
|
+
Provides-Extra: s3
|
|
30
|
+
Requires-Dist: boto3>=1.34; extra == 's3'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# runtime
|
|
34
|
+
|
|
35
|
+
Behavioral compiler + intervention runtime for vLLM decode workloads.
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install -e .
|
|
39
|
+
gitm run --workload vllm-decode --budget 24h --target 15%
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Embedded:
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from gitm import optimize
|
|
46
|
+
optimize(engine, budget="24h", target=0.15)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Data layout
|
|
50
|
+
|
|
51
|
+
Two roots — set both before running anything:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
export GITM_S3_ROOT="s3://gitm-data/prod" # canonical store (datasets + archives)
|
|
55
|
+
export GITM_SCRATCH="/mnt/nvme/gitm" # local ephemeral run dir (defaults to ~/.cache/gitm)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Canonical layout under `$GITM_S3_ROOT` (S3):
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
datasets/{hft,biotech,edge}/ # benchmark inputs (immutable, sha256-pinned)
|
|
62
|
+
runs/ # durable baseline + pilot outputs
|
|
63
|
+
traces/ # captured event-telemetry traces
|
|
64
|
+
telemetry/ # state-telemetry samples (1Hz GPU state)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Local layout under `$GITM_SCRATCH` (ephemeral, synced to S3 after a run):
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
staging/ # datasets staged in from S3 for the active run, then evicted
|
|
71
|
+
runs/ # this run's outputs (small) before archival
|
|
72
|
+
traces/ # this run's trace before archival
|
|
73
|
+
telemetry/ # this run's samples before archival
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Architecture
|
|
77
|
+
|
|
78
|
+
See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md). The runtime is structured as
|
|
79
|
+
seven subpackages mirroring the data flow:
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
gitm/
|
|
83
|
+
telemetry/ # state telemetry: NVML / ROCm SMI, 1Hz GPU state samples
|
|
84
|
+
tracer/ # event telemetry: CUPTI / rocprof per-kernel records
|
|
85
|
+
planner/ # behavioral compiler: predicted execution graph (roofline)
|
|
86
|
+
optimizer/ # deviation monitor, attribution, replay, qualification, report
|
|
87
|
+
kernels/ # curated intervention library (the levers)
|
|
88
|
+
scheduler/ # 24-hour loop phase orchestration
|
|
89
|
+
agents/ # autonomous decision policy (intervention selection)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Invariants
|
|
93
|
+
|
|
94
|
+
The deviation monitor checks observed-vs-predicted against three invariants:
|
|
95
|
+
|
|
96
|
+
1. **Kernel-time invariant** — per-kernel duration must lie within roofline.
|
|
97
|
+
2. **Memory-traffic invariant** — per-kernel bytes-moved must match predicted.
|
|
98
|
+
3. **Stream-concurrency invariant** — predicted-concurrent kernels must overlap.
|
|
99
|
+
|
|
100
|
+
See [docs/invariants.md](docs/invariants.md).
|
|
101
|
+
|
|
102
|
+
## The 24-hour loop
|
|
103
|
+
|
|
104
|
+
| Phase | Hours | Module |
|
|
105
|
+
|---|---|---|
|
|
106
|
+
| 1. Capture trace, fingerprint workload, predict graph | 0–2 | `tracer`, `telemetry`, `planner` |
|
|
107
|
+
| 2. Compute residuals + causal attribution | 2–6 | `optimizer.monitor`, `optimizer.attribution` |
|
|
108
|
+
| 3. Query library, rank via counterfactual replay | 6–12 | `kernels`, `optimizer.replay` |
|
|
109
|
+
| 4. Apply top-N interventions with rollback gates | 12–20 | `agents`, `optimizer` |
|
|
110
|
+
| 5. Stabilize, write provenance report | 20–24 | `optimizer.report` |
|
|
111
|
+
|
|
112
|
+
## Architecture
|
|
113
|
+
|
|
114
|
+
GITM separates the **empirical** half (what happened) from the **predicted**
|
|
115
|
+
half (what should have happened). Everything downstream operates on residuals
|
|
116
|
+
— the difference between the two.
|
|
117
|
+
|
|
118
|
+
### Two telemetry planes
|
|
119
|
+
|
|
120
|
+
GITM never conflates these.
|
|
121
|
+
|
|
122
|
+
#### State telemetry (`gitm.telemetry`)
|
|
123
|
+
|
|
124
|
+
Point-in-time samples of GPU state at ~1 Hz:
|
|
125
|
+
|
|
126
|
+
- Utilization, memory used, power, clocks, temperature
|
|
127
|
+
- Throttle reasons (canonical bitmask across vendors)
|
|
128
|
+
- NVLink throughput, ECC counters
|
|
129
|
+
|
|
130
|
+
Source: **NVML** on NVIDIA, **ROCm SMI** on AMD.
|
|
131
|
+
Cost: ~microseconds per sample.
|
|
132
|
+
Shape: summary, not trace.
|
|
133
|
+
|
|
134
|
+
#### Event telemetry (`gitm.tracer`)
|
|
135
|
+
|
|
136
|
+
Per-kernel activity records with start/end timestamps, stream IDs, memory
|
|
137
|
+
transfer events.
|
|
138
|
+
|
|
139
|
+
Source: **CUPTI** on NVIDIA, **rocprof** on AMD.
|
|
140
|
+
Cost: per-kernel callbacks.
|
|
141
|
+
Shape: structurally a trace — required for the kernel-time invariant.
|
|
142
|
+
|
|
143
|
+
### Components
|
|
144
|
+
|
|
145
|
+

|
|
146
|
+
|
|
147
|
+
### Module responsibilities
|
|
148
|
+
|
|
149
|
+
| Module | Responsibility |
|
|
150
|
+
|---|---|
|
|
151
|
+
| `gitm.telemetry` | Vendor-backend autodiscovery, NVML/ROCm SMI samples, pluggable sinks |
|
|
152
|
+
| `gitm.tracer` | Event-telemetry capture (CUPTI/rocprof), trace schema, context manager |
|
|
153
|
+
| `gitm.planner` | Behavioral Compiler — roofline-based predicted execution graph |
|
|
154
|
+
| `gitm.optimizer.monitor` | Deviation monitor — residuals against 3 invariants |
|
|
155
|
+
| `gitm.optimizer.attribution` | Granger + doubly-robust on residual subgraph |
|
|
156
|
+
| `gitm.optimizer.replay` | Counterfactual replay for predicted intervention delta |
|
|
157
|
+
| `gitm.optimizer.qualification` | Workload fingerprint gate (commit / diagnose) |
|
|
158
|
+
| `gitm.optimizer.report` | Provenance chain renderer (claim → evidence → intervention → delta) |
|
|
159
|
+
| `gitm.kernels` | Curated intervention library — 15–20 levers with applicability + safety |
|
|
160
|
+
| `gitm.agents` | Autonomous policy — selects interventions, drives rollback |
|
|
161
|
+
| `gitm.scheduler` | 24-hour loop phase orchestration |
|
|
162
|
+
|
|
163
|
+
### Interfaces are contracts
|
|
164
|
+
|
|
165
|
+
The five primary interfaces below are the load-bearing contracts. W2 swarm
|
|
166
|
+
extends behind these without rewriting upstream code.
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
# tracer
|
|
170
|
+
with gitm.tracer.capture(out_path: Path) -> ContextManager[Trace]: ...
|
|
171
|
+
|
|
172
|
+
# planner
|
|
173
|
+
graph = gitm.planner.predict_graph(model: ModelSpec, hw: HardwareSpec, batch: BatchConfig) -> Graph
|
|
174
|
+
|
|
175
|
+
# monitor
|
|
176
|
+
residuals = gitm.optimizer.monitor.residuals(trace: Trace, graph: Graph) -> Residuals
|
|
177
|
+
violations = gitm.optimizer.monitor.check_invariants(residuals, invariants) -> list[Violation]
|
|
178
|
+
|
|
179
|
+
# attribution
|
|
180
|
+
hypotheses = gitm.optimizer.attribution.attribute(residuals: Residuals, graph: Graph) -> RankedHypotheses
|
|
181
|
+
|
|
182
|
+
# report
|
|
183
|
+
report_md = gitm.optimizer.report.write(claims: list[Claim], provenance: Provenance) -> str
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Onboarding
|
|
187
|
+
|
|
188
|
+
This document is load-bearing for Day 6 — the six benchmark interns rotate
|
|
189
|
+
onto the skeleton using these steps. Every command here is expected to work
|
|
190
|
+
on a clean checkout.
|
|
191
|
+
|
|
192
|
+
### 1. Environment
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
git clone git@github.com:gitm-labs/runtime.git
|
|
196
|
+
cd runtime
|
|
197
|
+
python -m venv .venv
|
|
198
|
+
source .venv/bin/activate
|
|
199
|
+
pip install -e ".[dev]"
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
NVIDIA box additionally:
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
pip install -e ".[nvidia]"
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Point at the canonical S3 store and a local scratch dir (see
|
|
209
|
+
[Data layout](#data-layout) — datasets live in S3, never on local disk):
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
export GITM_S3_ROOT="s3://gitm-data/prod" # canonical store
|
|
213
|
+
export GITM_SCRATCH="/mnt/nvme/gitm" # local scratch (defaults to ~/.cache/gitm)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
`gitm doctor` reports both, plus discovered GPUs. Scratch subdirs are created
|
|
217
|
+
on first run.
|
|
218
|
+
|
|
219
|
+
### 2. Smoke test
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
gitm --help
|
|
223
|
+
gitm run --help
|
|
224
|
+
pytest -q
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
All three should pass on a clean checkout.
|
|
228
|
+
|
|
229
|
+
### 3. The 24-hour loop
|
|
230
|
+
|
|
231
|
+
The CLI entry point composes five subpackages in order. Read the source in
|
|
232
|
+
this order — it mirrors the data flow:
|
|
233
|
+
|
|
234
|
+
1. [`gitm/telemetry`](../gitm/telemetry/) — state telemetry (1 Hz GPU samples)
|
|
235
|
+
2. [`gitm/tracer`](../gitm/tracer/) — event telemetry (per-kernel records)
|
|
236
|
+
3. [`gitm/planner`](../gitm/planner/) — Behavioral Compiler (predicted graph)
|
|
237
|
+
4. [`gitm/optimizer`](../gitm/optimizer/) — monitor, attribution, replay, report
|
|
238
|
+
5. [`gitm/kernels`](../gitm/kernels/) — intervention library
|
|
239
|
+
6. [`gitm/agents`](../gitm/agents/) — selection policy
|
|
240
|
+
7. [`gitm/scheduler`](../gitm/scheduler/) — phase orchestration
|
|
241
|
+
|
|
242
|
+
Building a runtime system
|
|
243
|
+
gitm-labs/runtime, runtime/scheduler/, runtime/tracer/, runtime/optimizer/, runtime/kernels/, runtime/planner/, runtime/telemetry/, runtime/agents/
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
### 4. Where things live
|
|
247
|
+
|
|
248
|
+
| Concern | Path |
|
|
249
|
+
|---|---|
|
|
250
|
+
| Code | `gitm/` |
|
|
251
|
+
| Tests | `tests/` |
|
|
252
|
+
| Docs | `docs/` |
|
|
253
|
+
| Datasets, traces, runs | `$GITM_S3_ROOT/` (S3, canonical) · `$GITM_SCRATCH/` (local, ephemeral) |
|
|
254
|
+
| Intervention library | `gitm/kernels/library.yaml` |
|
|
255
|
+
| Report template | `gitm/optimizer/templates/report.md.j2` |
|
|
256
|
+
| Trace schema | `gitm/tracer/schema.py` (pydantic) |
|
|
257
|
+
| Telemetry schema | `gitm/telemetry/schema.py` (pydantic) |
|
|
258
|
+
|
|
259
|
+
### 5. Contributing a new component
|
|
260
|
+
|
|
261
|
+
Every new module hangs off one of the seven subpackages and exposes its public
|
|
262
|
+
surface through `__init__.py`. The five primary interfaces in
|
|
263
|
+
[ARCHITECTURE.md](ARCHITECTURE.md) are contracts — extend behind them, do not
|
|
264
|
+
change them, without Adit's sign-off.
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# runtime
|
|
2
|
+
|
|
3
|
+
Behavioral compiler + intervention runtime for vLLM decode workloads.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install -e .
|
|
7
|
+
gitm run --workload vllm-decode --budget 24h --target 15%
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
Embedded:
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
from gitm import optimize
|
|
14
|
+
optimize(engine, budget="24h", target=0.15)
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Data layout
|
|
18
|
+
|
|
19
|
+
Two roots — set both before running anything:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
export GITM_S3_ROOT="s3://gitm-data/prod" # canonical store (datasets + archives)
|
|
23
|
+
export GITM_SCRATCH="/mnt/nvme/gitm" # local ephemeral run dir (defaults to ~/.cache/gitm)
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Canonical layout under `$GITM_S3_ROOT` (S3):
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
datasets/{hft,biotech,edge}/ # benchmark inputs (immutable, sha256-pinned)
|
|
30
|
+
runs/ # durable baseline + pilot outputs
|
|
31
|
+
traces/ # captured event-telemetry traces
|
|
32
|
+
telemetry/ # state-telemetry samples (1Hz GPU state)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Local layout under `$GITM_SCRATCH` (ephemeral, synced to S3 after a run):
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
staging/ # datasets staged in from S3 for the active run, then evicted
|
|
39
|
+
runs/ # this run's outputs (small) before archival
|
|
40
|
+
traces/ # this run's trace before archival
|
|
41
|
+
telemetry/ # this run's samples before archival
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Architecture
|
|
45
|
+
|
|
46
|
+
See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md). The runtime is structured as
|
|
47
|
+
seven subpackages mirroring the data flow:
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
gitm/
|
|
51
|
+
telemetry/ # state telemetry: NVML / ROCm SMI, 1Hz GPU state samples
|
|
52
|
+
tracer/ # event telemetry: CUPTI / rocprof per-kernel records
|
|
53
|
+
planner/ # behavioral compiler: predicted execution graph (roofline)
|
|
54
|
+
optimizer/ # deviation monitor, attribution, replay, qualification, report
|
|
55
|
+
kernels/ # curated intervention library (the levers)
|
|
56
|
+
scheduler/ # 24-hour loop phase orchestration
|
|
57
|
+
agents/ # autonomous decision policy (intervention selection)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Invariants
|
|
61
|
+
|
|
62
|
+
The deviation monitor checks observed-vs-predicted against three invariants:
|
|
63
|
+
|
|
64
|
+
1. **Kernel-time invariant** — per-kernel duration must lie within roofline.
|
|
65
|
+
2. **Memory-traffic invariant** — per-kernel bytes-moved must match predicted.
|
|
66
|
+
3. **Stream-concurrency invariant** — predicted-concurrent kernels must overlap.
|
|
67
|
+
|
|
68
|
+
See [docs/invariants.md](docs/invariants.md).
|
|
69
|
+
|
|
70
|
+
## The 24-hour loop
|
|
71
|
+
|
|
72
|
+
| Phase | Hours | Module |
|
|
73
|
+
|---|---|---|
|
|
74
|
+
| 1. Capture trace, fingerprint workload, predict graph | 0–2 | `tracer`, `telemetry`, `planner` |
|
|
75
|
+
| 2. Compute residuals + causal attribution | 2–6 | `optimizer.monitor`, `optimizer.attribution` |
|
|
76
|
+
| 3. Query library, rank via counterfactual replay | 6–12 | `kernels`, `optimizer.replay` |
|
|
77
|
+
| 4. Apply top-N interventions with rollback gates | 12–20 | `agents`, `optimizer` |
|
|
78
|
+
| 5. Stabilize, write provenance report | 20–24 | `optimizer.report` |
|
|
79
|
+
|
|
80
|
+
## Architecture
|
|
81
|
+
|
|
82
|
+
GITM separates the **empirical** half (what happened) from the **predicted**
|
|
83
|
+
half (what should have happened). Everything downstream operates on residuals
|
|
84
|
+
— the difference between the two.
|
|
85
|
+
|
|
86
|
+
### Two telemetry planes
|
|
87
|
+
|
|
88
|
+
GITM never conflates these.
|
|
89
|
+
|
|
90
|
+
#### State telemetry (`gitm.telemetry`)
|
|
91
|
+
|
|
92
|
+
Point-in-time samples of GPU state at ~1 Hz:
|
|
93
|
+
|
|
94
|
+
- Utilization, memory used, power, clocks, temperature
|
|
95
|
+
- Throttle reasons (canonical bitmask across vendors)
|
|
96
|
+
- NVLink throughput, ECC counters
|
|
97
|
+
|
|
98
|
+
Source: **NVML** on NVIDIA, **ROCm SMI** on AMD.
|
|
99
|
+
Cost: ~microseconds per sample.
|
|
100
|
+
Shape: summary, not trace.
|
|
101
|
+
|
|
102
|
+
#### Event telemetry (`gitm.tracer`)
|
|
103
|
+
|
|
104
|
+
Per-kernel activity records with start/end timestamps, stream IDs, memory
|
|
105
|
+
transfer events.
|
|
106
|
+
|
|
107
|
+
Source: **CUPTI** on NVIDIA, **rocprof** on AMD.
|
|
108
|
+
Cost: per-kernel callbacks.
|
|
109
|
+
Shape: structurally a trace — required for the kernel-time invariant.
|
|
110
|
+
|
|
111
|
+
### Components
|
|
112
|
+
|
|
113
|
+

|
|
114
|
+
|
|
115
|
+
### Module responsibilities
|
|
116
|
+
|
|
117
|
+
| Module | Responsibility |
|
|
118
|
+
|---|---|
|
|
119
|
+
| `gitm.telemetry` | Vendor-backend autodiscovery, NVML/ROCm SMI samples, pluggable sinks |
|
|
120
|
+
| `gitm.tracer` | Event-telemetry capture (CUPTI/rocprof), trace schema, context manager |
|
|
121
|
+
| `gitm.planner` | Behavioral Compiler — roofline-based predicted execution graph |
|
|
122
|
+
| `gitm.optimizer.monitor` | Deviation monitor — residuals against 3 invariants |
|
|
123
|
+
| `gitm.optimizer.attribution` | Granger + doubly-robust on residual subgraph |
|
|
124
|
+
| `gitm.optimizer.replay` | Counterfactual replay for predicted intervention delta |
|
|
125
|
+
| `gitm.optimizer.qualification` | Workload fingerprint gate (commit / diagnose) |
|
|
126
|
+
| `gitm.optimizer.report` | Provenance chain renderer (claim → evidence → intervention → delta) |
|
|
127
|
+
| `gitm.kernels` | Curated intervention library — 15–20 levers with applicability + safety |
|
|
128
|
+
| `gitm.agents` | Autonomous policy — selects interventions, drives rollback |
|
|
129
|
+
| `gitm.scheduler` | 24-hour loop phase orchestration |
|
|
130
|
+
|
|
131
|
+
### Interfaces are contracts
|
|
132
|
+
|
|
133
|
+
The five primary interfaces below are the load-bearing contracts. W2 swarm
|
|
134
|
+
extends behind these without rewriting upstream code.
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
# tracer
|
|
138
|
+
with gitm.tracer.capture(out_path: Path) -> ContextManager[Trace]: ...
|
|
139
|
+
|
|
140
|
+
# planner
|
|
141
|
+
graph = gitm.planner.predict_graph(model: ModelSpec, hw: HardwareSpec, batch: BatchConfig) -> Graph
|
|
142
|
+
|
|
143
|
+
# monitor
|
|
144
|
+
residuals = gitm.optimizer.monitor.residuals(trace: Trace, graph: Graph) -> Residuals
|
|
145
|
+
violations = gitm.optimizer.monitor.check_invariants(residuals, invariants) -> list[Violation]
|
|
146
|
+
|
|
147
|
+
# attribution
|
|
148
|
+
hypotheses = gitm.optimizer.attribution.attribute(residuals: Residuals, graph: Graph) -> RankedHypotheses
|
|
149
|
+
|
|
150
|
+
# report
|
|
151
|
+
report_md = gitm.optimizer.report.write(claims: list[Claim], provenance: Provenance) -> str
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Onboarding
|
|
155
|
+
|
|
156
|
+
This document is load-bearing for Day 6 — the six benchmark interns rotate
|
|
157
|
+
onto the skeleton using these steps. Every command here is expected to work
|
|
158
|
+
on a clean checkout.
|
|
159
|
+
|
|
160
|
+
### 1. Environment
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
git clone git@github.com:gitm-labs/runtime.git
|
|
164
|
+
cd runtime
|
|
165
|
+
python -m venv .venv
|
|
166
|
+
source .venv/bin/activate
|
|
167
|
+
pip install -e ".[dev]"
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
NVIDIA box additionally:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
pip install -e ".[nvidia]"
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Point at the canonical S3 store and a local scratch dir (see
|
|
177
|
+
[Data layout](#data-layout) — datasets live in S3, never on local disk):
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
export GITM_S3_ROOT="s3://gitm-data/prod" # canonical store
|
|
181
|
+
export GITM_SCRATCH="/mnt/nvme/gitm" # local scratch (defaults to ~/.cache/gitm)
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
`gitm doctor` reports both, plus discovered GPUs. Scratch subdirs are created
|
|
185
|
+
on first run.
|
|
186
|
+
|
|
187
|
+
### 2. Smoke test
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
gitm --help
|
|
191
|
+
gitm run --help
|
|
192
|
+
pytest -q
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
All three should pass on a clean checkout.
|
|
196
|
+
|
|
197
|
+
### 3. The 24-hour loop
|
|
198
|
+
|
|
199
|
+
The CLI entry point composes five subpackages in order. Read the source in
|
|
200
|
+
this order — it mirrors the data flow:
|
|
201
|
+
|
|
202
|
+
1. [`gitm/telemetry`](../gitm/telemetry/) — state telemetry (1 Hz GPU samples)
|
|
203
|
+
2. [`gitm/tracer`](../gitm/tracer/) — event telemetry (per-kernel records)
|
|
204
|
+
3. [`gitm/planner`](../gitm/planner/) — Behavioral Compiler (predicted graph)
|
|
205
|
+
4. [`gitm/optimizer`](../gitm/optimizer/) — monitor, attribution, replay, report
|
|
206
|
+
5. [`gitm/kernels`](../gitm/kernels/) — intervention library
|
|
207
|
+
6. [`gitm/agents`](../gitm/agents/) — selection policy
|
|
208
|
+
7. [`gitm/scheduler`](../gitm/scheduler/) — phase orchestration
|
|
209
|
+
|
|
210
|
+
Building a runtime system
|
|
211
|
+
gitm-labs/runtime, runtime/scheduler/, runtime/tracer/, runtime/optimizer/, runtime/kernels/, runtime/planner/, runtime/telemetry/, runtime/agents/
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
### 4. Where things live
|
|
215
|
+
|
|
216
|
+
| Concern | Path |
|
|
217
|
+
|---|---|
|
|
218
|
+
| Code | `gitm/` |
|
|
219
|
+
| Tests | `tests/` |
|
|
220
|
+
| Docs | `docs/` |
|
|
221
|
+
| Datasets, traces, runs | `$GITM_S3_ROOT/` (S3, canonical) · `$GITM_SCRATCH/` (local, ephemeral) |
|
|
222
|
+
| Intervention library | `gitm/kernels/library.yaml` |
|
|
223
|
+
| Report template | `gitm/optimizer/templates/report.md.j2` |
|
|
224
|
+
| Trace schema | `gitm/tracer/schema.py` (pydantic) |
|
|
225
|
+
| Telemetry schema | `gitm/telemetry/schema.py` (pydantic) |
|
|
226
|
+
|
|
227
|
+
### 5. Contributing a new component
|
|
228
|
+
|
|
229
|
+
Every new module hangs off one of the seven subpackages and exposes its public
|
|
230
|
+
surface through `__init__.py`. The five primary interfaces in
|
|
231
|
+
[ARCHITECTURE.md](ARCHITECTURE.md) are contracts — extend behind them, do not
|
|
232
|
+
change them, without Adit's sign-off.
|
|
Binary file
|