modekeeper 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modekeeper-0.1.1/LICENSE +6 -0
- modekeeper-0.1.1/PKG-INFO +119 -0
- modekeeper-0.1.1/README.md +103 -0
- modekeeper-0.1.1/pyproject.toml +52 -0
- modekeeper-0.1.1/setup.cfg +4 -0
- modekeeper-0.1.1/src/modekeeper/__init__.py +4 -0
- modekeeper-0.1.1/src/modekeeper/actuators/knobs.py +49 -0
- modekeeper-0.1.1/src/modekeeper/adapters/kubernetes.py +33 -0
- modekeeper-0.1.1/src/modekeeper/adapters/lightning.py +77 -0
- modekeeper-0.1.1/src/modekeeper/audit/__init__.py +2 -0
- modekeeper-0.1.1/src/modekeeper/audit/decision_trace.py +27 -0
- modekeeper-0.1.1/src/modekeeper/chords/__init__.py +2 -0
- modekeeper-0.1.1/src/modekeeper/chords/catalog.py +157 -0
- modekeeper-0.1.1/src/modekeeper/chords/catalog_v1.json +138 -0
- modekeeper-0.1.1/src/modekeeper/chords/v1.py +18 -0
- modekeeper-0.1.1/src/modekeeper/cli.py +4827 -0
- modekeeper-0.1.1/src/modekeeper/core/analysis.py +82 -0
- modekeeper-0.1.1/src/modekeeper/core/cost_model.py +19 -0
- modekeeper-0.1.1/src/modekeeper/core/modes.py +8 -0
- modekeeper-0.1.1/src/modekeeper/core/opportunity.py +69 -0
- modekeeper-0.1.1/src/modekeeper/core/passport.py +22 -0
- modekeeper-0.1.1/src/modekeeper/core/state_machine.py +21 -0
- modekeeper-0.1.1/src/modekeeper/core/summary.py +116 -0
- modekeeper-0.1.1/src/modekeeper/core/value_summary.py +83 -0
- modekeeper-0.1.1/src/modekeeper/demo/mk068_demo.py +167 -0
- modekeeper-0.1.1/src/modekeeper/demo/runner.py +30 -0
- modekeeper-0.1.1/src/modekeeper/fleet/__init__.py +2 -0
- modekeeper-0.1.1/src/modekeeper/fleet/inventory.py +149 -0
- modekeeper-0.1.1/src/modekeeper/fleet/policy_propagation.py +184 -0
- modekeeper-0.1.1/src/modekeeper/governance/__init__.py +2 -0
- modekeeper-0.1.1/src/modekeeper/governance/approval.py +19 -0
- modekeeper-0.1.1/src/modekeeper/k8s/__init__.py +2 -0
- modekeeper-0.1.1/src/modekeeper/k8s/rbac_diagnostics.py +93 -0
- modekeeper-0.1.1/src/modekeeper/license/__init__.py +5 -0
- modekeeper-0.1.1/src/modekeeper/license/canonical.py +15 -0
- modekeeper-0.1.1/src/modekeeper/license/public_keys.json +4 -0
- modekeeper-0.1.1/src/modekeeper/license/public_keys.py +41 -0
- modekeeper-0.1.1/src/modekeeper/license/verify.py +247 -0
- modekeeper-0.1.1/src/modekeeper/passports/__init__.py +15 -0
- modekeeper-0.1.1/src/modekeeper/passports/observe_max.py +174 -0
- modekeeper-0.1.1/src/modekeeper/passports/templates/__init__.py +1 -0
- modekeeper-0.1.1/src/modekeeper/passports/templates/comm.json +47 -0
- modekeeper-0.1.1/src/modekeeper/passports/templates/cost.json +46 -0
- modekeeper-0.1.1/src/modekeeper/passports/templates/io.json +46 -0
- modekeeper-0.1.1/src/modekeeper/passports/templates/perf.json +50 -0
- modekeeper-0.1.1/src/modekeeper/passports/templates/pilot.json +40 -0
- modekeeper-0.1.1/src/modekeeper/passports/templates/recovery.json +45 -0
- modekeeper-0.1.1/src/modekeeper/passports/templates/safe.json +50 -0
- modekeeper-0.1.1/src/modekeeper/passports/v0.py +207 -0
- modekeeper-0.1.1/src/modekeeper/policy/actions.py +19 -0
- modekeeper-0.1.1/src/modekeeper/policy/bundle.py +74 -0
- modekeeper-0.1.1/src/modekeeper/policy/chords.py +16 -0
- modekeeper-0.1.1/src/modekeeper/policy/rules.py +132 -0
- modekeeper-0.1.1/src/modekeeper/policy/scalar.py +38 -0
- modekeeper-0.1.1/src/modekeeper/roi/__init__.py +2 -0
- modekeeper-0.1.1/src/modekeeper/roi/estimate.py +83 -0
- modekeeper-0.1.1/src/modekeeper/roi/mk074_before_after.py +165 -0
- modekeeper-0.1.1/src/modekeeper/safety/explain.py +21 -0
- modekeeper-0.1.1/src/modekeeper/safety/guards.py +482 -0
- modekeeper-0.1.1/src/modekeeper/safety/rollback.py +37 -0
- modekeeper-0.1.1/src/modekeeper/telemetry/collector.py +12 -0
- modekeeper-0.1.1/src/modekeeper/telemetry/file_source.py +243 -0
- modekeeper-0.1.1/src/modekeeper/telemetry/k8s_log_source.py +366 -0
- modekeeper-0.1.1/src/modekeeper/telemetry/models.py +18 -0
- modekeeper-0.1.1/src/modekeeper/telemetry/raw_recorder.py +57 -0
- modekeeper-0.1.1/src/modekeeper/telemetry/sources.py +56 -0
- modekeeper-0.1.1/src/modekeeper/trainer/__init__.py +5 -0
- modekeeper-0.1.1/src/modekeeper/trainer/__main__.py +89 -0
- modekeeper-0.1.1/src/modekeeper/trainer/knobs.py +36 -0
- modekeeper-0.1.1/src/modekeeper.egg-info/PKG-INFO +119 -0
- modekeeper-0.1.1/src/modekeeper.egg-info/SOURCES.txt +133 -0
- modekeeper-0.1.1/src/modekeeper.egg-info/dependency_links.txt +1 -0
- modekeeper-0.1.1/src/modekeeper.egg-info/entry_points.txt +2 -0
- modekeeper-0.1.1/src/modekeeper.egg-info/requires.txt +4 -0
- modekeeper-0.1.1/src/modekeeper.egg-info/top_level.txt +1 -0
- modekeeper-0.1.1/tests/test_chords_recover_relock.py +102 -0
- modekeeper-0.1.1/tests/test_cli_artifacts.py +239 -0
- modekeeper-0.1.1/tests/test_cli_policy_scalar.py +45 -0
- modekeeper-0.1.1/tests/test_cli_value_summary.py +36 -0
- modekeeper-0.1.1/tests/test_closed_loop_apply_pipeline.py +349 -0
- modekeeper-0.1.1/tests/test_closed_loop_k8s_observe.py +264 -0
- modekeeper-0.1.1/tests/test_closed_loop_watch.py +163 -0
- modekeeper-0.1.1/tests/test_demo_mk068.py +56 -0
- modekeeper-0.1.1/tests/test_duration_parse.py +25 -0
- modekeeper-0.1.1/tests/test_file_source_worker_latencies.py +54 -0
- modekeeper-0.1.1/tests/test_gpu_saturated.py +28 -0
- modekeeper-0.1.1/tests/test_k8s_apply_blocked.py +112 -0
- modekeeper-0.1.1/tests/test_k8s_apply_errors.py +71 -0
- modekeeper-0.1.1/tests/test_k8s_apply_real.py +148 -0
- modekeeper-0.1.1/tests/test_k8s_log_source_container_auto.py +50 -0
- modekeeper-0.1.1/tests/test_k8s_multi_object_plan.py +217 -0
- modekeeper-0.1.1/tests/test_k8s_preflight.py +189 -0
- modekeeper-0.1.1/tests/test_k8s_render.py +142 -0
- modekeeper-0.1.1/tests/test_k8s_render_errors.py +88 -0
- modekeeper-0.1.1/tests/test_k8s_verify.py +501 -0
- modekeeper-0.1.1/tests/test_k8s_verify_errors.py +59 -0
- modekeeper-0.1.1/tests/test_k8s_verify_helpers.py +35 -0
- modekeeper-0.1.1/tests/test_lightning_adapter_import.py +18 -0
- modekeeper-0.1.1/tests/test_mk062_chords_v1.py +55 -0
- modekeeper-0.1.1/tests/test_mk068_demo.py +39 -0
- modekeeper-0.1.1/tests/test_mk074_before_after.py +80 -0
- modekeeper-0.1.1/tests/test_mk075_decision_trace.py +55 -0
- modekeeper-0.1.1/tests/test_mk076_approval_gate.py +39 -0
- modekeeper-0.1.1/tests/test_mk077_inventory.py +63 -0
- modekeeper-0.1.1/tests/test_mk078_policy_propagation.py +86 -0
- modekeeper-0.1.1/tests/test_mk080_roi_estimate.py +56 -0
- modekeeper-0.1.1/tests/test_mk082_license_gates.py +289 -0
- modekeeper-0.1.1/tests/test_mk082_license_verify.py +100 -0
- modekeeper-0.1.1/tests/test_mk083_policy_bundle.py +93 -0
- modekeeper-0.1.1/tests/test_mk084_chord_catalog_validate.py +103 -0
- modekeeper-0.1.1/tests/test_mk084_guardrails_envelope.py +37 -0
- modekeeper-0.1.1/tests/test_mk085_killswitch_absolute.py +336 -0
- modekeeper-0.1.1/tests/test_mk086_license_kid_and_rotation.py +91 -0
- modekeeper-0.1.1/tests/test_mk089_telemetry_and_watch.py +163 -0
- modekeeper-0.1.1/tests/test_mk091_environment_fingerprint.py +127 -0
- modekeeper-0.1.1/tests/test_mk093_customer_eval.py +68 -0
- modekeeper-0.1.1/tests/test_mk096_roi_report.py +144 -0
- modekeeper-0.1.1/tests/test_mk097_export_bundle.py +114 -0
- modekeeper-0.1.1/tests/test_mk098_stdout_jsonl_ingest.py +61 -0
- modekeeper-0.1.1/tests/test_observe_file_source.py +61 -0
- modekeeper-0.1.1/tests/test_observe_k8s_source.py +61 -0
- modekeeper-0.1.1/tests/test_observe_summary.py +19 -0
- modekeeper-0.1.1/tests/test_opportunity_estimate.py +43 -0
- modekeeper-0.1.1/tests/test_passport_observe_max.py +75 -0
- modekeeper-0.1.1/tests/test_passport_observe_max_redaction.py +51 -0
- modekeeper-0.1.1/tests/test_passport_observe_max_report_only.py +47 -0
- modekeeper-0.1.1/tests/test_passports_v0.py +52 -0
- modekeeper-0.1.1/tests/test_policy_scalar_baseline.py +48 -0
- modekeeper-0.1.1/tests/test_rbac_diagnostics_parse.py +54 -0
- modekeeper-0.1.1/tests/test_record_replay.py +449 -0
- modekeeper-0.1.1/tests/test_report_contracts.py +94 -0
- modekeeper-0.1.1/tests/test_safety_guardrails.py +221 -0
- modekeeper-0.1.1/tests/test_state_machine.py +26 -0
- modekeeper-0.1.1/tests/test_trainer_knobs_parse.py +22 -0
- modekeeper-0.1.1/tests/test_value_summary.py +74 -0
modekeeper-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: modekeeper
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: ModeKeeper: self-serve observability and safe closed-loop tuning
|
|
5
|
+
Author: ModeKeeper
|
|
6
|
+
License-Expression: LicenseRef-Proprietary
|
|
7
|
+
Keywords: mlops,observability,autotuning,safety
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest>=7.4; extra == "dev"
|
|
14
|
+
Requires-Dist: ruff>=0.4.8; extra == "dev"
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# ModeKeeper
|
|
18
|
+
<!-- modekeeper:product-intro:start -->
|
|
19
|
+
|
|
20
|
+
## Для кого и зачем
|
|
21
|
+
|
|
22
|
+
**ModeKeeper** — инструмент для платформенных команд (**SRE / MLOps / FinOps**), которые запускают обучение/инференс в **Kubernetes** и хотят **снижать стоимость и нестабильность** без риска и без “ручного тюнинга”.
|
|
23
|
+
|
|
24
|
+
Что делает (в двух словах):
|
|
25
|
+
- **наблюдает** метрики/состояние ворклоада и собирает контекст;
|
|
26
|
+
- строит **план изменений** (**plan-only**, ничего не применяет);
|
|
27
|
+
- умеет **проверить применимость** плана (**verify**, тоже без изменений);
|
|
28
|
+
- в **платном режиме** может **применить** план (**apply**) внутри ModeKeeper — только при активной лицензии и после успешного verify.
|
|
29
|
+
|
|
30
|
+
Режимы:
|
|
31
|
+
- **Free:** observe + closed-loop dry-run + k8s render/verify (**ничего не меняет**)
|
|
32
|
+
- **Paid:** one-shot `mk closed-loop run --apply` под `MODEKEEPER_PAID=1`; детали — `docs/WORKFLOW.md`.
|
|
33
|
+
|
|
34
|
+
См. подробное продуктовое описание: `docs/product.md`.
|
|
35
|
+
См. технический snapshot для продолжения: `docs/SNAPSHOT.md`.
|
|
36
|
+
См. workflow (plan-only + verify + paid apply skeleton): `docs/WORKFLOW.md`.
|
|
37
|
+
|
|
38
|
+
<!-- modekeeper:product-intro:end -->
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
ModeKeeper — self-serve слой контроля для ML-систем. Сначала работает в режиме **OBSERVE_ONLY** (бесплатная неделя): собирает телеметрию и формирует отчет «теряете ли вы деньги». Затем, по желанию, включается режим **CLOSED_LOOP** для безопасного автотюнинга с guardrails и откатами.
|
|
42
|
+
|
|
43
|
+
## Возможности
|
|
44
|
+
- Два режима: `OBSERVE_ONLY` и `CLOSED_LOOP`.
|
|
45
|
+
- Модульная архитектура: core, telemetry, actuators, policy, safety, adapters.
|
|
46
|
+
- Explain-log везде: каждое решение, ограничение и действие фиксируется.
|
|
47
|
+
- Локальные демо-сценарии без внешних сервисов.
|
|
48
|
+
|
|
49
|
+
## Установка
|
|
50
|
+
```bash
|
|
51
|
+
python -m venv .venv
|
|
52
|
+
. .venv/bin/activate
|
|
53
|
+
pip install -e .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Быстрый старт
|
|
57
|
+
```bash
|
|
58
|
+
mk observe --duration 250ms --out report/_observe_quick
|
|
59
|
+
mk demo run --scenario drift
|
|
60
|
+
mk closed-loop run --scenario drift --dry-run --out report/_golden_dryrun
|
|
61
|
+
```
|
|
62
|
+
Canonical paid e2e (kind) + scripts: `docs/WORKFLOW.md`.
|
|
63
|
+
|
|
64
|
+
## Quickstart
|
|
65
|
+
Observe-only quickstart + RBAC verify-only replay:
|
|
66
|
+
`docs/QUICKSTART.md`
|
|
67
|
+
|
|
68
|
+
## E2E (kind)
|
|
69
|
+
Canonical workflow: `docs/WORKFLOW.md` (paid path is one-shot `closed-loop run --apply`).
|
|
70
|
+
Continuation snapshot: `docs/SNAPSHOT.md`.
|
|
71
|
+
|
|
72
|
+
Canonical scripts:
|
|
73
|
+
- `./scripts/e2e-smoke-kind.sh` (safe: kill switch blocks apply)
|
|
74
|
+
- `./scripts/e2e-apply-kind.sh` (REAL kubectl patches)
|
|
75
|
+
|
|
76
|
+
Warning: `./scripts/e2e-apply-kind.sh` performs real `kubectl` patches in the kind cluster.
|
|
77
|
+
|
|
78
|
+
Observe параметры:
|
|
79
|
+
- `--source` synthetic|file (default: synthetic)
|
|
80
|
+
- `--path` обязателен для `--source file`
|
|
81
|
+
- `--duration` принимает `1.5s`, `250ms`, `10m`; без суффикса = секунды
|
|
82
|
+
- формат входа: `jsonl`/`csv` с полями `ts`, `step_time_ms`, `loss` (optional)
|
|
83
|
+
|
|
84
|
+
## Демо
|
|
85
|
+
```bash
|
|
86
|
+
mk demo run --scenario straggler
|
|
87
|
+
mk demo run --scenario burst
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Выходные данные (контракт v0):
|
|
91
|
+
- `report/` содержит JSON-отчеты и `explain.jsonl` (JSONL, `ensure_ascii=False`).
|
|
92
|
+
- `observe_latest.json`, `demo_latest.json`, `closed_loop_latest.json` — копии последних отчетов.
|
|
93
|
+
- Подробности k8s утилит/полей — см. `docs/WORKFLOW.md` и `docs/SNAPSHOT.md`.
|
|
94
|
+
- Все отчеты имеют поля верхнего уровня: `schema_version`, `started_at`, `finished_at`, `duration_s`, `out_dir`.
|
|
95
|
+
- В `OBSERVE_ONLY` добавляется `summary`:
|
|
96
|
+
- `money_leak_risk` (low|medium|high)
|
|
97
|
+
- `top_symptoms`
|
|
98
|
+
- `recommendations`
|
|
99
|
+
- В `CLOSED_LOOP` дополнительно создаётся `summary.md` (короткий человекочитаемый итог прогона).
|
|
100
|
+
- В `CLOSED_LOOP` также пишутся plan-only артефакты: `k8s_plan.json` и `k8s_plan.kubectl.sh` (в dry-run kubectl **не выполняется**).
|
|
101
|
+
- В `CLOSED_LOOP` добавляются поля:
|
|
102
|
+
- `decision_summary` (RU)
|
|
103
|
+
- `proposed` (предлагаемые действия)
|
|
104
|
+
- `applied` (результаты применения/блокировки)
|
|
105
|
+
- `status` (например, `"ok"`)
|
|
106
|
+
- `kill_switch_active` (true/false)
|
|
107
|
+
- `blocked_reasons` (агрегация причин блокировок)
|
|
108
|
+
- `applied_reasons` (агрегация причин применений)
|
|
109
|
+
- `k8s_plan_path` (путь к `k8s_plan.json`)
|
|
110
|
+
- `k8s_plan_items` (кол-во items в плане)
|
|
111
|
+
- `k8s_kubectl_plan_path` (путь к `k8s_plan.kubectl.sh`)
|
|
112
|
+
- `k8s_namespace` (целевой namespace для скрипта/плана)
|
|
113
|
+
- `k8s_deployment` (целевой deployment для скрипта/плана)
|
|
114
|
+
|
|
115
|
+
## Примечания
|
|
116
|
+
- `closed-loop` по умолчанию работает в dry-run; для paid-apply используйте one-shot `closed-loop run --apply` (см. `docs/WORKFLOW.md`).
|
|
117
|
+
- В dry-run `closed-loop` **не выполняет** kubectl: он только генерирует `k8s_plan.kubectl.sh`.
|
|
118
|
+
- Для принудительной блокировки `--apply` используйте `MODEKEEPER_KILL_SWITCH=1` (в отчёте `blocked_reasons` будет `kill_switch`).
|
|
119
|
+
- Все safety-ограничения локальные, настраиваемые и аудируемые.
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# ModeKeeper
|
|
2
|
+
<!-- modekeeper:product-intro:start -->
|
|
3
|
+
|
|
4
|
+
## Для кого и зачем
|
|
5
|
+
|
|
6
|
+
**ModeKeeper** — инструмент для платформенных команд (**SRE / MLOps / FinOps**), которые запускают обучение/инференс в **Kubernetes** и хотят **снижать стоимость и нестабильность** без риска и без “ручного тюнинга”.
|
|
7
|
+
|
|
8
|
+
Что делает (в двух словах):
|
|
9
|
+
- **наблюдает** метрики/состояние ворклоада и собирает контекст;
|
|
10
|
+
- строит **план изменений** (**plan-only**, ничего не применяет);
|
|
11
|
+
- умеет **проверить применимость** плана (**verify**, тоже без изменений);
|
|
12
|
+
- в **платном режиме** может **применить** план (**apply**) внутри ModeKeeper — только при активной лицензии и после успешного verify.
|
|
13
|
+
|
|
14
|
+
Режимы:
|
|
15
|
+
- **Free:** observe + closed-loop dry-run + k8s render/verify (**ничего не меняет**)
|
|
16
|
+
- **Paid:** one-shot `mk closed-loop run --apply` под `MODEKEEPER_PAID=1`; детали — `docs/WORKFLOW.md`.
|
|
17
|
+
|
|
18
|
+
См. подробное продуктовое описание: `docs/product.md`.
|
|
19
|
+
См. технический snapshot для продолжения: `docs/SNAPSHOT.md`.
|
|
20
|
+
См. workflow (plan-only + verify + paid apply skeleton): `docs/WORKFLOW.md`.
|
|
21
|
+
|
|
22
|
+
<!-- modekeeper:product-intro:end -->
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
ModeKeeper — self-serve слой контроля для ML-систем. Сначала работает в режиме **OBSERVE_ONLY** (бесплатная неделя): собирает телеметрию и формирует отчет «теряете ли вы деньги». Затем, по желанию, включается режим **CLOSED_LOOP** для безопасного автотюнинга с guardrails и откатами.
|
|
26
|
+
|
|
27
|
+
## Возможности
|
|
28
|
+
- Два режима: `OBSERVE_ONLY` и `CLOSED_LOOP`.
|
|
29
|
+
- Модульная архитектура: core, telemetry, actuators, policy, safety, adapters.
|
|
30
|
+
- Explain-log везде: каждое решение, ограничение и действие фиксируется.
|
|
31
|
+
- Локальные демо-сценарии без внешних сервисов.
|
|
32
|
+
|
|
33
|
+
## Установка
|
|
34
|
+
```bash
|
|
35
|
+
python -m venv .venv
|
|
36
|
+
. .venv/bin/activate
|
|
37
|
+
pip install -e .
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Быстрый старт
|
|
41
|
+
```bash
|
|
42
|
+
mk observe --duration 250ms --out report/_observe_quick
|
|
43
|
+
mk demo run --scenario drift
|
|
44
|
+
mk closed-loop run --scenario drift --dry-run --out report/_golden_dryrun
|
|
45
|
+
```
|
|
46
|
+
Canonical paid e2e (kind) + scripts: `docs/WORKFLOW.md`.
|
|
47
|
+
|
|
48
|
+
## Quickstart
|
|
49
|
+
Observe-only quickstart + RBAC verify-only replay:
|
|
50
|
+
`docs/QUICKSTART.md`
|
|
51
|
+
|
|
52
|
+
## E2E (kind)
|
|
53
|
+
Canonical workflow: `docs/WORKFLOW.md` (paid path is one-shot `closed-loop run --apply`).
|
|
54
|
+
Continuation snapshot: `docs/SNAPSHOT.md`.
|
|
55
|
+
|
|
56
|
+
Canonical scripts:
|
|
57
|
+
- `./scripts/e2e-smoke-kind.sh` (safe: kill switch blocks apply)
|
|
58
|
+
- `./scripts/e2e-apply-kind.sh` (REAL kubectl patches)
|
|
59
|
+
|
|
60
|
+
Warning: `./scripts/e2e-apply-kind.sh` performs real `kubectl` patches in the kind cluster.
|
|
61
|
+
|
|
62
|
+
Observe параметры:
|
|
63
|
+
- `--source` synthetic|file (default: synthetic)
|
|
64
|
+
- `--path` обязателен для `--source file`
|
|
65
|
+
- `--duration` принимает `1.5s`, `250ms`, `10m`; без суффикса = секунды
|
|
66
|
+
- формат входа: `jsonl`/`csv` с полями `ts`, `step_time_ms`, `loss` (optional)
|
|
67
|
+
|
|
68
|
+
## Демо
|
|
69
|
+
```bash
|
|
70
|
+
mk demo run --scenario straggler
|
|
71
|
+
mk demo run --scenario burst
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Выходные данные (контракт v0):
|
|
75
|
+
- `report/` содержит JSON-отчеты и `explain.jsonl` (JSONL, `ensure_ascii=False`).
|
|
76
|
+
- `observe_latest.json`, `demo_latest.json`, `closed_loop_latest.json` — копии последних отчетов.
|
|
77
|
+
- Подробности k8s утилит/полей — см. `docs/WORKFLOW.md` и `docs/SNAPSHOT.md`.
|
|
78
|
+
- Все отчеты имеют поля верхнего уровня: `schema_version`, `started_at`, `finished_at`, `duration_s`, `out_dir`.
|
|
79
|
+
- В `OBSERVE_ONLY` добавляется `summary`:
|
|
80
|
+
- `money_leak_risk` (low|medium|high)
|
|
81
|
+
- `top_symptoms`
|
|
82
|
+
- `recommendations`
|
|
83
|
+
- В `CLOSED_LOOP` дополнительно создаётся `summary.md` (короткий человекочитаемый итог прогона).
|
|
84
|
+
- В `CLOSED_LOOP` также пишутся plan-only артефакты: `k8s_plan.json` и `k8s_plan.kubectl.sh` (в dry-run kubectl **не выполняется**).
|
|
85
|
+
- В `CLOSED_LOOP` добавляются поля:
|
|
86
|
+
- `decision_summary` (RU)
|
|
87
|
+
- `proposed` (предлагаемые действия)
|
|
88
|
+
- `applied` (результаты применения/блокировки)
|
|
89
|
+
- `status` (например, `"ok"`)
|
|
90
|
+
- `kill_switch_active` (true/false)
|
|
91
|
+
- `blocked_reasons` (агрегация причин блокировок)
|
|
92
|
+
- `applied_reasons` (агрегация причин применений)
|
|
93
|
+
- `k8s_plan_path` (путь к `k8s_plan.json`)
|
|
94
|
+
- `k8s_plan_items` (кол-во items в плане)
|
|
95
|
+
- `k8s_kubectl_plan_path` (путь к `k8s_plan.kubectl.sh`)
|
|
96
|
+
- `k8s_namespace` (целевой namespace для скрипта/плана)
|
|
97
|
+
- `k8s_deployment` (целевой deployment для скрипта/плана)
|
|
98
|
+
|
|
99
|
+
## Примечания
|
|
100
|
+
- `closed-loop` по умолчанию работает в dry-run; для paid-apply используйте one-shot `closed-loop run --apply` (см. `docs/WORKFLOW.md`).
|
|
101
|
+
- В dry-run `closed-loop` **не выполняет** kubectl: он только генерирует `k8s_plan.kubectl.sh`.
|
|
102
|
+
- Для принудительной блокировки `--apply` используйте `MODEKEEPER_KILL_SWITCH=1` (в отчёте `blocked_reasons` будет `kill_switch`).
|
|
103
|
+
- Все safety-ограничения локальные, настраиваемые и аудируемые.
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "modekeeper"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "ModeKeeper: self-serve observability and safe closed-loop tuning"
|
|
9
|
+
readme = {file = "README.md", content-type = "text/markdown"}
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "LicenseRef-Proprietary"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [{name = "ModeKeeper"}]
|
|
14
|
+
keywords = ["mlops", "observability", "autotuning", "safety"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
dependencies = []
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
dev = [
|
|
23
|
+
"pytest>=7.4",
|
|
24
|
+
"ruff>=0.4.8",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.scripts]
|
|
28
|
+
mk = "modekeeper.cli:main"
|
|
29
|
+
|
|
30
|
+
[tool.setuptools]
|
|
31
|
+
package-dir = {"" = "src"}
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["src"]
|
|
35
|
+
|
|
36
|
+
[tool.setuptools.package-data]
|
|
37
|
+
"modekeeper.passports.templates" = ["*.json"]
|
|
38
|
+
"modekeeper.chords" = ["*.json"]
|
|
39
|
+
"modekeeper.license" = ["*.json"]
|
|
40
|
+
|
|
41
|
+
[tool.ruff]
|
|
42
|
+
line-length = 100
|
|
43
|
+
select = ["E", "F", "I", "W"]
|
|
44
|
+
|
|
45
|
+
[tool.ruff.format]
|
|
46
|
+
quote-style = "double"
|
|
47
|
+
indent-style = "space"
|
|
48
|
+
line-ending = "lf"
|
|
49
|
+
|
|
50
|
+
[tool.pytest.ini_options]
|
|
51
|
+
addopts = "-q"
|
|
52
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class Knob:
|
|
9
|
+
name: str
|
|
10
|
+
min_value: int
|
|
11
|
+
max_value: int
|
|
12
|
+
step: int
|
|
13
|
+
value: int
|
|
14
|
+
last_changed_at: datetime | None = None
|
|
15
|
+
|
|
16
|
+
def clamp(self, target: int) -> int:
|
|
17
|
+
if target < self.min_value:
|
|
18
|
+
return self.min_value
|
|
19
|
+
if target > self.max_value:
|
|
20
|
+
return self.max_value
|
|
21
|
+
return target
|
|
22
|
+
|
|
23
|
+
def apply(self, target: int) -> int:
|
|
24
|
+
target = self.clamp(target)
|
|
25
|
+
self.value = target
|
|
26
|
+
self.last_changed_at = datetime.now(timezone.utc)
|
|
27
|
+
return self.value
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ActuatorRegistry:
|
|
31
|
+
def __init__(self) -> None:
|
|
32
|
+
self._knobs: dict[str, Knob] = {}
|
|
33
|
+
|
|
34
|
+
def register(self, knob: Knob) -> None:
|
|
35
|
+
self._knobs[knob.name] = knob
|
|
36
|
+
|
|
37
|
+
def get(self, name: str) -> Knob | None:
|
|
38
|
+
return self._knobs.get(name)
|
|
39
|
+
|
|
40
|
+
def snapshot(self) -> dict[str, int]:
|
|
41
|
+
return {k: v.value for k, v in self._knobs.items()}
|
|
42
|
+
|
|
43
|
+
def restore(self, snapshot: dict[str, int]) -> None:
|
|
44
|
+
for name, value in snapshot.items():
|
|
45
|
+
if name in self._knobs:
|
|
46
|
+
self._knobs[name].value = value
|
|
47
|
+
|
|
48
|
+
def list_names(self) -> list[str]:
|
|
49
|
+
return list(self._knobs.keys())
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def build_k8s_plan(
|
|
5
|
+
proposed_actions: list[object],
|
|
6
|
+
*,
|
|
7
|
+
namespace: str,
|
|
8
|
+
deployment: str,
|
|
9
|
+
) -> list[dict]:
|
|
10
|
+
items_by_target: dict[tuple[str, str], dict] = {}
|
|
11
|
+
for action in proposed_actions:
|
|
12
|
+
knob = getattr(action, "knob", None)
|
|
13
|
+
target = getattr(action, "target", None)
|
|
14
|
+
key = (namespace, deployment)
|
|
15
|
+
item = items_by_target.get(key)
|
|
16
|
+
if item is None:
|
|
17
|
+
item = {
|
|
18
|
+
"apiVersion": "apps/v1",
|
|
19
|
+
"kind": "Deployment",
|
|
20
|
+
"namespace": namespace,
|
|
21
|
+
"name": deployment,
|
|
22
|
+
"reason": "coalesced",
|
|
23
|
+
"patch": {
|
|
24
|
+
"metadata": {"annotations": {}},
|
|
25
|
+
"spec": {"template": {"metadata": {"annotations": {}}}},
|
|
26
|
+
},
|
|
27
|
+
}
|
|
28
|
+
items_by_target[key] = item
|
|
29
|
+
|
|
30
|
+
annotation_key = f"modekeeper/knob.{knob}"
|
|
31
|
+
item["patch"]["metadata"]["annotations"][annotation_key] = f"{target}"
|
|
32
|
+
item["patch"]["spec"]["template"]["metadata"]["annotations"][annotation_key] = f"{target}"
|
|
33
|
+
return list(items_by_target.values())
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from modekeeper.safety.explain import ExplainLog
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import pytorch_lightning as pl
|
|
10
|
+
except Exception:
|
|
11
|
+
try:
|
|
12
|
+
import lightning.pytorch as pl
|
|
13
|
+
except Exception:
|
|
14
|
+
pl = None
|
|
15
|
+
|
|
16
|
+
LIGHTNING_AVAILABLE = pl is not None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def build_lightning_callback(out_dir: Path) -> object | None:
|
|
20
|
+
if pl is None:
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
explain = ExplainLog(out_dir / "explain.jsonl")
|
|
24
|
+
|
|
25
|
+
class ExplainCallback(pl.Callback):
|
|
26
|
+
def __init__(self) -> None:
|
|
27
|
+
super().__init__()
|
|
28
|
+
self._fit_start = None
|
|
29
|
+
self._batch_start = None
|
|
30
|
+
|
|
31
|
+
def on_train_start(self, trainer, pl_module) -> None:
|
|
32
|
+
now = time.monotonic()
|
|
33
|
+
if self._fit_start is None:
|
|
34
|
+
self._fit_start = now
|
|
35
|
+
explain.emit("pl_train_start", {"t": now})
|
|
36
|
+
|
|
37
|
+
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx) -> None:
|
|
38
|
+
now = time.monotonic()
|
|
39
|
+
self._batch_start = now
|
|
40
|
+
explain.emit("pl_train_batch_start", {"t": now, "batch_idx": batch_idx})
|
|
41
|
+
|
|
42
|
+
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None:
|
|
43
|
+
now = time.monotonic()
|
|
44
|
+
loss = _extract_loss(outputs)
|
|
45
|
+
payload = {"t": now, "batch_idx": batch_idx, "loss": loss}
|
|
46
|
+
if self._batch_start is not None:
|
|
47
|
+
payload["batch_duration_s"] = now - self._batch_start
|
|
48
|
+
explain.emit("pl_train_batch_end", payload)
|
|
49
|
+
|
|
50
|
+
def on_fit_end(self, trainer, pl_module) -> None:
|
|
51
|
+
now = time.monotonic()
|
|
52
|
+
payload = {"t": now}
|
|
53
|
+
if self._fit_start is not None:
|
|
54
|
+
payload["fit_duration_s"] = now - self._fit_start
|
|
55
|
+
explain.emit("pl_fit_end", payload)
|
|
56
|
+
|
|
57
|
+
return ExplainCallback()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _extract_loss(outputs: object) -> float | None:
|
|
61
|
+
if outputs is None:
|
|
62
|
+
return None
|
|
63
|
+
if isinstance(outputs, (int, float)):
|
|
64
|
+
return float(outputs)
|
|
65
|
+
if isinstance(outputs, dict):
|
|
66
|
+
if "loss" in outputs:
|
|
67
|
+
return _extract_loss(outputs["loss"])
|
|
68
|
+
return None
|
|
69
|
+
if isinstance(outputs, (list, tuple)) and outputs:
|
|
70
|
+
return _extract_loss(outputs[0])
|
|
71
|
+
item = getattr(outputs, "item", None)
|
|
72
|
+
if callable(item):
|
|
73
|
+
try:
|
|
74
|
+
return float(item())
|
|
75
|
+
except Exception:
|
|
76
|
+
return None
|
|
77
|
+
return None
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
SCHEMA_VERSION = "decision_trace_event.v0"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class DecisionTraceWriter:
|
|
13
|
+
path: Path
|
|
14
|
+
|
|
15
|
+
def emit(self, event: dict[str, Any]) -> None:
|
|
16
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
17
|
+
with self.path.open("a", encoding="utf-8") as f:
|
|
18
|
+
f.write(
|
|
19
|
+
json.dumps(
|
|
20
|
+
event,
|
|
21
|
+
sort_keys=True,
|
|
22
|
+
separators=(",", ":"),
|
|
23
|
+
ensure_ascii=False,
|
|
24
|
+
)
|
|
25
|
+
)
|
|
26
|
+
f.write("\n")
|
|
27
|
+
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
CATALOG_SCHEMA_VERSION = "chord_catalog.v1"
|
|
8
|
+
VALIDATE_SCHEMA_VERSION = "chords_validate.v0"
|
|
9
|
+
_REQUIRED_CHORD_KEYS = (
|
|
10
|
+
"id",
|
|
11
|
+
"intent",
|
|
12
|
+
"risk_tier",
|
|
13
|
+
"required_signals",
|
|
14
|
+
"invariants",
|
|
15
|
+
"knobs_touched",
|
|
16
|
+
)
|
|
17
|
+
_OPTIONAL_CHORD_KEYS = ("cooldown_ms", "budget")
|
|
18
|
+
_ALLOWED_CHORD_KEYS = frozenset((*_REQUIRED_CHORD_KEYS, *_OPTIONAL_CHORD_KEYS))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _is_list_of_str(value: object) -> bool:
|
|
22
|
+
return isinstance(value, list) and all(isinstance(item, str) for item in value)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def validate_catalog_dict(catalog: dict, source: str) -> list[str]:
|
|
26
|
+
errors: list[str] = []
|
|
27
|
+
if not isinstance(catalog, dict):
|
|
28
|
+
return [f"{source}: top-level JSON must be an object"]
|
|
29
|
+
|
|
30
|
+
top_unknown_keys = sorted(set(catalog.keys()) - {"schema_version", "chords"})
|
|
31
|
+
for key in top_unknown_keys:
|
|
32
|
+
errors.append(f"{source}: unknown top-level field '{key}'")
|
|
33
|
+
|
|
34
|
+
schema_version = catalog.get("schema_version")
|
|
35
|
+
if schema_version != CATALOG_SCHEMA_VERSION:
|
|
36
|
+
errors.append(f"{source}: schema_version must be '{CATALOG_SCHEMA_VERSION}'")
|
|
37
|
+
|
|
38
|
+
chords = catalog.get("chords")
|
|
39
|
+
if not isinstance(chords, list):
|
|
40
|
+
errors.append(f"{source}: chords must be an array")
|
|
41
|
+
return errors
|
|
42
|
+
|
|
43
|
+
seen_ids: set[str] = set()
|
|
44
|
+
for index, item in enumerate(chords):
|
|
45
|
+
path = f"{source}: chords[{index}]"
|
|
46
|
+
if not isinstance(item, dict):
|
|
47
|
+
errors.append(f"{path} must be an object")
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
unknown_keys = sorted(set(item.keys()) - _ALLOWED_CHORD_KEYS)
|
|
51
|
+
for key in unknown_keys:
|
|
52
|
+
errors.append(f"{path}: unknown field '{key}'")
|
|
53
|
+
|
|
54
|
+
for key in _REQUIRED_CHORD_KEYS:
|
|
55
|
+
if key not in item:
|
|
56
|
+
errors.append(f"{path}: missing required field '{key}'")
|
|
57
|
+
|
|
58
|
+
chord_id = item.get("id")
|
|
59
|
+
if isinstance(chord_id, str):
|
|
60
|
+
if chord_id in seen_ids:
|
|
61
|
+
errors.append(f"{path}: duplicate chord id '{chord_id}'")
|
|
62
|
+
else:
|
|
63
|
+
seen_ids.add(chord_id)
|
|
64
|
+
else:
|
|
65
|
+
errors.append(f"{path}: id must be string")
|
|
66
|
+
|
|
67
|
+
intent = item.get("intent")
|
|
68
|
+
if not isinstance(intent, str):
|
|
69
|
+
errors.append(f"{path}: intent must be string")
|
|
70
|
+
|
|
71
|
+
risk_tier = item.get("risk_tier")
|
|
72
|
+
if not isinstance(risk_tier, str):
|
|
73
|
+
errors.append(f"{path}: risk_tier must be string")
|
|
74
|
+
|
|
75
|
+
required_signals = item.get("required_signals")
|
|
76
|
+
if not _is_list_of_str(required_signals):
|
|
77
|
+
errors.append(f"{path}: required_signals must be array of strings")
|
|
78
|
+
|
|
79
|
+
invariants = item.get("invariants")
|
|
80
|
+
if not _is_list_of_str(invariants):
|
|
81
|
+
errors.append(f"{path}: invariants must be array of strings")
|
|
82
|
+
|
|
83
|
+
knobs_touched = item.get("knobs_touched")
|
|
84
|
+
if not _is_list_of_str(knobs_touched):
|
|
85
|
+
errors.append(f"{path}: knobs_touched must be array of strings")
|
|
86
|
+
|
|
87
|
+
if "cooldown_ms" in item and not isinstance(item.get("cooldown_ms"), int):
|
|
88
|
+
errors.append(f"{path}: cooldown_ms must be int")
|
|
89
|
+
|
|
90
|
+
if "budget" in item and not isinstance(item.get("budget"), dict):
|
|
91
|
+
errors.append(f"{path}: budget must be object")
|
|
92
|
+
|
|
93
|
+
return errors
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def validate_catalog_file(path: Path) -> dict:
|
|
97
|
+
source = str(path)
|
|
98
|
+
errors: list[str] = []
|
|
99
|
+
payload: object = {}
|
|
100
|
+
try:
|
|
101
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
102
|
+
except FileNotFoundError:
|
|
103
|
+
errors.append(f"{source}: file not found")
|
|
104
|
+
except json.JSONDecodeError as exc:
|
|
105
|
+
errors.append(f"{source}: invalid JSON: {exc}")
|
|
106
|
+
|
|
107
|
+
chord_count = 0
|
|
108
|
+
chord_ids: list[str] = []
|
|
109
|
+
if isinstance(payload, dict):
|
|
110
|
+
chords = payload.get("chords")
|
|
111
|
+
if isinstance(chords, list):
|
|
112
|
+
chord_count = len(chords)
|
|
113
|
+
chord_ids = sorted(
|
|
114
|
+
{
|
|
115
|
+
item.get("id")
|
|
116
|
+
for item in chords
|
|
117
|
+
if isinstance(item, dict) and isinstance(item.get("id"), str)
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
if errors:
|
|
122
|
+
return {
|
|
123
|
+
"schema_version": VALIDATE_SCHEMA_VERSION,
|
|
124
|
+
"ok": False,
|
|
125
|
+
"errors": errors,
|
|
126
|
+
"chord_count": chord_count,
|
|
127
|
+
"chord_ids": chord_ids,
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if isinstance(payload, dict):
|
|
131
|
+
errors.extend(validate_catalog_dict(payload, source=source))
|
|
132
|
+
else:
|
|
133
|
+
errors.append(f"{source}: top-level JSON must be an object")
|
|
134
|
+
|
|
135
|
+
return {
|
|
136
|
+
"schema_version": VALIDATE_SCHEMA_VERSION,
|
|
137
|
+
"ok": len(errors) == 0,
|
|
138
|
+
"errors": errors,
|
|
139
|
+
"chord_count": chord_count,
|
|
140
|
+
"chord_ids": chord_ids,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def load_catalog_file(path: Path) -> dict:
|
|
145
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
146
|
+
if not isinstance(payload, dict):
|
|
147
|
+
raise ValueError(f"{path}: top-level JSON must be an object")
|
|
148
|
+
errors = validate_catalog_dict(payload, source=str(path))
|
|
149
|
+
if errors:
|
|
150
|
+
raise ValueError("; ".join(errors))
|
|
151
|
+
return payload
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@lru_cache(maxsize=1)
|
|
155
|
+
def load_default_catalog() -> dict:
|
|
156
|
+
path = Path(__file__).with_name("catalog_v1.json")
|
|
157
|
+
return load_catalog_file(path)
|