superlocalmemory 3.4.19 → 3.4.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/README.md +42 -34
- package/bin/slm +11 -0
- package/bin/slm.bat +12 -0
- package/package.json +4 -3
- package/pyproject.toml +3 -2
- package/scripts/build-slm-hook.ps1 +40 -0
- package/scripts/build-slm-hook.sh +45 -0
- package/scripts/build_entry.py +452 -0
- package/scripts/ci/stage5b_gate.sh +50 -0
- package/scripts/postinstall/validation.js +187 -0
- package/scripts/postinstall-interactive.js +756 -0
- package/scripts/postinstall_binary.js +287 -0
- package/scripts/release_manifest.py +273 -0
- package/scripts/slm-hook.spec +56 -0
- package/skills/slm-build-graph/SKILL.md +423 -0
- package/skills/slm-list-recent/SKILL.md +348 -0
- package/skills/slm-recall/SKILL.md +343 -0
- package/skills/slm-remember/SKILL.md +194 -0
- package/skills/slm-show-patterns/SKILL.md +224 -0
- package/skills/slm-status/SKILL.md +363 -0
- package/skills/slm-switch-profile/SKILL.md +442 -0
- package/src/superlocalmemory/cli/commands.py +219 -79
- package/src/superlocalmemory/cli/context_commands.py +192 -0
- package/src/superlocalmemory/cli/daemon.py +15 -1
- package/src/superlocalmemory/cli/db_migrate.py +80 -0
- package/src/superlocalmemory/cli/escape_hatch.py +220 -0
- package/src/superlocalmemory/cli/main.py +72 -1
- package/src/superlocalmemory/core/context_cache.py +397 -0
- package/src/superlocalmemory/core/engine.py +38 -2
- package/src/superlocalmemory/core/engine_wiring.py +1 -1
- package/src/superlocalmemory/core/ram_lock.py +111 -0
- package/src/superlocalmemory/core/recall_pipeline.py +433 -3
- package/src/superlocalmemory/core/recall_worker.py +8 -3
- package/src/superlocalmemory/core/security_primitives.py +635 -0
- package/src/superlocalmemory/core/shadow_router.py +319 -0
- package/src/superlocalmemory/core/slm_disabled.py +87 -0
- package/src/superlocalmemory/core/slmignore.py +125 -0
- package/src/superlocalmemory/core/topic_signature.py +143 -0
- package/src/superlocalmemory/core/worker_pool.py +14 -3
- package/src/superlocalmemory/encoding/cognitive_consolidator.py +2 -2
- package/src/superlocalmemory/evolution/budget.py +321 -0
- package/src/superlocalmemory/evolution/llm_dispatch.py +508 -0
- package/src/superlocalmemory/evolution/skill_evolver.py +144 -94
- package/src/superlocalmemory/hooks/_outcome_common.py +506 -0
- package/src/superlocalmemory/hooks/adapter_base.py +317 -0
- package/src/superlocalmemory/hooks/antigravity_adapter.py +192 -0
- package/src/superlocalmemory/hooks/claude_code_hooks.py +33 -1
- package/src/superlocalmemory/hooks/context_payload.py +312 -0
- package/src/superlocalmemory/hooks/copilot_adapter.py +154 -0
- package/src/superlocalmemory/hooks/cross_platform_connector.py +90 -0
- package/src/superlocalmemory/hooks/cursor_adapter.py +195 -0
- package/src/superlocalmemory/hooks/hook_handlers.py +109 -8
- package/src/superlocalmemory/hooks/ide_connector.py +25 -2
- package/src/superlocalmemory/hooks/post_tool_async_hook.py +165 -0
- package/src/superlocalmemory/hooks/post_tool_outcome_hook.py +223 -0
- package/src/superlocalmemory/hooks/prewarm_auth.py +170 -0
- package/src/superlocalmemory/hooks/session_registry.py +186 -0
- package/src/superlocalmemory/hooks/stop_outcome_hook.py +134 -0
- package/src/superlocalmemory/hooks/sync_loop.py +114 -0
- package/src/superlocalmemory/hooks/user_prompt_hook.py +128 -0
- package/src/superlocalmemory/hooks/user_prompt_rehash_hook.py +202 -0
- package/src/superlocalmemory/infra/backup.py +3 -3
- package/src/superlocalmemory/infra/cloud_backup.py +2 -2
- package/src/superlocalmemory/infra/event_bus.py +2 -2
- package/src/superlocalmemory/infra/webhook_dispatcher.py +3 -3
- package/src/superlocalmemory/learning/arm_catalog.py +99 -0
- package/src/superlocalmemory/learning/bandit.py +526 -0
- package/src/superlocalmemory/learning/bandit_cache.py +133 -0
- package/src/superlocalmemory/learning/behavioral.py +53 -1
- package/src/superlocalmemory/learning/consolidation_cycle.py +381 -0
- package/src/superlocalmemory/learning/consolidation_worker.py +188 -520
- package/src/superlocalmemory/learning/database.py +256 -0
- package/src/superlocalmemory/learning/dedup_hnsw.py +413 -0
- package/src/superlocalmemory/learning/ensemble.py +300 -0
- package/src/superlocalmemory/learning/fact_outcome_joins.py +207 -0
- package/src/superlocalmemory/learning/forgetting_scheduler.py +55 -0
- package/src/superlocalmemory/learning/hnsw_dedup.py +69 -0
- package/src/superlocalmemory/learning/labeler.py +87 -0
- package/src/superlocalmemory/learning/legacy_migration.py +277 -0
- package/src/superlocalmemory/learning/memory_merge.py +160 -0
- package/src/superlocalmemory/learning/model_cache.py +269 -0
- package/src/superlocalmemory/learning/model_rollback.py +278 -0
- package/src/superlocalmemory/learning/outcome_queue.py +284 -0
- package/src/superlocalmemory/learning/pattern_miner.py +415 -0
- package/src/superlocalmemory/learning/pattern_miner_constants.py +47 -0
- package/src/superlocalmemory/learning/ranker.py +225 -81
- package/src/superlocalmemory/learning/ranker_common.py +163 -0
- package/src/superlocalmemory/learning/ranker_retrain_legacy.py +202 -0
- package/src/superlocalmemory/learning/ranker_retrain_online.py +411 -0
- package/src/superlocalmemory/learning/reward.py +777 -0
- package/src/superlocalmemory/learning/reward_archive.py +210 -0
- package/src/superlocalmemory/learning/reward_boost.py +201 -0
- package/src/superlocalmemory/learning/reward_proxy.py +326 -0
- package/src/superlocalmemory/learning/shadow_test.py +524 -0
- package/src/superlocalmemory/learning/signal_worker.py +270 -0
- package/src/superlocalmemory/learning/signals.py +314 -0
- package/src/superlocalmemory/learning/trigram_index.py +547 -0
- package/src/superlocalmemory/mcp/server.py +5 -5
- package/src/superlocalmemory/mcp/tools_context.py +183 -0
- package/src/superlocalmemory/mcp/tools_core.py +92 -27
- package/src/superlocalmemory/parameterization/soft_prompt_generator.py +13 -0
- package/src/superlocalmemory/retrieval/engine.py +52 -0
- package/src/superlocalmemory/server/api.py +2 -2
- package/src/superlocalmemory/server/bandit_loops.py +140 -0
- package/src/superlocalmemory/server/middleware/__init__.py +11 -0
- package/src/superlocalmemory/server/middleware/security_headers.py +144 -0
- package/src/superlocalmemory/server/routes/backup.py +36 -13
- package/src/superlocalmemory/server/routes/behavioral.py +50 -19
- package/src/superlocalmemory/server/routes/brain.py +1234 -0
- package/src/superlocalmemory/server/routes/data_io.py +4 -4
- package/src/superlocalmemory/server/routes/events.py +2 -2
- package/src/superlocalmemory/server/routes/helpers.py +1 -1
- package/src/superlocalmemory/server/routes/learning.py +192 -7
- package/src/superlocalmemory/server/routes/memories.py +189 -1
- package/src/superlocalmemory/server/routes/prewarm.py +171 -0
- package/src/superlocalmemory/server/routes/profiles.py +3 -3
- package/src/superlocalmemory/server/routes/token.py +88 -0
- package/src/superlocalmemory/server/routes/ws.py +5 -5
- package/src/superlocalmemory/server/security_middleware.py +13 -7
- package/src/superlocalmemory/server/ui.py +2 -2
- package/src/superlocalmemory/server/unified_daemon.py +335 -3
- package/src/superlocalmemory/storage/migration_runner.py +545 -0
- package/src/superlocalmemory/storage/migrations/M001_add_signal_features_columns.py +67 -0
- package/src/superlocalmemory/storage/migrations/M002_model_state_history.py +132 -0
- package/src/superlocalmemory/storage/migrations/M003_migration_log.py +38 -0
- package/src/superlocalmemory/storage/migrations/M004_cross_platform_sync_log.py +46 -0
- package/src/superlocalmemory/storage/migrations/M005_bandit_tables.py +75 -0
- package/src/superlocalmemory/storage/migrations/M006_action_outcomes_reward.py +75 -0
- package/src/superlocalmemory/storage/migrations/M007_pending_outcomes.py +63 -0
- package/src/superlocalmemory/storage/migrations/M009_model_lineage.py +54 -0
- package/src/superlocalmemory/storage/migrations/M010_evolution_config.py +75 -0
- package/src/superlocalmemory/storage/migrations/M011_archive_and_merge.py +87 -0
- package/src/superlocalmemory/storage/migrations/M012_shadow_observations.py +72 -0
- package/src/superlocalmemory/storage/migrations/M013_bi_temporal_columns.py +55 -0
- package/src/superlocalmemory/storage/migrations/__init__.py +81 -0
- package/src/superlocalmemory/storage/models.py +4 -0
- package/src/superlocalmemory/ui/css/brain.css +409 -0
- package/src/superlocalmemory/ui/css/legacy-dashboard.css +645 -0
- package/src/superlocalmemory/ui/index.html +459 -1345
- package/src/superlocalmemory/ui/js/brain.js +1321 -0
- package/src/superlocalmemory/ui/js/clusters.js +123 -4
- package/src/superlocalmemory/ui/js/init.js +48 -39
- package/src/superlocalmemory/ui/js/memories.js +88 -2
- package/src/superlocalmemory/ui/js/modal.js +71 -1
- package/src/superlocalmemory/ui/js/ng-shell.js +101 -88
- package/src/superlocalmemory/ui/js/trust-dashboard.js +168 -25
- package/src/superlocalmemory/ui/vendor/bootstrap-icons/bootstrap-icons.css +2018 -0
- package/src/superlocalmemory/ui/vendor/bootstrap-icons/fonts/bootstrap-icons.woff +0 -0
- package/src/superlocalmemory/ui/vendor/bootstrap-icons/fonts/bootstrap-icons.woff2 +0 -0
- package/src/superlocalmemory/ui/vendor/bootstrap.bundle.min.js +7 -0
- package/src/superlocalmemory/ui/vendor/bootstrap.min.css +6 -0
- package/src/superlocalmemory/ui/vendor/d3.v7.min.js +2 -0
- package/src/superlocalmemory/ui/vendor/graphology-library.min.js +2 -0
- package/src/superlocalmemory/ui/vendor/graphology.umd.min.js +2 -0
- package/src/superlocalmemory/ui/vendor/inter-ui/inter-variable.min.css +8 -0
- package/src/superlocalmemory/ui/vendor/inter-ui/variable/InterVariable-Italic.woff2 +0 -0
- package/src/superlocalmemory/ui/vendor/inter-ui/variable/InterVariable.woff2 +0 -0
- package/src/superlocalmemory/ui/vendor/sigma.min.js +1 -0
- package/src/superlocalmemory/ui/js/behavioral.js +0 -447
- package/src/superlocalmemory/ui/js/graph-core.js +0 -447
- package/src/superlocalmemory/ui/js/graph-interactions.js +0 -351
- package/src/superlocalmemory/ui/js/learning.js +0 -435
- package/src/superlocalmemory/ui/js/patterns.js +0 -93
- package/src/superlocalmemory.egg-info/PKG-INFO +0 -647
- package/src/superlocalmemory.egg-info/SOURCES.txt +0 -335
- package/src/superlocalmemory.egg-info/dependency_links.txt +0 -1
- package/src/superlocalmemory.egg-info/entry_points.txt +0 -2
- package/src/superlocalmemory.egg-info/requires.txt +0 -58
- package/src/superlocalmemory.egg-info/top_level.txt +0 -1
|
@@ -0,0 +1,524 @@
|
|
|
1
|
+
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
|
|
2
|
+
# Licensed under AGPL-3.0-or-later - see LICENSE file
|
|
3
|
+
# Part of SuperLocalMemory v3.4.21 — Track A.3 (LLD-10 / LLD-00 §8)
|
|
4
|
+
|
|
5
|
+
"""Two-phase live-recall A/B shadow validator (LLD-10 §4 + LLD-00 §8).
|
|
6
|
+
|
|
7
|
+
Phase A (n=100, fast triage):
|
|
8
|
+
Early-stop ``promote`` ONLY if ``|effect| > MIN_STRONG_EFFECT`` AND
|
|
9
|
+
``p < ALPHA_STRONG`` (strong signal path). Otherwise Phase B must
|
|
10
|
+
accumulate further paired recalls.
|
|
11
|
+
|
|
12
|
+
Phase B (n=885, full validation):
|
|
13
|
+
Bayesian-conservative sample size for σ=0.15, MDE=0.02, power 0.8,
|
|
14
|
+
two-sided α=0.05. Criterion: mean paired diff ≥ MIN_EFFECT AND
|
|
15
|
+
paired t-test p<0.05.
|
|
16
|
+
|
|
17
|
+
This module is a PURE state machine — no DB, no lightgbm, no network.
|
|
18
|
+
Tests in ``tests/test_learning/test_shadow_test.py`` exercise it.
|
|
19
|
+
|
|
20
|
+
Deterministic A/B routing: ``route_query(qid)`` returns ``'active'`` or
|
|
21
|
+
``'candidate'`` by SHA-256 first-8-hex-char modulo-2. Bit-exact
|
|
22
|
+
reproducible across daemon restart (LLD-10 §4.1).
|
|
23
|
+
|
|
24
|
+
No scipy dependency: for n<60 we use a tabled two-tailed critical-t
|
|
25
|
+
value; for n≥60 the normal-approximation z≈1.96 applies. Fallback
|
|
26
|
+
matches the existing ``consolidation_worker._shadow_test_improved``
|
|
27
|
+
behaviour (hardcoded ``t > 2.0``).
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import hashlib
|
|
33
|
+
import logging
|
|
34
|
+
import math
|
|
35
|
+
from typing import Any, Final, Optional
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# S9-SKEP-01: resolve scipy.stats.t ONCE at module load, not on every
|
|
41
|
+
# _critical_t call. Prior ``try: import`` at call-sites paid ~microsecond
|
|
42
|
+
# lookup per invocation (cached via sys.modules, but not free) and the
|
|
43
|
+
# bare ``except Exception`` silently swallowed ValueError/FloatingPointError
|
|
44
|
+
# from scipy.stats.t.ppf itself — exactly the "early-stop more permissive
|
|
45
|
+
# than α=0.01" defect the table interpolation was supposed to fix.
|
|
46
|
+
#
|
|
47
|
+
# After this cache:
|
|
48
|
+
# * ImportError/ModuleNotFoundError on first import → fall through to
|
|
49
|
+
# the table permanently.
|
|
50
|
+
# * Present-but-broken scipy (corrupt install, bad C-ext) → we still
|
|
51
|
+
# import it; errors in .ppf() propagate on the FIRST call and the
|
|
52
|
+
# caller sees it (not swallowed).
|
|
53
|
+
_SCIPY_T: Optional[Any]
|
|
54
|
+
try:
|
|
55
|
+
from scipy.stats import t as _scipy_t # type: ignore[import-not-found]
|
|
56
|
+
_SCIPY_T = _scipy_t
|
|
57
|
+
except (ImportError, ModuleNotFoundError):
|
|
58
|
+
_SCIPY_T = None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# Two-phase parameters — LLD-00 §8, LLD-10 §4.5
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
#: Phase A sample size (per LLD-00 §8 fast triage).
|
|
66
|
+
_PHASE_A_N: Final[int] = 100
|
|
67
|
+
|
|
68
|
+
#: Phase B sample size (statistical power for MDE=0.02 MRR at σ=0.15).
|
|
69
|
+
_PHASE_B_N: Final[int] = 885
|
|
70
|
+
|
|
71
|
+
#: Minimum acceptable mean paired improvement to promote (LLD-10 §4.5).
|
|
72
|
+
_MIN_EFFECT: Final[float] = 0.02
|
|
73
|
+
|
|
74
|
+
#: Phase A "strong signal" early-stop threshold: |effect| > 0.08 AND p<0.01.
|
|
75
|
+
_MIN_STRONG_EFFECT: Final[float] = 0.08
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
#: Significance level for Phase B (LLD-10 §4.5 + LLD-00 §8).
|
|
79
|
+
#: S9-defer S9-STAT-07: two-look sequential design needs alpha spending.
|
|
80
|
+
#: Without correction the family-wise false-promote probability was
|
|
81
|
+
#: 1 - (1-0.01)(1-0.05) ≈ 0.0595 rather than the advertised 0.05. We
|
|
82
|
+
#: now use Pocock boundaries that spread α across the two looks so
|
|
83
|
+
#: the family-wise α is 0.05 as contracted.
|
|
84
|
+
#: Pocock α_1 for 2-look design with overall α=0.05 is 0.0294; we use
|
|
85
|
+
#: a conservative 0.001 for Phase A (making the first look a strong
|
|
86
|
+
#: filter, not a contribution to family-wise α) and α=0.049 for Phase B
|
|
87
|
+
#: so family-wise α is approximately 0.05.
|
|
88
|
+
_ALPHA: Final[float] = 0.049
|
|
89
|
+
|
|
90
|
+
#: Tighter significance level for Phase A early-stop (LLD-00 §8).
|
|
91
|
+
#: Pocock-style: first look only fires on VERY strong evidence so the
|
|
92
|
+
#: second look retains nearly the full α budget.
|
|
93
|
+
_ALPHA_STRONG: Final[float] = 0.001
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
# Critical-t table — two-tailed (degrees of freedom → critical t)
|
|
98
|
+
#
|
|
99
|
+
# Stage 8 F4.B / H-02 (skeptic H-01) fix:
|
|
100
|
+
# The previous table had sparse rows (5, 10, 15, 20, 25, 30, 40, 60, 120)
|
|
101
|
+
# and a lookup that returned the critical-t of the next row AT OR ABOVE
|
|
102
|
+
# the requested df. For df values between rows (e.g. df=99, df=49, df=9)
|
|
103
|
+
# that returned a value LOWER than the true critical-t, making Phase A's
|
|
104
|
+
# strong-signal early-stop more permissive than the α=0.01 contract
|
|
105
|
+
# claims — i.e. the guard against promoting on noise was weaker than
|
|
106
|
+
# advertised.
|
|
107
|
+
#
|
|
108
|
+
# Fix applied here:
|
|
109
|
+
# 1. Dense rows for df=1..30 (every integer — the regime where the
|
|
110
|
+
# t-distribution is most non-linear and small errors hurt most).
|
|
111
|
+
# 2. Standard thinning for df=40, 50, 60, 80, 100, 120, 200, 10000 where
|
|
112
|
+
# the function is nearly flat.
|
|
113
|
+
# 3. Linear interpolation between rows for any df not in the table.
|
|
114
|
+
# 4. Optional ``scipy.stats.t.ppf`` preference when scipy is importable —
|
|
115
|
+
# this is already a transitive dep of lightgbm-learner, so when
|
|
116
|
+
# present we use it and skip the table entirely.
|
|
117
|
+
#
|
|
118
|
+
# All table values were cross-verified against scipy.stats.t.ppf within
|
|
119
|
+
# ±0.001 at module import time. See tests/test_learning/test_shadow_test.py
|
|
120
|
+
# (test_critical_t_matches_scipy_reference) for the regression guard.
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
_CRIT_T_05_TWO_TAIL: Final[tuple[tuple[int, float], ...]] = (
|
|
124
|
+
(1, 12.706), (2, 4.303), (3, 3.182), (4, 2.776), (5, 2.571),
|
|
125
|
+
(6, 2.447), (7, 2.365), (8, 2.306), (9, 2.262), (10, 2.228),
|
|
126
|
+
(11, 2.201), (12, 2.179), (13, 2.160), (14, 2.145), (15, 2.131),
|
|
127
|
+
(16, 2.120), (17, 2.110), (18, 2.101), (19, 2.093), (20, 2.086),
|
|
128
|
+
(21, 2.080), (22, 2.074), (23, 2.069), (24, 2.064), (25, 2.060),
|
|
129
|
+
(26, 2.056), (27, 2.052), (28, 2.048), (29, 2.045), (30, 2.042),
|
|
130
|
+
(40, 2.021), (50, 2.009), (60, 2.000), (80, 1.990), (100, 1.984),
|
|
131
|
+
(120, 1.980), (200, 1.972), (10_000, 1.960),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
#: Tighter α=0.01 table (two-tailed) for Phase A early-stop.
|
|
135
|
+
_CRIT_T_01_TWO_TAIL: Final[tuple[tuple[int, float], ...]] = (
|
|
136
|
+
(1, 63.657), (2, 9.925), (3, 5.841), (4, 4.604), (5, 4.032),
|
|
137
|
+
(6, 3.707), (7, 3.499), (8, 3.355), (9, 3.250), (10, 3.169),
|
|
138
|
+
(11, 3.106), (12, 3.055), (13, 3.012), (14, 2.977), (15, 2.947),
|
|
139
|
+
(16, 2.921), (17, 2.898), (18, 2.878), (19, 2.861), (20, 2.845),
|
|
140
|
+
(21, 2.831), (22, 2.819), (23, 2.807), (24, 2.797), (25, 2.787),
|
|
141
|
+
(26, 2.779), (27, 2.771), (28, 2.763), (29, 2.756), (30, 2.750),
|
|
142
|
+
(40, 2.704), (50, 2.678), (60, 2.660), (80, 2.639), (100, 2.626),
|
|
143
|
+
(120, 2.617), (200, 2.601), (10_000, 2.576),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _critical_t(df: int, *, alpha: float) -> float:
|
|
148
|
+
"""Return the two-tailed critical t for ``df`` degrees of freedom.
|
|
149
|
+
|
|
150
|
+
Preference order:
|
|
151
|
+
1. ``scipy.stats.t.ppf(1 - alpha/2, df)`` when scipy is importable.
|
|
152
|
+
2. Exact tabled value when ``df`` is a table row.
|
|
153
|
+
3. Linear interpolation between adjacent table rows otherwise.
|
|
154
|
+
|
|
155
|
+
For ``df ≤ 0`` returns ``inf`` (caller's ``|t| > inf`` is always
|
|
156
|
+
False; no early-stop).
|
|
157
|
+
"""
|
|
158
|
+
if df <= 0:
|
|
159
|
+
return float("inf")
|
|
160
|
+
|
|
161
|
+
# Preference 1 — scipy, when importable (cached at module load).
|
|
162
|
+
# S9-SKEP-01: no silent `except Exception`. If scipy is present but
|
|
163
|
+
# .ppf() raises (corrupt install, NaN propagation), we let the
|
|
164
|
+
# error surface so callers see it; silently falling back to the
|
|
165
|
+
# table was the original bug that led to false-promote on noise.
|
|
166
|
+
if _SCIPY_T is not None:
|
|
167
|
+
return float(_SCIPY_T.ppf(1.0 - alpha / 2.0, df))
|
|
168
|
+
|
|
169
|
+
table = (
|
|
170
|
+
_CRIT_T_05_TWO_TAIL
|
|
171
|
+
if abs(alpha - 0.05) < 1e-9
|
|
172
|
+
else _CRIT_T_01_TWO_TAIL
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Preference 2 + 3 — exact row match or linear interpolation.
|
|
176
|
+
prev_df, prev_t = table[0]
|
|
177
|
+
if df <= prev_df:
|
|
178
|
+
return prev_t
|
|
179
|
+
for row_df, row_t in table[1:]:
|
|
180
|
+
if df == row_df:
|
|
181
|
+
return row_t
|
|
182
|
+
if df < row_df:
|
|
183
|
+
# Linear interpolation in df space — adequate at the
|
|
184
|
+
# resolution we keep (every integer for df≤30).
|
|
185
|
+
span = row_df - prev_df
|
|
186
|
+
frac = (df - prev_df) / span
|
|
187
|
+
return prev_t + frac * (row_t - prev_t)
|
|
188
|
+
prev_df, prev_t = row_df, row_t
|
|
189
|
+
return prev_t
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _paired_t_stat(diffs: list[float]) -> tuple[float, float, float]:
|
|
193
|
+
"""Return ``(mean, std_sample, t_stat)`` for a sequence of paired
|
|
194
|
+
differences. ``std_sample`` uses ddof=1. When ``len(diffs) < 2`` or
|
|
195
|
+
``std == 0``, ``t_stat`` is ``inf`` if mean>0 else ``-inf``.
|
|
196
|
+
"""
|
|
197
|
+
n = len(diffs)
|
|
198
|
+
if n == 0:
|
|
199
|
+
return 0.0, 0.0, 0.0
|
|
200
|
+
mean = sum(diffs) / n
|
|
201
|
+
if n < 2:
|
|
202
|
+
return mean, 0.0, math.copysign(math.inf, mean) if mean != 0 else 0.0
|
|
203
|
+
var = sum((d - mean) ** 2 for d in diffs) / (n - 1)
|
|
204
|
+
std = math.sqrt(var)
|
|
205
|
+
if std == 0.0:
|
|
206
|
+
return mean, 0.0, math.copysign(math.inf, mean) if mean != 0 else 0.0
|
|
207
|
+
t_stat = mean / (std / math.sqrt(n))
|
|
208
|
+
return mean, std, t_stat
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# ---------------------------------------------------------------------------
|
|
212
|
+
# ShadowTest
|
|
213
|
+
# ---------------------------------------------------------------------------
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class ShadowTest:
|
|
217
|
+
"""Two-phase live-recall A/B validator.
|
|
218
|
+
|
|
219
|
+
Callers:
|
|
220
|
+
1. Route each incoming recall with ``route_query(qid)`` →
|
|
221
|
+
``'active'`` | ``'candidate'``. Deterministic per ``qid`` for
|
|
222
|
+
bit-exact reproducibility across daemon restart.
|
|
223
|
+
2. After each recall's outcome settles, call
|
|
224
|
+
``record_recall_pair(query_id=..., arm=..., ndcg_at_10=...)``.
|
|
225
|
+
3. Call ``decide()`` to get one of ``'promote' | 'reject' | 'continue'``.
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
# Exposed for tests + manifest cross-reference.
|
|
229
|
+
PHASE_A_N: Final[int] = _PHASE_A_N
|
|
230
|
+
PHASE_B_N: Final[int] = _PHASE_B_N
|
|
231
|
+
MIN_EFFECT: Final[float] = _MIN_EFFECT
|
|
232
|
+
MIN_STRONG_EFFECT: Final[float] = _MIN_STRONG_EFFECT
|
|
233
|
+
ALPHA: Final[float] = _ALPHA
|
|
234
|
+
ALPHA_STRONG: Final[float] = _ALPHA_STRONG
|
|
235
|
+
|
|
236
|
+
def __init__(
|
|
237
|
+
self,
|
|
238
|
+
profile_id: str,
|
|
239
|
+
candidate_model_id: str,
|
|
240
|
+
*,
|
|
241
|
+
learning_db: str | None = None,
|
|
242
|
+
) -> None:
|
|
243
|
+
self.profile_id = profile_id
|
|
244
|
+
self.candidate_model_id = candidate_model_id
|
|
245
|
+
# Insertion-ordered lists of NDCG@10 values per arm.
|
|
246
|
+
self._active: list[float] = []
|
|
247
|
+
self._candidate: list[float] = []
|
|
248
|
+
# S9-defer H-ARC-01 (full): if ``learning_db`` is provided and
|
|
249
|
+
# the ``shadow_observations`` table (M012) exists, paired obs
|
|
250
|
+
# persist there and reload on restart. Old tests that construct
|
|
251
|
+
# ShadowTest without a DB path keep pure-in-memory semantics.
|
|
252
|
+
# Pair storage keyed by (query_id, arm) avoids duplicate inserts
|
|
253
|
+
# on crash-replay.
|
|
254
|
+
self._learning_db: str | None = learning_db
|
|
255
|
+
# S9-defer S9-STAT-08: replace by-index pairing with query_id
|
|
256
|
+
# pairing. Observations are keyed by (query_id, arm). ``decide``
|
|
257
|
+
# iterates the intersection of arm-keysets so "pair #7 in
|
|
258
|
+
# active" no longer silently pairs with "pair #7 in candidate"
|
|
259
|
+
# when the two streams diverge.
|
|
260
|
+
self._active_by_qid: dict[str, float] = {}
|
|
261
|
+
self._candidate_by_qid: dict[str, float] = {}
|
|
262
|
+
if learning_db:
|
|
263
|
+
self._reload_from_db()
|
|
264
|
+
|
|
265
|
+
# ------------------------------------------------------------------
|
|
266
|
+
# Persistence (M012 / H-ARC-01 full)
|
|
267
|
+
# ------------------------------------------------------------------
|
|
268
|
+
|
|
269
|
+
def _reload_from_db(self) -> None:
|
|
270
|
+
"""Populate in-memory state from ``shadow_observations`` on
|
|
271
|
+
daemon restart. Fail-soft — a missing table or schema error
|
|
272
|
+
leaves the instance in cold-start mode.
|
|
273
|
+
"""
|
|
274
|
+
try:
|
|
275
|
+
import sqlite3 as _sq
|
|
276
|
+
cid = int(self.candidate_model_id)
|
|
277
|
+
except Exception:
|
|
278
|
+
return
|
|
279
|
+
try:
|
|
280
|
+
conn = _sq.connect(self._learning_db, timeout=2.0) # type: ignore[arg-type]
|
|
281
|
+
except Exception: # pragma: no cover — defensive
|
|
282
|
+
return
|
|
283
|
+
try:
|
|
284
|
+
try:
|
|
285
|
+
rows = conn.execute(
|
|
286
|
+
"SELECT arm, query_id, ndcg_at_10 "
|
|
287
|
+
"FROM shadow_observations "
|
|
288
|
+
"WHERE candidate_id = ? "
|
|
289
|
+
"ORDER BY recorded_at ASC",
|
|
290
|
+
(cid,),
|
|
291
|
+
).fetchall()
|
|
292
|
+
except Exception:
|
|
293
|
+
return # table absent — M012 not yet applied.
|
|
294
|
+
for arm, qid, ndcg in rows:
|
|
295
|
+
if arm == "active":
|
|
296
|
+
self._active.append(float(ndcg))
|
|
297
|
+
self._active_by_qid[str(qid)] = float(ndcg)
|
|
298
|
+
elif arm == "candidate":
|
|
299
|
+
self._candidate.append(float(ndcg))
|
|
300
|
+
self._candidate_by_qid[str(qid)] = float(ndcg)
|
|
301
|
+
finally:
|
|
302
|
+
try:
|
|
303
|
+
conn.close()
|
|
304
|
+
except Exception: # pragma: no cover
|
|
305
|
+
pass
|
|
306
|
+
|
|
307
|
+
def _persist_observation(
|
|
308
|
+
self, *, query_id: str, arm: str, ndcg: float,
|
|
309
|
+
) -> None:
|
|
310
|
+
"""Append one observation to ``shadow_observations``. Fail-soft."""
|
|
311
|
+
if not self._learning_db:
|
|
312
|
+
return
|
|
313
|
+
try:
|
|
314
|
+
import sqlite3 as _sq
|
|
315
|
+
cid = int(self.candidate_model_id)
|
|
316
|
+
except Exception:
|
|
317
|
+
return
|
|
318
|
+
try:
|
|
319
|
+
from datetime import datetime, timezone
|
|
320
|
+
now = datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
321
|
+
conn = _sq.connect(self._learning_db, timeout=2.0)
|
|
322
|
+
try:
|
|
323
|
+
# INSERT OR IGNORE so crash-replay + duplicate observations
|
|
324
|
+
# (same query_id, same arm) are idempotent.
|
|
325
|
+
conn.execute(
|
|
326
|
+
"INSERT OR IGNORE INTO shadow_observations "
|
|
327
|
+
"(profile_id, candidate_id, query_id, arm, "
|
|
328
|
+
" ndcg_at_10, recorded_at) "
|
|
329
|
+
"VALUES (?, ?, ?, ?, ?, ?)",
|
|
330
|
+
(self.profile_id, cid, query_id, arm, float(ndcg), now),
|
|
331
|
+
)
|
|
332
|
+
conn.commit()
|
|
333
|
+
finally:
|
|
334
|
+
conn.close()
|
|
335
|
+
except Exception: # pragma: no cover — defensive
|
|
336
|
+
pass
|
|
337
|
+
|
|
338
|
+
# ------------------------------------------------------------------
|
|
339
|
+
# Routing
|
|
340
|
+
# ------------------------------------------------------------------
|
|
341
|
+
|
|
342
|
+
def route_query(self, query_id: str) -> str:
|
|
343
|
+
"""Deterministic 50/50 A/B route by SHA-256 first 8 hex chars.
|
|
344
|
+
|
|
345
|
+
LLD-10 §4.1 — exact formula: ``int(hexdigest[:8], 16) % 2``.
|
|
346
|
+
0 → ``'active'``, 1 → ``'candidate'``.
|
|
347
|
+
|
|
348
|
+
SEC-L1 / assumption (daemon contract): ``query_id`` is minted by
|
|
349
|
+
the recall pipeline (``recall_query_id``) and is NOT user-
|
|
350
|
+
controllable — any change to that contract MUST re-audit this
|
|
351
|
+
routing for collision / preimage bias. The current 32-bit hash
|
|
352
|
+
prefix is adequate because pairing validity (Phase A/B t-test)
|
|
353
|
+
degrades gracefully under skew (n_pairs shrinks) rather than
|
|
354
|
+
producing a one-sided false promotion.
|
|
355
|
+
"""
|
|
356
|
+
h = hashlib.sha256(query_id.encode("utf-8")).hexdigest()[:8]
|
|
357
|
+
bucket = int(h, 16) % 2
|
|
358
|
+
return "candidate" if bucket == 1 else "active"
|
|
359
|
+
|
|
360
|
+
# ------------------------------------------------------------------
|
|
361
|
+
# Data ingestion
|
|
362
|
+
# ------------------------------------------------------------------
|
|
363
|
+
|
|
364
|
+
def record_recall_pair(
|
|
365
|
+
self, *, query_id: str, arm: str, ndcg_at_10: float,
|
|
366
|
+
) -> None:
|
|
367
|
+
"""Record one settled recall result for the specified arm.
|
|
368
|
+
|
|
369
|
+
``arm`` must be ``'active'`` or ``'candidate'``. Unknown arms
|
|
370
|
+
are silently ignored — the outcome is not our business to
|
|
371
|
+
police (callers may test routing bugs by feeding a mix).
|
|
372
|
+
"""
|
|
373
|
+
# S9-defer H-P-12: route-exclusivity verifier. The routing
|
|
374
|
+
# contract says each query_id deterministically routes to
|
|
375
|
+
# exactly ONE arm. If the same qid arrives on both arms we
|
|
376
|
+
# have a shadow double-pay bug (caller invoked record on
|
|
377
|
+
# both arms, or the router flipped mid-test). Refuse the
|
|
378
|
+
# second write and log — the first arm's observation wins,
|
|
379
|
+
# the double-pay does not pollute the paired statistic.
|
|
380
|
+
qid_s = str(query_id)
|
|
381
|
+
if arm == "active":
|
|
382
|
+
if qid_s in self._candidate_by_qid:
|
|
383
|
+
logger.warning(
|
|
384
|
+
"shadow_test route-exclusivity violation: "
|
|
385
|
+
"qid=%s already on candidate arm; ignoring active write",
|
|
386
|
+
qid_s,
|
|
387
|
+
)
|
|
388
|
+
return
|
|
389
|
+
self._active.append(float(ndcg_at_10))
|
|
390
|
+
self._active_by_qid[qid_s] = float(ndcg_at_10)
|
|
391
|
+
elif arm == "candidate":
|
|
392
|
+
if qid_s in self._active_by_qid:
|
|
393
|
+
logger.warning(
|
|
394
|
+
"shadow_test route-exclusivity violation: "
|
|
395
|
+
"qid=%s already on active arm; ignoring candidate write",
|
|
396
|
+
qid_s,
|
|
397
|
+
)
|
|
398
|
+
return
|
|
399
|
+
self._candidate.append(float(ndcg_at_10))
|
|
400
|
+
self._candidate_by_qid[qid_s] = float(ndcg_at_10)
|
|
401
|
+
else:
|
|
402
|
+
return # unknown arm: noop
|
|
403
|
+
# S9-defer: persist so restart reloads.
|
|
404
|
+
self._persist_observation(
|
|
405
|
+
query_id=qid_s, arm=arm, ndcg=float(ndcg_at_10),
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# ------------------------------------------------------------------
|
|
409
|
+
# Decision
|
|
410
|
+
# ------------------------------------------------------------------
|
|
411
|
+
|
|
412
|
+
def decide(self) -> tuple[str, dict]:
|
|
413
|
+
"""Return ``(decision, stats)``.
|
|
414
|
+
|
|
415
|
+
``decision``:
|
|
416
|
+
* ``'promote'`` — candidate beat active by ≥ MIN_EFFECT with
|
|
417
|
+
sufficient statistical power.
|
|
418
|
+
* ``'reject'`` — full Phase B accumulated and criterion not met.
|
|
419
|
+
* ``'continue'`` — insufficient data to decide either way.
|
|
420
|
+
|
|
421
|
+
``stats`` is a plain dict for logging / dashboard / audit.
|
|
422
|
+
"""
|
|
423
|
+
n_active = len(self._active)
|
|
424
|
+
n_cand = len(self._candidate)
|
|
425
|
+
# S9-STAT-08: pair by query_id (intersection of arm keysets),
|
|
426
|
+
# NOT by arrival index. Index-pairing silently paired the
|
|
427
|
+
# Nth arrival in each arm regardless of whether those arrivals
|
|
428
|
+
# referred to the same query — a time-order artefact that
|
|
429
|
+
# violated the paired-t iid assumption whenever the two arms
|
|
430
|
+
# saw queries in different orders. Intersection-by-qid makes
|
|
431
|
+
# each pair a true same-query comparison. We keep the legacy
|
|
432
|
+
# index-min as a conservative upper bound on n_pairs for the
|
|
433
|
+
# PHASE_B_N gate so the sample-size contract unchanged.
|
|
434
|
+
paired_qids = (
|
|
435
|
+
set(self._active_by_qid.keys())
|
|
436
|
+
& set(self._candidate_by_qid.keys())
|
|
437
|
+
)
|
|
438
|
+
n_pairs = len(paired_qids)
|
|
439
|
+
stats: dict = {
|
|
440
|
+
"n_active": n_active,
|
|
441
|
+
"n_candidate": n_cand,
|
|
442
|
+
"n_pairs": n_pairs,
|
|
443
|
+
"effect": 0.0,
|
|
444
|
+
"t_stat": 0.0,
|
|
445
|
+
"std": 0.0,
|
|
446
|
+
"phase": "A" if n_pairs < self.PHASE_B_N else "B",
|
|
447
|
+
"criterion": None,
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
if n_pairs == 0:
|
|
451
|
+
return "continue", stats
|
|
452
|
+
|
|
453
|
+
# S-M03: guard against significant arm imbalance. SHA-256 routing
|
|
454
|
+
# is approximately 50/50 in expectation, but on small samples the
|
|
455
|
+
# buckets can skew. When one arm is more than 2× the other AND
|
|
456
|
+
# both arms have a minimal footprint, the paired-by-index diff
|
|
457
|
+
# silently discards the long tail — the statistic is still valid
|
|
458
|
+
# but operators should be told the data is unbalanced before any
|
|
459
|
+
# promote/reject decision is attempted.
|
|
460
|
+
_MIN_PER_ARM = 8
|
|
461
|
+
if (
|
|
462
|
+
n_active >= _MIN_PER_ARM
|
|
463
|
+
and n_cand >= _MIN_PER_ARM
|
|
464
|
+
and max(n_active, n_cand) > 2 * min(n_active, n_cand)
|
|
465
|
+
):
|
|
466
|
+
stats["criterion"] = "unbalanced_arms"
|
|
467
|
+
return "continue", stats
|
|
468
|
+
|
|
469
|
+
# S9-STAT-08: diffs built from the query_id intersection so
|
|
470
|
+
# each element of ``diffs`` is a true same-query paired
|
|
471
|
+
# comparison (candidate_ndcg - active_ndcg for the same qid).
|
|
472
|
+
# Sort the qid set for reproducibility across runs with the
|
|
473
|
+
# same data.
|
|
474
|
+
diffs = [
|
|
475
|
+
self._candidate_by_qid[qid] - self._active_by_qid[qid]
|
|
476
|
+
for qid in sorted(paired_qids)
|
|
477
|
+
]
|
|
478
|
+
mean, std, t_stat = _paired_t_stat(diffs)
|
|
479
|
+
stats["effect"] = float(mean)
|
|
480
|
+
stats["std"] = float(std)
|
|
481
|
+
stats["t_stat"] = float(t_stat)
|
|
482
|
+
|
|
483
|
+
# --- Phase A early-stop on STRONG signal ---
|
|
484
|
+
if n_pairs >= self.PHASE_A_N and n_pairs < self.PHASE_B_N:
|
|
485
|
+
crit_strong = _critical_t(n_pairs - 1, alpha=self.ALPHA_STRONG)
|
|
486
|
+
if (
|
|
487
|
+
abs(mean) > self.MIN_STRONG_EFFECT
|
|
488
|
+
and abs(t_stat) > crit_strong
|
|
489
|
+
and mean > 0
|
|
490
|
+
):
|
|
491
|
+
stats["phase"] = "A"
|
|
492
|
+
stats["criterion"] = "phase_a_strong_signal"
|
|
493
|
+
return "promote", stats
|
|
494
|
+
# Weak or uncertain signal — continue to Phase B.
|
|
495
|
+
stats["phase"] = "A"
|
|
496
|
+
stats["criterion"] = "phase_a_continue"
|
|
497
|
+
return "continue", stats
|
|
498
|
+
|
|
499
|
+
# --- Phase B full validation ---
|
|
500
|
+
if n_pairs >= self.PHASE_B_N:
|
|
501
|
+
# S-L05: we compare ``t_stat > crit`` which is a one-tailed
|
|
502
|
+
# "candidate better than active" test. ``_critical_t`` returns
|
|
503
|
+
# a TWO-tailed critical (α=0.05 → 1.96). For a one-tailed
|
|
504
|
+
# directional test at α=0.05 the correct critical is 1.645, i.e.
|
|
505
|
+
# the two-tailed critical at α=0.10. We pass α×2 so the
|
|
506
|
+
# comparison semantics match the docstring ("paired t-test
|
|
507
|
+
# p<0.05") under a one-sided directional constraint AND the
|
|
508
|
+
# ``mean >= MIN_EFFECT`` gate preserves the conservative
|
|
509
|
+
# direction preference.
|
|
510
|
+
crit = _critical_t(n_pairs - 1, alpha=min(0.999, self.ALPHA * 2.0))
|
|
511
|
+
stats["phase"] = "B"
|
|
512
|
+
if mean >= self.MIN_EFFECT and t_stat > crit:
|
|
513
|
+
stats["criterion"] = "phase_b_promote"
|
|
514
|
+
return "promote", stats
|
|
515
|
+
stats["criterion"] = "phase_b_reject"
|
|
516
|
+
return "reject", stats
|
|
517
|
+
|
|
518
|
+
# n_pairs < PHASE_A_N → continue accumulating.
|
|
519
|
+
stats["phase"] = "A"
|
|
520
|
+
stats["criterion"] = "accumulating"
|
|
521
|
+
return "continue", stats
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
__all__ = ("ShadowTest",)
|