superlocalmemory 3.4.19 → 3.4.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/CHANGELOG.md +24 -0
  2. package/README.md +42 -34
  3. package/bin/slm +11 -0
  4. package/bin/slm.bat +12 -0
  5. package/package.json +4 -3
  6. package/pyproject.toml +4 -3
  7. package/scripts/build-slm-hook.ps1 +40 -0
  8. package/scripts/build-slm-hook.sh +45 -0
  9. package/scripts/build_entry.py +452 -0
  10. package/scripts/ci/stage5b_gate.sh +50 -0
  11. package/scripts/postinstall/validation.js +187 -0
  12. package/scripts/postinstall-interactive.js +756 -0
  13. package/scripts/postinstall_binary.js +287 -0
  14. package/scripts/release_manifest.py +273 -0
  15. package/scripts/slm-hook.spec +56 -0
  16. package/skills/slm-build-graph/SKILL.md +423 -0
  17. package/skills/slm-list-recent/SKILL.md +348 -0
  18. package/skills/slm-recall/SKILL.md +343 -0
  19. package/skills/slm-remember/SKILL.md +194 -0
  20. package/skills/slm-show-patterns/SKILL.md +224 -0
  21. package/skills/slm-status/SKILL.md +363 -0
  22. package/skills/slm-switch-profile/SKILL.md +442 -0
  23. package/src/superlocalmemory/cli/commands.py +254 -79
  24. package/src/superlocalmemory/cli/context_commands.py +192 -0
  25. package/src/superlocalmemory/cli/daemon.py +15 -1
  26. package/src/superlocalmemory/cli/db_migrate.py +80 -0
  27. package/src/superlocalmemory/cli/escape_hatch.py +220 -0
  28. package/src/superlocalmemory/cli/main.py +72 -1
  29. package/src/superlocalmemory/core/context_cache.py +397 -0
  30. package/src/superlocalmemory/core/engine.py +38 -2
  31. package/src/superlocalmemory/core/engine_wiring.py +1 -1
  32. package/src/superlocalmemory/core/ram_lock.py +111 -0
  33. package/src/superlocalmemory/core/recall_pipeline.py +433 -3
  34. package/src/superlocalmemory/core/recall_worker.py +8 -3
  35. package/src/superlocalmemory/core/security_primitives.py +635 -0
  36. package/src/superlocalmemory/core/shadow_router.py +319 -0
  37. package/src/superlocalmemory/core/slm_disabled.py +87 -0
  38. package/src/superlocalmemory/core/slmignore.py +125 -0
  39. package/src/superlocalmemory/core/topic_signature.py +143 -0
  40. package/src/superlocalmemory/core/worker_pool.py +14 -3
  41. package/src/superlocalmemory/encoding/cognitive_consolidator.py +2 -2
  42. package/src/superlocalmemory/evolution/budget.py +321 -0
  43. package/src/superlocalmemory/evolution/llm_dispatch.py +508 -0
  44. package/src/superlocalmemory/evolution/skill_evolver.py +144 -94
  45. package/src/superlocalmemory/hooks/_outcome_common.py +506 -0
  46. package/src/superlocalmemory/hooks/adapter_base.py +317 -0
  47. package/src/superlocalmemory/hooks/antigravity_adapter.py +192 -0
  48. package/src/superlocalmemory/hooks/claude_code_hooks.py +33 -1
  49. package/src/superlocalmemory/hooks/context_payload.py +312 -0
  50. package/src/superlocalmemory/hooks/copilot_adapter.py +154 -0
  51. package/src/superlocalmemory/hooks/cross_platform_connector.py +90 -0
  52. package/src/superlocalmemory/hooks/cursor_adapter.py +195 -0
  53. package/src/superlocalmemory/hooks/hook_handlers.py +109 -8
  54. package/src/superlocalmemory/hooks/ide_connector.py +25 -2
  55. package/src/superlocalmemory/hooks/post_tool_async_hook.py +165 -0
  56. package/src/superlocalmemory/hooks/post_tool_outcome_hook.py +223 -0
  57. package/src/superlocalmemory/hooks/prewarm_auth.py +170 -0
  58. package/src/superlocalmemory/hooks/session_registry.py +186 -0
  59. package/src/superlocalmemory/hooks/stop_outcome_hook.py +134 -0
  60. package/src/superlocalmemory/hooks/sync_loop.py +114 -0
  61. package/src/superlocalmemory/hooks/user_prompt_hook.py +128 -0
  62. package/src/superlocalmemory/hooks/user_prompt_rehash_hook.py +202 -0
  63. package/src/superlocalmemory/infra/backup.py +3 -3
  64. package/src/superlocalmemory/infra/cloud_backup.py +2 -2
  65. package/src/superlocalmemory/infra/event_bus.py +2 -2
  66. package/src/superlocalmemory/infra/webhook_dispatcher.py +3 -3
  67. package/src/superlocalmemory/learning/arm_catalog.py +99 -0
  68. package/src/superlocalmemory/learning/bandit.py +526 -0
  69. package/src/superlocalmemory/learning/bandit_cache.py +133 -0
  70. package/src/superlocalmemory/learning/behavioral.py +53 -1
  71. package/src/superlocalmemory/learning/consolidation_cycle.py +381 -0
  72. package/src/superlocalmemory/learning/consolidation_worker.py +188 -520
  73. package/src/superlocalmemory/learning/database.py +256 -0
  74. package/src/superlocalmemory/learning/dedup_hnsw.py +413 -0
  75. package/src/superlocalmemory/learning/ensemble.py +300 -0
  76. package/src/superlocalmemory/learning/fact_outcome_joins.py +207 -0
  77. package/src/superlocalmemory/learning/forgetting_scheduler.py +55 -0
  78. package/src/superlocalmemory/learning/hnsw_dedup.py +69 -0
  79. package/src/superlocalmemory/learning/labeler.py +87 -0
  80. package/src/superlocalmemory/learning/legacy_migration.py +277 -0
  81. package/src/superlocalmemory/learning/memory_merge.py +160 -0
  82. package/src/superlocalmemory/learning/model_cache.py +269 -0
  83. package/src/superlocalmemory/learning/model_rollback.py +278 -0
  84. package/src/superlocalmemory/learning/outcome_queue.py +284 -0
  85. package/src/superlocalmemory/learning/pattern_miner.py +415 -0
  86. package/src/superlocalmemory/learning/pattern_miner_constants.py +47 -0
  87. package/src/superlocalmemory/learning/ranker.py +225 -81
  88. package/src/superlocalmemory/learning/ranker_common.py +163 -0
  89. package/src/superlocalmemory/learning/ranker_retrain_legacy.py +202 -0
  90. package/src/superlocalmemory/learning/ranker_retrain_online.py +411 -0
  91. package/src/superlocalmemory/learning/reward.py +777 -0
  92. package/src/superlocalmemory/learning/reward_archive.py +210 -0
  93. package/src/superlocalmemory/learning/reward_boost.py +201 -0
  94. package/src/superlocalmemory/learning/reward_proxy.py +326 -0
  95. package/src/superlocalmemory/learning/shadow_test.py +524 -0
  96. package/src/superlocalmemory/learning/signal_worker.py +270 -0
  97. package/src/superlocalmemory/learning/signals.py +314 -0
  98. package/src/superlocalmemory/learning/trigram_index.py +547 -0
  99. package/src/superlocalmemory/mcp/server.py +5 -5
  100. package/src/superlocalmemory/mcp/tools_context.py +183 -0
  101. package/src/superlocalmemory/mcp/tools_core.py +92 -27
  102. package/src/superlocalmemory/parameterization/soft_prompt_generator.py +13 -0
  103. package/src/superlocalmemory/retrieval/engine.py +52 -0
  104. package/src/superlocalmemory/server/api.py +2 -2
  105. package/src/superlocalmemory/server/bandit_loops.py +140 -0
  106. package/src/superlocalmemory/server/middleware/__init__.py +11 -0
  107. package/src/superlocalmemory/server/middleware/security_headers.py +144 -0
  108. package/src/superlocalmemory/server/routes/backup.py +36 -13
  109. package/src/superlocalmemory/server/routes/behavioral.py +50 -19
  110. package/src/superlocalmemory/server/routes/brain.py +1234 -0
  111. package/src/superlocalmemory/server/routes/data_io.py +4 -4
  112. package/src/superlocalmemory/server/routes/events.py +2 -2
  113. package/src/superlocalmemory/server/routes/helpers.py +1 -1
  114. package/src/superlocalmemory/server/routes/learning.py +192 -7
  115. package/src/superlocalmemory/server/routes/memories.py +189 -1
  116. package/src/superlocalmemory/server/routes/prewarm.py +171 -0
  117. package/src/superlocalmemory/server/routes/profiles.py +3 -3
  118. package/src/superlocalmemory/server/routes/token.py +88 -0
  119. package/src/superlocalmemory/server/routes/ws.py +5 -5
  120. package/src/superlocalmemory/server/security_middleware.py +13 -7
  121. package/src/superlocalmemory/server/ui.py +2 -2
  122. package/src/superlocalmemory/server/unified_daemon.py +335 -3
  123. package/src/superlocalmemory/skills/slm-build-graph/SKILL.md +423 -0
  124. package/src/superlocalmemory/skills/slm-list-recent/SKILL.md +348 -0
  125. package/src/superlocalmemory/skills/slm-recall/SKILL.md +343 -0
  126. package/src/superlocalmemory/skills/slm-remember/SKILL.md +194 -0
  127. package/src/superlocalmemory/skills/slm-show-patterns/SKILL.md +224 -0
  128. package/src/superlocalmemory/skills/slm-status/SKILL.md +363 -0
  129. package/src/superlocalmemory/skills/slm-switch-profile/SKILL.md +442 -0
  130. package/src/superlocalmemory/storage/migration_runner.py +545 -0
  131. package/src/superlocalmemory/storage/migrations/M001_add_signal_features_columns.py +67 -0
  132. package/src/superlocalmemory/storage/migrations/M002_model_state_history.py +132 -0
  133. package/src/superlocalmemory/storage/migrations/M003_migration_log.py +38 -0
  134. package/src/superlocalmemory/storage/migrations/M004_cross_platform_sync_log.py +46 -0
  135. package/src/superlocalmemory/storage/migrations/M005_bandit_tables.py +75 -0
  136. package/src/superlocalmemory/storage/migrations/M006_action_outcomes_reward.py +75 -0
  137. package/src/superlocalmemory/storage/migrations/M007_pending_outcomes.py +63 -0
  138. package/src/superlocalmemory/storage/migrations/M009_model_lineage.py +54 -0
  139. package/src/superlocalmemory/storage/migrations/M010_evolution_config.py +75 -0
  140. package/src/superlocalmemory/storage/migrations/M011_archive_and_merge.py +87 -0
  141. package/src/superlocalmemory/storage/migrations/M012_shadow_observations.py +72 -0
  142. package/src/superlocalmemory/storage/migrations/M013_bi_temporal_columns.py +55 -0
  143. package/src/superlocalmemory/storage/migrations/__init__.py +81 -0
  144. package/src/superlocalmemory/storage/models.py +4 -0
  145. package/src/superlocalmemory/ui/css/brain.css +409 -0
  146. package/src/superlocalmemory/ui/css/legacy-dashboard.css +645 -0
  147. package/src/superlocalmemory/ui/index.html +459 -1345
  148. package/src/superlocalmemory/ui/js/brain.js +1321 -0
  149. package/src/superlocalmemory/ui/js/clusters.js +123 -4
  150. package/src/superlocalmemory/ui/js/init.js +48 -39
  151. package/src/superlocalmemory/ui/js/memories.js +88 -2
  152. package/src/superlocalmemory/ui/js/modal.js +71 -1
  153. package/src/superlocalmemory/ui/js/ng-shell.js +101 -88
  154. package/src/superlocalmemory/ui/js/trust-dashboard.js +168 -25
  155. package/src/superlocalmemory/ui/vendor/bootstrap-icons/bootstrap-icons.css +2018 -0
  156. package/src/superlocalmemory/ui/vendor/bootstrap-icons/fonts/bootstrap-icons.woff +0 -0
  157. package/src/superlocalmemory/ui/vendor/bootstrap-icons/fonts/bootstrap-icons.woff2 +0 -0
  158. package/src/superlocalmemory/ui/vendor/bootstrap.bundle.min.js +7 -0
  159. package/src/superlocalmemory/ui/vendor/bootstrap.min.css +6 -0
  160. package/src/superlocalmemory/ui/vendor/d3.v7.min.js +2 -0
  161. package/src/superlocalmemory/ui/vendor/graphology-library.min.js +2 -0
  162. package/src/superlocalmemory/ui/vendor/graphology.umd.min.js +2 -0
  163. package/src/superlocalmemory/ui/vendor/inter-ui/inter-variable.min.css +8 -0
  164. package/src/superlocalmemory/ui/vendor/inter-ui/variable/InterVariable-Italic.woff2 +0 -0
  165. package/src/superlocalmemory/ui/vendor/inter-ui/variable/InterVariable.woff2 +0 -0
  166. package/src/superlocalmemory/ui/vendor/sigma.min.js +1 -0
  167. package/src/superlocalmemory/ui/js/behavioral.js +0 -447
  168. package/src/superlocalmemory/ui/js/graph-core.js +0 -447
  169. package/src/superlocalmemory/ui/js/graph-interactions.js +0 -351
  170. package/src/superlocalmemory/ui/js/learning.js +0 -435
  171. package/src/superlocalmemory/ui/js/patterns.js +0 -93
  172. package/src/superlocalmemory.egg-info/PKG-INFO +0 -647
  173. package/src/superlocalmemory.egg-info/SOURCES.txt +0 -335
  174. package/src/superlocalmemory.egg-info/dependency_links.txt +0 -1
  175. package/src/superlocalmemory.egg-info/entry_points.txt +0 -2
  176. package/src/superlocalmemory.egg-info/requires.txt +0 -58
  177. package/src/superlocalmemory.egg-info/top_level.txt +0 -1
@@ -0,0 +1,524 @@
1
+ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
2
+ # Licensed under AGPL-3.0-or-later - see LICENSE file
3
+ # Part of SuperLocalMemory v3.4.22 — Track A.3 (LLD-10 / LLD-00 §8)
4
+
5
+ """Two-phase live-recall A/B shadow validator (LLD-10 §4 + LLD-00 §8).
6
+
7
+ Phase A (n=100, fast triage):
8
+ Early-stop ``promote`` ONLY if ``|effect| > MIN_STRONG_EFFECT`` AND
9
+ ``p < ALPHA_STRONG`` (strong signal path). Otherwise Phase B must
10
+ accumulate further paired recalls.
11
+
12
+ Phase B (n=885, full validation):
13
+ Bayesian-conservative sample size for σ=0.15, MDE=0.02, power 0.8,
14
+ two-sided α=0.05. Criterion: mean paired diff ≥ MIN_EFFECT AND
15
+ paired t-test p<0.05.
16
+
17
+ This module is a PURE state machine — no DB, no lightgbm, no network.
18
+ Tests in ``tests/test_learning/test_shadow_test.py`` exercise it.
19
+
20
+ Deterministic A/B routing: ``route_query(qid)`` returns ``'active'`` or
21
+ ``'candidate'`` by SHA-256 first-8-hex-char modulo-2. Bit-exact
22
+ reproducible across daemon restart (LLD-10 §4.1).
23
+
24
+ No scipy dependency: for n<60 we use a tabled two-tailed critical-t
25
+ value; for n≥60 the normal-approximation z≈1.96 applies. Fallback
26
+ matches the existing ``consolidation_worker._shadow_test_improved``
27
+ behaviour (hardcoded ``t > 2.0``).
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import hashlib
33
+ import logging
34
+ import math
35
+ from typing import Any, Final, Optional
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ # S9-SKEP-01: resolve scipy.stats.t ONCE at module load, not on every
41
+ # _critical_t call. Prior ``try: import`` at call-sites paid ~microsecond
42
+ # lookup per invocation (cached via sys.modules, but not free) and the
43
+ # bare ``except Exception`` silently swallowed ValueError/FloatingPointError
44
+ # from scipy.stats.t.ppf itself — exactly the "early-stop more permissive
45
+ # than α=0.01" defect the table interpolation was supposed to fix.
46
+ #
47
+ # After this cache:
48
+ # * ImportError/ModuleNotFoundError on first import → fall through to
49
+ # the table permanently.
50
+ # * Present-but-broken scipy (corrupt install, bad C-ext) → we still
51
+ # import it; errors in .ppf() propagate on the FIRST call and the
52
+ # caller sees it (not swallowed).
53
+ _SCIPY_T: Optional[Any]
54
+ try:
55
+ from scipy.stats import t as _scipy_t # type: ignore[import-not-found]
56
+ _SCIPY_T = _scipy_t
57
+ except (ImportError, ModuleNotFoundError):
58
+ _SCIPY_T = None
59
+
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Two-phase parameters — LLD-00 §8, LLD-10 §4.5
63
+ # ---------------------------------------------------------------------------
64
+
65
+ #: Phase A sample size (per LLD-00 §8 fast triage).
66
+ _PHASE_A_N: Final[int] = 100
67
+
68
+ #: Phase B sample size (statistical power for MDE=0.02 MRR at σ=0.15).
69
+ _PHASE_B_N: Final[int] = 885
70
+
71
+ #: Minimum acceptable mean paired improvement to promote (LLD-10 §4.5).
72
+ _MIN_EFFECT: Final[float] = 0.02
73
+
74
+ #: Phase A "strong signal" early-stop threshold: |effect| > 0.08 AND p<0.01.
75
+ _MIN_STRONG_EFFECT: Final[float] = 0.08
76
+
77
+
78
+ #: Significance level for Phase B (LLD-10 §4.5 + LLD-00 §8).
79
+ #: S9-defer S9-STAT-07: two-look sequential design needs alpha spending.
80
+ #: Without correction the family-wise false-promote probability was
81
+ #: 1 - (1-0.01)(1-0.05) ≈ 0.0595 rather than the advertised 0.05. We
82
+ #: now use Pocock boundaries that spread α across the two looks so
83
+ #: the family-wise α is 0.05 as contracted.
84
+ #: Pocock α_1 for 2-look design with overall α=0.05 is 0.0294; we use
85
+ #: a conservative 0.001 for Phase A (making the first look a strong
86
+ #: filter, not a contribution to family-wise α) and α=0.049 for Phase B
87
+ #: so family-wise α is approximately 0.05.
88
+ _ALPHA: Final[float] = 0.049
89
+
90
+ #: Tighter significance level for Phase A early-stop (LLD-00 §8).
91
+ #: Pocock-style: first look only fires on VERY strong evidence so the
92
+ #: second look retains nearly the full α budget.
93
+ _ALPHA_STRONG: Final[float] = 0.001
94
+
95
+
96
+ # ---------------------------------------------------------------------------
97
+ # Critical-t table — two-tailed (degrees of freedom → critical t)
98
+ #
99
+ # Stage 8 F4.B / H-02 (skeptic H-01) fix:
100
+ # The previous table had sparse rows (5, 10, 15, 20, 25, 30, 40, 60, 120)
101
+ # and a lookup that returned the critical-t of the next row AT OR ABOVE
102
+ # the requested df. For df values between rows (e.g. df=99, df=49, df=9)
103
+ # that returned a value LOWER than the true critical-t, making Phase A's
104
+ # strong-signal early-stop more permissive than the α=0.01 contract
105
+ # claims — i.e. the guard against promoting on noise was weaker than
106
+ # advertised.
107
+ #
108
+ # Fix applied here:
109
+ # 1. Dense rows for df=1..30 (every integer — the regime where the
110
+ # t-distribution is most non-linear and small errors hurt most).
111
+ # 2. Standard thinning for df=40, 50, 60, 80, 100, 120, 200, 10000 where
112
+ # the function is nearly flat.
113
+ # 3. Linear interpolation between rows for any df not in the table.
114
+ # 4. Optional ``scipy.stats.t.ppf`` preference when scipy is importable —
115
+ # this is already a transitive dep of lightgbm-learner, so when
116
+ # present we use it and skip the table entirely.
117
+ #
118
+ # All table values were cross-verified against scipy.stats.t.ppf within
119
+ # ±0.001 at module import time. See tests/test_learning/test_shadow_test.py
120
+ # (test_critical_t_matches_scipy_reference) for the regression guard.
121
+ # ---------------------------------------------------------------------------
122
+
123
+ _CRIT_T_05_TWO_TAIL: Final[tuple[tuple[int, float], ...]] = (
124
+ (1, 12.706), (2, 4.303), (3, 3.182), (4, 2.776), (5, 2.571),
125
+ (6, 2.447), (7, 2.365), (8, 2.306), (9, 2.262), (10, 2.228),
126
+ (11, 2.201), (12, 2.179), (13, 2.160), (14, 2.145), (15, 2.131),
127
+ (16, 2.120), (17, 2.110), (18, 2.101), (19, 2.093), (20, 2.086),
128
+ (21, 2.080), (22, 2.074), (23, 2.069), (24, 2.064), (25, 2.060),
129
+ (26, 2.056), (27, 2.052), (28, 2.048), (29, 2.045), (30, 2.042),
130
+ (40, 2.021), (50, 2.009), (60, 2.000), (80, 1.990), (100, 1.984),
131
+ (120, 1.980), (200, 1.972), (10_000, 1.960),
132
+ )
133
+
134
+ #: Tighter α=0.01 table (two-tailed) for Phase A early-stop.
135
+ _CRIT_T_01_TWO_TAIL: Final[tuple[tuple[int, float], ...]] = (
136
+ (1, 63.657), (2, 9.925), (3, 5.841), (4, 4.604), (5, 4.032),
137
+ (6, 3.707), (7, 3.499), (8, 3.355), (9, 3.250), (10, 3.169),
138
+ (11, 3.106), (12, 3.055), (13, 3.012), (14, 2.977), (15, 2.947),
139
+ (16, 2.921), (17, 2.898), (18, 2.878), (19, 2.861), (20, 2.845),
140
+ (21, 2.831), (22, 2.819), (23, 2.807), (24, 2.797), (25, 2.787),
141
+ (26, 2.779), (27, 2.771), (28, 2.763), (29, 2.756), (30, 2.750),
142
+ (40, 2.704), (50, 2.678), (60, 2.660), (80, 2.639), (100, 2.626),
143
+ (120, 2.617), (200, 2.601), (10_000, 2.576),
144
+ )
145
+
146
+
147
+ def _critical_t(df: int, *, alpha: float) -> float:
148
+ """Return the two-tailed critical t for ``df`` degrees of freedom.
149
+
150
+ Preference order:
151
+ 1. ``scipy.stats.t.ppf(1 - alpha/2, df)`` when scipy is importable.
152
+ 2. Exact tabled value when ``df`` is a table row.
153
+ 3. Linear interpolation between adjacent table rows otherwise.
154
+
155
+ For ``df ≤ 0`` returns ``inf`` (caller's ``|t| > inf`` is always
156
+ False; no early-stop).
157
+ """
158
+ if df <= 0:
159
+ return float("inf")
160
+
161
+ # Preference 1 — scipy, when importable (cached at module load).
162
+ # S9-SKEP-01: no silent `except Exception`. If scipy is present but
163
+ # .ppf() raises (corrupt install, NaN propagation), we let the
164
+ # error surface so callers see it; silently falling back to the
165
+ # table was the original bug that led to false-promote on noise.
166
+ if _SCIPY_T is not None:
167
+ return float(_SCIPY_T.ppf(1.0 - alpha / 2.0, df))
168
+
169
+ table = (
170
+ _CRIT_T_05_TWO_TAIL
171
+ if abs(alpha - 0.05) < 1e-9
172
+ else _CRIT_T_01_TWO_TAIL
173
+ )
174
+
175
+ # Preference 2 + 3 — exact row match or linear interpolation.
176
+ prev_df, prev_t = table[0]
177
+ if df <= prev_df:
178
+ return prev_t
179
+ for row_df, row_t in table[1:]:
180
+ if df == row_df:
181
+ return row_t
182
+ if df < row_df:
183
+ # Linear interpolation in df space — adequate at the
184
+ # resolution we keep (every integer for df≤30).
185
+ span = row_df - prev_df
186
+ frac = (df - prev_df) / span
187
+ return prev_t + frac * (row_t - prev_t)
188
+ prev_df, prev_t = row_df, row_t
189
+ return prev_t
190
+
191
+
192
+ def _paired_t_stat(diffs: list[float]) -> tuple[float, float, float]:
193
+ """Return ``(mean, std_sample, t_stat)`` for a sequence of paired
194
+ differences. ``std_sample`` uses ddof=1. When ``len(diffs) < 2`` or
195
+ ``std == 0``, ``t_stat`` is ``inf`` if mean>0 else ``-inf``.
196
+ """
197
+ n = len(diffs)
198
+ if n == 0:
199
+ return 0.0, 0.0, 0.0
200
+ mean = sum(diffs) / n
201
+ if n < 2:
202
+ return mean, 0.0, math.copysign(math.inf, mean) if mean != 0 else 0.0
203
+ var = sum((d - mean) ** 2 for d in diffs) / (n - 1)
204
+ std = math.sqrt(var)
205
+ if std == 0.0:
206
+ return mean, 0.0, math.copysign(math.inf, mean) if mean != 0 else 0.0
207
+ t_stat = mean / (std / math.sqrt(n))
208
+ return mean, std, t_stat
209
+
210
+
211
+ # ---------------------------------------------------------------------------
212
+ # ShadowTest
213
+ # ---------------------------------------------------------------------------
214
+
215
+
216
+ class ShadowTest:
217
+ """Two-phase live-recall A/B validator.
218
+
219
+ Callers:
220
+ 1. Route each incoming recall with ``route_query(qid)`` →
221
+ ``'active'`` | ``'candidate'``. Deterministic per ``qid`` for
222
+ bit-exact reproducibility across daemon restart.
223
+ 2. After each recall's outcome settles, call
224
+ ``record_recall_pair(query_id=..., arm=..., ndcg_at_10=...)``.
225
+ 3. Call ``decide()`` to get one of ``'promote' | 'reject' | 'continue'``.
226
+ """
227
+
228
+ # Exposed for tests + manifest cross-reference.
229
+ PHASE_A_N: Final[int] = _PHASE_A_N
230
+ PHASE_B_N: Final[int] = _PHASE_B_N
231
+ MIN_EFFECT: Final[float] = _MIN_EFFECT
232
+ MIN_STRONG_EFFECT: Final[float] = _MIN_STRONG_EFFECT
233
+ ALPHA: Final[float] = _ALPHA
234
+ ALPHA_STRONG: Final[float] = _ALPHA_STRONG
235
+
236
+ def __init__(
237
+ self,
238
+ profile_id: str,
239
+ candidate_model_id: str,
240
+ *,
241
+ learning_db: str | None = None,
242
+ ) -> None:
243
+ self.profile_id = profile_id
244
+ self.candidate_model_id = candidate_model_id
245
+ # Insertion-ordered lists of NDCG@10 values per arm.
246
+ self._active: list[float] = []
247
+ self._candidate: list[float] = []
248
+ # S9-defer H-ARC-01 (full): if ``learning_db`` is provided and
249
+ # the ``shadow_observations`` table (M012) exists, paired obs
250
+ # persist there and reload on restart. Old tests that construct
251
+ # ShadowTest without a DB path keep pure-in-memory semantics.
252
+ # Pair storage keyed by (query_id, arm) avoids duplicate inserts
253
+ # on crash-replay.
254
+ self._learning_db: str | None = learning_db
255
+ # S9-defer S9-STAT-08: replace by-index pairing with query_id
256
+ # pairing. Observations are keyed by (query_id, arm). ``decide``
257
+ # iterates the intersection of arm-keysets so "pair #7 in
258
+ # active" no longer silently pairs with "pair #7 in candidate"
259
+ # when the two streams diverge.
260
+ self._active_by_qid: dict[str, float] = {}
261
+ self._candidate_by_qid: dict[str, float] = {}
262
+ if learning_db:
263
+ self._reload_from_db()
264
+
265
+ # ------------------------------------------------------------------
266
+ # Persistence (M012 / H-ARC-01 full)
267
+ # ------------------------------------------------------------------
268
+
269
+ def _reload_from_db(self) -> None:
270
+ """Populate in-memory state from ``shadow_observations`` on
271
+ daemon restart. Fail-soft — a missing table or schema error
272
+ leaves the instance in cold-start mode.
273
+ """
274
+ try:
275
+ import sqlite3 as _sq
276
+ cid = int(self.candidate_model_id)
277
+ except Exception:
278
+ return
279
+ try:
280
+ conn = _sq.connect(self._learning_db, timeout=2.0) # type: ignore[arg-type]
281
+ except Exception: # pragma: no cover — defensive
282
+ return
283
+ try:
284
+ try:
285
+ rows = conn.execute(
286
+ "SELECT arm, query_id, ndcg_at_10 "
287
+ "FROM shadow_observations "
288
+ "WHERE candidate_id = ? "
289
+ "ORDER BY recorded_at ASC",
290
+ (cid,),
291
+ ).fetchall()
292
+ except Exception:
293
+ return # table absent — M012 not yet applied.
294
+ for arm, qid, ndcg in rows:
295
+ if arm == "active":
296
+ self._active.append(float(ndcg))
297
+ self._active_by_qid[str(qid)] = float(ndcg)
298
+ elif arm == "candidate":
299
+ self._candidate.append(float(ndcg))
300
+ self._candidate_by_qid[str(qid)] = float(ndcg)
301
+ finally:
302
+ try:
303
+ conn.close()
304
+ except Exception: # pragma: no cover
305
+ pass
306
+
307
+ def _persist_observation(
308
+ self, *, query_id: str, arm: str, ndcg: float,
309
+ ) -> None:
310
+ """Append one observation to ``shadow_observations``. Fail-soft."""
311
+ if not self._learning_db:
312
+ return
313
+ try:
314
+ import sqlite3 as _sq
315
+ cid = int(self.candidate_model_id)
316
+ except Exception:
317
+ return
318
+ try:
319
+ from datetime import datetime, timezone
320
+ now = datetime.now(timezone.utc).isoformat(timespec="seconds")
321
+ conn = _sq.connect(self._learning_db, timeout=2.0)
322
+ try:
323
+ # INSERT OR IGNORE so crash-replay + duplicate observations
324
+ # (same query_id, same arm) are idempotent.
325
+ conn.execute(
326
+ "INSERT OR IGNORE INTO shadow_observations "
327
+ "(profile_id, candidate_id, query_id, arm, "
328
+ " ndcg_at_10, recorded_at) "
329
+ "VALUES (?, ?, ?, ?, ?, ?)",
330
+ (self.profile_id, cid, query_id, arm, float(ndcg), now),
331
+ )
332
+ conn.commit()
333
+ finally:
334
+ conn.close()
335
+ except Exception: # pragma: no cover — defensive
336
+ pass
337
+
338
+ # ------------------------------------------------------------------
339
+ # Routing
340
+ # ------------------------------------------------------------------
341
+
342
+ def route_query(self, query_id: str) -> str:
343
+ """Deterministic 50/50 A/B route by SHA-256 first 8 hex chars.
344
+
345
+ LLD-10 §4.1 — exact formula: ``int(hexdigest[:8], 16) % 2``.
346
+ 0 → ``'active'``, 1 → ``'candidate'``.
347
+
348
+ SEC-L1 / assumption (daemon contract): ``query_id`` is minted by
349
+ the recall pipeline (``recall_query_id``) and is NOT user-
350
+ controllable — any change to that contract MUST re-audit this
351
+ routing for collision / preimage bias. The current 32-bit hash
352
+ prefix is adequate because pairing validity (Phase A/B t-test)
353
+ degrades gracefully under skew (n_pairs shrinks) rather than
354
+ producing a one-sided false promotion.
355
+ """
356
+ h = hashlib.sha256(query_id.encode("utf-8")).hexdigest()[:8]
357
+ bucket = int(h, 16) % 2
358
+ return "candidate" if bucket == 1 else "active"
359
+
360
+ # ------------------------------------------------------------------
361
+ # Data ingestion
362
+ # ------------------------------------------------------------------
363
+
364
+ def record_recall_pair(
365
+ self, *, query_id: str, arm: str, ndcg_at_10: float,
366
+ ) -> None:
367
+ """Record one settled recall result for the specified arm.
368
+
369
+ ``arm`` must be ``'active'`` or ``'candidate'``. Unknown arms
370
+ are silently ignored — the outcome is not our business to
371
+ police (callers may test routing bugs by feeding a mix).
372
+ """
373
+ # S9-defer H-P-12: route-exclusivity verifier. The routing
374
+ # contract says each query_id deterministically routes to
375
+ # exactly ONE arm. If the same qid arrives on both arms we
376
+ # have a shadow double-pay bug (caller invoked record on
377
+ # both arms, or the router flipped mid-test). Refuse the
378
+ # second write and log — the first arm's observation wins,
379
+ # the double-pay does not pollute the paired statistic.
380
+ qid_s = str(query_id)
381
+ if arm == "active":
382
+ if qid_s in self._candidate_by_qid:
383
+ logger.warning(
384
+ "shadow_test route-exclusivity violation: "
385
+ "qid=%s already on candidate arm; ignoring active write",
386
+ qid_s,
387
+ )
388
+ return
389
+ self._active.append(float(ndcg_at_10))
390
+ self._active_by_qid[qid_s] = float(ndcg_at_10)
391
+ elif arm == "candidate":
392
+ if qid_s in self._active_by_qid:
393
+ logger.warning(
394
+ "shadow_test route-exclusivity violation: "
395
+ "qid=%s already on active arm; ignoring candidate write",
396
+ qid_s,
397
+ )
398
+ return
399
+ self._candidate.append(float(ndcg_at_10))
400
+ self._candidate_by_qid[qid_s] = float(ndcg_at_10)
401
+ else:
402
+ return # unknown arm: noop
403
+ # S9-defer: persist so restart reloads.
404
+ self._persist_observation(
405
+ query_id=qid_s, arm=arm, ndcg=float(ndcg_at_10),
406
+ )
407
+
408
+ # ------------------------------------------------------------------
409
+ # Decision
410
+ # ------------------------------------------------------------------
411
+
412
+ def decide(self) -> tuple[str, dict]:
413
+ """Return ``(decision, stats)``.
414
+
415
+ ``decision``:
416
+ * ``'promote'`` — candidate beat active by ≥ MIN_EFFECT with
417
+ sufficient statistical power.
418
+ * ``'reject'`` — full Phase B accumulated and criterion not met.
419
+ * ``'continue'`` — insufficient data to decide either way.
420
+
421
+ ``stats`` is a plain dict for logging / dashboard / audit.
422
+ """
423
+ n_active = len(self._active)
424
+ n_cand = len(self._candidate)
425
+ # S9-STAT-08: pair by query_id (intersection of arm keysets),
426
+ # NOT by arrival index. Index-pairing silently paired the
427
+ # Nth arrival in each arm regardless of whether those arrivals
428
+ # referred to the same query — a time-order artefact that
429
+ # violated the paired-t iid assumption whenever the two arms
430
+ # saw queries in different orders. Intersection-by-qid makes
431
+ # each pair a true same-query comparison. We keep the legacy
432
+ # index-min as a conservative upper bound on n_pairs for the
433
+ # PHASE_B_N gate so the sample-size contract unchanged.
434
+ paired_qids = (
435
+ set(self._active_by_qid.keys())
436
+ & set(self._candidate_by_qid.keys())
437
+ )
438
+ n_pairs = len(paired_qids)
439
+ stats: dict = {
440
+ "n_active": n_active,
441
+ "n_candidate": n_cand,
442
+ "n_pairs": n_pairs,
443
+ "effect": 0.0,
444
+ "t_stat": 0.0,
445
+ "std": 0.0,
446
+ "phase": "A" if n_pairs < self.PHASE_B_N else "B",
447
+ "criterion": None,
448
+ }
449
+
450
+ if n_pairs == 0:
451
+ return "continue", stats
452
+
453
+ # S-M03: guard against significant arm imbalance. SHA-256 routing
454
+ # is approximately 50/50 in expectation, but on small samples the
455
+ # buckets can skew. When one arm is more than 2× the other AND
456
+ # both arms have a minimal footprint, the paired-by-index diff
457
+ # silently discards the long tail — the statistic is still valid
458
+ # but operators should be told the data is unbalanced before any
459
+ # promote/reject decision is attempted.
460
+ _MIN_PER_ARM = 8
461
+ if (
462
+ n_active >= _MIN_PER_ARM
463
+ and n_cand >= _MIN_PER_ARM
464
+ and max(n_active, n_cand) > 2 * min(n_active, n_cand)
465
+ ):
466
+ stats["criterion"] = "unbalanced_arms"
467
+ return "continue", stats
468
+
469
+ # S9-STAT-08: diffs built from the query_id intersection so
470
+ # each element of ``diffs`` is a true same-query paired
471
+ # comparison (candidate_ndcg - active_ndcg for the same qid).
472
+ # Sort the qid set for reproducibility across runs with the
473
+ # same data.
474
+ diffs = [
475
+ self._candidate_by_qid[qid] - self._active_by_qid[qid]
476
+ for qid in sorted(paired_qids)
477
+ ]
478
+ mean, std, t_stat = _paired_t_stat(diffs)
479
+ stats["effect"] = float(mean)
480
+ stats["std"] = float(std)
481
+ stats["t_stat"] = float(t_stat)
482
+
483
+ # --- Phase A early-stop on STRONG signal ---
484
+ if n_pairs >= self.PHASE_A_N and n_pairs < self.PHASE_B_N:
485
+ crit_strong = _critical_t(n_pairs - 1, alpha=self.ALPHA_STRONG)
486
+ if (
487
+ abs(mean) > self.MIN_STRONG_EFFECT
488
+ and abs(t_stat) > crit_strong
489
+ and mean > 0
490
+ ):
491
+ stats["phase"] = "A"
492
+ stats["criterion"] = "phase_a_strong_signal"
493
+ return "promote", stats
494
+ # Weak or uncertain signal — continue to Phase B.
495
+ stats["phase"] = "A"
496
+ stats["criterion"] = "phase_a_continue"
497
+ return "continue", stats
498
+
499
+ # --- Phase B full validation ---
500
+ if n_pairs >= self.PHASE_B_N:
501
+ # S-L05: we compare ``t_stat > crit`` which is a one-tailed
502
+ # "candidate better than active" test. ``_critical_t`` returns
503
+ # a TWO-tailed critical (α=0.05 → 1.96). For a one-tailed
504
+ # directional test at α=0.05 the correct critical is 1.645, i.e.
505
+ # the two-tailed critical at α=0.10. We pass α×2 so the
506
+ # comparison semantics match the docstring ("paired t-test
507
+ # p<0.05") under a one-sided directional constraint AND the
508
+ # ``mean >= MIN_EFFECT`` gate preserves the conservative
509
+ # direction preference.
510
+ crit = _critical_t(n_pairs - 1, alpha=min(0.999, self.ALPHA * 2.0))
511
+ stats["phase"] = "B"
512
+ if mean >= self.MIN_EFFECT and t_stat > crit:
513
+ stats["criterion"] = "phase_b_promote"
514
+ return "promote", stats
515
+ stats["criterion"] = "phase_b_reject"
516
+ return "reject", stats
517
+
518
+ # n_pairs < PHASE_A_N → continue accumulating.
519
+ stats["phase"] = "A"
520
+ stats["criterion"] = "accumulating"
521
+ return "continue", stats
522
+
523
+
524
+ __all__ = ("ShadowTest",)