driftvane 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- driftvane-0.1.0/.gitignore +21 -0
- driftvane-0.1.0/LICENSE +21 -0
- driftvane-0.1.0/PKG-INFO +135 -0
- driftvane-0.1.0/README.md +99 -0
- driftvane-0.1.0/pyproject.toml +69 -0
- driftvane-0.1.0/src/driftvane/__init__.py +25 -0
- driftvane-0.1.0/src/driftvane/detector.py +42 -0
- driftvane-0.1.0/src/driftvane/detectors/__init__.py +6 -0
- driftvane-0.1.0/src/driftvane/detectors/embedding.py +112 -0
- driftvane-0.1.0/src/driftvane/detectors/latency.py +90 -0
- driftvane-0.1.0/src/driftvane/detectors/response.py +128 -0
- driftvane-0.1.0/src/driftvane/detectors/retrieval.py +113 -0
- driftvane-0.1.0/src/driftvane/report.py +99 -0
- driftvane-0.1.0/tests/__init__.py +0 -0
- driftvane-0.1.0/tests/test_embedding.py +73 -0
- driftvane-0.1.0/tests/test_latency.py +63 -0
- driftvane-0.1.0/tests/test_report.py +61 -0
- driftvane-0.1.0/tests/test_response.py +67 -0
- driftvane-0.1.0/tests/test_retrieval.py +60 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.so
|
|
5
|
+
.Python
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
*.egg-info/
|
|
9
|
+
*.egg
|
|
10
|
+
.pytest_cache/
|
|
11
|
+
.ruff_cache/
|
|
12
|
+
.coverage
|
|
13
|
+
htmlcov/
|
|
14
|
+
.tox/
|
|
15
|
+
.mypy_cache/
|
|
16
|
+
.venv/
|
|
17
|
+
venv/
|
|
18
|
+
env/
|
|
19
|
+
.idea/
|
|
20
|
+
.vscode/
|
|
21
|
+
.DS_Store
|
driftvane-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mukunda Rao Katta
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
driftvane-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: driftvane
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Compose drift detectors (embedding, retrieval, response, latency) into one report. Library-only, no server, no UI.
|
|
5
|
+
Project-URL: Homepage, https://github.com/MukundaKatta/driftvane
|
|
6
|
+
Project-URL: Issues, https://github.com/MukundaKatta/driftvane/issues
|
|
7
|
+
Project-URL: Source, https://github.com/MukundaKatta/driftvane
|
|
8
|
+
Author-email: Mukunda Rao Katta <mukunda.vjcs6@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: agents,ai,drift,embedding-drift,evals,llm,mlops,monitoring,rag,retrieval-drift
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
|
+
Requires-Python: >=3.10
|
|
26
|
+
Requires-Dist: numpy>=1.24
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pandas>=2.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
31
|
+
Provides-Extra: external-response
|
|
32
|
+
Requires-Dist: context-drift-detector-py>=0.1; extra == 'external-response'
|
|
33
|
+
Provides-Extra: pandas
|
|
34
|
+
Requires-Dist: pandas>=2.0; extra == 'pandas'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# driftvane
|
|
38
|
+
|
|
39
|
+
[](https://github.com/MukundaKatta/driftvane/actions/workflows/ci.yml)
|
|
40
|
+
[](https://pypi.org/project/driftvane/)
|
|
41
|
+
[](LICENSE)
|
|
42
|
+
|
|
43
|
+
**Compose drift detectors for RAG and agent systems.**
|
|
44
|
+
|
|
45
|
+
Most drift libraries are either tabular-only (Evidently, DataDrift) or are
|
|
46
|
+
platforms that want you to ship telemetry to their backend (Phoenix, Arize).
|
|
47
|
+
`driftvane` is a small Python library that lets you wire up multiple drift
|
|
48
|
+
signals — embedding, retrieval, response, latency — into one report. No
|
|
49
|
+
server, no UI, no telemetry. Plug it into a Lambda or Glue job, get a
|
|
50
|
+
`pandas.DataFrame` or a JSON dict back.
|
|
51
|
+
|
|
52
|
+
## Install
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install driftvane
|
|
56
|
+
# optional
|
|
57
|
+
pip install "driftvane[pandas]" # to_pandas()
|
|
58
|
+
pip install "driftvane[external-response]" # delegate response scoring to context-drift-detector-py
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Quickstart
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import numpy as np
|
|
65
|
+
from driftvane import (
|
|
66
|
+
DriftReport,
|
|
67
|
+
EmbeddingDrift,
|
|
68
|
+
RetrievalDrift,
|
|
69
|
+
ResponseDrift,
|
|
70
|
+
LatencyDrift,
|
|
71
|
+
)
|
|
72
|
+
from driftvane.detectors.response import Triple
|
|
73
|
+
|
|
74
|
+
ref_emb = np.load("reference_query_embeddings.npy") # (n, 768)
|
|
75
|
+
cur_emb = np.load("current_query_embeddings.npy")
|
|
76
|
+
|
|
77
|
+
report = DriftReport.from_signals([
|
|
78
|
+
EmbeddingDrift(threshold=0.1).compute(ref_emb, cur_emb),
|
|
79
|
+
RetrievalDrift(k=10, threshold=0.3).compute(ref_top_k, cur_top_k),
|
|
80
|
+
ResponseDrift(threshold=0.15).compute(ref_triples, cur_triples),
|
|
81
|
+
LatencyDrift(p_threshold=0.01).compute(ref_latencies, cur_latencies),
|
|
82
|
+
])
|
|
83
|
+
|
|
84
|
+
if report.any_drifted():
|
|
85
|
+
print(report.to_pandas())
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Or fail a CI job when retrieval moves too much:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from driftvane import DriftAlert
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
report.alert_if({"retrieval_jaccard_at_10": 0.2})
|
|
95
|
+
except DriftAlert as e:
|
|
96
|
+
sys.exit(f"drift gate failed: {e}")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Detectors
|
|
100
|
+
|
|
101
|
+
| Detector | Input | Statistic | Notes |
|
|
102
|
+
|---|---|---|---|
|
|
103
|
+
| `EmbeddingDrift` | two `(n, d)` arrays | MMD with RBF kernel, median-heuristic sigma | numpy-only, O(n²) — subsample for n > a few thousand |
|
|
104
|
+
| `RetrievalDrift` | paired top-k id lists | 1 − mean Jaccard@k; reports RBO too | aligned queries required |
|
|
105
|
+
| `ResponseDrift` | `(intent, context, answer)` triples | shift in mean answer-to-context grounding | uses `context-drift-detector-py` if installed |
|
|
106
|
+
| `LatencyDrift` | two 1-D arrays of floats | Kolmogorov–Smirnov D + asymptotic p-value | scipy-free |
|
|
107
|
+
|
|
108
|
+
Each detector returns a `DriftSignal(name, value, threshold, drifted, metadata)`.
|
|
109
|
+
`DriftReport` collects them.
|
|
110
|
+
|
|
111
|
+
## What it does NOT do
|
|
112
|
+
|
|
113
|
+
- No server. No UI. No telemetry shipping.
|
|
114
|
+
- No tabular feature drift — use [DataDrift](https://github.com/MukundaKatta/DataDrift)
|
|
115
|
+
for KS/PSI on classical features.
|
|
116
|
+
- No live trace ingestion or OTel collection — point this at parquet/numpy
|
|
117
|
+
arrays you already have.
|
|
118
|
+
- No causal root-cause analysis. It tells you *that* drift is there, not why.
|
|
119
|
+
- No model retraining triggers — emit your own when `report.any_drifted()`.
|
|
120
|
+
|
|
121
|
+
## Why not Phoenix / Arize / Evidently / Ragas?
|
|
122
|
+
|
|
123
|
+
| | driftvane | Phoenix | Arize | Evidently | Ragas |
|
|
124
|
+
|---|---|---|---|---|---|
|
|
125
|
+
| Library-only (no server) | ✓ | ✗ | ✗ | partial | ✓ |
|
|
126
|
+
| RAG-shaped detectors | ✓ | ✓ | ✓ | ✗ | ✓ |
|
|
127
|
+
| Embedding MMD out of the box | ✓ | partial | ✓ | ✗ | ✗ |
|
|
128
|
+
| Retrieval rank-shift | ✓ | ✗ | partial | ✗ | ✗ |
|
|
129
|
+
| Run inside a 5s Lambda | ✓ | ✗ | ✗ | ✓ | partial |
|
|
130
|
+
| numpy-only core deps | ✓ | ✗ | ✗ | ✗ | ✗ |
|
|
131
|
+
|
|
132
|
+
## Status
|
|
133
|
+
|
|
134
|
+
v0.1 — alpha. The four detectors above work and have tests. Public API may
|
|
135
|
+
change before v1.0. Issues and PRs welcome.
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# driftvane
|
|
2
|
+
|
|
3
|
+
[](https://github.com/MukundaKatta/driftvane/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/driftvane/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
**Compose drift detectors for RAG and agent systems.**
|
|
8
|
+
|
|
9
|
+
Most drift libraries are either tabular-only (Evidently, DataDrift) or are
|
|
10
|
+
platforms that want you to ship telemetry to their backend (Phoenix, Arize).
|
|
11
|
+
`driftvane` is a small Python library that lets you wire up multiple drift
|
|
12
|
+
signals — embedding, retrieval, response, latency — into one report. No
|
|
13
|
+
server, no UI, no telemetry. Plug it into a Lambda or Glue job, get a
|
|
14
|
+
`pandas.DataFrame` or a JSON dict back.
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install driftvane
|
|
20
|
+
# optional
|
|
21
|
+
pip install "driftvane[pandas]" # to_pandas()
|
|
22
|
+
pip install "driftvane[external-response]" # delegate response scoring to context-drift-detector-py
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quickstart
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import numpy as np
|
|
29
|
+
from driftvane import (
|
|
30
|
+
DriftReport,
|
|
31
|
+
EmbeddingDrift,
|
|
32
|
+
RetrievalDrift,
|
|
33
|
+
ResponseDrift,
|
|
34
|
+
LatencyDrift,
|
|
35
|
+
)
|
|
36
|
+
from driftvane.detectors.response import Triple
|
|
37
|
+
|
|
38
|
+
ref_emb = np.load("reference_query_embeddings.npy") # (n, 768)
|
|
39
|
+
cur_emb = np.load("current_query_embeddings.npy")
|
|
40
|
+
|
|
41
|
+
report = DriftReport.from_signals([
|
|
42
|
+
EmbeddingDrift(threshold=0.1).compute(ref_emb, cur_emb),
|
|
43
|
+
RetrievalDrift(k=10, threshold=0.3).compute(ref_top_k, cur_top_k),
|
|
44
|
+
ResponseDrift(threshold=0.15).compute(ref_triples, cur_triples),
|
|
45
|
+
LatencyDrift(p_threshold=0.01).compute(ref_latencies, cur_latencies),
|
|
46
|
+
])
|
|
47
|
+
|
|
48
|
+
if report.any_drifted():
|
|
49
|
+
print(report.to_pandas())
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Or fail a CI job when retrieval moves too much:
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from driftvane import DriftAlert
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
report.alert_if({"retrieval_jaccard_at_10": 0.2})
|
|
59
|
+
except DriftAlert as e:
|
|
60
|
+
sys.exit(f"drift gate failed: {e}")
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Detectors
|
|
64
|
+
|
|
65
|
+
| Detector | Input | Statistic | Notes |
|
|
66
|
+
|---|---|---|---|
|
|
67
|
+
| `EmbeddingDrift` | two `(n, d)` arrays | MMD with RBF kernel, median-heuristic sigma | numpy-only, O(n²) — subsample for n > a few thousand |
|
|
68
|
+
| `RetrievalDrift` | paired top-k id lists | 1 − mean Jaccard@k; reports RBO too | aligned queries required |
|
|
69
|
+
| `ResponseDrift` | `(intent, context, answer)` triples | shift in mean answer-to-context grounding | uses `context-drift-detector-py` if installed |
|
|
70
|
+
| `LatencyDrift` | two 1-D arrays of floats | Kolmogorov–Smirnov D + asymptotic p-value | scipy-free |
|
|
71
|
+
|
|
72
|
+
Each detector returns a `DriftSignal(name, value, threshold, drifted, metadata)`.
|
|
73
|
+
`DriftReport` collects them.
|
|
74
|
+
|
|
75
|
+
## What it does NOT do
|
|
76
|
+
|
|
77
|
+
- No server. No UI. No telemetry shipping.
|
|
78
|
+
- No tabular feature drift — use [DataDrift](https://github.com/MukundaKatta/DataDrift)
|
|
79
|
+
for KS/PSI on classical features.
|
|
80
|
+
- No live trace ingestion or OTel collection — point this at parquet/numpy
|
|
81
|
+
arrays you already have.
|
|
82
|
+
- No causal root-cause analysis. It tells you *that* drift is there, not why.
|
|
83
|
+
- No model retraining triggers — emit your own when `report.any_drifted()`.
|
|
84
|
+
|
|
85
|
+
## Why not Phoenix / Arize / Evidently / Ragas?
|
|
86
|
+
|
|
87
|
+
| | driftvane | Phoenix | Arize | Evidently | Ragas |
|
|
88
|
+
|---|---|---|---|---|---|
|
|
89
|
+
| Library-only (no server) | ✓ | ✗ | ✗ | partial | ✓ |
|
|
90
|
+
| RAG-shaped detectors | ✓ | ✓ | ✓ | ✗ | ✓ |
|
|
91
|
+
| Embedding MMD out of the box | ✓ | partial | ✓ | ✗ | ✗ |
|
|
92
|
+
| Retrieval rank-shift | ✓ | ✗ | partial | ✗ | ✗ |
|
|
93
|
+
| Run inside a 5s Lambda | ✓ | ✗ | ✗ | ✓ | partial |
|
|
94
|
+
| numpy-only core deps | ✓ | ✗ | ✗ | ✗ | ✗ |
|
|
95
|
+
|
|
96
|
+
## Status
|
|
97
|
+
|
|
98
|
+
v0.1 — alpha. The four detectors above work and have tests. Public API may
|
|
99
|
+
change before v1.0. Issues and PRs welcome.
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.24"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "driftvane"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Compose drift detectors (embedding, retrieval, response, latency) into one report. Library-only, no server, no UI."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Mukunda Rao Katta", email = "mukunda.vjcs6@gmail.com" }]
|
|
13
|
+
keywords = [
|
|
14
|
+
"ai",
|
|
15
|
+
"llm",
|
|
16
|
+
"rag",
|
|
17
|
+
"agents",
|
|
18
|
+
"drift",
|
|
19
|
+
"embedding-drift",
|
|
20
|
+
"retrieval-drift",
|
|
21
|
+
"monitoring",
|
|
22
|
+
"evals",
|
|
23
|
+
"mlops",
|
|
24
|
+
]
|
|
25
|
+
classifiers = [
|
|
26
|
+
"Development Status :: 3 - Alpha",
|
|
27
|
+
"Intended Audience :: Developers",
|
|
28
|
+
"License :: OSI Approved :: MIT License",
|
|
29
|
+
"Operating System :: OS Independent",
|
|
30
|
+
"Programming Language :: Python :: 3",
|
|
31
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
32
|
+
"Programming Language :: Python :: 3.10",
|
|
33
|
+
"Programming Language :: Python :: 3.11",
|
|
34
|
+
"Programming Language :: Python :: 3.12",
|
|
35
|
+
"Programming Language :: Python :: 3.13",
|
|
36
|
+
"Programming Language :: Python :: 3.14",
|
|
37
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
38
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
39
|
+
]
|
|
40
|
+
dependencies = [
|
|
41
|
+
"numpy>=1.24",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[project.optional-dependencies]
|
|
45
|
+
pandas = ["pandas>=2.0"]
|
|
46
|
+
external-response = ["context-drift-detector-py>=0.1"]
|
|
47
|
+
dev = ["pytest>=8.0", "pandas>=2.0", "ruff>=0.4"]
|
|
48
|
+
|
|
49
|
+
[project.urls]
|
|
50
|
+
Homepage = "https://github.com/MukundaKatta/driftvane"
|
|
51
|
+
Issues = "https://github.com/MukundaKatta/driftvane/issues"
|
|
52
|
+
Source = "https://github.com/MukundaKatta/driftvane"
|
|
53
|
+
|
|
54
|
+
[tool.hatch.build.targets.wheel]
|
|
55
|
+
packages = ["src/driftvane"]
|
|
56
|
+
|
|
57
|
+
[tool.hatch.build.targets.sdist]
|
|
58
|
+
include = ["/src", "/tests", "/README.md", "/LICENSE"]
|
|
59
|
+
|
|
60
|
+
[tool.pytest.ini_options]
|
|
61
|
+
testpaths = ["tests"]
|
|
62
|
+
pythonpath = ["src"]
|
|
63
|
+
|
|
64
|
+
[tool.ruff]
|
|
65
|
+
target-version = "py310"
|
|
66
|
+
line-length = 100
|
|
67
|
+
|
|
68
|
+
[tool.ruff.lint]
|
|
69
|
+
select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""driftvane — compose drift detectors for RAG and agent systems.
|
|
2
|
+
|
|
3
|
+
A small library that lets you wire up multiple drift signals (embedding,
|
|
4
|
+
retrieval, response, latency) into one DriftReport. No server, no UI.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from driftvane.detector import DriftAlert, DriftSignal
|
|
8
|
+
from driftvane.detectors.embedding import EmbeddingDrift
|
|
9
|
+
from driftvane.detectors.latency import LatencyDrift
|
|
10
|
+
from driftvane.detectors.response import ResponseDrift
|
|
11
|
+
from driftvane.detectors.retrieval import RetrievalDrift
|
|
12
|
+
from driftvane.report import DriftReport
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0"
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"DriftAlert",
|
|
18
|
+
"DriftReport",
|
|
19
|
+
"DriftSignal",
|
|
20
|
+
"EmbeddingDrift",
|
|
21
|
+
"LatencyDrift",
|
|
22
|
+
"ResponseDrift",
|
|
23
|
+
"RetrievalDrift",
|
|
24
|
+
"__version__",
|
|
25
|
+
]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Core types: DriftSignal, DriftAlert."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class DriftSignal:
|
|
11
|
+
"""One detector's verdict.
|
|
12
|
+
|
|
13
|
+
name: stable identifier, e.g. "embedding_mmd", "retrieval_jaccard_at_10"
|
|
14
|
+
value: the raw statistic
|
|
15
|
+
threshold: the configured threshold; None means "report only, don't flag"
|
|
16
|
+
drifted: True when value exceeds threshold
|
|
17
|
+
metadata: detector-specific extras (sample sizes, kernel sigma, etc.)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
name: str
|
|
21
|
+
value: float
|
|
22
|
+
threshold: float | None = None
|
|
23
|
+
drifted: bool = False
|
|
24
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
def to_dict(self) -> dict[str, Any]:
|
|
27
|
+
return {
|
|
28
|
+
"name": self.name,
|
|
29
|
+
"value": self.value,
|
|
30
|
+
"threshold": self.threshold,
|
|
31
|
+
"drifted": self.drifted,
|
|
32
|
+
"metadata": self.metadata,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DriftAlert(Exception):
|
|
37
|
+
"""Raised by DriftReport.alert_if when a threshold is breached."""
|
|
38
|
+
|
|
39
|
+
def __init__(self, breaches: list[DriftSignal]):
|
|
40
|
+
self.breaches = breaches
|
|
41
|
+
names = ", ".join(f"{s.name}={s.value:.4f}>{s.threshold}" for s in breaches)
|
|
42
|
+
super().__init__(f"drift detected: {names}")
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from driftvane.detectors.embedding import EmbeddingDrift
|
|
2
|
+
from driftvane.detectors.latency import LatencyDrift
|
|
3
|
+
from driftvane.detectors.response import ResponseDrift
|
|
4
|
+
from driftvane.detectors.retrieval import RetrievalDrift
|
|
5
|
+
|
|
6
|
+
__all__ = ["EmbeddingDrift", "LatencyDrift", "ResponseDrift", "RetrievalDrift"]
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""EmbeddingDrift — Maximum Mean Discrepancy with RBF kernel.
|
|
2
|
+
|
|
3
|
+
MMD is a kernel two-sample test. It tests whether two batches of embeddings
|
|
4
|
+
were drawn from the same distribution. MMD^2 is zero when the distributions
|
|
5
|
+
match and grows with the distance between them.
|
|
6
|
+
|
|
7
|
+
We compute the squared MMD with the RBF (Gaussian) kernel:
|
|
8
|
+
k(x, y) = exp(-||x - y||^2 / (2 * sigma^2))
|
|
9
|
+
MMD^2 = E[k(X, X')] + E[k(Y, Y')] - 2 E[k(X, Y)]
|
|
10
|
+
|
|
11
|
+
When sigma is None we use the median heuristic on the merged sample, which
|
|
12
|
+
is the standard default and removes the main hyperparameter footgun.
|
|
13
|
+
|
|
14
|
+
Cost is O(n^2) memory and time, so call this with batches up to a few
|
|
15
|
+
thousand vectors. For larger sets, subsample first.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
from driftvane.detector import DriftSignal
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _pairwise_sq_dists(a: np.ndarray, b: np.ndarray) -> np.ndarray:
|
|
26
|
+
"""Squared Euclidean distance matrix, shape (len(a), len(b))."""
|
|
27
|
+
a2 = np.sum(a * a, axis=1)[:, None]
|
|
28
|
+
b2 = np.sum(b * b, axis=1)[None, :]
|
|
29
|
+
return np.maximum(a2 + b2 - 2.0 * a @ b.T, 0.0)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _median_heuristic_sigma(x: np.ndarray, y: np.ndarray) -> float:
|
|
33
|
+
"""Median pairwise distance on the merged sample. Robust default for sigma."""
|
|
34
|
+
z = np.concatenate([x, y], axis=0)
|
|
35
|
+
# subsample to keep this cheap on big inputs
|
|
36
|
+
if len(z) > 1000:
|
|
37
|
+
rng = np.random.default_rng(0)
|
|
38
|
+
idx = rng.choice(len(z), size=1000, replace=False)
|
|
39
|
+
z = z[idx]
|
|
40
|
+
d2 = _pairwise_sq_dists(z, z)
|
|
41
|
+
iu = np.triu_indices_from(d2, k=1)
|
|
42
|
+
median_sq = float(np.median(d2[iu]))
|
|
43
|
+
# sigma is the bandwidth, not sigma^2; floor to avoid div-by-zero
|
|
44
|
+
return max(np.sqrt(median_sq / 2.0), 1e-8)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def mmd_rbf(x: np.ndarray, y: np.ndarray, sigma: float | None = None) -> tuple[float, float]:
|
|
48
|
+
"""Compute MMD^2 between two batches with RBF kernel.
|
|
49
|
+
|
|
50
|
+
Returns (mmd_squared, sigma_used).
|
|
51
|
+
"""
|
|
52
|
+
if sigma is None:
|
|
53
|
+
sigma = _median_heuristic_sigma(x, y)
|
|
54
|
+
gamma = 1.0 / (2.0 * sigma * sigma)
|
|
55
|
+
|
|
56
|
+
kxx = np.exp(-gamma * _pairwise_sq_dists(x, x))
|
|
57
|
+
kyy = np.exp(-gamma * _pairwise_sq_dists(y, y))
|
|
58
|
+
kxy = np.exp(-gamma * _pairwise_sq_dists(x, y))
|
|
59
|
+
|
|
60
|
+
mmd2 = float(kxx.mean() + kyy.mean() - 2.0 * kxy.mean())
|
|
61
|
+
# numerical noise can push the value slightly negative; clamp at 0
|
|
62
|
+
return max(mmd2, 0.0), sigma
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class EmbeddingDrift:
|
|
66
|
+
"""Detect distribution shift between two batches of embedding vectors.
|
|
67
|
+
|
|
68
|
+
ed = EmbeddingDrift(threshold=0.1)
|
|
69
|
+
signal = ed.compute(reference=ref_emb, current=cur_emb)
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
method: str = "mmd",
|
|
75
|
+
sigma: float | None = None,
|
|
76
|
+
threshold: float | None = None,
|
|
77
|
+
name: str = "embedding_mmd",
|
|
78
|
+
) -> None:
|
|
79
|
+
if method != "mmd":
|
|
80
|
+
raise ValueError(f"unknown method: {method!r}; only 'mmd' is supported")
|
|
81
|
+
self.method = method
|
|
82
|
+
self.sigma = sigma
|
|
83
|
+
self.threshold = threshold
|
|
84
|
+
self.name = name
|
|
85
|
+
|
|
86
|
+
def compute(self, reference: np.ndarray, current: np.ndarray) -> DriftSignal:
|
|
87
|
+
ref = np.asarray(reference, dtype=np.float64)
|
|
88
|
+
cur = np.asarray(current, dtype=np.float64)
|
|
89
|
+
if ref.ndim != 2 or cur.ndim != 2:
|
|
90
|
+
raise ValueError("reference and current must be 2-D (n_samples, n_dims)")
|
|
91
|
+
if ref.shape[1] != cur.shape[1]:
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"dim mismatch: reference has {ref.shape[1]}, current has {cur.shape[1]}"
|
|
94
|
+
)
|
|
95
|
+
if len(ref) < 2 or len(cur) < 2:
|
|
96
|
+
raise ValueError("need at least 2 samples in each set")
|
|
97
|
+
|
|
98
|
+
value, sigma_used = mmd_rbf(ref, cur, sigma=self.sigma)
|
|
99
|
+
drifted = self.threshold is not None and value > self.threshold
|
|
100
|
+
return DriftSignal(
|
|
101
|
+
name=self.name,
|
|
102
|
+
value=value,
|
|
103
|
+
threshold=self.threshold,
|
|
104
|
+
drifted=drifted,
|
|
105
|
+
metadata={
|
|
106
|
+
"n_ref": int(ref.shape[0]),
|
|
107
|
+
"n_cur": int(cur.shape[0]),
|
|
108
|
+
"dim": int(ref.shape[1]),
|
|
109
|
+
"sigma": float(sigma_used),
|
|
110
|
+
"method": self.method,
|
|
111
|
+
},
|
|
112
|
+
)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""LatencyDrift — Kolmogorov-Smirnov two-sample test on latency arrays.
|
|
2
|
+
|
|
3
|
+
KS compares the empirical CDFs of two samples. The statistic is the maximum
|
|
4
|
+
absolute difference between the CDFs and is bounded in [0, 1]. It is robust
|
|
5
|
+
to scale and doesn't assume any particular distribution, which matches how
|
|
6
|
+
real LLM latency tails behave.
|
|
7
|
+
|
|
8
|
+
We compute the KS statistic from sorted arrays without scipy so the install
|
|
9
|
+
stays light. For an approximate p-value we use the standard asymptotic form
|
|
10
|
+
sqrt(-0.5 * ln(alpha/2) * (n1+n2)/(n1*n2)).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import math
|
|
16
|
+
from collections.abc import Sequence
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from driftvane.detector import DriftSignal
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def ks_2samp(x: Sequence[float], y: Sequence[float]) -> tuple[float, float]:
|
|
24
|
+
"""Return (D, approx_p_value). Numpy-only two-sample KS."""
|
|
25
|
+
a = np.sort(np.asarray(x, dtype=np.float64))
|
|
26
|
+
b = np.sort(np.asarray(y, dtype=np.float64))
|
|
27
|
+
n1, n2 = len(a), len(b)
|
|
28
|
+
if n1 == 0 or n2 == 0:
|
|
29
|
+
raise ValueError("both arrays must be non-empty")
|
|
30
|
+
all_v = np.concatenate([a, b])
|
|
31
|
+
cdf_a = np.searchsorted(a, all_v, side="right") / n1
|
|
32
|
+
cdf_b = np.searchsorted(b, all_v, side="right") / n2
|
|
33
|
+
d = float(np.max(np.abs(cdf_a - cdf_b)))
|
|
34
|
+
|
|
35
|
+
if d == 0.0:
|
|
36
|
+
# asymptotic series degenerates at d=0; the null is trivially consistent
|
|
37
|
+
return 0.0, 1.0
|
|
38
|
+
|
|
39
|
+
en = math.sqrt(n1 * n2 / (n1 + n2))
|
|
40
|
+
# asymptotic two-sided p-value (Smirnov)
|
|
41
|
+
lam = (en + 0.12 + 0.11 / en) * d
|
|
42
|
+
p = 2.0 * sum(((-1) ** (k - 1)) * math.exp(-2.0 * lam * lam * k * k) for k in range(1, 101))
|
|
43
|
+
p = max(0.0, min(1.0, p))
|
|
44
|
+
return d, p
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class LatencyDrift:
|
|
48
|
+
"""Detect distribution shift in latency (or any 1-D numeric array).
|
|
49
|
+
|
|
50
|
+
ld = LatencyDrift(threshold=0.2) # threshold on KS statistic
|
|
51
|
+
signal = ld.compute(reference=ref_lat, current=cur_lat)
|
|
52
|
+
|
|
53
|
+
Or threshold on p-value:
|
|
54
|
+
|
|
55
|
+
ld = LatencyDrift(p_threshold=0.01)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
threshold: float | None = None,
|
|
61
|
+
p_threshold: float | None = None,
|
|
62
|
+
name: str = "latency_ks",
|
|
63
|
+
) -> None:
|
|
64
|
+
if threshold is not None and p_threshold is not None:
|
|
65
|
+
raise ValueError("set either threshold or p_threshold, not both")
|
|
66
|
+
self.threshold = threshold
|
|
67
|
+
self.p_threshold = p_threshold
|
|
68
|
+
self.name = name
|
|
69
|
+
|
|
70
|
+
def compute(self, reference: Sequence[float], current: Sequence[float]) -> DriftSignal:
|
|
71
|
+
d, p = ks_2samp(reference, current)
|
|
72
|
+
if self.p_threshold is not None:
|
|
73
|
+
drifted = p < self.p_threshold
|
|
74
|
+
else:
|
|
75
|
+
drifted = self.threshold is not None and d > self.threshold
|
|
76
|
+
|
|
77
|
+
return DriftSignal(
|
|
78
|
+
name=self.name,
|
|
79
|
+
value=d,
|
|
80
|
+
threshold=self.threshold,
|
|
81
|
+
drifted=drifted,
|
|
82
|
+
metadata={
|
|
83
|
+
"n_ref": len(reference),
|
|
84
|
+
"n_cur": len(current),
|
|
85
|
+
"ks_p_value": p,
|
|
86
|
+
"p_threshold": self.p_threshold,
|
|
87
|
+
"median_ref": float(np.median(reference)),
|
|
88
|
+
"median_cur": float(np.median(current)),
|
|
89
|
+
},
|
|
90
|
+
)
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""ResponseDrift — answer-vs-context grounding drift across batches.
|
|
2
|
+
|
|
3
|
+
For each (intent, context, answer) triple, compute Jaccard overlap of token
|
|
4
|
+
sets between the answer and the context. Then compare the *distribution* of
|
|
5
|
+
those scores between reference and current batches.
|
|
6
|
+
|
|
7
|
+
The drift value is the absolute difference of the mean grounding scores. A
|
|
8
|
+
shrinking mean answer-to-context overlap is the signal you want to catch:
|
|
9
|
+
the model is wandering off the retrieved context.
|
|
10
|
+
|
|
11
|
+
If `context-drift-detector-py` is installed we delegate per-triple scoring
|
|
12
|
+
to it for compatibility with that library's signal definitions; otherwise
|
|
13
|
+
we use the inline tokenizer below. Either way, the aggregation is ours.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import re
|
|
19
|
+
from collections.abc import Iterable
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
|
|
22
|
+
from driftvane.detector import DriftSignal
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class Triple:
|
|
27
|
+
intent: str
|
|
28
|
+
context: str | list[str]
|
|
29
|
+
answer: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
_WORD_RE = re.compile(r"[a-z0-9]+")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _tokens(text: str) -> set[str]:
|
|
36
|
+
return set(_WORD_RE.findall(text.lower()))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _flatten_context(ctx: str | Iterable[str]) -> str:
|
|
40
|
+
if isinstance(ctx, str):
|
|
41
|
+
return ctx
|
|
42
|
+
return " ".join(ctx)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _grounding_score(triple: Triple) -> float:
|
|
46
|
+
"""answer ∩ context / answer (recall-style; 1.0 = fully grounded)."""
|
|
47
|
+
ans = _tokens(triple.answer)
|
|
48
|
+
if not ans:
|
|
49
|
+
return 1.0
|
|
50
|
+
ctx = _tokens(_flatten_context(triple.context))
|
|
51
|
+
return len(ans & ctx) / len(ans)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _try_load_external_scorer():
|
|
55
|
+
try:
|
|
56
|
+
from context_drift_detector import detect # type: ignore
|
|
57
|
+
except ImportError:
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
def _score(triple: Triple) -> float:
|
|
61
|
+
ctx = triple.context if isinstance(triple.context, list) else [triple.context]
|
|
62
|
+
result = detect(triple.intent, ctx, triple.answer)
|
|
63
|
+
# context-drift-detector-py exposes signals dict with answer_to_context
|
|
64
|
+
return float(result.signals.get("answer_to_context", _grounding_score(triple)))
|
|
65
|
+
|
|
66
|
+
return _score
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class ResponseDrift:
|
|
70
|
+
"""Detect drift in how well answers stay grounded in retrieved context.
|
|
71
|
+
|
|
72
|
+
rsp = ResponseDrift(threshold=0.15)
|
|
73
|
+
signal = rsp.compute(
|
|
74
|
+
reference=[Triple("...", "...", "..."), ...],
|
|
75
|
+
current=[Triple("...", "...", "..."), ...],
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
Pass `use_external=False` to force the inline tokenizer even when
|
|
79
|
+
context-drift-detector-py is installed.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
threshold: float | None = None,
|
|
85
|
+
name: str = "response_grounding_shift",
|
|
86
|
+
use_external: bool = True,
|
|
87
|
+
) -> None:
|
|
88
|
+
self.threshold = threshold
|
|
89
|
+
self.name = name
|
|
90
|
+
self.use_external = use_external
|
|
91
|
+
self._scorer = _try_load_external_scorer() if use_external else None
|
|
92
|
+
|
|
93
|
+
def compute(
|
|
94
|
+
self,
|
|
95
|
+
reference: Iterable[Triple | dict],
|
|
96
|
+
current: Iterable[Triple | dict],
|
|
97
|
+
) -> DriftSignal:
|
|
98
|
+
ref = [t if isinstance(t, Triple) else Triple(**t) for t in reference]
|
|
99
|
+
cur = [t if isinstance(t, Triple) else Triple(**t) for t in current]
|
|
100
|
+
if not ref or not cur:
|
|
101
|
+
raise ValueError("need at least 1 triple in each batch")
|
|
102
|
+
|
|
103
|
+
score = self._scorer or _grounding_score
|
|
104
|
+
ref_scores = [score(t) for t in ref]
|
|
105
|
+
cur_scores = [score(t) for t in cur]
|
|
106
|
+
|
|
107
|
+
mean_ref = sum(ref_scores) / len(ref_scores)
|
|
108
|
+
mean_cur = sum(cur_scores) / len(cur_scores)
|
|
109
|
+
# we care about *worsening* grounding, so use signed shift but report
|
|
110
|
+
# absolute as the drift value
|
|
111
|
+
signed_shift = mean_cur - mean_ref
|
|
112
|
+
drift_value = abs(signed_shift)
|
|
113
|
+
drifted = self.threshold is not None and drift_value > self.threshold
|
|
114
|
+
|
|
115
|
+
return DriftSignal(
|
|
116
|
+
name=self.name,
|
|
117
|
+
value=drift_value,
|
|
118
|
+
threshold=self.threshold,
|
|
119
|
+
drifted=drifted,
|
|
120
|
+
metadata={
|
|
121
|
+
"n_ref": len(ref),
|
|
122
|
+
"n_cur": len(cur),
|
|
123
|
+
"mean_ref_grounding": mean_ref,
|
|
124
|
+
"mean_cur_grounding": mean_cur,
|
|
125
|
+
"signed_shift": signed_shift,
|
|
126
|
+
"scorer": "external" if self._scorer else "inline_jaccard",
|
|
127
|
+
},
|
|
128
|
+
)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""RetrievalDrift — measure shift in retriever output for the same queries.
|
|
2
|
+
|
|
3
|
+
Inputs are paired top-k document-id lists: for each query, the reference
|
|
4
|
+
retriever produced one ranked list and the current retriever produced another.
|
|
5
|
+
Drift = how much the top-k sets and rank order have moved.
|
|
6
|
+
|
|
7
|
+
Two metrics:
|
|
8
|
+
* mean_jaccard_at_k: average Jaccard overlap of the top-k sets (1.0 = identical)
|
|
9
|
+
* mean_rbo: rank-biased overlap, weights early positions more (1.0 = identical)
|
|
10
|
+
|
|
11
|
+
The reported drift value is 1 - mean_jaccard_at_k so that "more drift = larger
|
|
12
|
+
value" matches the convention in the other detectors.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from collections.abc import Sequence
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from driftvane.detector import DriftSignal
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _jaccard(a: set[Any], b: set[Any]) -> float:
|
|
24
|
+
if not a and not b:
|
|
25
|
+
return 1.0
|
|
26
|
+
return len(a & b) / len(a | b)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _rbo(ref: Sequence[Any], cur: Sequence[Any], p: float = 0.9) -> float:
|
|
30
|
+
"""Rank-biased overlap. Weighted overlap of the two prefix sets at each depth.
|
|
31
|
+
|
|
32
|
+
p controls how top-heavy the weighting is; p=0.9 puts ~86% of weight on the
|
|
33
|
+
top 10. See Webber, Moffat, Zobel 2010.
|
|
34
|
+
"""
|
|
35
|
+
depth = max(len(ref), len(cur))
|
|
36
|
+
if depth == 0:
|
|
37
|
+
return 1.0
|
|
38
|
+
seen_ref: set[Any] = set()
|
|
39
|
+
seen_cur: set[Any] = set()
|
|
40
|
+
weighted_sum = 0.0
|
|
41
|
+
weight_total = 0.0
|
|
42
|
+
for i in range(depth):
|
|
43
|
+
if i < len(ref):
|
|
44
|
+
seen_ref.add(ref[i])
|
|
45
|
+
if i < len(cur):
|
|
46
|
+
seen_cur.add(cur[i])
|
|
47
|
+
agreement = len(seen_ref & seen_cur) / (i + 1)
|
|
48
|
+
w = p**i
|
|
49
|
+
weighted_sum += agreement * w
|
|
50
|
+
weight_total += w
|
|
51
|
+
return weighted_sum / weight_total if weight_total > 0 else 1.0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class RetrievalDrift:
|
|
55
|
+
"""Detect retrieval drift across paired query→top-k results.
|
|
56
|
+
|
|
57
|
+
rd = RetrievalDrift(k=10, threshold=0.3)
|
|
58
|
+
signal = rd.compute(
|
|
59
|
+
reference=[["doc_1", "doc_2", ...], ...],
|
|
60
|
+
current=[["doc_1", "doc_3", ...], ...],
|
|
61
|
+
)
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
k: int = 10,
|
|
67
|
+
threshold: float | None = None,
|
|
68
|
+
name: str | None = None,
|
|
69
|
+
) -> None:
|
|
70
|
+
if k < 1:
|
|
71
|
+
raise ValueError("k must be >= 1")
|
|
72
|
+
self.k = k
|
|
73
|
+
self.threshold = threshold
|
|
74
|
+
self.name = name or f"retrieval_jaccard_at_{k}"
|
|
75
|
+
|
|
76
|
+
def compute(
|
|
77
|
+
self,
|
|
78
|
+
reference: Sequence[Sequence[Any]],
|
|
79
|
+
current: Sequence[Sequence[Any]],
|
|
80
|
+
) -> DriftSignal:
|
|
81
|
+
if len(reference) != len(current):
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"reference and current must have the same number of queries; "
|
|
84
|
+
f"got {len(reference)} vs {len(current)}"
|
|
85
|
+
)
|
|
86
|
+
if not reference:
|
|
87
|
+
raise ValueError("need at least 1 query")
|
|
88
|
+
|
|
89
|
+
jaccards: list[float] = []
|
|
90
|
+
rbos: list[float] = []
|
|
91
|
+
for ref_list, cur_list in zip(reference, current, strict=True):
|
|
92
|
+
ref_top = list(ref_list[: self.k])
|
|
93
|
+
cur_top = list(cur_list[: self.k])
|
|
94
|
+
jaccards.append(_jaccard(set(ref_top), set(cur_top)))
|
|
95
|
+
rbos.append(_rbo(ref_top, cur_top))
|
|
96
|
+
|
|
97
|
+
mean_jaccard = sum(jaccards) / len(jaccards)
|
|
98
|
+
mean_rbo = sum(rbos) / len(rbos)
|
|
99
|
+
drift_value = 1.0 - mean_jaccard
|
|
100
|
+
drifted = self.threshold is not None and drift_value > self.threshold
|
|
101
|
+
|
|
102
|
+
return DriftSignal(
|
|
103
|
+
name=self.name,
|
|
104
|
+
value=drift_value,
|
|
105
|
+
threshold=self.threshold,
|
|
106
|
+
drifted=drifted,
|
|
107
|
+
metadata={
|
|
108
|
+
"n_queries": len(reference),
|
|
109
|
+
"k": self.k,
|
|
110
|
+
"mean_jaccard_at_k": mean_jaccard,
|
|
111
|
+
"mean_rbo": mean_rbo,
|
|
112
|
+
},
|
|
113
|
+
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""DriftReport — collect signals from multiple detectors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from driftvane.detector import DriftAlert, DriftSignal
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DriftReport:
|
|
11
|
+
"""A bag of DriftSignals with output helpers.
|
|
12
|
+
|
|
13
|
+
Build it incrementally:
|
|
14
|
+
|
|
15
|
+
report = DriftReport()
|
|
16
|
+
report.add(EmbeddingDrift().compute(ref_emb, cur_emb))
|
|
17
|
+
report.add(LatencyDrift().compute(ref_lat, cur_lat))
|
|
18
|
+
|
|
19
|
+
Or in one shot:
|
|
20
|
+
|
|
21
|
+
report = DriftReport.from_signals([
|
|
22
|
+
EmbeddingDrift().compute(ref_emb, cur_emb),
|
|
23
|
+
LatencyDrift().compute(ref_lat, cur_lat),
|
|
24
|
+
])
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self) -> None:
|
|
28
|
+
self._signals: list[DriftSignal] = []
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def from_signals(cls, signals: list[DriftSignal]) -> DriftReport:
|
|
32
|
+
r = cls()
|
|
33
|
+
for s in signals:
|
|
34
|
+
r.add(s)
|
|
35
|
+
return r
|
|
36
|
+
|
|
37
|
+
def add(self, signal: DriftSignal) -> DriftReport:
|
|
38
|
+
self._signals.append(signal)
|
|
39
|
+
return self
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def signals(self) -> list[DriftSignal]:
|
|
43
|
+
return list(self._signals)
|
|
44
|
+
|
|
45
|
+
def get(self, name: str) -> DriftSignal | None:
|
|
46
|
+
for s in self._signals:
|
|
47
|
+
if s.name == name:
|
|
48
|
+
return s
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
def any_drifted(self) -> bool:
|
|
52
|
+
return any(s.drifted for s in self._signals)
|
|
53
|
+
|
|
54
|
+
def to_dict(self) -> dict[str, Any]:
|
|
55
|
+
return {
|
|
56
|
+
"signals": [s.to_dict() for s in self._signals],
|
|
57
|
+
"any_drifted": self.any_drifted(),
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
def to_pandas(self):
|
|
61
|
+
# imported lazily so pandas isn't required for non-DataFrame users
|
|
62
|
+
import pandas as pd
|
|
63
|
+
|
|
64
|
+
if not self._signals:
|
|
65
|
+
return pd.DataFrame(columns=["name", "value", "threshold", "drifted"])
|
|
66
|
+
return pd.DataFrame(
|
|
67
|
+
[
|
|
68
|
+
{
|
|
69
|
+
"name": s.name,
|
|
70
|
+
"value": s.value,
|
|
71
|
+
"threshold": s.threshold,
|
|
72
|
+
"drifted": s.drifted,
|
|
73
|
+
**{f"meta_{k}": v for k, v in s.metadata.items()},
|
|
74
|
+
}
|
|
75
|
+
for s in self._signals
|
|
76
|
+
]
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def alert_if(self, thresholds: dict[str, float]) -> None:
|
|
80
|
+
"""Raise DriftAlert if any of the given signals exceeds its threshold.
|
|
81
|
+
|
|
82
|
+
Overrides the threshold each signal was computed with. Use this when the
|
|
83
|
+
report is being evaluated against a different policy than the detector
|
|
84
|
+
was constructed with (e.g. CI vs. prod).
|
|
85
|
+
"""
|
|
86
|
+
breaches = []
|
|
87
|
+
for s in self._signals:
|
|
88
|
+
if s.name in thresholds and s.value > thresholds[s.name]:
|
|
89
|
+
breaches.append(
|
|
90
|
+
DriftSignal(
|
|
91
|
+
name=s.name,
|
|
92
|
+
value=s.value,
|
|
93
|
+
threshold=thresholds[s.name],
|
|
94
|
+
drifted=True,
|
|
95
|
+
metadata=s.metadata,
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
if breaches:
|
|
99
|
+
raise DriftAlert(breaches)
|
|
File without changes
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from driftvane import EmbeddingDrift
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _gen(n: int, dim: int, mean: float = 0.0, scale: float = 1.0, seed: int = 0):
|
|
8
|
+
rng = np.random.default_rng(seed)
|
|
9
|
+
return rng.normal(loc=mean, scale=scale, size=(n, dim))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_no_drift_for_same_distribution():
|
|
13
|
+
a = _gen(200, 16, seed=1)
|
|
14
|
+
b = _gen(200, 16, seed=2)
|
|
15
|
+
sig = EmbeddingDrift().compute(a, b)
|
|
16
|
+
# MMD between two N(0,1) samples of the same shape should be small
|
|
17
|
+
assert sig.value < 0.05
|
|
18
|
+
assert sig.metadata["n_ref"] == 200
|
|
19
|
+
assert sig.metadata["dim"] == 16
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_drift_for_shifted_mean():
|
|
23
|
+
a = _gen(200, 16, mean=0.0, seed=1)
|
|
24
|
+
b = _gen(200, 16, mean=2.0, seed=2)
|
|
25
|
+
sig = EmbeddingDrift().compute(a, b)
|
|
26
|
+
assert sig.value > 0.1
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_threshold_flags_drift():
|
|
30
|
+
a = _gen(200, 16, mean=0.0, seed=1)
|
|
31
|
+
b = _gen(200, 16, mean=2.0, seed=2)
|
|
32
|
+
sig = EmbeddingDrift(threshold=0.1).compute(a, b)
|
|
33
|
+
assert sig.drifted is True
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_threshold_passes_when_under():
|
|
37
|
+
a = _gen(200, 16, seed=1)
|
|
38
|
+
b = _gen(200, 16, seed=2)
|
|
39
|
+
sig = EmbeddingDrift(threshold=0.5).compute(a, b)
|
|
40
|
+
assert sig.drifted is False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_dim_mismatch_raises():
|
|
44
|
+
a = _gen(50, 8)
|
|
45
|
+
b = _gen(50, 16)
|
|
46
|
+
with pytest.raises(ValueError, match="dim mismatch"):
|
|
47
|
+
EmbeddingDrift().compute(a, b)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_one_sample_raises():
|
|
51
|
+
a = _gen(1, 8)
|
|
52
|
+
b = _gen(50, 8)
|
|
53
|
+
with pytest.raises(ValueError, match="at least 2 samples"):
|
|
54
|
+
EmbeddingDrift().compute(a, b)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_unknown_method_raises():
|
|
58
|
+
with pytest.raises(ValueError, match="unknown method"):
|
|
59
|
+
EmbeddingDrift(method="kld")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_explicit_sigma_used():
|
|
63
|
+
a = _gen(50, 8)
|
|
64
|
+
b = _gen(50, 8)
|
|
65
|
+
sig = EmbeddingDrift(sigma=2.5).compute(a, b)
|
|
66
|
+
assert sig.metadata["sigma"] == 2.5
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_value_is_non_negative():
|
|
70
|
+
a = _gen(200, 16, seed=1)
|
|
71
|
+
b = _gen(200, 16, seed=1)
|
|
72
|
+
sig = EmbeddingDrift().compute(a, b)
|
|
73
|
+
assert sig.value >= 0.0
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from driftvane import LatencyDrift
|
|
5
|
+
from driftvane.detectors.latency import ks_2samp
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_identical_samples_have_zero_ks():
|
|
9
|
+
rng = np.random.default_rng(0)
|
|
10
|
+
a = rng.normal(size=500)
|
|
11
|
+
sig = LatencyDrift().compute(a, a)
|
|
12
|
+
assert sig.value == 0.0
|
|
13
|
+
assert sig.metadata["ks_p_value"] == pytest.approx(1.0, abs=0.05)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_shifted_distribution_has_high_ks():
|
|
17
|
+
rng = np.random.default_rng(0)
|
|
18
|
+
a = rng.normal(loc=0.0, size=500)
|
|
19
|
+
b = rng.normal(loc=2.0, size=500)
|
|
20
|
+
sig = LatencyDrift().compute(a, b)
|
|
21
|
+
assert sig.value > 0.5
|
|
22
|
+
assert sig.metadata["ks_p_value"] < 0.001
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_threshold_flags_on_d_statistic():
|
|
26
|
+
rng = np.random.default_rng(0)
|
|
27
|
+
a = rng.normal(loc=0.0, size=200)
|
|
28
|
+
b = rng.normal(loc=2.0, size=200)
|
|
29
|
+
sig = LatencyDrift(threshold=0.3).compute(a, b)
|
|
30
|
+
assert sig.drifted is True
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_p_threshold_flags():
|
|
34
|
+
rng = np.random.default_rng(0)
|
|
35
|
+
a = rng.normal(loc=0.0, size=200)
|
|
36
|
+
b = rng.normal(loc=2.0, size=200)
|
|
37
|
+
sig = LatencyDrift(p_threshold=0.01).compute(a, b)
|
|
38
|
+
assert sig.drifted is True
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_cant_set_both_thresholds():
|
|
42
|
+
with pytest.raises(ValueError, match="not both"):
|
|
43
|
+
LatencyDrift(threshold=0.1, p_threshold=0.01)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_empty_raises():
|
|
47
|
+
with pytest.raises(ValueError, match="non-empty"):
|
|
48
|
+
ks_2samp([], [1.0, 2.0])
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_ks_returns_in_unit_interval():
|
|
52
|
+
rng = np.random.default_rng(0)
|
|
53
|
+
a = rng.normal(size=100)
|
|
54
|
+
b = rng.normal(size=100)
|
|
55
|
+
d, p = ks_2samp(a, b)
|
|
56
|
+
assert 0.0 <= d <= 1.0
|
|
57
|
+
assert 0.0 <= p <= 1.0
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_median_metadata_present():
|
|
61
|
+
sig = LatencyDrift().compute([1.0, 2.0, 3.0, 4.0], [10.0, 20.0, 30.0, 40.0])
|
|
62
|
+
assert sig.metadata["median_ref"] == pytest.approx(2.5)
|
|
63
|
+
assert sig.metadata["median_cur"] == pytest.approx(25.0)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from driftvane import DriftAlert, DriftReport, DriftSignal
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _sig(
|
|
7
|
+
name: str,
|
|
8
|
+
value: float,
|
|
9
|
+
threshold: float | None = None,
|
|
10
|
+
drifted: bool = False,
|
|
11
|
+
) -> DriftSignal:
|
|
12
|
+
return DriftSignal(name=name, value=value, threshold=threshold, drifted=drifted)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_empty_report_is_not_drifted():
|
|
16
|
+
r = DriftReport()
|
|
17
|
+
assert r.signals == []
|
|
18
|
+
assert r.any_drifted() is False
|
|
19
|
+
assert r.to_dict() == {"signals": [], "any_drifted": False}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_add_chains():
|
|
23
|
+
r = DriftReport()
|
|
24
|
+
out = r.add(_sig("a", 0.1)).add(_sig("b", 0.2))
|
|
25
|
+
assert out is r
|
|
26
|
+
assert [s.name for s in r.signals] == ["a", "b"]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_from_signals():
|
|
30
|
+
r = DriftReport.from_signals([_sig("a", 0.1), _sig("b", 0.2, threshold=0.1, drifted=True)])
|
|
31
|
+
assert r.any_drifted() is True
|
|
32
|
+
assert r.get("a").value == 0.1
|
|
33
|
+
assert r.get("missing") is None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_to_pandas_includes_metadata_columns():
|
|
37
|
+
r = DriftReport().add(
|
|
38
|
+
DriftSignal(name="x", value=0.5, metadata={"sigma": 1.5, "n": 100})
|
|
39
|
+
)
|
|
40
|
+
df = r.to_pandas()
|
|
41
|
+
assert list(df.columns) == ["name", "value", "threshold", "drifted", "meta_sigma", "meta_n"]
|
|
42
|
+
assert df.iloc[0]["meta_sigma"] == 1.5
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_alert_if_raises_on_breach():
|
|
46
|
+
r = DriftReport().add(_sig("emb", 0.3)).add(_sig("lat", 0.05))
|
|
47
|
+
with pytest.raises(DriftAlert) as exc:
|
|
48
|
+
r.alert_if({"emb": 0.2, "lat": 0.1})
|
|
49
|
+
assert len(exc.value.breaches) == 1
|
|
50
|
+
assert exc.value.breaches[0].name == "emb"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_alert_if_no_breach():
|
|
54
|
+
r = DriftReport().add(_sig("emb", 0.05))
|
|
55
|
+
# should not raise
|
|
56
|
+
r.alert_if({"emb": 0.2})
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_alert_if_ignores_unlisted_signals():
|
|
60
|
+
r = DriftReport().add(_sig("emb", 999.0))
|
|
61
|
+
r.alert_if({"latency": 0.1}) # emb has no threshold here, should not raise
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from driftvane import ResponseDrift
|
|
4
|
+
from driftvane.detectors.response import Triple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_grounded_answers_have_no_drift():
|
|
8
|
+
triples = [
|
|
9
|
+
Triple(
|
|
10
|
+
intent="What is the capital of France?",
|
|
11
|
+
context="Paris is the capital of France.",
|
|
12
|
+
answer="Paris is the capital of France.",
|
|
13
|
+
),
|
|
14
|
+
Triple(
|
|
15
|
+
intent="What is 2+2?",
|
|
16
|
+
context="Two plus two equals four.",
|
|
17
|
+
answer="Two plus two equals four.",
|
|
18
|
+
),
|
|
19
|
+
]
|
|
20
|
+
sig = ResponseDrift(use_external=False).compute(triples, triples)
|
|
21
|
+
assert sig.value == pytest.approx(0.0)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_ungrounded_current_drifts_negative():
|
|
25
|
+
grounded = [
|
|
26
|
+
Triple(
|
|
27
|
+
intent="capital of France",
|
|
28
|
+
context="Paris is the capital of France.",
|
|
29
|
+
answer="Paris is the capital of France.",
|
|
30
|
+
)
|
|
31
|
+
]
|
|
32
|
+
ungrounded = [
|
|
33
|
+
Triple(
|
|
34
|
+
intent="capital of France",
|
|
35
|
+
context="Paris is the capital of France.",
|
|
36
|
+
answer="Wombats live in Australia.",
|
|
37
|
+
)
|
|
38
|
+
]
|
|
39
|
+
sig = ResponseDrift(use_external=False).compute(grounded, ungrounded)
|
|
40
|
+
assert sig.metadata["signed_shift"] < 0
|
|
41
|
+
assert sig.value > 0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_threshold_flags():
|
|
45
|
+
a = [Triple("q", "the answer is forty two", "the answer is forty two")]
|
|
46
|
+
b = [Triple("q", "the answer is forty two", "completely unrelated text")]
|
|
47
|
+
sig = ResponseDrift(threshold=0.3, use_external=False).compute(a, b)
|
|
48
|
+
assert sig.drifted is True
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_accepts_dicts_and_list_context():
|
|
52
|
+
a = [{"intent": "q", "context": ["fact a", "fact b"], "answer": "fact a"}]
|
|
53
|
+
b = [{"intent": "q", "context": ["fact a"], "answer": "fact b fact a"}]
|
|
54
|
+
sig = ResponseDrift(use_external=False).compute(a, b)
|
|
55
|
+
assert sig.metadata["n_ref"] == 1
|
|
56
|
+
assert sig.metadata["n_cur"] == 1
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_empty_raises():
|
|
60
|
+
with pytest.raises(ValueError, match="at least 1 triple"):
|
|
61
|
+
ResponseDrift(use_external=False).compute([], [])
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_inline_scorer_used_when_external_disabled():
|
|
65
|
+
triples = [Triple("q", "ctx", "ctx")]
|
|
66
|
+
sig = ResponseDrift(use_external=False).compute(triples, triples)
|
|
67
|
+
assert sig.metadata["scorer"] == "inline_jaccard"
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from driftvane import RetrievalDrift
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_identical_rankings_have_zero_drift():
|
|
7
|
+
ref = [["a", "b", "c", "d"], ["x", "y", "z"]]
|
|
8
|
+
cur = [["a", "b", "c", "d"], ["x", "y", "z"]]
|
|
9
|
+
sig = RetrievalDrift(k=4).compute(ref, cur)
|
|
10
|
+
assert sig.value == 0.0
|
|
11
|
+
assert sig.metadata["mean_jaccard_at_k"] == 1.0
|
|
12
|
+
assert sig.metadata["mean_rbo"] == 1.0
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_disjoint_rankings_have_max_drift():
|
|
16
|
+
ref = [["a", "b", "c"]]
|
|
17
|
+
cur = [["x", "y", "z"]]
|
|
18
|
+
sig = RetrievalDrift(k=3).compute(ref, cur)
|
|
19
|
+
assert sig.value == 1.0
|
|
20
|
+
assert sig.metadata["mean_jaccard_at_k"] == 0.0
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_partial_overlap():
|
|
24
|
+
ref = [["a", "b", "c", "d"]]
|
|
25
|
+
cur = [["a", "b", "x", "y"]]
|
|
26
|
+
sig = RetrievalDrift(k=4).compute(ref, cur)
|
|
27
|
+
# Jaccard of {a,b,c,d} vs {a,b,x,y} = 2/6 = 1/3
|
|
28
|
+
assert sig.metadata["mean_jaccard_at_k"] == pytest.approx(1 / 3)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_threshold_flags():
|
|
32
|
+
ref = [["a", "b", "c"]]
|
|
33
|
+
cur = [["a", "x", "y"]]
|
|
34
|
+
sig = RetrievalDrift(k=3, threshold=0.5).compute(ref, cur)
|
|
35
|
+
# 1 - (1/5) = 0.8 > 0.5
|
|
36
|
+
assert sig.drifted is True
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_mismatched_lengths_raises():
|
|
40
|
+
with pytest.raises(ValueError, match="same number of queries"):
|
|
41
|
+
RetrievalDrift().compute([["a"]], [["a"], ["b"]])
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_empty_input_raises():
|
|
45
|
+
with pytest.raises(ValueError, match="at least 1 query"):
|
|
46
|
+
RetrievalDrift().compute([], [])
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_k_must_be_positive():
|
|
50
|
+
with pytest.raises(ValueError, match="k must be"):
|
|
51
|
+
RetrievalDrift(k=0)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_rbo_weights_top_positions_more():
|
|
55
|
+
# Same set, different order: Jaccard sees no drift, RBO does
|
|
56
|
+
ref = [["a", "b", "c", "d", "e"]]
|
|
57
|
+
cur = [["e", "d", "c", "b", "a"]]
|
|
58
|
+
sig = RetrievalDrift(k=5).compute(ref, cur)
|
|
59
|
+
assert sig.metadata["mean_jaccard_at_k"] == 1.0
|
|
60
|
+
assert sig.metadata["mean_rbo"] < 1.0
|