meridian-regression 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meridian_regression-0.1.0/.claude/settings.local.json +9 -0
- meridian_regression-0.1.0/.gitignore +33 -0
- meridian_regression-0.1.0/PKG-INFO +200 -0
- meridian_regression-0.1.0/README.md +173 -0
- meridian_regression-0.1.0/datasets/example.json +50 -0
- meridian_regression-0.1.0/datasets/golden_50_template.json +55 -0
- meridian_regression-0.1.0/meridian/__init__.py +26 -0
- meridian_regression-0.1.0/meridian/adapters/__init__.py +0 -0
- meridian_regression-0.1.0/meridian/adapters/base.py +17 -0
- meridian_regression-0.1.0/meridian/embedder.py +51 -0
- meridian_regression-0.1.0/meridian/models.py +73 -0
- meridian_regression-0.1.0/meridian/reporter.py +120 -0
- meridian_regression-0.1.0/meridian/runner.py +79 -0
- meridian_regression-0.1.0/meridian/sampler.py +99 -0
- meridian_regression-0.1.0/meridian/scorer.py +89 -0
- meridian_regression-0.1.0/paper/meridian.tex +543 -0
- meridian_regression-0.1.0/paper/references.bib +80 -0
- meridian_regression-0.1.0/pyproject.toml +47 -0
- meridian_regression-0.1.0/scripts/generate_golden_dataset.py +101 -0
- meridian_regression-0.1.0/tests/__init__.py +0 -0
- meridian_regression-0.1.0/tests/test_embedder.py +105 -0
- meridian_regression-0.1.0/tests/test_models.py +161 -0
- meridian_regression-0.1.0/tests/test_reporter.py +205 -0
- meridian_regression-0.1.0/tests/test_sampler.py +173 -0
- meridian_regression-0.1.0/tests/test_scorer.py +224 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"PowerShell(New-Item *)",
|
|
5
|
+
"PowerShell(pip install *)",
|
|
6
|
+
"PowerShell(cd \"C:\\\\Users\\\\VijayMandavilli\\\\OneDrive - Cognida Pvt Limited\\\\Desktop\\\\Tinkering\\\\meridian\"; $env:DEEPSEEK_API_KEY = \"sk-d95291da521d43ab8db825b8d2a4461e\"; $env:PYTHONIOENCODING = \"utf-8\"; python scripts/generate_golden_dataset.py 2>&1)"
|
|
7
|
+
]
|
|
8
|
+
}
|
|
9
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*.egg-info/
|
|
4
|
+
dist/
|
|
5
|
+
build/
|
|
6
|
+
.eggs/
|
|
7
|
+
.venv/
|
|
8
|
+
venv/
|
|
9
|
+
*.egg
|
|
10
|
+
|
|
11
|
+
# test cache
|
|
12
|
+
.pytest_cache/
|
|
13
|
+
.coverage
|
|
14
|
+
htmlcov/
|
|
15
|
+
|
|
16
|
+
# sentence-transformers model cache (large, re-downloadable)
|
|
17
|
+
# .cache/
|
|
18
|
+
|
|
19
|
+
# generated reports (reproducible from golden dataset)
|
|
20
|
+
reports/
|
|
21
|
+
|
|
22
|
+
# populated golden dataset — contains API outputs, regenerate with scripts/generate_golden_dataset.py
|
|
23
|
+
datasets/golden_50.json
|
|
24
|
+
|
|
25
|
+
# paper build artefacts
|
|
26
|
+
paper/*.aux
|
|
27
|
+
paper/*.bbl
|
|
28
|
+
paper/*.blg
|
|
29
|
+
paper/*.log
|
|
30
|
+
paper/*.out
|
|
31
|
+
paper/*.pdf
|
|
32
|
+
paper/*.synctex.gz
|
|
33
|
+
paper/*.toc
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: meridian-regression
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local, embedding-based LLM model equivalence scoring for migration validation
|
|
5
|
+
Project-URL: Repository, https://github.com/mandavillivijay/meridian
|
|
6
|
+
Author-email: Vijay Mandavilli <mvijayfromvizag@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: embeddings,equivalence,llm,model-migration,nlp,regression-testing
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Software Development :: Testing
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: numpy>=1.24
|
|
21
|
+
Requires-Dist: pydantic>=2.0
|
|
22
|
+
Requires-Dist: sentence-transformers>=2.7
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# MERIDIAN
|
|
29
|
+
|
|
30
|
+
**Model Equivalence and Regression via Intent Drift In AI Networks**
|
|
31
|
+
|
|
32
|
+
A lightweight Python library for validating LLM model equivalence when a vendor deprecates a model and you need to migrate to a replacement.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## The Problem
|
|
37
|
+
|
|
38
|
+
When OpenAI deprecates `gpt-4-0613` or Anthropic retires `claude-2`, enterprise teams have no established, reusable methodology to validate that the replacement produces semantically equivalent outputs for their specific workload. Traditional software testing checks exact outputs — useless for non-deterministic LLM responses. Existing benchmarks (MMLU, HELM) measure absolute capability, not relative equivalence between two specific models on your use case.
|
|
39
|
+
|
|
40
|
+
## How MERIDIAN Is Different
|
|
41
|
+
|
|
42
|
+
Recent work ([arXiv:2604.27082](https://arxiv.org/abs/2604.27082), [arXiv:2507.05573](https://arxiv.org/abs/2507.05573), [arXiv:2604.27789](https://arxiv.org/abs/2604.27789)) describes migration validation processes using LLM-as-judge evaluation or human review. MERIDIAN takes a different approach:
|
|
43
|
+
|
|
44
|
+
| | Existing approaches | MERIDIAN |
|
|
45
|
+
|---|---|---|
|
|
46
|
+
| **Scoring method** | LLM-as-judge or human eval | Sentence-transformer cosine similarity |
|
|
47
|
+
| **Cloud dependency** | Requires API calls to score | Runs entirely locally |
|
|
48
|
+
| **Cost** | Per-token API cost to evaluate | Free after model download |
|
|
49
|
+
| **Reproducibility** | Non-deterministic (LLM judge) | Deterministic |
|
|
50
|
+
| **Framing** | Evaluation problem | Regression testing problem |
|
|
51
|
+
| **Format** | Research process descriptions | Reusable open-source library |
|
|
52
|
+
|
|
53
|
+
**Core insight:** embed old and new model outputs using a sentence-transformer, compute cosine similarity, and flag pairs below a drift threshold. Same technique as [canvas-heal](https://pypi.org/project/canvas-heal/) (UI locator healing), different problem surface.
|
|
54
|
+
|
|
55
|
+
## Three-Tier Gate
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
Cosine Similarity
|
|
59
|
+
─────────────────────────────────────────────────────────
|
|
60
|
+
0.0 ──────────── 0.75 ──────────── 0.92 ──────────── 1.0
|
|
61
|
+
DRIFTED REVIEW EQUIVALENT
|
|
62
|
+
(flag) (human eye) (auto-pass)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Thresholds are configurable. Defaults (0.92 / 0.75) are starting points — calibrate them against a small human-labeled set for your domain. See the accompanying paper for a calibration procedure derived from the deepseek-chat (V3) → deepseek-reasoner (R1) empirical study.
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install meridian-regression
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or from source:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
git clone https://github.com/mandavillivijay/meridian
|
|
77
|
+
cd meridian
|
|
78
|
+
pip install -e ".[dev]"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Quickstart
|
|
82
|
+
|
|
83
|
+
### 1. Build your golden dataset
|
|
84
|
+
|
|
85
|
+
Create a JSON file with outputs from both models for each prompt:
|
|
86
|
+
|
|
87
|
+
```json
|
|
88
|
+
[
|
|
89
|
+
{
|
|
90
|
+
"prompt": "What is the capital of France?",
|
|
91
|
+
"intent": "factual",
|
|
92
|
+
"old_output": "The capital of France is Paris.",
|
|
93
|
+
"new_output": "Paris is the capital city of France."
|
|
94
|
+
}
|
|
95
|
+
]
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Intent categories: `factual`, `generative`, `classification`, `structured_output`.
|
|
99
|
+
|
|
100
|
+
Run your old model and new model on the same prompts, save the outputs. MERIDIAN doesn't call any APIs — you bring the outputs.
|
|
101
|
+
|
|
102
|
+
### 2. Run the pipeline
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from meridian.runner import run
|
|
106
|
+
|
|
107
|
+
report = run("datasets/my_golden_set.json")
|
|
108
|
+
print(report.summary)
|
|
109
|
+
# "94.0% of outputs are semantically equivalent, 4.0% show minor drift
|
|
110
|
+
# requiring human review, 2.0% show significant drift (regression flagged)."
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### 3. Use the report
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
print(f"Equivalent: {report.equivalent_pct}%")
|
|
117
|
+
print(f"Wilson 95% CI: [{report.wilson_lower:.3f}, {report.wilson_upper:.3f}]")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
JSON and markdown reports are written to `reports/` automatically.
|
|
121
|
+
|
|
122
|
+
## Advanced Usage
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from meridian.runner import run
|
|
126
|
+
|
|
127
|
+
report = run(
|
|
128
|
+
"datasets/my_golden_set.json",
|
|
129
|
+
sample_n=50, # stratified sample of 50 prompts
|
|
130
|
+
seed=42, # reproducible sampling
|
|
131
|
+
equivalent_threshold=0.90,
|
|
132
|
+
review_threshold=0.70,
|
|
133
|
+
report_stem="sonnet_migration_v2",
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Using modules directly
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from meridian.sampler import load, stratified_sample
|
|
141
|
+
from meridian.scorer import DriftScorer
|
|
142
|
+
from meridian.reporter import Reporter
|
|
143
|
+
|
|
144
|
+
records = load("datasets/my_golden_set.json")
|
|
145
|
+
records = stratified_sample(records, n=50, seed=42)
|
|
146
|
+
|
|
147
|
+
scorer = DriftScorer()
|
|
148
|
+
results = scorer.score_all(records)
|
|
149
|
+
|
|
150
|
+
reporter = Reporter()
|
|
151
|
+
report = reporter.build(results)
|
|
152
|
+
reporter.write(report, stem="my_run")
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Bringing your own adapter
|
|
156
|
+
|
|
157
|
+
If you want to populate outputs programmatically rather than from a JSON file, implement the `ModelAdapter` protocol:
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from meridian.adapters.base import ModelAdapter
|
|
161
|
+
|
|
162
|
+
class MyAdapter:
|
|
163
|
+
def complete(self, prompt: str) -> str:
|
|
164
|
+
# call your model here
|
|
165
|
+
...
|
|
166
|
+
def name(self) -> str:
|
|
167
|
+
return "my-model-v2"
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Project Structure
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
meridian/
|
|
174
|
+
├── meridian/
|
|
175
|
+
│ ├── models.py # Pydantic data models
|
|
176
|
+
│ ├── embedder.py # Sentence-transformer wrapper (singleton)
|
|
177
|
+
│ ├── scorer.py # Three-tier drift gate
|
|
178
|
+
│ ├── reporter.py # Aggregate verdict + JSON/markdown output
|
|
179
|
+
│ ├── sampler.py # Dataset loading + stratified sampling
|
|
180
|
+
│ ├── runner.py # End-to-end pipeline entry point
|
|
181
|
+
│ └── adapters/
|
|
182
|
+
│ └── base.py # ModelAdapter Protocol (extension point)
|
|
183
|
+
├── datasets/ # Example golden datasets
|
|
184
|
+
├── reports/ # Generated reports
|
|
185
|
+
└── tests/ # pytest suite (106 tests)
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Running Tests
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
pytest
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Author
|
|
195
|
+
|
|
196
|
+
Vijay Mandavilli — Quality Engineering Lead, Cognida AI, Hyderabad, India
|
|
197
|
+
|
|
198
|
+
## License
|
|
199
|
+
|
|
200
|
+
MIT
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# MERIDIAN
|
|
2
|
+
|
|
3
|
+
**Model Equivalence and Regression via Intent Drift In AI Networks**
|
|
4
|
+
|
|
5
|
+
A lightweight Python library for validating LLM model equivalence when a vendor deprecates a model and you need to migrate to a replacement.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## The Problem
|
|
10
|
+
|
|
11
|
+
When OpenAI deprecates `gpt-4-0613` or Anthropic retires `claude-2`, enterprise teams have no established, reusable methodology to validate that the replacement produces semantically equivalent outputs for their specific workload. Traditional software testing checks exact outputs — useless for non-deterministic LLM responses. Existing benchmarks (MMLU, HELM) measure absolute capability, not relative equivalence between two specific models on your use case.
|
|
12
|
+
|
|
13
|
+
## How MERIDIAN Is Different
|
|
14
|
+
|
|
15
|
+
Recent work ([arXiv:2604.27082](https://arxiv.org/abs/2604.27082), [arXiv:2507.05573](https://arxiv.org/abs/2507.05573), [arXiv:2604.27789](https://arxiv.org/abs/2604.27789)) describes migration validation processes using LLM-as-judge evaluation or human review. MERIDIAN takes a different approach:
|
|
16
|
+
|
|
17
|
+
| | Existing approaches | MERIDIAN |
|
|
18
|
+
|---|---|---|
|
|
19
|
+
| **Scoring method** | LLM-as-judge or human eval | Sentence-transformer cosine similarity |
|
|
20
|
+
| **Cloud dependency** | Requires API calls to score | Runs entirely locally |
|
|
21
|
+
| **Cost** | Per-token API cost to evaluate | Free after model download |
|
|
22
|
+
| **Reproducibility** | Non-deterministic (LLM judge) | Deterministic |
|
|
23
|
+
| **Framing** | Evaluation problem | Regression testing problem |
|
|
24
|
+
| **Format** | Research process descriptions | Reusable open-source library |
|
|
25
|
+
|
|
26
|
+
**Core insight:** embed old and new model outputs using a sentence-transformer, compute cosine similarity, and flag pairs below a drift threshold. Same technique as [canvas-heal](https://pypi.org/project/canvas-heal/) (UI locator healing), different problem surface.
|
|
27
|
+
|
|
28
|
+
## Three-Tier Gate
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
Cosine Similarity
|
|
32
|
+
─────────────────────────────────────────────────────────
|
|
33
|
+
0.0 ──────────── 0.75 ──────────── 0.92 ──────────── 1.0
|
|
34
|
+
DRIFTED REVIEW EQUIVALENT
|
|
35
|
+
(flag) (human eye) (auto-pass)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Thresholds are configurable. Defaults (0.92 / 0.75) are starting points — calibrate them against a small human-labeled set for your domain. See the accompanying paper for a calibration procedure derived from the deepseek-chat (V3) → deepseek-reasoner (R1) empirical study.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install meridian-regression
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Or from source:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
git clone https://github.com/mandavillivijay/meridian
|
|
50
|
+
cd meridian
|
|
51
|
+
pip install -e ".[dev]"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Quickstart
|
|
55
|
+
|
|
56
|
+
### 1. Build your golden dataset
|
|
57
|
+
|
|
58
|
+
Create a JSON file with outputs from both models for each prompt:
|
|
59
|
+
|
|
60
|
+
```json
|
|
61
|
+
[
|
|
62
|
+
{
|
|
63
|
+
"prompt": "What is the capital of France?",
|
|
64
|
+
"intent": "factual",
|
|
65
|
+
"old_output": "The capital of France is Paris.",
|
|
66
|
+
"new_output": "Paris is the capital city of France."
|
|
67
|
+
}
|
|
68
|
+
]
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Intent categories: `factual`, `generative`, `classification`, `structured_output`.
|
|
72
|
+
|
|
73
|
+
Run your old model and new model on the same prompts, save the outputs. MERIDIAN doesn't call any APIs — you bring the outputs.
|
|
74
|
+
|
|
75
|
+
### 2. Run the pipeline
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from meridian.runner import run
|
|
79
|
+
|
|
80
|
+
report = run("datasets/my_golden_set.json")
|
|
81
|
+
print(report.summary)
|
|
82
|
+
# "94.0% of outputs are semantically equivalent, 4.0% show minor drift
|
|
83
|
+
# requiring human review, 2.0% show significant drift (regression flagged)."
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 3. Use the report
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
print(f"Equivalent: {report.equivalent_pct}%")
|
|
90
|
+
print(f"Wilson 95% CI: [{report.wilson_lower:.3f}, {report.wilson_upper:.3f}]")
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
JSON and markdown reports are written to `reports/` automatically.
|
|
94
|
+
|
|
95
|
+
## Advanced Usage
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from meridian.runner import run
|
|
99
|
+
|
|
100
|
+
report = run(
|
|
101
|
+
"datasets/my_golden_set.json",
|
|
102
|
+
sample_n=50, # stratified sample of 50 prompts
|
|
103
|
+
seed=42, # reproducible sampling
|
|
104
|
+
equivalent_threshold=0.90,
|
|
105
|
+
review_threshold=0.70,
|
|
106
|
+
report_stem="sonnet_migration_v2",
|
|
107
|
+
)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Using modules directly
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from meridian.sampler import load, stratified_sample
|
|
114
|
+
from meridian.scorer import DriftScorer
|
|
115
|
+
from meridian.reporter import Reporter
|
|
116
|
+
|
|
117
|
+
records = load("datasets/my_golden_set.json")
|
|
118
|
+
records = stratified_sample(records, n=50, seed=42)
|
|
119
|
+
|
|
120
|
+
scorer = DriftScorer()
|
|
121
|
+
results = scorer.score_all(records)
|
|
122
|
+
|
|
123
|
+
reporter = Reporter()
|
|
124
|
+
report = reporter.build(results)
|
|
125
|
+
reporter.write(report, stem="my_run")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Bringing your own adapter
|
|
129
|
+
|
|
130
|
+
If you want to populate outputs programmatically rather than from a JSON file, implement the `ModelAdapter` protocol:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from meridian.adapters.base import ModelAdapter
|
|
134
|
+
|
|
135
|
+
class MyAdapter:
|
|
136
|
+
def complete(self, prompt: str) -> str:
|
|
137
|
+
# call your model here
|
|
138
|
+
...
|
|
139
|
+
def name(self) -> str:
|
|
140
|
+
return "my-model-v2"
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Project Structure
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
meridian/
|
|
147
|
+
├── meridian/
|
|
148
|
+
│ ├── models.py # Pydantic data models
|
|
149
|
+
│ ├── embedder.py # Sentence-transformer wrapper (singleton)
|
|
150
|
+
│ ├── scorer.py # Three-tier drift gate
|
|
151
|
+
│ ├── reporter.py # Aggregate verdict + JSON/markdown output
|
|
152
|
+
│ ├── sampler.py # Dataset loading + stratified sampling
|
|
153
|
+
│ ├── runner.py # End-to-end pipeline entry point
|
|
154
|
+
│ └── adapters/
|
|
155
|
+
│ └── base.py # ModelAdapter Protocol (extension point)
|
|
156
|
+
├── datasets/ # Example golden datasets
|
|
157
|
+
├── reports/ # Generated reports
|
|
158
|
+
└── tests/ # pytest suite (106 tests)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Running Tests
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
pytest
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Author
|
|
168
|
+
|
|
169
|
+
Vijay Mandavilli — Quality Engineering Lead, Cognida AI, Hyderabad, India
|
|
170
|
+
|
|
171
|
+
## License
|
|
172
|
+
|
|
173
|
+
MIT
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"prompt": "What is the capital of France?",
|
|
4
|
+
"intent": "factual",
|
|
5
|
+
"old_output": "The capital of France is Paris.",
|
|
6
|
+
"new_output": "Paris is the capital city of France."
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
"prompt": "What year did World War II end?",
|
|
10
|
+
"intent": "factual",
|
|
11
|
+
"old_output": "World War II ended in 1945.",
|
|
12
|
+
"new_output": "The Second World War ended in 1945."
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"prompt": "Write a short poem about the ocean.",
|
|
16
|
+
"intent": "generative",
|
|
17
|
+
"old_output": "Waves crash upon the shore,\nSalt and sea forever more.\nDepths unknown beneath the blue,\nOcean vast and ever true.",
|
|
18
|
+
"new_output": "The ocean breathes in endless tides,\nA world of wonder where life resides.\nCrashing waves on ancient stone,\nThe sea reminds us we're not alone."
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"prompt": "Write a one-sentence tagline for a coffee brand.",
|
|
22
|
+
"intent": "generative",
|
|
23
|
+
"old_output": "Wake up to the world's finest brew.",
|
|
24
|
+
"new_output": "Start every morning with something extraordinary."
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"prompt": "Is the following review positive or negative? 'The service was slow but the food was amazing.'",
|
|
28
|
+
"intent": "classification",
|
|
29
|
+
"old_output": "Mixed — the review contains both a negative sentiment (slow service) and a positive sentiment (amazing food).",
|
|
30
|
+
"new_output": "This is a mixed review: negative about service speed, positive about food quality."
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"prompt": "Classify the following text as spam or not spam: 'Congratulations! You have won a $1000 gift card. Click here to claim.'",
|
|
34
|
+
"intent": "classification",
|
|
35
|
+
"old_output": "Spam",
|
|
36
|
+
"new_output": "This is spam."
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"prompt": "Extract the name, date, and amount from this invoice text: 'Invoice for John Smith dated 2024-03-15, total $450.00'",
|
|
40
|
+
"intent": "structured_output",
|
|
41
|
+
"old_output": "{\"name\": \"John Smith\", \"date\": \"2024-03-15\", \"amount\": \"$450.00\"}",
|
|
42
|
+
"new_output": "{\"name\": \"John Smith\", \"date\": \"2024-03-15\", \"amount\": 450.00}"
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"prompt": "Return a JSON object with keys 'celsius' and 'fahrenheit' for 100 degrees Celsius.",
|
|
46
|
+
"intent": "structured_output",
|
|
47
|
+
"old_output": "{\"celsius\": 100, \"fahrenheit\": 212}",
|
|
48
|
+
"new_output": "{\"celsius\": 100, \"fahrenheit\": 212.0}"
|
|
49
|
+
}
|
|
50
|
+
]
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[
|
|
2
|
+
{"prompt": "What is the capital of Japan?", "intent": "factual"},
|
|
3
|
+
{"prompt": "In what year did the Berlin Wall fall?", "intent": "factual"},
|
|
4
|
+
{"prompt": "What is the chemical symbol for gold?", "intent": "factual"},
|
|
5
|
+
{"prompt": "Who wrote the novel '1984'?", "intent": "factual"},
|
|
6
|
+
{"prompt": "What is the speed of light in a vacuum, in metres per second?", "intent": "factual"},
|
|
7
|
+
{"prompt": "How many bones are in the adult human body?", "intent": "factual"},
|
|
8
|
+
{"prompt": "What is the largest planet in our solar system?", "intent": "factual"},
|
|
9
|
+
{"prompt": "What is the Pythagorean theorem?", "intent": "factual"},
|
|
10
|
+
{"prompt": "Who painted the Mona Lisa?", "intent": "factual"},
|
|
11
|
+
{"prompt": "What does DNA stand for?", "intent": "factual"},
|
|
12
|
+
{"prompt": "What is the square root of 144?", "intent": "factual"},
|
|
13
|
+
{"prompt": "Which country has the largest land area in the world?", "intent": "factual"},
|
|
14
|
+
{"prompt": "What is the boiling point of water at standard atmospheric pressure in Celsius?", "intent": "factual"},
|
|
15
|
+
|
|
16
|
+
{"prompt": "Write a two-sentence product description for a noise-cancelling coffee thermos.", "intent": "generative"},
|
|
17
|
+
{"prompt": "Write a short poem (4 lines) about the feeling of finishing a long project.", "intent": "generative"},
|
|
18
|
+
{"prompt": "Draft a one-paragraph 'About Us' section for a small bakery in a coastal town.", "intent": "generative"},
|
|
19
|
+
{"prompt": "Write three bullet points summarising why sleep is important for cognitive performance.", "intent": "generative"},
|
|
20
|
+
{"prompt": "Write a one-sentence tagline for a fintech app that helps freelancers track invoices.", "intent": "generative"},
|
|
21
|
+
{"prompt": "Write a polite two-sentence email declining a meeting invitation due to a scheduling conflict.", "intent": "generative"},
|
|
22
|
+
{"prompt": "Summarise the concept of compound interest in two sentences aimed at a teenager.", "intent": "generative"},
|
|
23
|
+
{"prompt": "Write a motivational opening line for a data science conference keynote speech.", "intent": "generative"},
|
|
24
|
+
{"prompt": "Describe the taste of a mango to someone who has never eaten one, in two sentences.", "intent": "generative"},
|
|
25
|
+
{"prompt": "Write a one-paragraph explanation of what machine learning is for a non-technical business audience.", "intent": "generative"},
|
|
26
|
+
{"prompt": "Write a three-sentence plot summary for a thriller novel set in a remote Antarctic research station.", "intent": "generative"},
|
|
27
|
+
{"prompt": "Write a friendly out-of-office auto-reply message for a one-week vacation.", "intent": "generative"},
|
|
28
|
+
|
|
29
|
+
{"prompt": "Classify the sentiment of this review as positive, negative, or mixed: 'The hotel room was spotless and the view was stunning, but the checkout process took forever.'", "intent": "classification"},
|
|
30
|
+
{"prompt": "Is the following email spam or not spam? 'Hi, your package #4821 is ready for collection at the depot. Please bring your ID.'", "intent": "classification"},
|
|
31
|
+
{"prompt": "Classify the topic of this sentence into one of: technology, sports, politics, or entertainment. 'The new smartphone model features a 200-megapixel camera and a foldable display.'", "intent": "classification"},
|
|
32
|
+
{"prompt": "Is this customer message a complaint, a question, or a compliment? 'I have been waiting three weeks for my refund and nobody is responding to my emails.'", "intent": "classification"},
|
|
33
|
+
{"prompt": "Classify the urgency of this support ticket as high, medium, or low: 'Our production database is returning errors and the entire platform is down for all users.'", "intent": "classification"},
|
|
34
|
+
{"prompt": "Classify the sentiment of this tweet as positive, negative, or neutral: 'Just landed in Seoul. The airport is massive but the signage is surprisingly clear.'", "intent": "classification"},
|
|
35
|
+
{"prompt": "Is the following statement a fact or an opinion? 'Electric vehicles produce zero direct emissions during operation.'", "intent": "classification"},
|
|
36
|
+
{"prompt": "Classify this job title into one of: engineering, marketing, finance, or operations. 'Growth Hacker'", "intent": "classification"},
|
|
37
|
+
{"prompt": "Is the following code comment helpful or unhelpful? '# increment i by 1'", "intent": "classification"},
|
|
38
|
+
{"prompt": "Classify the following question as open-ended or closed-ended: 'What do you think the biggest challenge in AI safety is?'", "intent": "classification"},
|
|
39
|
+
{"prompt": "Classify the reading level of this sentence as elementary, intermediate, or advanced: 'The mitochondria are the organelles responsible for cellular respiration and ATP synthesis.'", "intent": "classification"},
|
|
40
|
+
{"prompt": "Is this news headline clickbait or not clickbait? 'Scientists discover new species of deep-sea fish near hydrothermal vents in the Pacific.'", "intent": "classification"},
|
|
41
|
+
|
|
42
|
+
{"prompt": "Extract the following fields as a JSON object from this text: name, company, and email. Text: 'Please reach out to Sarah Chen at Vertex Labs — her email is schen@vertexlabs.io'", "intent": "structured_output"},
|
|
43
|
+
{"prompt": "Return a JSON object with keys 'fahrenheit' and 'celsius' for a temperature of 98.6 degrees Fahrenheit.", "intent": "structured_output"},
|
|
44
|
+
{"prompt": "Parse the following address into a JSON object with keys: street, city, state, zip. Address: '742 Evergreen Terrace, Springfield, IL 62701'", "intent": "structured_output"},
|
|
45
|
+
{"prompt": "Extract all action items from this meeting note as a JSON array of strings. Note: 'Alice will update the roadmap by Friday. Bob needs to send the budget proposal. The team should review the new designs before Thursday.'", "intent": "structured_output"},
|
|
46
|
+
{"prompt": "Convert this sentence into a JSON object with keys 'subject', 'verb', 'object': 'The engineer deployed the service.'", "intent": "structured_output"},
|
|
47
|
+
{"prompt": "Return a JSON array listing the ingredients mentioned in this recipe snippet: 'Combine two cups of flour, one egg, half a cup of sugar, and a teaspoon of vanilla extract in a bowl.'", "intent": "structured_output"},
|
|
48
|
+
{"prompt": "Extract the event details from this text as a JSON object with keys: event_name, date, location. Text: 'Join us for the Annual Tech Summit on 15 September 2025 at the Grand Hyatt, San Francisco.'", "intent": "structured_output"},
|
|
49
|
+
{"prompt": "Return a JSON object with keys 'principal', 'rate', 'years', 'final_amount' for this scenario: $5000 invested at 6% annual interest for 10 years with annual compounding.", "intent": "structured_output"},
|
|
50
|
+
{"prompt": "Parse this log line into a JSON object with keys: timestamp, level, message. Log: '[2024-11-03 14:22:01] ERROR: Database connection timeout after 30s'", "intent": "structured_output"},
|
|
51
|
+
{"prompt": "Return a JSON object representing this person's details: 'Dr. Priya Nair, aged 41, works as a cardiologist at Apollo Hospital in Chennai.'", "intent": "structured_output"},
|
|
52
|
+
{"prompt": "Extract all dates mentioned in this paragraph as a JSON array in ISO 8601 format. Text: 'The contract was signed on March 3rd 2023, with a review scheduled for June 15 2023 and final delivery on 1 December 2023.'", "intent": "structured_output"},
|
|
53
|
+
{"prompt": "Return a JSON object summarising this product review with keys: rating (1-5), pros (array), cons (array). Review: 'Great battery life and the camera is exceptional. However, it runs hot during gaming and the charger is sold separately. I'd give it a 4 out of 5.'", "intent": "structured_output"},
|
|
54
|
+
{"prompt": "Convert this table row into a JSON object with keys: employee_id, name, department, salary. Row: '1042 | Arjun Mehta | Engineering | 95000'", "intent": "structured_output"}
|
|
55
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from meridian.runner import run
|
|
2
|
+
from meridian.scorer import DriftScorer
|
|
3
|
+
from meridian.reporter import Reporter
|
|
4
|
+
from meridian.sampler import load, stratified_sample
|
|
5
|
+
from meridian.embedder import Embedder
|
|
6
|
+
from meridian.models import (
|
|
7
|
+
PromptRecord,
|
|
8
|
+
DriftResult,
|
|
9
|
+
EquivalenceReport,
|
|
10
|
+
IntentCategory,
|
|
11
|
+
TierVerdict,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"run",
|
|
16
|
+
"DriftScorer",
|
|
17
|
+
"Reporter",
|
|
18
|
+
"load",
|
|
19
|
+
"stratified_sample",
|
|
20
|
+
"Embedder",
|
|
21
|
+
"PromptRecord",
|
|
22
|
+
"DriftResult",
|
|
23
|
+
"EquivalenceReport",
|
|
24
|
+
"IntentCategory",
|
|
25
|
+
"TierVerdict",
|
|
26
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ModelAdapter Protocol — extension point for users who want to call a live model
|
|
3
|
+
during dataset construction rather than loading pre-populated JSON.
|
|
4
|
+
|
|
5
|
+
MERIDIAN itself does not depend on any LLM SDK. If you want to wire up a model,
|
|
6
|
+
implement this Protocol and pass it to DatasetSampler.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Protocol, runtime_checkable
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@runtime_checkable
|
|
15
|
+
class ModelAdapter(Protocol):
|
|
16
|
+
def complete(self, prompt: str) -> str: ...
|
|
17
|
+
def name(self) -> str: ...
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Local sentence-transformer embedder.
|
|
3
|
+
|
|
4
|
+
Runs entirely on-device — no API calls, no cloud dependency. This is a deliberate
|
|
5
|
+
design choice: unlike LLM-as-judge migration validators, MERIDIAN's scoring step
|
|
6
|
+
is free, fast, and deterministic.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Union
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from sentence_transformers import SentenceTransformer
|
|
15
|
+
|
|
16
|
+
_DEFAULT_MODEL = "all-MiniLM-L6-v2"
|
|
17
|
+
|
|
18
|
+
# Module-level cache: one loaded model per model name per process.
|
|
19
|
+
_instances: dict[str, "Embedder"] = {}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Embedder:
|
|
23
|
+
"""Singleton-per-model-name wrapper around SentenceTransformer."""
|
|
24
|
+
|
|
25
|
+
def __new__(cls, model_name: str = _DEFAULT_MODEL) -> "Embedder":
|
|
26
|
+
if model_name not in _instances:
|
|
27
|
+
instance = super().__new__(cls)
|
|
28
|
+
instance._model_name = model_name
|
|
29
|
+
instance._model = SentenceTransformer(model_name)
|
|
30
|
+
_instances[model_name] = instance
|
|
31
|
+
return _instances[model_name]
|
|
32
|
+
|
|
33
|
+
def embed(self, text: Union[str, list[str]]) -> np.ndarray:
|
|
34
|
+
"""Return L2-normalised embeddings as a float32 numpy array.
|
|
35
|
+
|
|
36
|
+
Single string → shape (dim,). List of strings → shape (n, dim).
|
|
37
|
+
"""
|
|
38
|
+
single = isinstance(text, str)
|
|
39
|
+
inputs = [text] if single else text
|
|
40
|
+
if not inputs:
|
|
41
|
+
return np.empty((0, self.embedding_dim), dtype=np.float32)
|
|
42
|
+
vectors = self._model.encode(inputs, normalize_embeddings=True, convert_to_numpy=True)
|
|
43
|
+
return vectors[0] if single else vectors
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def model_name(self) -> str:
|
|
47
|
+
return self._model_name
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def embedding_dim(self) -> int:
|
|
51
|
+
return self._model.get_sentence_embedding_dimension()
|