meridian-regression 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "PowerShell(New-Item *)",
5
+ "PowerShell(pip install *)",
6
+ "PowerShell(cd \"C:\\\\Users\\\\VijayMandavilli\\\\OneDrive - Cognida Pvt Limited\\\\Desktop\\\\Tinkering\\\\meridian\"; $env:DEEPSEEK_API_KEY = \"sk-d95291da521d43ab8db825b8d2a4461e\"; $env:PYTHONIOENCODING = \"utf-8\"; python scripts/generate_golden_dataset.py 2>&1)"
7
+ ]
8
+ }
9
+ }
@@ -0,0 +1,33 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .eggs/
7
+ .venv/
8
+ venv/
9
+ *.egg
10
+
11
+ # test cache
12
+ .pytest_cache/
13
+ .coverage
14
+ htmlcov/
15
+
16
+ # sentence-transformers model cache (large, re-downloadable)
17
+ # .cache/
18
+
19
+ # generated reports (reproducible from golden dataset)
20
+ reports/
21
+
22
+ # populated golden dataset — contains API outputs, regenerate with scripts/generate_golden_dataset.py
23
+ datasets/golden_50.json
24
+
25
+ # paper build artefacts
26
+ paper/*.aux
27
+ paper/*.bbl
28
+ paper/*.blg
29
+ paper/*.log
30
+ paper/*.out
31
+ paper/*.pdf
32
+ paper/*.synctex.gz
33
+ paper/*.toc
@@ -0,0 +1,200 @@
1
+ Metadata-Version: 2.4
2
+ Name: meridian-regression
3
+ Version: 0.1.0
4
+ Summary: Local, embedding-based LLM model equivalence scoring for migration validation
5
+ Project-URL: Repository, https://github.com/mandavillivijay/meridian
6
+ Author-email: Vijay Mandavilli <mvijayfromvizag@gmail.com>
7
+ License: MIT
8
+ Keywords: embeddings,equivalence,llm,model-migration,nlp,regression-testing
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Software Development :: Testing
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: numpy>=1.24
21
+ Requires-Dist: pydantic>=2.0
22
+ Requires-Dist: sentence-transformers>=2.7
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest-cov; extra == 'dev'
25
+ Requires-Dist: pytest>=8.0; extra == 'dev'
26
+ Description-Content-Type: text/markdown
27
+
28
+ # MERIDIAN
29
+
30
+ **Model Equivalence and Regression via Intent Drift In AI Networks**
31
+
32
+ A lightweight Python library for validating LLM model equivalence when a vendor deprecates a model and you need to migrate to a replacement.
33
+
34
+ ---
35
+
36
+ ## The Problem
37
+
38
+ When OpenAI deprecates `gpt-4-0613` or Anthropic retires `claude-2`, enterprise teams have no established, reusable methodology to validate that the replacement produces semantically equivalent outputs for their specific workload. Traditional software testing checks exact outputs — useless for non-deterministic LLM responses. Existing benchmarks (MMLU, HELM) measure absolute capability, not relative equivalence between two specific models on your use case.
39
+
40
+ ## How MERIDIAN Is Different
41
+
42
+ Recent work ([arXiv:2604.27082](https://arxiv.org/abs/2604.27082), [arXiv:2507.05573](https://arxiv.org/abs/2507.05573), [arXiv:2604.27789](https://arxiv.org/abs/2604.27789)) describes migration validation processes using LLM-as-judge evaluation or human review. MERIDIAN takes a different approach:
43
+
44
+ | | Existing approaches | MERIDIAN |
45
+ |---|---|---|
46
+ | **Scoring method** | LLM-as-judge or human eval | Sentence-transformer cosine similarity |
47
+ | **Cloud dependency** | Requires API calls to score | Runs entirely locally |
48
+ | **Cost** | Per-token API cost to evaluate | Free after model download |
49
+ | **Reproducibility** | Non-deterministic (LLM judge) | Deterministic |
50
+ | **Framing** | Evaluation problem | Regression testing problem |
51
+ | **Format** | Research process descriptions | Reusable open-source library |
52
+
53
+ **Core insight:** embed old and new model outputs using a sentence-transformer, compute cosine similarity, and flag pairs below a drift threshold. Same technique as [canvas-heal](https://pypi.org/project/canvas-heal/) (UI locator healing), different problem surface.
54
+
55
+ ## Three-Tier Gate
56
+
57
+ ```
58
+ Cosine Similarity
59
+ ─────────────────────────────────────────────────────────
60
+ 0.0 ──────────── 0.75 ──────────── 0.92 ──────────── 1.0
61
+ DRIFTED REVIEW EQUIVALENT
62
+ (flag) (human eye) (auto-pass)
63
+ ```
64
+
65
+ Thresholds are configurable. Defaults (0.92 / 0.75) are starting points — calibrate them against a small human-labeled set for your domain. See the accompanying paper for a calibration procedure derived from the deepseek-chat (V3) → deepseek-reasoner (R1) empirical study.
66
+
67
+ ## Installation
68
+
69
+ ```bash
70
+ pip install meridian-regression
71
+ ```
72
+
73
+ Or from source:
74
+
75
+ ```bash
76
+ git clone https://github.com/mandavillivijay/meridian
77
+ cd meridian
78
+ pip install -e ".[dev]"
79
+ ```
80
+
81
+ ## Quickstart
82
+
83
+ ### 1. Build your golden dataset
84
+
85
+ Create a JSON file with outputs from both models for each prompt:
86
+
87
+ ```json
88
+ [
89
+ {
90
+ "prompt": "What is the capital of France?",
91
+ "intent": "factual",
92
+ "old_output": "The capital of France is Paris.",
93
+ "new_output": "Paris is the capital city of France."
94
+ }
95
+ ]
96
+ ```
97
+
98
+ Intent categories: `factual`, `generative`, `classification`, `structured_output`.
99
+
100
+ Run your old model and new model on the same prompts, save the outputs. MERIDIAN doesn't call any APIs — you bring the outputs.
101
+
102
+ ### 2. Run the pipeline
103
+
104
+ ```python
105
+ from meridian.runner import run
106
+
107
+ report = run("datasets/my_golden_set.json")
108
+ print(report.summary)
109
+ # "94.0% of outputs are semantically equivalent, 4.0% show minor drift
110
+ # requiring human review, 2.0% show significant drift (regression flagged)."
111
+ ```
112
+
113
+ ### 3. Use the report
114
+
115
+ ```python
116
+ print(f"Equivalent: {report.equivalent_pct}%")
117
+ print(f"Wilson 95% CI: [{report.wilson_lower:.3f}, {report.wilson_upper:.3f}]")
118
+ ```
119
+
120
+ JSON and markdown reports are written to `reports/` automatically.
121
+
122
+ ## Advanced Usage
123
+
124
+ ```python
125
+ from meridian.runner import run
126
+
127
+ report = run(
128
+ "datasets/my_golden_set.json",
129
+ sample_n=50, # stratified sample of 50 prompts
130
+ seed=42, # reproducible sampling
131
+ equivalent_threshold=0.90,
132
+ review_threshold=0.70,
133
+ report_stem="sonnet_migration_v2",
134
+ )
135
+ ```
136
+
137
+ ### Using modules directly
138
+
139
+ ```python
140
+ from meridian.sampler import load, stratified_sample
141
+ from meridian.scorer import DriftScorer
142
+ from meridian.reporter import Reporter
143
+
144
+ records = load("datasets/my_golden_set.json")
145
+ records = stratified_sample(records, n=50, seed=42)
146
+
147
+ scorer = DriftScorer()
148
+ results = scorer.score_all(records)
149
+
150
+ reporter = Reporter()
151
+ report = reporter.build(results)
152
+ reporter.write(report, stem="my_run")
153
+ ```
154
+
155
+ ### Bringing your own adapter
156
+
157
+ If you want to populate outputs programmatically rather than from a JSON file, implement the `ModelAdapter` protocol:
158
+
159
+ ```python
160
+ from meridian.adapters.base import ModelAdapter
161
+
162
+ class MyAdapter:
163
+ def complete(self, prompt: str) -> str:
164
+ # call your model here
165
+ ...
166
+ def name(self) -> str:
167
+ return "my-model-v2"
168
+ ```
169
+
170
+ ## Project Structure
171
+
172
+ ```
173
+ meridian/
174
+ ├── meridian/
175
+ │ ├── models.py # Pydantic data models
176
+ │ ├── embedder.py # Sentence-transformer wrapper (singleton)
177
+ │ ├── scorer.py # Three-tier drift gate
178
+ │ ├── reporter.py # Aggregate verdict + JSON/markdown output
179
+ │ ├── sampler.py # Dataset loading + stratified sampling
180
+ │ ├── runner.py # End-to-end pipeline entry point
181
+ │ └── adapters/
182
+ │ └── base.py # ModelAdapter Protocol (extension point)
183
+ ├── datasets/ # Example golden datasets
184
+ ├── reports/ # Generated reports
185
+ └── tests/ # pytest suite (106 tests)
186
+ ```
187
+
188
+ ## Running Tests
189
+
190
+ ```bash
191
+ pytest
192
+ ```
193
+
194
+ ## Author
195
+
196
+ Vijay Mandavilli — Quality Engineering Lead, Cognida AI, Hyderabad, India
197
+
198
+ ## License
199
+
200
+ MIT
@@ -0,0 +1,173 @@
1
+ # MERIDIAN
2
+
3
+ **Model Equivalence and Regression via Intent Drift In AI Networks**
4
+
5
+ A lightweight Python library for validating LLM model equivalence when a vendor deprecates a model and you need to migrate to a replacement.
6
+
7
+ ---
8
+
9
+ ## The Problem
10
+
11
+ When OpenAI deprecates `gpt-4-0613` or Anthropic retires `claude-2`, enterprise teams have no established, reusable methodology to validate that the replacement produces semantically equivalent outputs for their specific workload. Traditional software testing checks exact outputs — useless for non-deterministic LLM responses. Existing benchmarks (MMLU, HELM) measure absolute capability, not relative equivalence between two specific models on your use case.
12
+
13
+ ## How MERIDIAN Is Different
14
+
15
+ Recent work ([arXiv:2604.27082](https://arxiv.org/abs/2604.27082), [arXiv:2507.05573](https://arxiv.org/abs/2507.05573), [arXiv:2604.27789](https://arxiv.org/abs/2604.27789)) describes migration validation processes using LLM-as-judge evaluation or human review. MERIDIAN takes a different approach:
16
+
17
+ | | Existing approaches | MERIDIAN |
18
+ |---|---|---|
19
+ | **Scoring method** | LLM-as-judge or human eval | Sentence-transformer cosine similarity |
20
+ | **Cloud dependency** | Requires API calls to score | Runs entirely locally |
21
+ | **Cost** | Per-token API cost to evaluate | Free after model download |
22
+ | **Reproducibility** | Non-deterministic (LLM judge) | Deterministic |
23
+ | **Framing** | Evaluation problem | Regression testing problem |
24
+ | **Format** | Research process descriptions | Reusable open-source library |
25
+
26
+ **Core insight:** embed old and new model outputs using a sentence-transformer, compute cosine similarity, and flag pairs below a drift threshold. Same technique as [canvas-heal](https://pypi.org/project/canvas-heal/) (UI locator healing), different problem surface.
27
+
28
+ ## Three-Tier Gate
29
+
30
+ ```
31
+ Cosine Similarity
32
+ ─────────────────────────────────────────────────────────
33
+ 0.0 ──────────── 0.75 ──────────── 0.92 ──────────── 1.0
34
+ DRIFTED REVIEW EQUIVALENT
35
+ (flag) (human eye) (auto-pass)
36
+ ```
37
+
38
+ Thresholds are configurable. Defaults (0.92 / 0.75) are starting points — calibrate them against a small human-labeled set for your domain. See the accompanying paper for a calibration procedure derived from the deepseek-chat (V3) → deepseek-reasoner (R1) empirical study.
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ pip install meridian-regression
44
+ ```
45
+
46
+ Or from source:
47
+
48
+ ```bash
49
+ git clone https://github.com/mandavillivijay/meridian
50
+ cd meridian
51
+ pip install -e ".[dev]"
52
+ ```
53
+
54
+ ## Quickstart
55
+
56
+ ### 1. Build your golden dataset
57
+
58
+ Create a JSON file with outputs from both models for each prompt:
59
+
60
+ ```json
61
+ [
62
+ {
63
+ "prompt": "What is the capital of France?",
64
+ "intent": "factual",
65
+ "old_output": "The capital of France is Paris.",
66
+ "new_output": "Paris is the capital city of France."
67
+ }
68
+ ]
69
+ ```
70
+
71
+ Intent categories: `factual`, `generative`, `classification`, `structured_output`.
72
+
73
+ Run your old model and new model on the same prompts, save the outputs. MERIDIAN doesn't call any APIs — you bring the outputs.
74
+
75
+ ### 2. Run the pipeline
76
+
77
+ ```python
78
+ from meridian.runner import run
79
+
80
+ report = run("datasets/my_golden_set.json")
81
+ print(report.summary)
82
+ # "94.0% of outputs are semantically equivalent, 4.0% show minor drift
83
+ # requiring human review, 2.0% show significant drift (regression flagged)."
84
+ ```
85
+
86
+ ### 3. Use the report
87
+
88
+ ```python
89
+ print(f"Equivalent: {report.equivalent_pct}%")
90
+ print(f"Wilson 95% CI: [{report.wilson_lower:.3f}, {report.wilson_upper:.3f}]")
91
+ ```
92
+
93
+ JSON and markdown reports are written to `reports/` automatically.
94
+
95
+ ## Advanced Usage
96
+
97
+ ```python
98
+ from meridian.runner import run
99
+
100
+ report = run(
101
+ "datasets/my_golden_set.json",
102
+ sample_n=50, # stratified sample of 50 prompts
103
+ seed=42, # reproducible sampling
104
+ equivalent_threshold=0.90,
105
+ review_threshold=0.70,
106
+ report_stem="sonnet_migration_v2",
107
+ )
108
+ ```
109
+
110
+ ### Using modules directly
111
+
112
+ ```python
113
+ from meridian.sampler import load, stratified_sample
114
+ from meridian.scorer import DriftScorer
115
+ from meridian.reporter import Reporter
116
+
117
+ records = load("datasets/my_golden_set.json")
118
+ records = stratified_sample(records, n=50, seed=42)
119
+
120
+ scorer = DriftScorer()
121
+ results = scorer.score_all(records)
122
+
123
+ reporter = Reporter()
124
+ report = reporter.build(results)
125
+ reporter.write(report, stem="my_run")
126
+ ```
127
+
128
+ ### Bringing your own adapter
129
+
130
+ If you want to populate outputs programmatically rather than from a JSON file, implement the `ModelAdapter` protocol:
131
+
132
+ ```python
133
+ from meridian.adapters.base import ModelAdapter
134
+
135
+ class MyAdapter:
136
+ def complete(self, prompt: str) -> str:
137
+ # call your model here
138
+ ...
139
+ def name(self) -> str:
140
+ return "my-model-v2"
141
+ ```
142
+
143
+ ## Project Structure
144
+
145
+ ```
146
+ meridian/
147
+ ├── meridian/
148
+ │ ├── models.py # Pydantic data models
149
+ │ ├── embedder.py # Sentence-transformer wrapper (singleton)
150
+ │ ├── scorer.py # Three-tier drift gate
151
+ │ ├── reporter.py # Aggregate verdict + JSON/markdown output
152
+ │ ├── sampler.py # Dataset loading + stratified sampling
153
+ │ ├── runner.py # End-to-end pipeline entry point
154
+ │ └── adapters/
155
+ │ └── base.py # ModelAdapter Protocol (extension point)
156
+ ├── datasets/ # Example golden datasets
157
+ ├── reports/ # Generated reports
158
+ └── tests/ # pytest suite (106 tests)
159
+ ```
160
+
161
+ ## Running Tests
162
+
163
+ ```bash
164
+ pytest
165
+ ```
166
+
167
+ ## Author
168
+
169
+ Vijay Mandavilli — Quality Engineering Lead, Cognida AI, Hyderabad, India
170
+
171
+ ## License
172
+
173
+ MIT
@@ -0,0 +1,50 @@
1
+ [
2
+ {
3
+ "prompt": "What is the capital of France?",
4
+ "intent": "factual",
5
+ "old_output": "The capital of France is Paris.",
6
+ "new_output": "Paris is the capital city of France."
7
+ },
8
+ {
9
+ "prompt": "What year did World War II end?",
10
+ "intent": "factual",
11
+ "old_output": "World War II ended in 1945.",
12
+ "new_output": "The Second World War ended in 1945."
13
+ },
14
+ {
15
+ "prompt": "Write a short poem about the ocean.",
16
+ "intent": "generative",
17
+ "old_output": "Waves crash upon the shore,\nSalt and sea forever more.\nDepths unknown beneath the blue,\nOcean vast and ever true.",
18
+ "new_output": "The ocean breathes in endless tides,\nA world of wonder where life resides.\nCrashing waves on ancient stone,\nThe sea reminds us we're not alone."
19
+ },
20
+ {
21
+ "prompt": "Write a one-sentence tagline for a coffee brand.",
22
+ "intent": "generative",
23
+ "old_output": "Wake up to the world's finest brew.",
24
+ "new_output": "Start every morning with something extraordinary."
25
+ },
26
+ {
27
+ "prompt": "Is the following review positive or negative? 'The service was slow but the food was amazing.'",
28
+ "intent": "classification",
29
+ "old_output": "Mixed — the review contains both a negative sentiment (slow service) and a positive sentiment (amazing food).",
30
+ "new_output": "This is a mixed review: negative about service speed, positive about food quality."
31
+ },
32
+ {
33
+ "prompt": "Classify the following text as spam or not spam: 'Congratulations! You have won a $1000 gift card. Click here to claim.'",
34
+ "intent": "classification",
35
+ "old_output": "Spam",
36
+ "new_output": "This is spam."
37
+ },
38
+ {
39
+ "prompt": "Extract the name, date, and amount from this invoice text: 'Invoice for John Smith dated 2024-03-15, total $450.00'",
40
+ "intent": "structured_output",
41
+ "old_output": "{\"name\": \"John Smith\", \"date\": \"2024-03-15\", \"amount\": \"$450.00\"}",
42
+ "new_output": "{\"name\": \"John Smith\", \"date\": \"2024-03-15\", \"amount\": 450.00}"
43
+ },
44
+ {
45
+ "prompt": "Return a JSON object with keys 'celsius' and 'fahrenheit' for 100 degrees Celsius.",
46
+ "intent": "structured_output",
47
+ "old_output": "{\"celsius\": 100, \"fahrenheit\": 212}",
48
+ "new_output": "{\"celsius\": 100, \"fahrenheit\": 212.0}"
49
+ }
50
+ ]
@@ -0,0 +1,55 @@
1
+ [
2
+ {"prompt": "What is the capital of Japan?", "intent": "factual"},
3
+ {"prompt": "In what year did the Berlin Wall fall?", "intent": "factual"},
4
+ {"prompt": "What is the chemical symbol for gold?", "intent": "factual"},
5
+ {"prompt": "Who wrote the novel '1984'?", "intent": "factual"},
6
+ {"prompt": "What is the speed of light in a vacuum, in metres per second?", "intent": "factual"},
7
+ {"prompt": "How many bones are in the adult human body?", "intent": "factual"},
8
+ {"prompt": "What is the largest planet in our solar system?", "intent": "factual"},
9
+ {"prompt": "What is the Pythagorean theorem?", "intent": "factual"},
10
+ {"prompt": "Who painted the Mona Lisa?", "intent": "factual"},
11
+ {"prompt": "What does DNA stand for?", "intent": "factual"},
12
+ {"prompt": "What is the square root of 144?", "intent": "factual"},
13
+ {"prompt": "Which country has the largest land area in the world?", "intent": "factual"},
14
+ {"prompt": "What is the boiling point of water at standard atmospheric pressure in Celsius?", "intent": "factual"},
15
+
16
+ {"prompt": "Write a two-sentence product description for a noise-cancelling coffee thermos.", "intent": "generative"},
17
+ {"prompt": "Write a short poem (4 lines) about the feeling of finishing a long project.", "intent": "generative"},
18
+ {"prompt": "Draft a one-paragraph 'About Us' section for a small bakery in a coastal town.", "intent": "generative"},
19
+ {"prompt": "Write three bullet points summarising why sleep is important for cognitive performance.", "intent": "generative"},
20
+ {"prompt": "Write a one-sentence tagline for a fintech app that helps freelancers track invoices.", "intent": "generative"},
21
+ {"prompt": "Write a polite two-sentence email declining a meeting invitation due to a scheduling conflict.", "intent": "generative"},
22
+ {"prompt": "Summarise the concept of compound interest in two sentences aimed at a teenager.", "intent": "generative"},
23
+ {"prompt": "Write a motivational opening line for a data science conference keynote speech.", "intent": "generative"},
24
+ {"prompt": "Describe the taste of a mango to someone who has never eaten one, in two sentences.", "intent": "generative"},
25
+ {"prompt": "Write a one-paragraph explanation of what machine learning is for a non-technical business audience.", "intent": "generative"},
26
+ {"prompt": "Write a three-sentence plot summary for a thriller novel set in a remote Antarctic research station.", "intent": "generative"},
27
+ {"prompt": "Write a friendly out-of-office auto-reply message for a one-week vacation.", "intent": "generative"},
28
+
29
+ {"prompt": "Classify the sentiment of this review as positive, negative, or mixed: 'The hotel room was spotless and the view was stunning, but the checkout process took forever.'", "intent": "classification"},
30
+ {"prompt": "Is the following email spam or not spam? 'Hi, your package #4821 is ready for collection at the depot. Please bring your ID.'", "intent": "classification"},
31
+ {"prompt": "Classify the topic of this sentence into one of: technology, sports, politics, or entertainment. 'The new smartphone model features a 200-megapixel camera and a foldable display.'", "intent": "classification"},
32
+ {"prompt": "Is this customer message a complaint, a question, or a compliment? 'I have been waiting three weeks for my refund and nobody is responding to my emails.'", "intent": "classification"},
33
+ {"prompt": "Classify the urgency of this support ticket as high, medium, or low: 'Our production database is returning errors and the entire platform is down for all users.'", "intent": "classification"},
34
+ {"prompt": "Classify the sentiment of this tweet as positive, negative, or neutral: 'Just landed in Seoul. The airport is massive but the signage is surprisingly clear.'", "intent": "classification"},
35
+ {"prompt": "Is the following statement a fact or an opinion? 'Electric vehicles produce zero direct emissions during operation.'", "intent": "classification"},
36
+ {"prompt": "Classify this job title into one of: engineering, marketing, finance, or operations. 'Growth Hacker'", "intent": "classification"},
37
+ {"prompt": "Is the following code comment helpful or unhelpful? '# increment i by 1'", "intent": "classification"},
38
+ {"prompt": "Classify the following question as open-ended or closed-ended: 'What do you think the biggest challenge in AI safety is?'", "intent": "classification"},
39
+ {"prompt": "Classify the reading level of this sentence as elementary, intermediate, or advanced: 'The mitochondria are the organelles responsible for cellular respiration and ATP synthesis.'", "intent": "classification"},
40
+ {"prompt": "Is this news headline clickbait or not clickbait? 'Scientists discover new species of deep-sea fish near hydrothermal vents in the Pacific.'", "intent": "classification"},
41
+
42
+ {"prompt": "Extract the following fields as a JSON object from this text: name, company, and email. Text: 'Please reach out to Sarah Chen at Vertex Labs — her email is schen@vertexlabs.io'", "intent": "structured_output"},
43
+ {"prompt": "Return a JSON object with keys 'fahrenheit' and 'celsius' for a temperature of 98.6 degrees Fahrenheit.", "intent": "structured_output"},
44
+ {"prompt": "Parse the following address into a JSON object with keys: street, city, state, zip. Address: '742 Evergreen Terrace, Springfield, IL 62701'", "intent": "structured_output"},
45
+ {"prompt": "Extract all action items from this meeting note as a JSON array of strings. Note: 'Alice will update the roadmap by Friday. Bob needs to send the budget proposal. The team should review the new designs before Thursday.'", "intent": "structured_output"},
46
+ {"prompt": "Convert this sentence into a JSON object with keys 'subject', 'verb', 'object': 'The engineer deployed the service.'", "intent": "structured_output"},
47
+ {"prompt": "Return a JSON array listing the ingredients mentioned in this recipe snippet: 'Combine two cups of flour, one egg, half a cup of sugar, and a teaspoon of vanilla extract in a bowl.'", "intent": "structured_output"},
48
+ {"prompt": "Extract the event details from this text as a JSON object with keys: event_name, date, location. Text: 'Join us for the Annual Tech Summit on 15 September 2025 at the Grand Hyatt, San Francisco.'", "intent": "structured_output"},
49
+ {"prompt": "Return a JSON object with keys 'principal', 'rate', 'years', 'final_amount' for this scenario: $5000 invested at 6% annual interest for 10 years with annual compounding.", "intent": "structured_output"},
50
+ {"prompt": "Parse this log line into a JSON object with keys: timestamp, level, message. Log: '[2024-11-03 14:22:01] ERROR: Database connection timeout after 30s'", "intent": "structured_output"},
51
+ {"prompt": "Return a JSON object representing this person's details: 'Dr. Priya Nair, aged 41, works as a cardiologist at Apollo Hospital in Chennai.'", "intent": "structured_output"},
52
+ {"prompt": "Extract all dates mentioned in this paragraph as a JSON array in ISO 8601 format. Text: 'The contract was signed on March 3rd 2023, with a review scheduled for June 15 2023 and final delivery on 1 December 2023.'", "intent": "structured_output"},
53
+ {"prompt": "Return a JSON object summarising this product review with keys: rating (1-5), pros (array), cons (array). Review: 'Great battery life and the camera is exceptional. However, it runs hot during gaming and the charger is sold separately. I'd give it a 4 out of 5.'", "intent": "structured_output"},
54
+ {"prompt": "Convert this table row into a JSON object with keys: employee_id, name, department, salary. Row: '1042 | Arjun Mehta | Engineering | 95000'", "intent": "structured_output"}
55
+ ]
@@ -0,0 +1,26 @@
1
+ from meridian.runner import run
2
+ from meridian.scorer import DriftScorer
3
+ from meridian.reporter import Reporter
4
+ from meridian.sampler import load, stratified_sample
5
+ from meridian.embedder import Embedder
6
+ from meridian.models import (
7
+ PromptRecord,
8
+ DriftResult,
9
+ EquivalenceReport,
10
+ IntentCategory,
11
+ TierVerdict,
12
+ )
13
+
14
+ __all__ = [
15
+ "run",
16
+ "DriftScorer",
17
+ "Reporter",
18
+ "load",
19
+ "stratified_sample",
20
+ "Embedder",
21
+ "PromptRecord",
22
+ "DriftResult",
23
+ "EquivalenceReport",
24
+ "IntentCategory",
25
+ "TierVerdict",
26
+ ]
@@ -0,0 +1,17 @@
1
+ """
2
+ ModelAdapter Protocol — extension point for users who want to call a live model
3
+ during dataset construction rather than loading pre-populated JSON.
4
+
5
+ MERIDIAN itself does not depend on any LLM SDK. If you want to wire up a model,
6
+ implement this Protocol and pass it to DatasetSampler.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Protocol, runtime_checkable
12
+
13
+
14
+ @runtime_checkable
15
+ class ModelAdapter(Protocol):
16
+ def complete(self, prompt: str) -> str: ...
17
+ def name(self) -> str: ...
@@ -0,0 +1,51 @@
1
+ """
2
+ Local sentence-transformer embedder.
3
+
4
+ Runs entirely on-device — no API calls, no cloud dependency. This is a deliberate
5
+ design choice: unlike LLM-as-judge migration validators, MERIDIAN's scoring step
6
+ is free, fast, and deterministic.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Union
12
+
13
+ import numpy as np
14
+ from sentence_transformers import SentenceTransformer
15
+
16
+ _DEFAULT_MODEL = "all-MiniLM-L6-v2"
17
+
18
+ # Module-level cache: one loaded model per model name per process.
19
+ _instances: dict[str, "Embedder"] = {}
20
+
21
+
22
+ class Embedder:
23
+ """Singleton-per-model-name wrapper around SentenceTransformer."""
24
+
25
+ def __new__(cls, model_name: str = _DEFAULT_MODEL) -> "Embedder":
26
+ if model_name not in _instances:
27
+ instance = super().__new__(cls)
28
+ instance._model_name = model_name
29
+ instance._model = SentenceTransformer(model_name)
30
+ _instances[model_name] = instance
31
+ return _instances[model_name]
32
+
33
+ def embed(self, text: Union[str, list[str]]) -> np.ndarray:
34
+ """Return L2-normalised embeddings as a float32 numpy array.
35
+
36
+ Single string → shape (dim,). List of strings → shape (n, dim).
37
+ """
38
+ single = isinstance(text, str)
39
+ inputs = [text] if single else text
40
+ if not inputs:
41
+ return np.empty((0, self.embedding_dim), dtype=np.float32)
42
+ vectors = self._model.encode(inputs, normalize_embeddings=True, convert_to_numpy=True)
43
+ return vectors[0] if single else vectors
44
+
45
+ @property
46
+ def model_name(self) -> str:
47
+ return self._model_name
48
+
49
+ @property
50
+ def embedding_dim(self) -> int:
51
+ return self._model.get_sentence_embedding_dimension()