inferencebench-embeddings 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inferencebench_embeddings/__init__.py +15 -0
- inferencebench_embeddings/benchmarks/beir-mini.yaml +13 -0
- inferencebench_embeddings/benchmarks/long-doc.yaml +13 -0
- inferencebench_embeddings/benchmarks/msmarco-style.yaml +13 -0
- inferencebench_embeddings/benchmarks/query-expansion.yaml +13 -0
- inferencebench_embeddings/datasets/beir-mini-corpus.jsonl +20 -0
- inferencebench_embeddings/datasets/beir-mini-queries.jsonl +5 -0
- inferencebench_embeddings/datasets/long-doc-corpus.jsonl +10 -0
- inferencebench_embeddings/datasets/long-doc-queries.jsonl +3 -0
- inferencebench_embeddings/datasets/msmarco-style-corpus.jsonl +25 -0
- inferencebench_embeddings/datasets/msmarco-style-queries.jsonl +5 -0
- inferencebench_embeddings/datasets/query-expansion-corpus.jsonl +20 -0
- inferencebench_embeddings/datasets/query-expansion-queries.jsonl +5 -0
- inferencebench_embeddings/plugin.py +414 -0
- inferencebench_embeddings/py.typed +0 -0
- inferencebench_embeddings/schemas.py +94 -0
- inferencebench_embeddings/scoring.py +98 -0
- inferencebench_embeddings-0.0.2.dist-info/METADATA +42 -0
- inferencebench_embeddings-0.0.2.dist-info/RECORD +21 -0
- inferencebench_embeddings-0.0.2.dist-info/WHEEL +4 -0
- inferencebench_embeddings-0.0.2.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""InferenceBench embeddings-retrieval plugin."""
|
|
2
|
+
|
|
3
|
+
from inferencebench_embeddings.plugin import (
|
|
4
|
+
EXPECTED_METRICS,
|
|
5
|
+
EmbeddingsRetrievalPlugin,
|
|
6
|
+
)
|
|
7
|
+
from inferencebench_embeddings.schemas import BenchmarkSpec, EngineKind, RunContext
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"EXPECTED_METRICS",
|
|
11
|
+
"BenchmarkSpec",
|
|
12
|
+
"EmbeddingsRetrievalPlugin",
|
|
13
|
+
"EngineKind",
|
|
14
|
+
"RunContext",
|
|
15
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
benchmark_id: embeddings.retrieval.beir-mini
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: BEIR-style small corpus, recall@5.
|
|
4
|
+
modality: embeddings
|
|
5
|
+
kind: retrieval
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-beir-mini
|
|
8
|
+
path: beir-mini-queries.jsonl
|
|
9
|
+
corpus_path: beir-mini-corpus.jsonl
|
|
10
|
+
slo_template: embeddings.retrieval.standard
|
|
11
|
+
warmup:
|
|
12
|
+
discard_runs: 0
|
|
13
|
+
metric: recall_at_5
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
benchmark_id: embeddings.retrieval.long-doc
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: Long-document corpus, nDCG@10.
|
|
4
|
+
modality: embeddings
|
|
5
|
+
kind: retrieval
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-long-doc
|
|
8
|
+
path: long-doc-queries.jsonl
|
|
9
|
+
corpus_path: long-doc-corpus.jsonl
|
|
10
|
+
slo_template: embeddings.retrieval.standard
|
|
11
|
+
warmup:
|
|
12
|
+
discard_runs: 0
|
|
13
|
+
metric: ndcg_at_10
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
benchmark_id: embeddings.retrieval.msmarco-style
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: MS MARCO-style passage ranking, MRR@10.
|
|
4
|
+
modality: embeddings
|
|
5
|
+
kind: retrieval
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-msmarco-style
|
|
8
|
+
path: msmarco-style-queries.jsonl
|
|
9
|
+
corpus_path: msmarco-style-corpus.jsonl
|
|
10
|
+
slo_template: embeddings.retrieval.standard
|
|
11
|
+
warmup:
|
|
12
|
+
discard_runs: 0
|
|
13
|
+
metric: mrr_at_10
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
benchmark_id: embeddings.retrieval.query-expansion
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: Short queries against a paraphrase-rich corpus.
|
|
4
|
+
modality: embeddings
|
|
5
|
+
kind: retrieval
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-query-expansion
|
|
8
|
+
path: query-expansion-queries.jsonl
|
|
9
|
+
corpus_path: query-expansion-corpus.jsonl
|
|
10
|
+
slo_template: embeddings.retrieval.standard
|
|
11
|
+
warmup:
|
|
12
|
+
discard_runs: 0
|
|
13
|
+
metric: recall_at_5
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{"doc_id": "doc-1", "text": "Paris is the capital and most populous city of France."}
|
|
2
|
+
{"doc_id": "doc-2", "text": "Tokyo is the capital of Japan and one of the most populous metropolitan areas."}
|
|
3
|
+
{"doc_id": "doc-3", "text": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris."}
|
|
4
|
+
{"doc_id": "doc-4", "text": "Mount Fuji is the highest mountain in Japan and a symbol of the country."}
|
|
5
|
+
{"doc_id": "doc-5", "text": "The Pacific Ocean is the largest and deepest ocean on Earth."}
|
|
6
|
+
{"doc_id": "doc-6", "text": "William Shakespeare wrote many plays including Hamlet and Macbeth."}
|
|
7
|
+
{"doc_id": "doc-7", "text": "Hamlet is one of Shakespeare's most famous tragedies, set in Denmark."}
|
|
8
|
+
{"doc_id": "doc-8", "text": "Mars is the fourth planet from the Sun, often called the Red Planet."}
|
|
9
|
+
{"doc_id": "doc-9", "text": "Gold is a chemical element with the symbol Au and atomic number 79."}
|
|
10
|
+
{"doc_id": "doc-10", "text": "Silver is a chemical element with the symbol Ag and is widely used in jewellery."}
|
|
11
|
+
{"doc_id": "doc-11", "text": "Leonardo da Vinci painted the Mona Lisa, now displayed in the Louvre."}
|
|
12
|
+
{"doc_id": "doc-12", "text": "The Louvre is the world's largest art museum, located in Paris, France."}
|
|
13
|
+
{"doc_id": "doc-13", "text": "Blue whales are the largest mammals on Earth and can weigh over 100 tons."}
|
|
14
|
+
{"doc_id": "doc-14", "text": "Photosynthesis is the process by which plants absorb carbon dioxide and produce oxygen."}
|
|
15
|
+
{"doc_id": "doc-15", "text": "The Moon landing in 1969 was a major milestone in human space exploration."}
|
|
16
|
+
{"doc_id": "doc-16", "text": "Apollo 11 was the spaceflight that first landed humans on the Moon."}
|
|
17
|
+
{"doc_id": "doc-17", "text": "The Atlantic Ocean separates the Americas from Europe and Africa."}
|
|
18
|
+
{"doc_id": "doc-18", "text": "Albert Einstein developed the theory of relativity and won the Nobel Prize in Physics."}
|
|
19
|
+
{"doc_id": "doc-19", "text": "The speed of light in vacuum is approximately 299,792 kilometres per second."}
|
|
20
|
+
{"doc_id": "doc-20", "text": "DNA is a molecule composed of two strands that coil around each other to form a double helix."}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"query": "What is the capital of France?", "relevant_doc_ids": ["doc-1", "doc-3", "doc-12"]}
|
|
2
|
+
{"query": "Tell me about Japan's tallest mountain.", "relevant_doc_ids": ["doc-2", "doc-4"]}
|
|
3
|
+
{"query": "Who wrote Hamlet?", "relevant_doc_ids": ["doc-6", "doc-7"]}
|
|
4
|
+
{"query": "What is the chemical symbol for gold?", "relevant_doc_ids": ["doc-9"]}
|
|
5
|
+
{"query": "When did humans first land on the Moon?", "relevant_doc_ids": ["doc-15", "doc-16"]}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{"doc_id": "ld-1", "text": "Transformer architectures have revolutionised natural language processing since their introduction in 2017. The original paper attention is all you need showed that self-attention layers can replace recurrent and convolutional structures while training in parallel across all positions, which dramatically reduced wall-clock training time and unlocked the scaling laws that subsequent work has explored exhaustively."}
|
|
2
|
+
{"doc_id": "ld-2", "text": "Renewable energy adoption has accelerated globally over the past decade, with solar photovoltaic capacity growing more than tenfold and onshore wind seeing similar gains. The economics of grid-scale battery storage have also shifted dramatically, making it feasible to firm intermittent generation in many markets without resorting to natural gas peakers."}
|
|
3
|
+
{"doc_id": "ld-3", "text": "Modern garbage collectors in managed runtimes use generational hypotheses to avoid scanning the entire heap on every cycle. Young objects die quickly, so the nursery is collected frequently with a copying algorithm, while the older generation uses a mark-and-sweep or mark-and-compact pass less often. Region-based collectors like G1 extend this with finer-grained partitioning."}
|
|
4
|
+
{"doc_id": "ld-4", "text": "The history of vaccines stretches back over two centuries, beginning with Edward Jenner's observation that milkmaids exposed to cowpox seemed immune to smallpox. The twentieth century saw the development of inactivated and live-attenuated vaccines for polio, measles, and dozens of other diseases, while the twenty-first century has been defined by mRNA platforms that proved themselves during the COVID-19 pandemic."}
|
|
5
|
+
{"doc_id": "ld-5", "text": "Distributed consensus algorithms like Paxos and Raft solve the problem of agreeing on a sequence of values across a set of failure-prone nodes. Raft was designed explicitly for understandability and breaks consensus into leader election, log replication, and safety, while Paxos predates Raft and is the basis for many production systems including Google's Chubby lock service."}
|
|
6
|
+
{"doc_id": "ld-6", "text": "Quantum computing exploits superposition and entanglement to perform certain calculations exponentially faster than classical machines. Shor's algorithm for factoring integers and Grover's algorithm for unstructured search are the canonical examples, but practical quantum advantage on commercially relevant problems remains an open research question as engineering teams work to reduce gate error rates."}
|
|
7
|
+
{"doc_id": "ld-7", "text": "Coral reefs are some of the most biodiverse ecosystems on the planet, supporting roughly a quarter of all marine species despite covering less than one percent of the ocean floor. Rising sea temperatures and ocean acidification have triggered widespread bleaching events that threaten the long-term viability of these systems, with the Great Barrier Reef among the most studied examples."}
|
|
8
|
+
{"doc_id": "ld-8", "text": "Compiler optimisation involves a sequence of program transformations that preserve observable behaviour while improving some target metric such as execution time, code size, or energy usage. Modern compilers like LLVM and GCC implement hundreds of passes including inlining, constant propagation, loop unrolling, vectorisation, and aggressive dead-code elimination."}
|
|
9
|
+
{"doc_id": "ld-9", "text": "The protein folding problem asks how a linear sequence of amino acids reliably collapses into a specific three-dimensional structure. AlphaFold demonstrated that deep learning models trained on the Protein Data Bank can predict structures with accuracy comparable to experimental methods for many protein families, accelerating drug discovery and basic biology research."}
|
|
10
|
+
{"doc_id": "ld-10", "text": "Modern web browsers are among the most complex pieces of consumer software ever shipped. They include a multi-process architecture, a JIT-compiling JavaScript engine, a GPU-accelerated compositor, a layout engine that implements thousands of pages of CSS specification, an extension API, sandboxed rendering, and increasingly an integration point for AI assistants."}
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
{"query": "How do transformer architectures change NLP training?", "relevant_doc_ids": ["ld-1"]}
|
|
2
|
+
{"query": "What is the role of consensus algorithms in distributed systems?", "relevant_doc_ids": ["ld-5"]}
|
|
3
|
+
{"query": "How has deep learning advanced biology and drug discovery?", "relevant_doc_ids": ["ld-9", "ld-1"]}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{"doc_id": "mm-1", "text": "Coffee is a brewed beverage prepared from roasted coffee beans. It originated in Ethiopia and is now consumed worldwide. Espresso, drip, and pour-over are common preparation methods."}
|
|
2
|
+
{"doc_id": "mm-2", "text": "The Pacific Ocean is the largest ocean on Earth, covering one third of the planet's surface. It is bordered by Asia, the Americas, and Antarctica. Its average depth is roughly four kilometres."}
|
|
3
|
+
{"doc_id": "mm-3", "text": "Photosynthesis is the biological process by which plants convert sunlight, water, and carbon dioxide into glucose and oxygen. Chlorophyll in the leaves absorbs light energy. The process powers most life on Earth."}
|
|
4
|
+
{"doc_id": "mm-4", "text": "Football is the most popular sport in the world by viewership. The FIFA World Cup is held every four years. Major leagues operate in Europe, South America, and increasingly Asia and North America."}
|
|
5
|
+
{"doc_id": "mm-5", "text": "TCP is a connection-oriented protocol that guarantees delivery and ordering through acknowledgements. UDP is a connectionless protocol that trades reliability for lower latency. Choose TCP for files, UDP for real-time streams."}
|
|
6
|
+
{"doc_id": "mm-6", "text": "The Renaissance was a cultural movement that began in Italy in the fourteenth century. It produced major advances in art, science, and politics. Leonardo da Vinci and Michelangelo are emblematic figures of the period."}
|
|
7
|
+
{"doc_id": "mm-7", "text": "Python is widely used in data science thanks to libraries like NumPy, pandas, scikit-learn, and PyTorch. Beginners often start with introductory courses on Kaggle or Coursera. Practice with real datasets accelerates learning."}
|
|
8
|
+
{"doc_id": "mm-8", "text": "The smartphone revolutionised personal computing in the late 2000s. iOS and Android dominate the operating-system market. App stores transformed the software distribution model."}
|
|
9
|
+
{"doc_id": "mm-9", "text": "Plate tectonics describes how Earth's lithosphere is divided into plates that move over the mantle. Earthquakes and volcanoes mostly occur at plate boundaries. The theory was widely accepted in the 1960s."}
|
|
10
|
+
{"doc_id": "mm-10", "text": "The Mediterranean diet emphasises vegetables, fruits, whole grains, olive oil, and fish. It has been linked to lower rates of cardiovascular disease. The diet originated in countries like Greece and Italy."}
|
|
11
|
+
{"doc_id": "mm-11", "text": "Vitamin D deficiency can cause fatigue, bone pain, muscle weakness, and mood changes. It is common in regions with limited sunlight exposure. Supplementation and dietary sources like fatty fish help correct it."}
|
|
12
|
+
{"doc_id": "mm-12", "text": "The Industrial Revolution began in Britain in the eighteenth century. Steam power, mechanised textile production, and railways transformed economies. It also drove urbanisation and large-scale environmental change."}
|
|
13
|
+
{"doc_id": "mm-13", "text": "Quantum mechanics describes physical systems at atomic scales. Superposition, entanglement, and uncertainty are core principles. The field underpins technologies from lasers to MRI scanners."}
|
|
14
|
+
{"doc_id": "mm-14", "text": "Plants exchange gases through small pores called stomata, typically on the underside of leaves. During photosynthesis they take in carbon dioxide and release oxygen. The stomata also regulate water loss via transpiration."}
|
|
15
|
+
{"doc_id": "mm-15", "text": "The euro is the common currency of nineteen European Union member states. It was introduced in 1999 for accounting and 2002 in physical form. The European Central Bank manages monetary policy for the eurozone."}
|
|
16
|
+
{"doc_id": "mm-16", "text": "Bicycles are an efficient form of human-powered transport, widely used for commuting in cities like Amsterdam and Copenhagen. Modern designs include road, mountain, and electric variants. Cycling reduces carbon emissions and improves health."}
|
|
17
|
+
{"doc_id": "mm-17", "text": "The DNA double helix was discovered in 1953 by Watson and Crick, building on work by Franklin and Wilkins. The structure consists of two strands held by base pairs. It encodes the genetic information of all known organisms."}
|
|
18
|
+
{"doc_id": "mm-18", "text": "The Great Wall of China was built across many dynasties, with most surviving sections dating from the Ming period. It stretches for thousands of kilometres across northern China. The wall served as defence, signalling, and trade control."}
|
|
19
|
+
{"doc_id": "mm-19", "text": "Bread is one of the oldest prepared foods, with evidence of baking going back over fourteen thousand years. Modern bread relies on wheat flour, water, salt, and yeast. Sourdough uses wild yeast and lactic-acid bacteria."}
|
|
20
|
+
{"doc_id": "mm-20", "text": "The Amazon rainforest is the largest tropical forest on Earth, spanning nine countries in South America. It hosts an enormous share of global biodiversity. Deforestation threatens carbon storage and species survival."}
|
|
21
|
+
{"doc_id": "mm-21", "text": "Cloud computing delivers compute, storage, and networking as on-demand services over the internet. Major providers include AWS, Azure, and Google Cloud. The model has reshaped how software is built and deployed."}
|
|
22
|
+
{"doc_id": "mm-22", "text": "Sunlight exposure is the body's main source of vitamin D, with diet contributing only a small fraction. Office workers and people in high-latitude regions are at higher risk of deficiency. Routine blood tests can confirm low levels."}
|
|
23
|
+
{"doc_id": "mm-23", "text": "The Eiffel Tower in Paris was completed in 1889 for the Universal Exposition. It is made of wrought iron and stands 330 metres tall. The tower is among the most-visited paid monuments in the world."}
|
|
24
|
+
{"doc_id": "mm-24", "text": "Machine learning has driven advances in computer vision, language understanding, and recommendation systems. Deep neural networks scale impressively with data and compute. Training large models requires specialised hardware accelerators."}
|
|
25
|
+
{"doc_id": "mm-25", "text": "Construction of the Great Wall continued under successive Chinese dynasties starting with the Qin in the third century BCE. Materials varied by region, from tamped earth to bricks. Watchtowers and garrisons were spaced along its length."}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"query": "how does photosynthesis work in plants", "relevant_doc_ids": ["mm-3", "mm-14"]}
|
|
2
|
+
{"query": "best way to learn python for data science", "relevant_doc_ids": ["mm-7"]}
|
|
3
|
+
{"query": "symptoms of vitamin d deficiency", "relevant_doc_ids": ["mm-11", "mm-22"]}
|
|
4
|
+
{"query": "what is the difference between tcp and udp", "relevant_doc_ids": ["mm-5"]}
|
|
5
|
+
{"query": "history of the great wall of china", "relevant_doc_ids": ["mm-18", "mm-25"]}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{"doc_id": "qe-1", "text": "A vehicle with four wheels used for personal transport on roads."}
|
|
2
|
+
{"doc_id": "qe-2", "text": "An automobile, typically powered by an internal combustion engine or electric motor."}
|
|
3
|
+
{"doc_id": "qe-3", "text": "A long, articulated road vehicle used to carry many passengers between stops."}
|
|
4
|
+
{"doc_id": "qe-4", "text": "A two-wheeled, pedal-driven personal transport device."}
|
|
5
|
+
{"doc_id": "qe-5", "text": "A medical practitioner who diagnoses and treats illness."}
|
|
6
|
+
{"doc_id": "qe-6", "text": "A physician trained to provide medical care and prescribe medication."}
|
|
7
|
+
{"doc_id": "qe-7", "text": "A trained nurse who assists with patient care in hospitals or clinics."}
|
|
8
|
+
{"doc_id": "qe-8", "text": "An academic who has earned a doctorate in a non-medical field."}
|
|
9
|
+
{"doc_id": "qe-9", "text": "A film, typically released in cinemas, that tells a story through moving images."}
|
|
10
|
+
{"doc_id": "qe-10", "text": "A motion picture, screened in theatres or streamed online."}
|
|
11
|
+
{"doc_id": "qe-11", "text": "A short video clip recorded on a smartphone and shared online."}
|
|
12
|
+
{"doc_id": "qe-12", "text": "A live theatrical performance staged in front of an audience."}
|
|
13
|
+
{"doc_id": "qe-13", "text": "A building used as a private residence by an individual or family."}
|
|
14
|
+
{"doc_id": "qe-14", "text": "A dwelling, typically detached, where people live and sleep."}
|
|
15
|
+
{"doc_id": "qe-15", "text": "A small wooden structure used to store gardening tools."}
|
|
16
|
+
{"doc_id": "qe-16", "text": "A high-rise apartment building containing many separate flats."}
|
|
17
|
+
{"doc_id": "qe-17", "text": "The art of arranging sounds in time to produce a composition through melody, harmony, and rhythm."}
|
|
18
|
+
{"doc_id": "qe-18", "text": "Songs and instrumental pieces, performed live or recorded, that people listen to for enjoyment."}
|
|
19
|
+
{"doc_id": "qe-19", "text": "Ambient noise from machinery and traffic in a busy urban environment."}
|
|
20
|
+
{"doc_id": "qe-20", "text": "A spoken-word podcast covering interviews with public figures."}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"query": "car", "relevant_doc_ids": ["qe-1", "qe-2"]}
|
|
2
|
+
{"query": "doctor", "relevant_doc_ids": ["qe-5", "qe-6"]}
|
|
3
|
+
{"query": "movie", "relevant_doc_ids": ["qe-9", "qe-10"]}
|
|
4
|
+
{"query": "house", "relevant_doc_ids": ["qe-13", "qe-14"]}
|
|
5
|
+
{"query": "music", "relevant_doc_ids": ["qe-17", "qe-18"]}
|
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
"""EmbeddingsRetrievalPlugin — entry point for ``embeddings.retrieval`` benchmarks.
|
|
2
|
+
|
|
3
|
+
Phase-2-quality skeleton: produces a real signed envelope by deterministically
|
|
4
|
+
ranking the corpus per query via ``sha256(query + doc_id)`` sort, then scoring
|
|
5
|
+
the top-k against the fixture's relevant set with recall@5 / mrr@10 / nDCG@10.
|
|
6
|
+
|
|
7
|
+
Future revisions wire a real embedding model call into :meth:`_rank_corpus`;
|
|
8
|
+
the rest of the pipeline (signing, aggregation, sample dump) is production-shaped.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import hashlib
|
|
14
|
+
import json
|
|
15
|
+
import math
|
|
16
|
+
import os
|
|
17
|
+
import time
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
import yaml
|
|
21
|
+
|
|
22
|
+
from inferencebench.envelope import (
|
|
23
|
+
DatasetSpec as EnvDatasetSpec,
|
|
24
|
+
)
|
|
25
|
+
from inferencebench.envelope import (
|
|
26
|
+
EngineConfig,
|
|
27
|
+
Envelope,
|
|
28
|
+
EnvelopeBuilder,
|
|
29
|
+
ModelConfig,
|
|
30
|
+
Quantization,
|
|
31
|
+
SigningMode,
|
|
32
|
+
sign_envelope,
|
|
33
|
+
)
|
|
34
|
+
from inferencebench.harness import (
|
|
35
|
+
Sample,
|
|
36
|
+
collect_hardware_fingerprint,
|
|
37
|
+
collect_software_provenance,
|
|
38
|
+
)
|
|
39
|
+
from inferencebench.harness.metrics import EnergyReport, Percentiles, TelemetryWindow
|
|
40
|
+
from inferencebench_embeddings.schemas import BenchmarkSpec, EngineKind, RunContext
|
|
41
|
+
from inferencebench_embeddings.scoring import METRICS
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _json_num(v: float) -> str:
|
|
45
|
+
"""JSON-safe numeric encoder: NaN/inf become null."""
|
|
46
|
+
if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
|
|
47
|
+
return "null"
|
|
48
|
+
return repr(v)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# Engines that require ``base_url`` (self-hosted TEI servers).
|
|
52
|
+
_SELF_HOSTED_ENGINES = frozenset({EngineKind.TEI})
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _fixtures_cache_root() -> Path:
|
|
56
|
+
"""Resolve the bench-fixtures cache root for ``fixtures://`` dataset URIs."""
|
|
57
|
+
override = os.environ.get("BENCH_FIXTURES_ROOT")
|
|
58
|
+
if override:
|
|
59
|
+
return Path(override)
|
|
60
|
+
return Path.home() / ".cache" / "inferencebench" / "fixtures"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _compute_fixture_hash(queries: list[dict[str, object]], corpus: list[dict[str, str]]) -> str:
|
|
64
|
+
"""SHA-256 over the canonical-JSON-encoded queries + corpus."""
|
|
65
|
+
canonical = json.dumps(
|
|
66
|
+
{"queries": queries, "corpus": corpus},
|
|
67
|
+
sort_keys=True,
|
|
68
|
+
separators=(",", ":"),
|
|
69
|
+
default=str,
|
|
70
|
+
)
|
|
71
|
+
return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _rank_corpus(query: str, corpus_ids: list[str]) -> list[str]:
|
|
75
|
+
"""Deterministically rank corpus doc-ids for a query.
|
|
76
|
+
|
|
77
|
+
Sorts by ``sha256(query + doc_id).hexdigest()`` so the ranking is
|
|
78
|
+
reproducible across machines and Python versions but uncorrelated with
|
|
79
|
+
actual relevance — exactly what we want for a contract-validation
|
|
80
|
+
skeleton that should produce a non-degenerate metric in [0, 1].
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def _key(doc_id: str) -> str:
|
|
84
|
+
h = hashlib.sha256()
|
|
85
|
+
h.update(query.encode("utf-8"))
|
|
86
|
+
h.update(b"\x00")
|
|
87
|
+
h.update(doc_id.encode("utf-8"))
|
|
88
|
+
return h.hexdigest()
|
|
89
|
+
|
|
90
|
+
return sorted(corpus_ids, key=_key)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# Metrics this plugin is expected to emit. Consumed by ``bench coverage``.
|
|
94
|
+
EXPECTED_METRICS: tuple[str, ...] = (
|
|
95
|
+
"recall_at_5_mean",
|
|
96
|
+
"recall_at_5_p50",
|
|
97
|
+
"recall_at_5_p95",
|
|
98
|
+
"mrr_at_10_mean",
|
|
99
|
+
"ndcg_at_10_mean",
|
|
100
|
+
"ok_rate",
|
|
101
|
+
"n_queries",
|
|
102
|
+
"corpus_size",
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class EmbeddingsRetrievalPlugin:
|
|
107
|
+
"""Plugin entry point. Registered via ``inferencebench.plugins`` entrypoint group."""
|
|
108
|
+
|
|
109
|
+
suite_id = "embeddings.retrieval"
|
|
110
|
+
version = "0.0.0"
|
|
111
|
+
description = (
|
|
112
|
+
"Embeddings retrieval benchmarks (deterministic hash ranking on bundled "
|
|
113
|
+
"corpora; real embedding-model invocation deferred)."
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# ----------------------------------------------------------- benchmarks #
|
|
117
|
+
def list_benchmarks(self) -> list[BenchmarkSpec]:
|
|
118
|
+
bench_dir = self._benchmarks_dir()
|
|
119
|
+
specs: list[BenchmarkSpec] = []
|
|
120
|
+
if not bench_dir.exists():
|
|
121
|
+
return specs
|
|
122
|
+
for yml in sorted(bench_dir.glob("*.yaml")):
|
|
123
|
+
specs.append(self._load_yaml(yml))
|
|
124
|
+
return specs
|
|
125
|
+
|
|
126
|
+
def get_benchmark(self, benchmark_id: str) -> BenchmarkSpec:
|
|
127
|
+
for spec in self.list_benchmarks():
|
|
128
|
+
if spec.benchmark_id == benchmark_id:
|
|
129
|
+
return spec
|
|
130
|
+
msg = f"benchmark_id not found: {benchmark_id}"
|
|
131
|
+
raise KeyError(msg)
|
|
132
|
+
|
|
133
|
+
# ------------------------------------------------------------- validate #
|
|
134
|
+
def validate(self, spec: BenchmarkSpec, context: RunContext) -> list[str]:
|
|
135
|
+
warnings: list[str] = []
|
|
136
|
+
if not context.model_id:
|
|
137
|
+
warnings.append("model_id is empty")
|
|
138
|
+
if context.engine_kind in _SELF_HOSTED_ENGINES and not context.base_url:
|
|
139
|
+
warnings.append(
|
|
140
|
+
f"{context.engine_kind.value} needs base_url (e.g. http://localhost:8080)"
|
|
141
|
+
)
|
|
142
|
+
if not self._queries_path(spec).exists():
|
|
143
|
+
warnings.append(f"queries fixture not found: {spec.dataset.path}")
|
|
144
|
+
if not self._corpus_path(spec).exists():
|
|
145
|
+
warnings.append(f"corpus fixture not found: {spec.dataset.corpus_path}")
|
|
146
|
+
return warnings
|
|
147
|
+
|
|
148
|
+
# ------------------------------------------------------------------ run #
|
|
149
|
+
def run(self, spec: BenchmarkSpec, context: RunContext) -> Envelope:
|
|
150
|
+
"""Execute the benchmark and return a SIGNED envelope."""
|
|
151
|
+
queries = self._load_queries(spec)
|
|
152
|
+
corpus = self._load_corpus(spec)
|
|
153
|
+
fixture_hash = _compute_fixture_hash(queries, corpus)
|
|
154
|
+
k, scorer = METRICS[spec.metric]
|
|
155
|
+
|
|
156
|
+
corpus_ids = [doc["doc_id"] for doc in corpus]
|
|
157
|
+
|
|
158
|
+
samples: list[Sample] = []
|
|
159
|
+
scores: list[float] = []
|
|
160
|
+
telemetry = TelemetryWindow()
|
|
161
|
+
with telemetry:
|
|
162
|
+
for idx, q in enumerate(queries):
|
|
163
|
+
query_text = str(q["query"])
|
|
164
|
+
raw_relevant = q.get("relevant_doc_ids") or []
|
|
165
|
+
# ``_load_queries`` guarantees this is a list[str], but mypy sees
|
|
166
|
+
# the dict as ``dict[str, object]`` — narrow explicitly.
|
|
167
|
+
assert isinstance(raw_relevant, list)
|
|
168
|
+
relevant = [str(x) for x in raw_relevant]
|
|
169
|
+
t_arrival = time.perf_counter() * 1000.0
|
|
170
|
+
t_start = time.perf_counter()
|
|
171
|
+
ranking = _rank_corpus(query_text, corpus_ids)
|
|
172
|
+
score = float(scorer(ranking, relevant, k))
|
|
173
|
+
total_ms = (time.perf_counter() - t_start) * 1000.0
|
|
174
|
+
scores.append(score)
|
|
175
|
+
samples.append(
|
|
176
|
+
Sample(
|
|
177
|
+
request_idx=idx,
|
|
178
|
+
arrival_ms=t_arrival,
|
|
179
|
+
start_ms=t_arrival,
|
|
180
|
+
ttft_ms=float("nan"),
|
|
181
|
+
total_ms=total_ms,
|
|
182
|
+
tpot_ms=float("nan"),
|
|
183
|
+
tokens_in=len(query_text.split()),
|
|
184
|
+
tokens_out=k,
|
|
185
|
+
cost_usd=0.0,
|
|
186
|
+
finish_reason="stop",
|
|
187
|
+
ok=True,
|
|
188
|
+
extra={
|
|
189
|
+
"score": score,
|
|
190
|
+
"k": k,
|
|
191
|
+
"n_relevant": len(relevant),
|
|
192
|
+
"topk": ranking[:k],
|
|
193
|
+
},
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Best-effort diagnostic dump — never blocks the run on I/O errors.
|
|
198
|
+
self._dump_samples(context, samples)
|
|
199
|
+
|
|
200
|
+
envelope = self._build_envelope(
|
|
201
|
+
spec,
|
|
202
|
+
context,
|
|
203
|
+
samples=samples,
|
|
204
|
+
scores=scores,
|
|
205
|
+
corpus_size=len(corpus),
|
|
206
|
+
dataset_hash=fixture_hash,
|
|
207
|
+
energy=telemetry.summarise(samples),
|
|
208
|
+
)
|
|
209
|
+
signing_mode = context.extra.get("signing_mode", "dev")
|
|
210
|
+
dev_key_path = context.extra.get("dev_key_path")
|
|
211
|
+
if signing_mode == "dev":
|
|
212
|
+
if not dev_key_path:
|
|
213
|
+
msg = "dev signing requires context.extra['dev_key_path']"
|
|
214
|
+
raise ValueError(msg)
|
|
215
|
+
return sign_envelope(
|
|
216
|
+
envelope,
|
|
217
|
+
mode=SigningMode.DEV,
|
|
218
|
+
dev_key_path=Path(str(dev_key_path)),
|
|
219
|
+
)
|
|
220
|
+
return sign_envelope(envelope, mode=SigningMode.KEYLESS)
|
|
221
|
+
|
|
222
|
+
# ------------------------------------------------------------ samples #
|
|
223
|
+
def _dump_samples(self, context: RunContext, samples: list[Sample]) -> None:
|
|
224
|
+
"""Write per-query samples (incl. score) to ``<output_dir>/samples-<ts>.jsonl``.
|
|
225
|
+
|
|
226
|
+
Mirrors the llm-quality plugin's diagnostic dump — failures here
|
|
227
|
+
never block the run.
|
|
228
|
+
"""
|
|
229
|
+
try:
|
|
230
|
+
out_dir = Path(context.output_dir)
|
|
231
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
232
|
+
ts = int(time.time())
|
|
233
|
+
path = out_dir / f"samples-{ts}.jsonl"
|
|
234
|
+
with path.open("w", encoding="utf-8") as fp:
|
|
235
|
+
for s in samples:
|
|
236
|
+
score = s.extra.get("score") if s.extra else None
|
|
237
|
+
score_part = (
|
|
238
|
+
',"score":' + _json_num(float(score))
|
|
239
|
+
if isinstance(score, (int, float))
|
|
240
|
+
else ""
|
|
241
|
+
)
|
|
242
|
+
fp.write(
|
|
243
|
+
'{"request_idx":'
|
|
244
|
+
+ str(s.request_idx)
|
|
245
|
+
+ ',"ok":'
|
|
246
|
+
+ ("true" if s.ok else "false")
|
|
247
|
+
+ ',"total_ms":'
|
|
248
|
+
+ _json_num(s.total_ms)
|
|
249
|
+
+ ',"tokens_in":'
|
|
250
|
+
+ str(s.tokens_in)
|
|
251
|
+
+ ',"tokens_out":'
|
|
252
|
+
+ str(s.tokens_out)
|
|
253
|
+
+ score_part
|
|
254
|
+
+ ',"finish_reason":"'
|
|
255
|
+
+ (s.finish_reason or "")
|
|
256
|
+
+ '"'
|
|
257
|
+
+ "}\n"
|
|
258
|
+
)
|
|
259
|
+
except OSError:
|
|
260
|
+
pass # diagnostics-only — never block the run
|
|
261
|
+
|
|
262
|
+
# ---------------------------------------------------------- file paths #
|
|
263
|
+
def _benchmarks_dir(self) -> Path:
|
|
264
|
+
return Path(__file__).parent / "benchmarks"
|
|
265
|
+
|
|
266
|
+
def _datasets_dir(self) -> Path:
|
|
267
|
+
return Path(__file__).parent / "datasets"
|
|
268
|
+
|
|
269
|
+
def _queries_path(self, spec: BenchmarkSpec) -> Path:
|
|
270
|
+
raw = spec.dataset.path
|
|
271
|
+
if raw.startswith("fixtures://"):
|
|
272
|
+
return _fixtures_cache_root() / f"{raw[len('fixtures://') :]}.jsonl"
|
|
273
|
+
return self._datasets_dir() / raw
|
|
274
|
+
|
|
275
|
+
def _corpus_path(self, spec: BenchmarkSpec) -> Path:
|
|
276
|
+
raw = spec.dataset.corpus_path
|
|
277
|
+
if raw.startswith("fixtures://"):
|
|
278
|
+
return _fixtures_cache_root() / f"{raw[len('fixtures://') :]}.jsonl"
|
|
279
|
+
return self._datasets_dir() / raw
|
|
280
|
+
|
|
281
|
+
def _load_yaml(self, path: Path) -> BenchmarkSpec:
|
|
282
|
+
raw = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
|
283
|
+
return BenchmarkSpec.model_validate(raw)
|
|
284
|
+
|
|
285
|
+
def _load_queries(self, spec: BenchmarkSpec) -> list[dict[str, object]]:
|
|
286
|
+
path = self._queries_path(spec)
|
|
287
|
+
if not path.exists():
|
|
288
|
+
msg = f"queries fixture not found: {path}"
|
|
289
|
+
raise FileNotFoundError(msg)
|
|
290
|
+
items: list[dict[str, object]] = []
|
|
291
|
+
with path.open("r", encoding="utf-8") as fp:
|
|
292
|
+
for line in fp:
|
|
293
|
+
line = line.strip()
|
|
294
|
+
if not line:
|
|
295
|
+
continue
|
|
296
|
+
obj = json.loads(line)
|
|
297
|
+
if not isinstance(obj, dict):
|
|
298
|
+
continue
|
|
299
|
+
if "query" not in obj or "relevant_doc_ids" not in obj:
|
|
300
|
+
continue
|
|
301
|
+
items.append(
|
|
302
|
+
{
|
|
303
|
+
"query": str(obj["query"]),
|
|
304
|
+
"relevant_doc_ids": [str(x) for x in obj["relevant_doc_ids"]],
|
|
305
|
+
}
|
|
306
|
+
)
|
|
307
|
+
if not items:
|
|
308
|
+
msg = f"queries fixture is empty: {path}"
|
|
309
|
+
raise ValueError(msg)
|
|
310
|
+
return items
|
|
311
|
+
|
|
312
|
+
def _load_corpus(self, spec: BenchmarkSpec) -> list[dict[str, str]]:
|
|
313
|
+
path = self._corpus_path(spec)
|
|
314
|
+
if not path.exists():
|
|
315
|
+
msg = f"corpus fixture not found: {path}"
|
|
316
|
+
raise FileNotFoundError(msg)
|
|
317
|
+
items: list[dict[str, str]] = []
|
|
318
|
+
with path.open("r", encoding="utf-8") as fp:
|
|
319
|
+
for line in fp:
|
|
320
|
+
line = line.strip()
|
|
321
|
+
if not line:
|
|
322
|
+
continue
|
|
323
|
+
obj = json.loads(line)
|
|
324
|
+
if not isinstance(obj, dict):
|
|
325
|
+
continue
|
|
326
|
+
if "doc_id" not in obj or "text" not in obj:
|
|
327
|
+
continue
|
|
328
|
+
items.append({"doc_id": str(obj["doc_id"]), "text": str(obj["text"])})
|
|
329
|
+
if not items:
|
|
330
|
+
msg = f"corpus fixture is empty: {path}"
|
|
331
|
+
raise ValueError(msg)
|
|
332
|
+
return items
|
|
333
|
+
|
|
334
|
+
# ---------------------------------------------------------- envelope #
|
|
335
|
+
def _build_envelope(
|
|
336
|
+
self,
|
|
337
|
+
spec: BenchmarkSpec,
|
|
338
|
+
context: RunContext,
|
|
339
|
+
*,
|
|
340
|
+
samples: list[Sample],
|
|
341
|
+
scores: list[float],
|
|
342
|
+
corpus_size: int,
|
|
343
|
+
dataset_hash: str,
|
|
344
|
+
energy: EnergyReport | None = None,
|
|
345
|
+
) -> Envelope:
|
|
346
|
+
hw = collect_hardware_fingerprint()
|
|
347
|
+
sw = collect_software_provenance()
|
|
348
|
+
|
|
349
|
+
metrics: dict[str, float | int | str | None] = {}
|
|
350
|
+
|
|
351
|
+
ok_samples = [s for s in samples if s.ok]
|
|
352
|
+
n_ok = len(ok_samples)
|
|
353
|
+
metrics["n_queries"] = float(len(samples))
|
|
354
|
+
metrics["n_ok"] = float(n_ok)
|
|
355
|
+
metrics["ok_rate"] = float(n_ok) / float(len(samples)) if samples else 0.0
|
|
356
|
+
metrics["corpus_size"] = float(corpus_size)
|
|
357
|
+
|
|
358
|
+
# Headline retrieval metric — keyed by the spec's metric so downstream
|
|
359
|
+
# `bench diff` knows higher-is-better (see _HIGHER_IS_BETTER in
|
|
360
|
+
# cli/commands/diff.py).
|
|
361
|
+
if scores:
|
|
362
|
+
mean_score = sum(scores) / len(scores)
|
|
363
|
+
metric_prefix = spec.metric # "recall_at_5" | "mrr_at_10" | "ndcg_at_10"
|
|
364
|
+
metrics[f"{metric_prefix}_mean"] = mean_score
|
|
365
|
+
if len(scores) >= 2:
|
|
366
|
+
pcts = Percentiles(scores, percentiles=(50.0, 95.0))
|
|
367
|
+
metrics[f"{metric_prefix}_p50"] = pcts.p50
|
|
368
|
+
metrics[f"{metric_prefix}_p95"] = pcts.p95
|
|
369
|
+
else:
|
|
370
|
+
metrics[f"{metric_prefix}_p50"] = mean_score
|
|
371
|
+
metrics[f"{metric_prefix}_p95"] = mean_score
|
|
372
|
+
|
|
373
|
+
total_vals = [s.total_ms for s in ok_samples if math.isfinite(s.total_ms)]
|
|
374
|
+
if total_vals:
|
|
375
|
+
metrics["total_p50_ms"] = Percentiles(total_vals).p50
|
|
376
|
+
|
|
377
|
+
# Energy / power summary from telemetry (None on plugins that haven't
|
|
378
|
+
# threaded a TelemetryWindow through yet). Mirrors llm-inference.
|
|
379
|
+
if energy is not None:
|
|
380
|
+
if energy.gpu_power_avg_w > 0:
|
|
381
|
+
metrics["power_avg_w"] = energy.gpu_power_avg_w
|
|
382
|
+
metrics["power_peak_w"] = energy.gpu_power_peak_w
|
|
383
|
+
if energy.total_energy_joules > 0:
|
|
384
|
+
metrics["energy_joules_total"] = energy.total_energy_joules
|
|
385
|
+
if energy.joules_per_token == energy.joules_per_token: # not NaN
|
|
386
|
+
metrics["joules_per_token"] = energy.joules_per_token
|
|
387
|
+
|
|
388
|
+
builder = EnvelopeBuilder(
|
|
389
|
+
suite_id=spec.benchmark_id,
|
|
390
|
+
suite_version=spec.suite_version,
|
|
391
|
+
model=ModelConfig(
|
|
392
|
+
id=context.model_id,
|
|
393
|
+
revision=context.model_revision,
|
|
394
|
+
provider=context.engine_kind.value,
|
|
395
|
+
endpoint_hash="0" * 64,
|
|
396
|
+
),
|
|
397
|
+
engine=EngineConfig(
|
|
398
|
+
name=context.engine_kind.value,
|
|
399
|
+
version=context.engine_version or "unknown",
|
|
400
|
+
config_hash="0" * 64,
|
|
401
|
+
),
|
|
402
|
+
hardware_fingerprint=hw,
|
|
403
|
+
software_provenance=sw,
|
|
404
|
+
dataset=EnvDatasetSpec(id=spec.dataset.id, hash=dataset_hash),
|
|
405
|
+
seed=0,
|
|
406
|
+
quantization=(
|
|
407
|
+
Quantization(format=context.quantization_format)
|
|
408
|
+
if context.quantization_format
|
|
409
|
+
else None
|
|
410
|
+
),
|
|
411
|
+
metrics=metrics,
|
|
412
|
+
slo_template=spec.slo_template,
|
|
413
|
+
)
|
|
414
|
+
return builder.build()
|
|
File without changes
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Pydantic schemas for embeddings-retrieval benchmark specs + run context."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import StrEnum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Annotated, Literal
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EngineKind(StrEnum):
|
|
13
|
+
"""Engines this plugin can drive.
|
|
14
|
+
|
|
15
|
+
Production-grade paths for embeddings: HuggingFace's Text Embeddings
|
|
16
|
+
Inference (TEI) for self-hosted, plus the two big provider-hosted
|
|
17
|
+
options (OpenAI text-embedding-3, Cohere embed-english).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
TEI = "tei"
|
|
21
|
+
OPENAI = "openai"
|
|
22
|
+
COHERE = "cohere"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DatasetConfig(BaseModel):
|
|
26
|
+
"""Dataset under evaluation.
|
|
27
|
+
|
|
28
|
+
Fixture is a JSONL of query records — each with the relevant doc-id set
|
|
29
|
+
and a pointer to the corpus JSONL. Corpus is a sibling JSONL containing
|
|
30
|
+
``{"doc_id", "text"}`` rows.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
model_config = ConfigDict(extra="forbid")
|
|
34
|
+
id: Annotated[str, Field(min_length=1)]
|
|
35
|
+
path: Annotated[
|
|
36
|
+
str,
|
|
37
|
+
Field(
|
|
38
|
+
min_length=1,
|
|
39
|
+
description=("Path to the queries JSONL relative to the plugin's datasets/ directory."),
|
|
40
|
+
),
|
|
41
|
+
]
|
|
42
|
+
corpus_path: Annotated[
|
|
43
|
+
str,
|
|
44
|
+
Field(
|
|
45
|
+
min_length=1,
|
|
46
|
+
description=("Path to the corpus JSONL relative to the plugin's datasets/ directory."),
|
|
47
|
+
),
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class WarmupConfig(BaseModel):
|
|
52
|
+
"""Warmup parameters.
|
|
53
|
+
|
|
54
|
+
Retrieval runs are per-query and order-independent; default is zero
|
|
55
|
+
discarded runs. Surfaced for future JIT-warmup of embedding servers.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
model_config = ConfigDict(extra="forbid")
|
|
59
|
+
discard_runs: Annotated[int, Field(ge=0)] = 0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class BenchmarkSpec(BaseModel):
|
|
63
|
+
"""One retrieval benchmark — fixture + metric + metadata."""
|
|
64
|
+
|
|
65
|
+
model_config = ConfigDict(extra="forbid")
|
|
66
|
+
benchmark_id: Annotated[str, Field(min_length=1)]
|
|
67
|
+
suite_version: Annotated[str, Field(pattern=r"^\d+\.\d+\.\d+(-[\w.]+)?$")]
|
|
68
|
+
description: str = ""
|
|
69
|
+
modality: Literal["embeddings"] = "embeddings"
|
|
70
|
+
kind: Literal["retrieval"] = "retrieval"
|
|
71
|
+
dataset: DatasetConfig
|
|
72
|
+
slo_template: str = "embeddings.retrieval.standard"
|
|
73
|
+
warmup: WarmupConfig = Field(default_factory=WarmupConfig)
|
|
74
|
+
metric: Literal["recall_at_5", "mrr_at_10", "ndcg_at_10"] = "recall_at_5"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class RunContext(BaseModel):
|
|
78
|
+
"""Per-invocation context (where to send requests, where to write results).
|
|
79
|
+
|
|
80
|
+
Mirrors the llm-quality plugin shape so cross-plugin tooling can reuse
|
|
81
|
+
the same context object.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
|
|
85
|
+
model_id: Annotated[str, Field(min_length=1)]
|
|
86
|
+
model_revision: Annotated[str, Field(min_length=7, max_length=40)] = "unknown00"
|
|
87
|
+
engine_kind: EngineKind
|
|
88
|
+
engine_version: str = ""
|
|
89
|
+
base_url: str = ""
|
|
90
|
+
api_key: str = ""
|
|
91
|
+
quantization_format: str = ""
|
|
92
|
+
hardware_class: str = ""
|
|
93
|
+
output_dir: Path
|
|
94
|
+
extra: dict[str, str | int | float | bool] = Field(default_factory=dict)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Deterministic IR scoring strategies for the embeddings-retrieval plugin.
|
|
2
|
+
|
|
3
|
+
Three pure functions, each ``(ranking, relevant, k) -> float`` in ``[0.0, 1.0]``.
|
|
4
|
+
All higher-is-better — 1.0 means the ranking placed every relevant doc at the
|
|
5
|
+
top k, 0.0 means none of the relevant docs appear in the top k.
|
|
6
|
+
|
|
7
|
+
No real embedding model is invoked; these are standard IR metrics on
|
|
8
|
+
already-produced rank lists.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import math
|
|
14
|
+
from collections.abc import Callable, Iterable, Sequence
|
|
15
|
+
|
|
16
|
+
MetricFn = Callable[[Sequence[str], Iterable[str], int], float]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _topk(ranking: Sequence[str], k: int) -> list[str]:
|
|
20
|
+
"""Slice the top-k of a ranking; tolerate ``k`` exceeding the ranking length."""
|
|
21
|
+
if k <= 0:
|
|
22
|
+
return []
|
|
23
|
+
return list(ranking[:k])
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def recall_at_k(
|
|
27
|
+
ranking: Sequence[str],
|
|
28
|
+
relevant: Iterable[str],
|
|
29
|
+
k: int,
|
|
30
|
+
) -> float:
|
|
31
|
+
"""Fraction of relevant docs that appear in the top-k of the ranking.
|
|
32
|
+
|
|
33
|
+
Standard recall@k: |{relevant ∩ top-k}| / |relevant|. Returns 0.0 when
|
|
34
|
+
the relevant set is empty (vacuously no docs to recall). Capped at 1.0
|
|
35
|
+
by construction.
|
|
36
|
+
"""
|
|
37
|
+
relevant_set = set(relevant)
|
|
38
|
+
if not relevant_set:
|
|
39
|
+
return 0.0
|
|
40
|
+
top = set(_topk(ranking, k))
|
|
41
|
+
hits = len(top & relevant_set)
|
|
42
|
+
return hits / len(relevant_set)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def mrr_at_k(
|
|
46
|
+
ranking: Sequence[str],
|
|
47
|
+
relevant: Iterable[str],
|
|
48
|
+
k: int,
|
|
49
|
+
) -> float:
|
|
50
|
+
"""Reciprocal rank of the first relevant doc within top-k, or 0.0 if none.
|
|
51
|
+
|
|
52
|
+
Strictly speaking "MRR" is the mean of reciprocal ranks across queries,
|
|
53
|
+
but per-query the value used in that mean is ``1 / rank_of_first_hit``,
|
|
54
|
+
and that is what this function returns. The caller aggregates by taking
|
|
55
|
+
the mean across queries.
|
|
56
|
+
"""
|
|
57
|
+
relevant_set = set(relevant)
|
|
58
|
+
if not relevant_set:
|
|
59
|
+
return 0.0
|
|
60
|
+
for idx, doc_id in enumerate(_topk(ranking, k), start=1):
|
|
61
|
+
if doc_id in relevant_set:
|
|
62
|
+
return 1.0 / idx
|
|
63
|
+
return 0.0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def ndcg_at_k(
|
|
67
|
+
ranking: Sequence[str],
|
|
68
|
+
relevant: Iterable[str],
|
|
69
|
+
k: int,
|
|
70
|
+
) -> float:
|
|
71
|
+
"""Normalised discounted cumulative gain @ k with binary relevance.
|
|
72
|
+
|
|
73
|
+
Standard binary-relevance nDCG: ``DCG@k = sum_{i=1..k} rel_i / log2(i+1)``
|
|
74
|
+
where ``rel_i`` is 1 if the i-th ranked doc is relevant else 0. The
|
|
75
|
+
ideal DCG is the same sum when the top ``min(k, |relevant|)`` slots are
|
|
76
|
+
all relevant. nDCG = DCG / IDCG. Returns 0.0 for an empty relevant set.
|
|
77
|
+
"""
|
|
78
|
+
relevant_set = set(relevant)
|
|
79
|
+
if not relevant_set:
|
|
80
|
+
return 0.0
|
|
81
|
+
top = _topk(ranking, k)
|
|
82
|
+
dcg = 0.0
|
|
83
|
+
for i, doc_id in enumerate(top, start=1):
|
|
84
|
+
if doc_id in relevant_set:
|
|
85
|
+
dcg += 1.0 / math.log2(i + 1)
|
|
86
|
+
ideal_hits = min(k, len(relevant_set))
|
|
87
|
+
idcg = sum(1.0 / math.log2(i + 1) for i in range(1, ideal_hits + 1))
|
|
88
|
+
if idcg == 0.0:
|
|
89
|
+
return 0.0
|
|
90
|
+
return dcg / idcg
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# Map metric-name -> ``(k, scorer)``. Scorer signature: (ranking, relevant, k).
|
|
94
|
+
METRICS: dict[str, tuple[int, MetricFn]] = {
|
|
95
|
+
"recall_at_5": (5, recall_at_k),
|
|
96
|
+
"mrr_at_10": (10, mrr_at_k),
|
|
97
|
+
"ndcg_at_10": (10, ndcg_at_k),
|
|
98
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: inferencebench-embeddings
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Embeddings retrieval plugin for InferenceBench Suite (deterministic ranking skeleton; real embedding-model invocation deferred)
|
|
5
|
+
Project-URL: Homepage, https://github.com/yobitelcomm/bench
|
|
6
|
+
Author-email: Yobitel Communications <bench@yobitel.com>
|
|
7
|
+
License: Apache-2.0
|
|
8
|
+
Keywords: ai,benchmark,embeddings,ml,ndcg,recall,retrieval
|
|
9
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Requires-Dist: inferencebench-envelope
|
|
18
|
+
Requires-Dist: inferencebench-harness
|
|
19
|
+
Requires-Dist: pydantic~=2.9
|
|
20
|
+
Requires-Dist: pyyaml~=6.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# inferencebench-embeddings
|
|
24
|
+
|
|
25
|
+
Embeddings retrieval plugin for the InferenceBench Suite.
|
|
26
|
+
|
|
27
|
+
Phase-2-quality skeleton: produces signed envelopes via deterministic
|
|
28
|
+
hash-based rankings, with placeholders for real embedding-model invocation
|
|
29
|
+
that future revisions wire to TEI / OpenAI / Cohere.
|
|
30
|
+
|
|
31
|
+
Suite ID: `embeddings.retrieval`
|
|
32
|
+
|
|
33
|
+
Bundled benchmarks:
|
|
34
|
+
|
|
35
|
+
- `embeddings.retrieval.beir-mini` — 5 queries × 20-doc corpus, recall@5.
|
|
36
|
+
- `embeddings.retrieval.long-doc` — 3 queries with longer documents, nDCG@10.
|
|
37
|
+
|
|
38
|
+
The skeleton does NOT actually embed any text. For each query it ranks the
|
|
39
|
+
corpus by `sha256(query + doc_id)`, then scores the top-k against the
|
|
40
|
+
fixture's relevant set. This produces a real, well-defined retrieval metric
|
|
41
|
+
in [0, 1] without external dependencies — future revisions replace the
|
|
42
|
+
hash rank with a real vector search.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
inferencebench_embeddings/__init__.py,sha256=kzGstyjov6oEF9lOw3YzTfeIQgRMTeqs6E3sJZlGx1I,366
|
|
2
|
+
inferencebench_embeddings/plugin.py,sha256=sIJzyNm5y5rHTwxVCWQZ6Ng1X82gdLBBgc5ELYNJ3po,16004
|
|
3
|
+
inferencebench_embeddings/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
inferencebench_embeddings/schemas.py,sha256=k4tSBcwhZP5sBVHZyngpppfdvLVLDjcffHPkY2DfAqY,2978
|
|
5
|
+
inferencebench_embeddings/scoring.py,sha256=wNxG-p7Vu3CAQ7kkM1iMINVMyvB5PeKSWfJubLz6RCw,3114
|
|
6
|
+
inferencebench_embeddings/benchmarks/beir-mini.yaml,sha256=V1QVIVxiUsL_39bFbwykx6xAC8F3YbSAv5hS0Xnl0Jw,344
|
|
7
|
+
inferencebench_embeddings/benchmarks/long-doc.yaml,sha256=0psmhNkaNQMTTlxZLrFTlaoXJy4HQR1Wj5q0vNjtoyc,335
|
|
8
|
+
inferencebench_embeddings/benchmarks/msmarco-style.yaml,sha256=4yCD6VhFM1rVt3Lkf4GEyQu1aI-38vHOr_6hTdwMOuw,363
|
|
9
|
+
inferencebench_embeddings/benchmarks/query-expansion.yaml,sha256=dRsF-iQ3wDjzpAHC2oKjujI5RD72_Ip5uema8B1bleQ,381
|
|
10
|
+
inferencebench_embeddings/datasets/beir-mini-corpus.jsonl,sha256=cuW1wXndbhFt32hnqbIf_hplFOLxz-k1JyV4eH42WAk,2105
|
|
11
|
+
inferencebench_embeddings/datasets/beir-mini-queries.jsonl,sha256=6nF5YAp6hxc5xXZw0JPkX1BG4Egkqy1EkUF-YM-aXFA,435
|
|
12
|
+
inferencebench_embeddings/datasets/long-doc-corpus.jsonl,sha256=zFB-GoJxVdt4hBfBLQF0xrh9HZm7U98GIyh7ZlkOZI4,4144
|
|
13
|
+
inferencebench_embeddings/datasets/long-doc-queries.jsonl,sha256=zammwAV-PMxPIMbAHY4KQQY_AEK68uxhKOQ_NJj0t7c,315
|
|
14
|
+
inferencebench_embeddings/datasets/msmarco-style-corpus.jsonl,sha256=WkKvrs3Oniz15_dUI-BdcLx4Or1EEQlTrKR68_9yiik,6120
|
|
15
|
+
inferencebench_embeddings/datasets/msmarco-style-queries.jsonl,sha256=_xnZKNxF9GT55qhpoRPC3mJ-76e7sUkXFTD2UnpfiUg,436
|
|
16
|
+
inferencebench_embeddings/datasets/query-expansion-corpus.jsonl,sha256=8cZoY5WXJ7pC3PUicaCSclsN1VNmbwqobzx26OmKmT0,1994
|
|
17
|
+
inferencebench_embeddings/datasets/query-expansion-queries.jsonl,sha256=KIp0BV6ZSDQxKUQuFQYv_mZjFOMcpCwbGtJZt2IF85E,289
|
|
18
|
+
inferencebench_embeddings-0.0.2.dist-info/METADATA,sha256=EQJrhBzV12NAzdV42ZSVlF1vHErbX5qlWJlic6C613E,1806
|
|
19
|
+
inferencebench_embeddings-0.0.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
20
|
+
inferencebench_embeddings-0.0.2.dist-info/entry_points.txt,sha256=DYTJRaKRl0zDqR8kSExRfxN2vi7cKznhXSpEzl5ecEI,107
|
|
21
|
+
inferencebench_embeddings-0.0.2.dist-info/RECORD,,
|