inferencebench-embeddings 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. inferencebench_embeddings-0.0.2/.gitignore +137 -0
  2. inferencebench_embeddings-0.0.2/PKG-INFO +42 -0
  3. inferencebench_embeddings-0.0.2/README.md +20 -0
  4. inferencebench_embeddings-0.0.2/pyproject.toml +43 -0
  5. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/__init__.py +15 -0
  6. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/benchmarks/beir-mini.yaml +13 -0
  7. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/benchmarks/long-doc.yaml +13 -0
  8. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/benchmarks/msmarco-style.yaml +13 -0
  9. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/benchmarks/query-expansion.yaml +13 -0
  10. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/datasets/beir-mini-corpus.jsonl +20 -0
  11. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/datasets/beir-mini-queries.jsonl +5 -0
  12. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/datasets/long-doc-corpus.jsonl +10 -0
  13. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/datasets/long-doc-queries.jsonl +3 -0
  14. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/datasets/msmarco-style-corpus.jsonl +25 -0
  15. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/datasets/msmarco-style-queries.jsonl +5 -0
  16. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/datasets/query-expansion-corpus.jsonl +20 -0
  17. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/datasets/query-expansion-queries.jsonl +5 -0
  18. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/plugin.py +414 -0
  19. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/py.typed +0 -0
  20. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/schemas.py +94 -0
  21. inferencebench_embeddings-0.0.2/src/inferencebench_embeddings/scoring.py +98 -0
  22. inferencebench_embeddings-0.0.2/tests/conftest.py +38 -0
  23. inferencebench_embeddings-0.0.2/tests/test_embeddings_plugin.py +167 -0
  24. inferencebench_embeddings-0.0.2/tests/test_embeddings_scoring.py +98 -0
@@ -0,0 +1,137 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+
25
+ # uv / virtualenv
26
+ .venv/
27
+ venv/
28
+ env/
29
+ ENV/
30
+ uv.lock.tmp
31
+ .python-version
32
+
33
+ # Testing / coverage
34
+ .tox/
35
+ .nox/
36
+ .coverage
37
+ .coverage.*
38
+ .cache
39
+ nosetests.xml
40
+ coverage.xml
41
+ *.cover
42
+ *.py,cover
43
+ .hypothesis/
44
+ .pytest_cache/
45
+ cover/
46
+ htmlcov/
47
+
48
+ # Type checking
49
+ .mypy_cache/
50
+ .dmypy.json
51
+ dmypy.json
52
+ .pyre/
53
+ .pytype/
54
+
55
+ # Ruff
56
+ .ruff_cache/
57
+
58
+ # IDE / editor
59
+ .idea/
60
+ .vscode/
61
+ *.swp
62
+ *.swo
63
+ *~
64
+ .DS_Store
65
+
66
+ # OS
67
+ Thumbs.db
68
+ desktop.ini
69
+
70
+ # Secrets / env
71
+ .env
72
+ .env.*
73
+ !.env.example
74
+ .envrc
75
+
76
+ # Bench-specific local caches
77
+ ~/.cache/inferencebench/
78
+ .cache/inferencebench/
79
+ .inferencebench/
80
+
81
+ # Sigstore dev keys (never commit private keys)
82
+ cosign.key
83
+ cosign-*.key
84
+ cosign-*.pub
85
+ .bench/*.key
86
+ # Local benchmark working dirs (kept local; published outputs land under validation-runs/)
87
+ envelopes-voice/
88
+ envelopes-*/
89
+ *.pem
90
+ !tests/fixtures/**/*.pem
91
+
92
+ # Real-GPU validation artifacts (kept locally, never pushed)
93
+ # Use slash-star (not trailing slash) so individual subpaths can be re-included below.
94
+ validation-runs/*
95
+ # ...except the canonical published marathon corpus — small, public, used by docs + CI
96
+ !validation-runs/2026-05-18-multi-vendor-marathon
97
+ validation-runs/2026-05-18-multi-vendor-marathon/*
98
+ !validation-runs/2026-05-18-multi-vendor-marathon/marathon
99
+ validation-runs/2026-05-18-multi-vendor-marathon/marathon/*
100
+ !validation-runs/2026-05-18-multi-vendor-marathon/marathon/all
101
+ !validation-runs/2026-05-18-multi-vendor-marathon/marathon/all/*.json
102
+ # Voice ASR validation envelopes (small, public, used by leaderboard build)
103
+ !validation-runs/2026-05-25-voice-rtx4000ada
104
+ !validation-runs/2026-05-25-voice-rtx4000ada/*.json
105
+ !validation-runs/2026-05-29-voice-testbm-h100
106
+ !validation-runs/2026-05-29-voice-testbm-h100/*.json
107
+
108
+ # Model weights / datasets (use Git LFS or S3)
109
+ *.bin
110
+ *.safetensors
111
+ *.pt
112
+ *.pth
113
+ *.gguf
114
+ *.onnx
115
+ *.parquet
116
+ !tests/fixtures/**/*.parquet
117
+
118
+ # Logs
119
+ *.log
120
+ logs/
121
+
122
+ # Documentation build
123
+ docs/_build/
124
+ site/
125
+
126
+ # Internal-only files (Claude Code context + planning) — kept locally, not pushed
127
+ /CLAUDE.md
128
+ /INDEX.md
129
+ /PROJECT_PLAN.md
130
+ /CONVENTIONS.md
131
+ /HUMAN_REVIEW_GATES.md
132
+ **/CLAUDE.md
133
+ memory/
134
+ skills/
135
+ agents/
136
+ .claude/
137
+ TICKETS/
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.4
2
+ Name: inferencebench-embeddings
3
+ Version: 0.0.2
4
+ Summary: Embeddings retrieval plugin for InferenceBench Suite (deterministic ranking skeleton; real embedding-model invocation deferred)
5
+ Project-URL: Homepage, https://github.com/yobitelcomm/bench
6
+ Author-email: Yobitel Communications <bench@yobitel.com>
7
+ License: Apache-2.0
8
+ Keywords: ai,benchmark,embeddings,ml,ndcg,recall,retrieval
9
+ Classifier: Development Status :: 2 - Pre-Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.12
17
+ Requires-Dist: inferencebench-envelope
18
+ Requires-Dist: inferencebench-harness
19
+ Requires-Dist: pydantic~=2.9
20
+ Requires-Dist: pyyaml~=6.0
21
+ Description-Content-Type: text/markdown
22
+
23
+ # inferencebench-embeddings
24
+
25
+ Embeddings retrieval plugin for the InferenceBench Suite.
26
+
27
+ Phase-2-quality skeleton: produces signed envelopes via deterministic
28
+ hash-based rankings, with placeholders for real embedding-model invocation
29
+ that future revisions wire to TEI / OpenAI / Cohere.
30
+
31
+ Suite ID: `embeddings.retrieval`
32
+
33
+ Bundled benchmarks:
34
+
35
+ - `embeddings.retrieval.beir-mini` — 5 queries × 20-doc corpus, recall@5.
36
+ - `embeddings.retrieval.long-doc` — 3 queries with longer documents, nDCG@10.
37
+
38
+ The skeleton does NOT actually embed any text. For each query it ranks the
39
+ corpus by `sha256(query + doc_id)`, then scores the top-k against the
40
+ fixture's relevant set. This produces a real, well-defined retrieval metric
41
+ in [0, 1] without external dependencies — future revisions replace the
42
+ hash rank with a real vector search.
@@ -0,0 +1,20 @@
1
+ # inferencebench-embeddings
2
+
3
+ Embeddings retrieval plugin for the InferenceBench Suite.
4
+
5
+ Phase-2-quality skeleton: produces signed envelopes via deterministic
6
+ hash-based rankings, with placeholders for real embedding-model invocation
7
+ that future revisions wire to TEI / OpenAI / Cohere.
8
+
9
+ Suite ID: `embeddings.retrieval`
10
+
11
+ Bundled benchmarks:
12
+
13
+ - `embeddings.retrieval.beir-mini` — 5 queries × 20-doc corpus, recall@5.
14
+ - `embeddings.retrieval.long-doc` — 3 queries with longer documents, nDCG@10.
15
+
16
+ The skeleton does NOT actually embed any text. For each query it ranks the
17
+ corpus by `sha256(query + doc_id)`, then scores the top-k against the
18
+ fixture's relevant set. This produces a real, well-defined retrieval metric
19
+ in [0, 1] without external dependencies — future revisions replace the
20
+ hash rank with a real vector search.
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "inferencebench-embeddings"
7
+ version = "0.0.2"
8
+ description = "Embeddings retrieval plugin for InferenceBench Suite (deterministic ranking skeleton; real embedding-model invocation deferred)"
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = { text = "Apache-2.0" }
12
+ authors = [
13
+ { name = "Yobitel Communications", email = "bench@yobitel.com" },
14
+ ]
15
+ keywords = ["benchmark", "embeddings", "retrieval", "recall", "ndcg", "ai", "ml"]
16
+ classifiers = [
17
+ "Development Status :: 2 - Pre-Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: Apache Software License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
24
+ ]
25
+ dependencies = [
26
+ "inferencebench-envelope",
27
+ "inferencebench-harness",
28
+ "pydantic~=2.9",
29
+ "pyyaml~=6.0",
30
+ ]
31
+
32
+ [project.entry-points."inferencebench.plugins"]
33
+ "embeddings.retrieval" = "inferencebench_embeddings.plugin:EmbeddingsRetrievalPlugin"
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/yobitelcomm/bench"
37
+
38
+ [tool.hatch.build.targets.wheel]
39
+ packages = ["src/inferencebench_embeddings"]
40
+
41
+ [tool.uv.sources]
42
+ inferencebench-envelope = { workspace = true }
43
+ inferencebench-harness = { workspace = true }
@@ -0,0 +1,15 @@
1
+ """InferenceBench embeddings-retrieval plugin."""
2
+
3
+ from inferencebench_embeddings.plugin import (
4
+ EXPECTED_METRICS,
5
+ EmbeddingsRetrievalPlugin,
6
+ )
7
+ from inferencebench_embeddings.schemas import BenchmarkSpec, EngineKind, RunContext
8
+
9
+ __all__ = [
10
+ "EXPECTED_METRICS",
11
+ "BenchmarkSpec",
12
+ "EmbeddingsRetrievalPlugin",
13
+ "EngineKind",
14
+ "RunContext",
15
+ ]
@@ -0,0 +1,13 @@
1
+ benchmark_id: embeddings.retrieval.beir-mini
2
+ suite_version: 1.0.0
3
+ description: BEIR-style small corpus, recall@5.
4
+ modality: embeddings
5
+ kind: retrieval
6
+ dataset:
7
+ id: builtin-beir-mini
8
+ path: beir-mini-queries.jsonl
9
+ corpus_path: beir-mini-corpus.jsonl
10
+ slo_template: embeddings.retrieval.standard
11
+ warmup:
12
+ discard_runs: 0
13
+ metric: recall_at_5
@@ -0,0 +1,13 @@
1
+ benchmark_id: embeddings.retrieval.long-doc
2
+ suite_version: 1.0.0
3
+ description: Long-document corpus, nDCG@10.
4
+ modality: embeddings
5
+ kind: retrieval
6
+ dataset:
7
+ id: builtin-long-doc
8
+ path: long-doc-queries.jsonl
9
+ corpus_path: long-doc-corpus.jsonl
10
+ slo_template: embeddings.retrieval.standard
11
+ warmup:
12
+ discard_runs: 0
13
+ metric: ndcg_at_10
@@ -0,0 +1,13 @@
1
+ benchmark_id: embeddings.retrieval.msmarco-style
2
+ suite_version: 1.0.0
3
+ description: MS MARCO-style passage ranking, MRR@10.
4
+ modality: embeddings
5
+ kind: retrieval
6
+ dataset:
7
+ id: builtin-msmarco-style
8
+ path: msmarco-style-queries.jsonl
9
+ corpus_path: msmarco-style-corpus.jsonl
10
+ slo_template: embeddings.retrieval.standard
11
+ warmup:
12
+ discard_runs: 0
13
+ metric: mrr_at_10
@@ -0,0 +1,13 @@
1
+ benchmark_id: embeddings.retrieval.query-expansion
2
+ suite_version: 1.0.0
3
+ description: Short queries against a paraphrase-rich corpus.
4
+ modality: embeddings
5
+ kind: retrieval
6
+ dataset:
7
+ id: builtin-query-expansion
8
+ path: query-expansion-queries.jsonl
9
+ corpus_path: query-expansion-corpus.jsonl
10
+ slo_template: embeddings.retrieval.standard
11
+ warmup:
12
+ discard_runs: 0
13
+ metric: recall_at_5
@@ -0,0 +1,20 @@
1
+ {"doc_id": "doc-1", "text": "Paris is the capital and most populous city of France."}
2
+ {"doc_id": "doc-2", "text": "Tokyo is the capital of Japan and one of the most populous metropolitan areas."}
3
+ {"doc_id": "doc-3", "text": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris."}
4
+ {"doc_id": "doc-4", "text": "Mount Fuji is the highest mountain in Japan and a symbol of the country."}
5
+ {"doc_id": "doc-5", "text": "The Pacific Ocean is the largest and deepest ocean on Earth."}
6
+ {"doc_id": "doc-6", "text": "William Shakespeare wrote many plays including Hamlet and Macbeth."}
7
+ {"doc_id": "doc-7", "text": "Hamlet is one of Shakespeare's most famous tragedies, set in Denmark."}
8
+ {"doc_id": "doc-8", "text": "Mars is the fourth planet from the Sun, often called the Red Planet."}
9
+ {"doc_id": "doc-9", "text": "Gold is a chemical element with the symbol Au and atomic number 79."}
10
+ {"doc_id": "doc-10", "text": "Silver is a chemical element with the symbol Ag and is widely used in jewellery."}
11
+ {"doc_id": "doc-11", "text": "Leonardo da Vinci painted the Mona Lisa, now displayed in the Louvre."}
12
+ {"doc_id": "doc-12", "text": "The Louvre is the world's largest art museum, located in Paris, France."}
13
+ {"doc_id": "doc-13", "text": "Blue whales are the largest mammals on Earth and can weigh over 100 tons."}
14
+ {"doc_id": "doc-14", "text": "Photosynthesis is the process by which plants absorb carbon dioxide and produce oxygen."}
15
+ {"doc_id": "doc-15", "text": "The Moon landing in 1969 was a major milestone in human space exploration."}
16
+ {"doc_id": "doc-16", "text": "Apollo 11 was the spaceflight that first landed humans on the Moon."}
17
+ {"doc_id": "doc-17", "text": "The Atlantic Ocean separates the Americas from Europe and Africa."}
18
+ {"doc_id": "doc-18", "text": "Albert Einstein developed the theory of relativity and won the Nobel Prize in Physics."}
19
+ {"doc_id": "doc-19", "text": "The speed of light in vacuum is approximately 299,792 kilometres per second."}
20
+ {"doc_id": "doc-20", "text": "DNA is a molecule composed of two strands that coil around each other to form a double helix."}
@@ -0,0 +1,5 @@
1
+ {"query": "What is the capital of France?", "relevant_doc_ids": ["doc-1", "doc-3", "doc-12"]}
2
+ {"query": "Tell me about Japan's tallest mountain.", "relevant_doc_ids": ["doc-2", "doc-4"]}
3
+ {"query": "Who wrote Hamlet?", "relevant_doc_ids": ["doc-6", "doc-7"]}
4
+ {"query": "What is the chemical symbol for gold?", "relevant_doc_ids": ["doc-9"]}
5
+ {"query": "When did humans first land on the Moon?", "relevant_doc_ids": ["doc-15", "doc-16"]}
@@ -0,0 +1,10 @@
1
+ {"doc_id": "ld-1", "text": "Transformer architectures have revolutionised natural language processing since their introduction in 2017. The original paper attention is all you need showed that self-attention layers can replace recurrent and convolutional structures while training in parallel across all positions, which dramatically reduced wall-clock training time and unlocked the scaling laws that subsequent work has explored exhaustively."}
2
+ {"doc_id": "ld-2", "text": "Renewable energy adoption has accelerated globally over the past decade, with solar photovoltaic capacity growing more than tenfold and onshore wind seeing similar gains. The economics of grid-scale battery storage have also shifted dramatically, making it feasible to firm intermittent generation in many markets without resorting to natural gas peakers."}
3
+ {"doc_id": "ld-3", "text": "Modern garbage collectors in managed runtimes use generational hypotheses to avoid scanning the entire heap on every cycle. Young objects die quickly, so the nursery is collected frequently with a copying algorithm, while the older generation uses a mark-and-sweep or mark-and-compact pass less often. Region-based collectors like G1 extend this with finer-grained partitioning."}
4
+ {"doc_id": "ld-4", "text": "The history of vaccines stretches back over two centuries, beginning with Edward Jenner's observation that milkmaids exposed to cowpox seemed immune to smallpox. The twentieth century saw the development of inactivated and live-attenuated vaccines for polio, measles, and dozens of other diseases, while the twenty-first century has been defined by mRNA platforms that proved themselves during the COVID-19 pandemic."}
5
+ {"doc_id": "ld-5", "text": "Distributed consensus algorithms like Paxos and Raft solve the problem of agreeing on a sequence of values across a set of failure-prone nodes. Raft was designed explicitly for understandability and breaks consensus into leader election, log replication, and safety, while Paxos predates Raft and is the basis for many production systems including Google's Chubby lock service."}
6
+ {"doc_id": "ld-6", "text": "Quantum computing exploits superposition and entanglement to perform certain calculations exponentially faster than classical machines. Shor's algorithm for factoring integers and Grover's algorithm for unstructured search are the canonical examples, but practical quantum advantage on commercially relevant problems remains an open research question as engineering teams work to reduce gate error rates."}
7
+ {"doc_id": "ld-7", "text": "Coral reefs are some of the most biodiverse ecosystems on the planet, supporting roughly a quarter of all marine species despite covering less than one percent of the ocean floor. Rising sea temperatures and ocean acidification have triggered widespread bleaching events that threaten the long-term viability of these systems, with the Great Barrier Reef among the most studied examples."}
8
+ {"doc_id": "ld-8", "text": "Compiler optimisation involves a sequence of program transformations that preserve observable behaviour while improving some target metric such as execution time, code size, or energy usage. Modern compilers like LLVM and GCC implement hundreds of passes including inlining, constant propagation, loop unrolling, vectorisation, and aggressive dead-code elimination."}
9
+ {"doc_id": "ld-9", "text": "The protein folding problem asks how a linear sequence of amino acids reliably collapses into a specific three-dimensional structure. AlphaFold demonstrated that deep learning models trained on the Protein Data Bank can predict structures with accuracy comparable to experimental methods for many protein families, accelerating drug discovery and basic biology research."}
10
+ {"doc_id": "ld-10", "text": "Modern web browsers are among the most complex pieces of consumer software ever shipped. They include a multi-process architecture, a JIT-compiling JavaScript engine, a GPU-accelerated compositor, a layout engine that implements thousands of pages of CSS specification, an extension API, sandboxed rendering, and increasingly an integration point for AI assistants."}
@@ -0,0 +1,3 @@
1
+ {"query": "How do transformer architectures change NLP training?", "relevant_doc_ids": ["ld-1"]}
2
+ {"query": "What is the role of consensus algorithms in distributed systems?", "relevant_doc_ids": ["ld-5"]}
3
+ {"query": "How has deep learning advanced biology and drug discovery?", "relevant_doc_ids": ["ld-9", "ld-1"]}
@@ -0,0 +1,25 @@
1
+ {"doc_id": "mm-1", "text": "Coffee is a brewed beverage prepared from roasted coffee beans. It originated in Ethiopia and is now consumed worldwide. Espresso, drip, and pour-over are common preparation methods."}
2
+ {"doc_id": "mm-2", "text": "The Pacific Ocean is the largest ocean on Earth, covering one third of the planet's surface. It is bordered by Asia, the Americas, and Antarctica. Its average depth is roughly four kilometres."}
3
+ {"doc_id": "mm-3", "text": "Photosynthesis is the biological process by which plants convert sunlight, water, and carbon dioxide into glucose and oxygen. Chlorophyll in the leaves absorbs light energy. The process powers most life on Earth."}
4
+ {"doc_id": "mm-4", "text": "Football is the most popular sport in the world by viewership. The FIFA World Cup is held every four years. Major leagues operate in Europe, South America, and increasingly Asia and North America."}
5
+ {"doc_id": "mm-5", "text": "TCP is a connection-oriented protocol that guarantees delivery and ordering through acknowledgements. UDP is a connectionless protocol that trades reliability for lower latency. Choose TCP for files, UDP for real-time streams."}
6
+ {"doc_id": "mm-6", "text": "The Renaissance was a cultural movement that began in Italy in the fourteenth century. It produced major advances in art, science, and politics. Leonardo da Vinci and Michelangelo are emblematic figures of the period."}
7
+ {"doc_id": "mm-7", "text": "Python is widely used in data science thanks to libraries like NumPy, pandas, scikit-learn, and PyTorch. Beginners often start with introductory courses on Kaggle or Coursera. Practice with real datasets accelerates learning."}
8
+ {"doc_id": "mm-8", "text": "The smartphone revolutionised personal computing in the late 2000s. iOS and Android dominate the operating-system market. App stores transformed the software distribution model."}
9
+ {"doc_id": "mm-9", "text": "Plate tectonics describes how Earth's lithosphere is divided into plates that move over the mantle. Earthquakes and volcanoes mostly occur at plate boundaries. The theory was widely accepted in the 1960s."}
10
+ {"doc_id": "mm-10", "text": "The Mediterranean diet emphasises vegetables, fruits, whole grains, olive oil, and fish. It has been linked to lower rates of cardiovascular disease. The diet originated in countries like Greece and Italy."}
11
+ {"doc_id": "mm-11", "text": "Vitamin D deficiency can cause fatigue, bone pain, muscle weakness, and mood changes. It is common in regions with limited sunlight exposure. Supplementation and dietary sources like fatty fish help correct it."}
12
+ {"doc_id": "mm-12", "text": "The Industrial Revolution began in Britain in the eighteenth century. Steam power, mechanised textile production, and railways transformed economies. It also drove urbanisation and large-scale environmental change."}
13
+ {"doc_id": "mm-13", "text": "Quantum mechanics describes physical systems at atomic scales. Superposition, entanglement, and uncertainty are core principles. The field underpins technologies from lasers to MRI scanners."}
14
+ {"doc_id": "mm-14", "text": "Plants exchange gases through small pores called stomata, typically on the underside of leaves. During photosynthesis they take in carbon dioxide and release oxygen. The stomata also regulate water loss via transpiration."}
15
+ {"doc_id": "mm-15", "text": "The euro is the common currency of nineteen European Union member states. It was introduced in 1999 for accounting and 2002 in physical form. The European Central Bank manages monetary policy for the eurozone."}
16
+ {"doc_id": "mm-16", "text": "Bicycles are an efficient form of human-powered transport, widely used for commuting in cities like Amsterdam and Copenhagen. Modern designs include road, mountain, and electric variants. Cycling reduces carbon emissions and improves health."}
17
+ {"doc_id": "mm-17", "text": "The DNA double helix was discovered in 1953 by Watson and Crick, building on work by Franklin and Wilkins. The structure consists of two strands held by base pairs. It encodes the genetic information of all known organisms."}
18
+ {"doc_id": "mm-18", "text": "The Great Wall of China was built across many dynasties, with most surviving sections dating from the Ming period. It stretches for thousands of kilometres across northern China. The wall served as defence, signalling, and trade control."}
19
+ {"doc_id": "mm-19", "text": "Bread is one of the oldest prepared foods, with evidence of baking going back over fourteen thousand years. Modern bread relies on wheat flour, water, salt, and yeast. Sourdough uses wild yeast and lactic-acid bacteria."}
20
+ {"doc_id": "mm-20", "text": "The Amazon rainforest is the largest tropical forest on Earth, spanning nine countries in South America. It hosts an enormous share of global biodiversity. Deforestation threatens carbon storage and species survival."}
21
+ {"doc_id": "mm-21", "text": "Cloud computing delivers compute, storage, and networking as on-demand services over the internet. Major providers include AWS, Azure, and Google Cloud. The model has reshaped how software is built and deployed."}
22
+ {"doc_id": "mm-22", "text": "Sunlight exposure is the body's main source of vitamin D, with diet contributing only a small fraction. Office workers and people in high-latitude regions are at higher risk of deficiency. Routine blood tests can confirm low levels."}
23
+ {"doc_id": "mm-23", "text": "The Eiffel Tower in Paris was completed in 1889 for the Universal Exposition. It is made of wrought iron and stands 330 metres tall. The tower is among the most-visited paid monuments in the world."}
24
+ {"doc_id": "mm-24", "text": "Machine learning has driven advances in computer vision, language understanding, and recommendation systems. Deep neural networks scale impressively with data and compute. Training large models requires specialised hardware accelerators."}
25
+ {"doc_id": "mm-25", "text": "Construction of the Great Wall continued under successive Chinese dynasties starting with the Qin in the third century BCE. Materials varied by region, from tamped earth to bricks. Watchtowers and garrisons were spaced along its length."}
@@ -0,0 +1,5 @@
1
+ {"query": "how does photosynthesis work in plants", "relevant_doc_ids": ["mm-3", "mm-14"]}
2
+ {"query": "best way to learn python for data science", "relevant_doc_ids": ["mm-7"]}
3
+ {"query": "symptoms of vitamin d deficiency", "relevant_doc_ids": ["mm-11", "mm-22"]}
4
+ {"query": "what is the difference between tcp and udp", "relevant_doc_ids": ["mm-5"]}
5
+ {"query": "history of the great wall of china", "relevant_doc_ids": ["mm-18", "mm-25"]}
@@ -0,0 +1,20 @@
1
+ {"doc_id": "qe-1", "text": "A vehicle with four wheels used for personal transport on roads."}
2
+ {"doc_id": "qe-2", "text": "An automobile, typically powered by an internal combustion engine or electric motor."}
3
+ {"doc_id": "qe-3", "text": "A long, articulated road vehicle used to carry many passengers between stops."}
4
+ {"doc_id": "qe-4", "text": "A two-wheeled, pedal-driven personal transport device."}
5
+ {"doc_id": "qe-5", "text": "A medical practitioner who diagnoses and treats illness."}
6
+ {"doc_id": "qe-6", "text": "A physician trained to provide medical care and prescribe medication."}
7
+ {"doc_id": "qe-7", "text": "A trained nurse who assists with patient care in hospitals or clinics."}
8
+ {"doc_id": "qe-8", "text": "An academic who has earned a doctorate in a non-medical field."}
9
+ {"doc_id": "qe-9", "text": "A film, typically released in cinemas, that tells a story through moving images."}
10
+ {"doc_id": "qe-10", "text": "A motion picture, screened in theatres or streamed online."}
11
+ {"doc_id": "qe-11", "text": "A short video clip recorded on a smartphone and shared online."}
12
+ {"doc_id": "qe-12", "text": "A live theatrical performance staged in front of an audience."}
13
+ {"doc_id": "qe-13", "text": "A building used as a private residence by an individual or family."}
14
+ {"doc_id": "qe-14", "text": "A dwelling, typically detached, where people live and sleep."}
15
+ {"doc_id": "qe-15", "text": "A small wooden structure used to store gardening tools."}
16
+ {"doc_id": "qe-16", "text": "A high-rise apartment building containing many separate flats."}
17
+ {"doc_id": "qe-17", "text": "The art of arranging sounds in time to produce a composition through melody, harmony, and rhythm."}
18
+ {"doc_id": "qe-18", "text": "Songs and instrumental pieces, performed live or recorded, that people listen to for enjoyment."}
19
+ {"doc_id": "qe-19", "text": "Ambient noise from machinery and traffic in a busy urban environment."}
20
+ {"doc_id": "qe-20", "text": "A spoken-word podcast covering interviews with public figures."}
@@ -0,0 +1,5 @@
1
+ {"query": "car", "relevant_doc_ids": ["qe-1", "qe-2"]}
2
+ {"query": "doctor", "relevant_doc_ids": ["qe-5", "qe-6"]}
3
+ {"query": "movie", "relevant_doc_ids": ["qe-9", "qe-10"]}
4
+ {"query": "house", "relevant_doc_ids": ["qe-13", "qe-14"]}
5
+ {"query": "music", "relevant_doc_ids": ["qe-17", "qe-18"]}