retrieval-observatory 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. retrieval_observatory-0.1.2/LICENSE +21 -0
  2. retrieval_observatory-0.1.2/MANIFEST.in +6 -0
  3. retrieval_observatory-0.1.2/PKG-INFO +655 -0
  4. retrieval_observatory-0.1.2/README.md +583 -0
  5. retrieval_observatory-0.1.2/pyproject.toml +77 -0
  6. retrieval_observatory-0.1.2/retrieval_observatory/__init__.py +4 -0
  7. retrieval_observatory-0.1.2/retrieval_observatory/adapters/__init__.py +0 -0
  8. retrieval_observatory-0.1.2/retrieval_observatory/adapters/bm25_adapter.py +105 -0
  9. retrieval_observatory-0.1.2/retrieval_observatory/adapters/cohere_adapter.py +62 -0
  10. retrieval_observatory-0.1.2/retrieval_observatory/adapters/hf_adapter.py +75 -0
  11. retrieval_observatory-0.1.2/retrieval_observatory/adapters/hf_biencoder_adapter.py +140 -0
  12. retrieval_observatory-0.1.2/retrieval_observatory/adapters/http_adapter.py +118 -0
  13. retrieval_observatory-0.1.2/retrieval_observatory/adapters/langchain_adapter.py +47 -0
  14. retrieval_observatory-0.1.2/retrieval_observatory/adapters/llamaindex_adapter.py +44 -0
  15. retrieval_observatory-0.1.2/retrieval_observatory/adapters/pgvector_adapter.py +75 -0
  16. retrieval_observatory-0.1.2/retrieval_observatory/adapters/rrf_adapter.py +93 -0
  17. retrieval_observatory-0.1.2/retrieval_observatory/classifier/__init__.py +1 -0
  18. retrieval_observatory-0.1.2/retrieval_observatory/classifier/data.py +103 -0
  19. retrieval_observatory-0.1.2/retrieval_observatory/classifier/features.py +105 -0
  20. retrieval_observatory-0.1.2/retrieval_observatory/classifier/labels.py +40 -0
  21. retrieval_observatory-0.1.2/retrieval_observatory/classifier/model.py +283 -0
  22. retrieval_observatory-0.1.2/retrieval_observatory/cli.py +1165 -0
  23. retrieval_observatory-0.1.2/retrieval_observatory/config/__init__.py +0 -0
  24. retrieval_observatory-0.1.2/retrieval_observatory/config/cost.py +39 -0
  25. retrieval_observatory-0.1.2/retrieval_observatory/config/schema.py +166 -0
  26. retrieval_observatory-0.1.2/retrieval_observatory/config/validator.py +50 -0
  27. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/__init__.py +0 -0
  28. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/api.py +994 -0
  29. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/registry.py +93 -0
  30. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/dist/assets/index-BRJkz-jv.js +114 -0
  31. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/dist/assets/index-Dy8dsKST.css +1 -0
  32. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/dist/index.html +13 -0
  33. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/index.html +12 -0
  34. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/package-lock.json +3042 -0
  35. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/package.json +26 -0
  36. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/postcss.config.js +6 -0
  37. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/App.tsx +141 -0
  38. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/api.ts +270 -0
  39. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/ChartFrame.tsx +18 -0
  40. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/ChartModal.tsx +48 -0
  41. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/ChartZoomControls.tsx +92 -0
  42. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/ChartZoomSurface.tsx +79 -0
  43. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/ClassifierCalibration.tsx +127 -0
  44. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/ComparePanel.tsx +108 -0
  45. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/DashboardGuide.tsx +49 -0
  46. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/DataQualityWarnings.tsx +31 -0
  47. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/DbTabs.tsx +42 -0
  48. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/ErrorBoundary.tsx +48 -0
  49. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/ExperimentOverview.tsx +98 -0
  50. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/LatencyChart.tsx +256 -0
  51. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/MetricTooltip.tsx +52 -0
  52. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/MetricsTable.tsx +542 -0
  53. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/QueryExplorer.tsx +103 -0
  54. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/RecallCurve.tsx +190 -0
  55. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/RecallFunnel.tsx +293 -0
  56. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/RunDetail.tsx +233 -0
  57. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/RunsSidebar.tsx +70 -0
  58. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/SegmentBreakdown.tsx +251 -0
  59. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/StageCombinationMatrix.tsx +79 -0
  60. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/StagePipelineFlow.tsx +177 -0
  61. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/TradeoffScatter.tsx +423 -0
  62. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/components/VerdictCard.tsx +270 -0
  63. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/hooks/useChartZoom.ts +136 -0
  64. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/index.css +3 -0
  65. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/main.tsx +10 -0
  66. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/utils/chartColors.ts +66 -0
  67. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/utils/format.ts +29 -0
  68. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/utils/formatMetricKey.ts +85 -0
  69. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/utils/metricGlossary.ts +57 -0
  70. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/src/utils/pipelineStages.ts +223 -0
  71. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/tailwind.config.js +6 -0
  72. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/tsconfig.json +20 -0
  73. retrieval_observatory-0.1.2/retrieval_observatory/dashboard/ui/vite.config.ts +18 -0
  74. retrieval_observatory-0.1.2/retrieval_observatory/datasets/__init__.py +0 -0
  75. retrieval_observatory-0.1.2/retrieval_observatory/datasets/beir.py +108 -0
  76. retrieval_observatory-0.1.2/retrieval_observatory/datasets/custom.py +164 -0
  77. retrieval_observatory-0.1.2/retrieval_observatory/datasets/llm_judge.py +225 -0
  78. retrieval_observatory-0.1.2/retrieval_observatory/datasets/timeqa.py +56 -0
  79. retrieval_observatory-0.1.2/retrieval_observatory/datasets/validation.py +205 -0
  80. retrieval_observatory-0.1.2/retrieval_observatory/examples/beir_demo.yaml +31 -0
  81. retrieval_observatory-0.1.2/retrieval_observatory/examples/quickstart_scifact.yaml +34 -0
  82. retrieval_observatory-0.1.2/retrieval_observatory/metrics/__init__.py +0 -0
  83. retrieval_observatory-0.1.2/retrieval_observatory/metrics/comparison.py +59 -0
  84. retrieval_observatory-0.1.2/retrieval_observatory/metrics/diagnostics.py +143 -0
  85. retrieval_observatory-0.1.2/retrieval_observatory/metrics/engine.py +377 -0
  86. retrieval_observatory-0.1.2/retrieval_observatory/metrics/latency.py +27 -0
  87. retrieval_observatory-0.1.2/retrieval_observatory/metrics/pareto.py +141 -0
  88. retrieval_observatory-0.1.2/retrieval_observatory/metrics/ranking.py +116 -0
  89. retrieval_observatory-0.1.2/retrieval_observatory/metrics/recall.py +106 -0
  90. retrieval_observatory-0.1.2/retrieval_observatory/metrics/significance.py +69 -0
  91. retrieval_observatory-0.1.2/retrieval_observatory/pipeline/__init__.py +0 -0
  92. retrieval_observatory-0.1.2/retrieval_observatory/pipeline/factory.py +275 -0
  93. retrieval_observatory-0.1.2/retrieval_observatory/pipeline/multi.py +119 -0
  94. retrieval_observatory-0.1.2/retrieval_observatory/pipeline/single.py +71 -0
  95. retrieval_observatory-0.1.2/retrieval_observatory/runner/__init__.py +0 -0
  96. retrieval_observatory-0.1.2/retrieval_observatory/runner/benchmark.py +148 -0
  97. retrieval_observatory-0.1.2/retrieval_observatory/runner/cache.py +199 -0
  98. retrieval_observatory-0.1.2/retrieval_observatory/runner/manifest.py +52 -0
  99. retrieval_observatory-0.1.2/retrieval_observatory/runner/scheduler.py +20 -0
  100. retrieval_observatory-0.1.2/retrieval_observatory/store/__init__.py +0 -0
  101. retrieval_observatory-0.1.2/retrieval_observatory/store/base.py +86 -0
  102. retrieval_observatory-0.1.2/retrieval_observatory/store/postgres.py +515 -0
  103. retrieval_observatory-0.1.2/retrieval_observatory/store/sqlite.py +506 -0
  104. retrieval_observatory-0.1.2/retrieval_observatory/types.py +74 -0
  105. retrieval_observatory-0.1.2/retrieval_observatory.egg-info/PKG-INFO +655 -0
  106. retrieval_observatory-0.1.2/retrieval_observatory.egg-info/SOURCES.txt +109 -0
  107. retrieval_observatory-0.1.2/retrieval_observatory.egg-info/dependency_links.txt +1 -0
  108. retrieval_observatory-0.1.2/retrieval_observatory.egg-info/entry_points.txt +2 -0
  109. retrieval_observatory-0.1.2/retrieval_observatory.egg-info/requires.txt +63 -0
  110. retrieval_observatory-0.1.2/retrieval_observatory.egg-info/top_level.txt +1 -0
  111. retrieval_observatory-0.1.2/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ameya Kiwalkar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ graft retrieval_observatory
2
+ prune retrieval_observatory/dashboard/ui/node_modules
3
+ global-exclude *.db
4
+ global-exclude __pycache__
5
+ global-exclude .retobs
6
+ recursive-exclude results *
@@ -0,0 +1,655 @@
1
+ Metadata-Version: 2.4
2
+ Name: retrieval-observatory
3
+ Version: 0.1.2
4
+ Summary: Framework-agnostic benchmarking for hybrid RAG retrieval pipelines
5
+ Author-email: Ameya Kiwalkar <akiwalkar@berkeley.edu>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/AmeyaKI/retrieval-observatory
8
+ Project-URL: Documentation, https://github.com/AmeyaKI/retrieval-observatory#readme
9
+ Project-URL: Repository, https://github.com/AmeyaKI/retrieval-observatory
10
+ Project-URL: Issues, https://github.com/AmeyaKI/retrieval-observatory/issues
11
+ Project-URL: Changelog, https://github.com/AmeyaKI/retrieval-observatory/releases
12
+ Keywords: rag,retrieval,benchmark,evaluation,nlp
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: pydantic>=2.0
22
+ Requires-Dist: httpx>=0.27
23
+ Requires-Dist: aiosqlite>=0.20
24
+ Requires-Dist: rich>=13.0
25
+ Requires-Dist: typer>=0.12
26
+ Requires-Dist: pyyaml>=6.0
27
+ Requires-Dist: numpy>=1.26
28
+ Provides-Extra: beir
29
+ Requires-Dist: datasets>=2.0; extra == "beir"
30
+ Requires-Dist: beir>=2.0; extra == "beir"
31
+ Provides-Extra: cohere
32
+ Requires-Dist: cohere>=5.0; extra == "cohere"
33
+ Provides-Extra: hf
34
+ Requires-Dist: sentence-transformers>=3.0; extra == "hf"
35
+ Requires-Dist: torch; extra == "hf"
36
+ Provides-Extra: dense
37
+ Requires-Dist: sentence-transformers>=3.0; extra == "dense"
38
+ Requires-Dist: faiss-cpu>=1.7; extra == "dense"
39
+ Requires-Dist: torch; extra == "dense"
40
+ Provides-Extra: langchain
41
+ Requires-Dist: langchain-core>=0.2; extra == "langchain"
42
+ Provides-Extra: llamaindex
43
+ Requires-Dist: llama-index-core>=0.10; extra == "llamaindex"
44
+ Provides-Extra: pgvector
45
+ Requires-Dist: asyncpg>=0.29; extra == "pgvector"
46
+ Requires-Dist: pgvector>=0.3; extra == "pgvector"
47
+ Provides-Extra: postgres
48
+ Requires-Dist: asyncpg>=0.29; extra == "postgres"
49
+ Provides-Extra: dashboard
50
+ Requires-Dist: fastapi>=0.111; extra == "dashboard"
51
+ Requires-Dist: uvicorn>=0.29; extra == "dashboard"
52
+ Requires-Dist: python-multipart>=0.0.9; extra == "dashboard"
53
+ Provides-Extra: llm-judge
54
+ Requires-Dist: google-generativeai>=0.8; extra == "llm-judge"
55
+ Requires-Dist: anthropic>=0.28; extra == "llm-judge"
56
+ Requires-Dist: openai>=1.0; extra == "llm-judge"
57
+ Provides-Extra: demo
58
+ Requires-Dist: beir>=2.0; extra == "demo"
59
+ Requires-Dist: datasets>=2.0; extra == "demo"
60
+ Requires-Dist: rank-bm25>=0.2.2; extra == "demo"
61
+ Provides-Extra: classifier
62
+ Requires-Dist: scikit-learn>=1.4; extra == "classifier"
63
+ Requires-Dist: joblib>=1.3; extra == "classifier"
64
+ Provides-Extra: dev
65
+ Requires-Dist: pytest>=8.0; extra == "dev"
66
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
67
+ Requires-Dist: coverage; extra == "dev"
68
+ Requires-Dist: respx>=0.21; extra == "dev"
69
+ Requires-Dist: scikit-learn>=1.4; extra == "dev"
70
+ Requires-Dist: joblib>=1.3; extra == "dev"
71
+ Dynamic: license-file
72
+
73
+ # retrieval-observatory (retobs)
74
+
75
+ [![PyPI version](https://badge.fury.io/py/retrieval-observatory.svg)](https://pypi.org/project/retrieval-observatory/)
76
+
77
+ Most RAG evaluation tools score end-to-end answer quality and stop there. They don't tell you **which stage helped**, **what it cost in latency**, or **which queries will fail before you run retrieval**. retobs is an open-source multi-stage retrieval benchmark and local dashboard that measures per-stage contribution, failure diagnosis, latency–quality tradeoffs, and query difficulty — so you can decide whether to add that reranker (or switch to dense) with evidence, not intuition.
78
+
79
+ **Headline result:** On BEIR/FiQA, dense retrieval (`all-MiniLM-L6-v2`) outperforms BM25 by **+132% NDCG@10** (0.369 vs 0.159) at **~130× lower latency** than cross-encoder reranking. On SciFact and FiQA, dense-only is the **sole Pareto-optimal** pipeline. On NFCorpus, dense/rerank/RRF NDCG CIs overlap — no single winner on quality alone.
80
+
81
+ Quality–Latency Tradeoff — NFCorpus Pareto frontier
82
+
83
+ ---
84
+
85
+ ## Install
86
+
87
+ ```bash
88
+ pip install "retrieval-observatory[demo,dashboard,dense]"
89
+ ```
90
+
91
+ For development from source:
92
+
93
+ ```bash
94
+ git clone https://github.com/AmeyaKI/retrieval-observatory.git && cd retrieval-observatory
95
+ python -m venv .venv && source .venv/bin/activate
96
+ pip install -e ".[demo,dashboard,dense]"
97
+ ```
98
+
99
+ ---
100
+
101
+ ## Quickstart (~5 minutes)
102
+
103
+ Run BM25 on 50 SciFact queries, then open the dashboard.
104
+
105
+ **PyPI install** (bundled example config):
106
+
107
+ ```bash
108
+ CFG="$(python -c 'from retrieval_observatory import EXAMPLES_DIR; print(EXAMPLES_DIR / "quickstart_scifact.yaml")')"
109
+ retobs validate --config "$CFG"
110
+ retobs run --config "$CFG"
111
+ retobs serve --db .retobs/quickstart_scifact.db
112
+ ```
113
+
114
+ **From a git clone** (repo `examples/` tree):
115
+
116
+ ```bash
117
+ retobs validate --config examples/quickstart_scifact.yaml
118
+ retobs run --config examples/quickstart_scifact.yaml
119
+ retobs serve --db .retobs/quickstart_scifact.db
120
+ ```
121
+
122
+ Open `http://localhost:8000` — explore metrics, latency, and query-level diagnostics.
123
+
124
+ ### Full examples and BEIR publish configs
125
+
126
+ The PyPI wheel includes quickstart YAMLs only. For the full `examples/` demos (HTTP quickstart, temporal demo, dashboard demo with JSONL data) and multi-dataset BEIR sweeps, clone the repo:
127
+
128
+ ```bash
129
+ git clone https://github.com/AmeyaKI/retrieval-observatory.git
130
+ cd retrieval-observatory
131
+ ./scripts/run_beir_publish.sh full-sweep # uses configs/beir_publish/
132
+ ```
133
+
134
+ ---
135
+
136
+ ## Benchmark Results
137
+
138
+ Cross-dataset summary (full BEIR test splits, 4 independent pipelines). See [results/BENCHMARK_ANALYSIS.md](results/BENCHMARK_ANALYSIS.md) for motivation, Pareto analysis, classifier calibration, and limitations.
139
+
140
+
141
+ | Dataset | bm25 NDCG@10 | dense_only | rrf_hybrid | bm25__rerank | Pareto optimal |
142
+ | --------------- | ------------ | ---------- | ---------- | ------------ | ---------------- |
143
+ | NFCorpus (323q) | 0.264 | **0.310** | 0.304 | 0.310 | bm25, dense_only |
144
+ | SciFact (300q) | 0.544 | **0.640** | 0.623 | 0.628 | dense_only |
145
+ | FiQA (648q) | 0.159 | **0.369** | 0.290 | 0.260 | dense_only |
146
+
147
+
148
+ Four pipelines: `bm25`, `dense_only`, `rrf_hybrid`, `bm25__rerank`. Stage attribution uses the bm25 → bm25__rerank prefix pair only. JSON exports and regeneration: [results/RESULTS_OVERVIEW.md](results/RESULTS_OVERVIEW.md).
149
+
150
+ ---
151
+
152
+ ## What retobs tells you
153
+
154
+ ```
155
+ Stage Contribution: bm25 → bm25__rerank
156
+ ┌───────────────┬──────────┬──────────┬──────────────┬────────────────┐
157
+ │ Metric │ Before │ After │ Δ │ Significant? │
158
+ ├───────────────┼──────────┼──────────┼──────────────┼────────────────┤
159
+ │ recall@10 │ 0.1190 │ 0.1380 │ +0.0190 (+16%)│ q=0.041 ✓ │
160
+ │ ndcg@10 │ 0.2640 │ 0.3100 │ +0.0460 (+17%)│ q=0.012 ✓ │
161
+ │ Latency P50 │ 2ms │ 4,057ms │ +4,055ms │ — │
162
+ └───────────────┴──────────┴──────────┴──────────────┴────────────────┘
163
+ ```
164
+
165
+ 1. **Stage attribution** — What did each stage add in quality, cost, and latency? BH-corrected significance on paired queries.
166
+ 2. **Failure diagnosis** — Candidate misses, lexical mismatches, reranker drops — labeled per query.
167
+ 3. **Latency–quality tradeoff** — Pareto frontier and budget slider; see whether reranking is worth it at your latency budget.
168
+
169
+ Core promise:
170
+
171
+ - Comparable **Recall@K, NDCG@K, MRR, MAP, latency percentiles, and estimated cost per 1k queries** across pipelines.
172
+ - Multi-stage pipelines with independent stage analysis and temporal recall for time-sensitive datasets.
173
+
174
+ ---
175
+
176
+ ## How It's Different
177
+
178
+
179
+ | Tool | What it measures |
180
+ | --------------- | ---------------------------------------------------------------------------------- |
181
+ | BEIR | End-to-end pipeline accuracy on fixed datasets |
182
+ | RAGAs / TruLens | Answer quality given retrieved context |
183
+ | **retobs** | **Per-stage contribution: what did each stage add in quality, cost, and latency?** |
184
+
185
+
186
+ retobs is not a leaderboard and not an answer evaluator. It's a diagnostic layer between "I have a retrieval pipeline" and "I understand how to improve it."
187
+
188
+ ---
189
+
190
+ ## Install (development)
191
+
192
+ ```bash
193
+ python -m venv .venv
194
+ source .venv/bin/activate
195
+
196
+ # Full local development setup
197
+ pip install -e ".[demo,dashboard,dense,dev,llm-judge]"
198
+ ```
199
+
200
+ For a smaller install:
201
+
202
+ ```bash
203
+ pip install -e ".[demo,dashboard]"
204
+ ```
205
+
206
+ ---
207
+
208
+ ## Stage Attribution in 60 Seconds
209
+
210
+ Add `ablations: true` to your combinations config and retobs automatically runs the prefix pipeline too:
211
+
212
+ ```yaml
213
+ stages:
214
+ bm25:
215
+ type: adapter.bm25
216
+ config: {k: 100}
217
+ rerank:
218
+ type: adapter.hf_crossencoder
219
+ config:
220
+ model: cross-encoder/ms-marco-MiniLM-L-6-v2
221
+ k: 10
222
+
223
+ combinations:
224
+ include:
225
+ - [bm25, rerank]
226
+ ablations: true # automatically also runs [bm25] alone — no extra config needed
227
+ ```
228
+
229
+ `retobs run` then prints the stage contribution table showing exactly what the reranker added.
230
+
231
+ For a 3-stage pipeline, `ablations: true` generates **all valid ordered subsequences** — not just prefixes:
232
+
233
+ ```yaml
234
+ combinations:
235
+ include:
236
+ - [bm25, fast_rerank, precise_rerank]
237
+ ablations: true
238
+ # Generates: bm25 | bm25__fast_rerank | bm25__precise_rerank | bm25__fast_rerank__precise_rerank
239
+ # Answers: does skipping fast_rerank and going direct to precise_rerank beat the cascade?
240
+ ```
241
+
242
+ To test only whether a specific stage pays for itself, name it explicitly:
243
+
244
+ ```yaml
245
+ combinations:
246
+ include:
247
+ - [bm25, fast_rerank, precise_rerank]
248
+ ablations: [fast_rerank] # generates only: without fast_rerank vs with fast_rerank
249
+ ```
250
+
251
+ Optionally set a latency budget to get a one-line verdict in CI:
252
+
253
+ ```bash
254
+ retobs run --config my_experiment.yaml --latency-budget-ms 1000
255
+ ```
256
+
257
+ ---
258
+
259
+ ## Query Difficulty Classifier
260
+
261
+ Predict whether a query will be hard for retrieval **before** running your pipeline, using only query text. Labels come from post-hoc diagnostics (mean Recall across pipelines on a specific corpus), so models are **dataset-specific**.
262
+
263
+ ```bash
264
+ # Install classifier dependencies
265
+ pip install -e ".[classifier]"
266
+
267
+ # After one or more benchmark runs on the same dataset:
268
+ retobs classifier train --dataset beir/nfcorpus
269
+
270
+ # Inspect cross-val accuracy, Brier score, and feature importances:
271
+ retobs classifier report --dataset beir/nfcorpus
272
+
273
+ # Score a single query:
274
+ retobs classifier predict --model .retobs/models/query_difficulty_beir_nfcorpus.joblib \
275
+ --query "What mitochondrial mechanisms were studied since 2019?"
276
+
277
+ # Next benchmark run auto-applies predictions when a matching model exists
278
+ retobs run --config my_experiment.yaml
279
+ ```
280
+
281
+ The dashboard shows **Classifier Calibration**: mean Recall@10 (with bootstrap CIs) grouped by predicted difficulty. If predicted-hard queries have lower Recall@10 than predicted-easy ones, the classifier is doing useful work.
282
+
283
+ **Caveat:** The classifier predicts observatory difficulty under *your* pipelines on *your* corpus—not intrinsic question hardness. Train and evaluate on the same dataset; cross-dataset use is unsupported.
284
+
285
+ ---
286
+
287
+ ## HTTP Quickstart
288
+
289
+ If your retrieval service is already running, point retobs at it and get metrics immediately:
290
+
291
+ ```bash
292
+ # Start the mock server
293
+ pip install fastapi uvicorn rank-bm25
294
+ uvicorn examples.http_quickstart.server:app --port 8000
295
+
296
+ # Benchmark it
297
+ retobs run --config examples/http_quickstart/config.yaml
298
+ ```
299
+
300
+ The HTTP adapter POSTs `{"query": str, "k": int}` and expects `{"results": [{"id", "text", "score"}]}`.
301
+
302
+ ---
303
+
304
+ ## Quick Test Of The Observatory
305
+
306
+ ```bash
307
+ # 1. Install/update editable package
308
+ source .venv/bin/activate
309
+ pip install -e ".[demo,dashboard,dense,dev,llm-judge]"
310
+
311
+ # 2. Confirm CLI commands are registered
312
+ retobs --help
313
+
314
+ # 3. Generate a starter experiment config
315
+ retobs init --mode bm25+reranker --output my_experiment.yaml
316
+
317
+ # 4. Validate before running
318
+ retobs validate --config my_experiment.yaml
319
+
320
+ # 5. Run the benchmark (stage attribution table printed automatically)
321
+ retobs run --config my_experiment.yaml --no-cache
322
+
323
+ # 6. Open the interactive dashboard
324
+ retobs serve --db .retobs/results.db --port 8000
325
+ ```
326
+
327
+ Open `http://localhost:8000` — move the latency budget slider and watch the stage verdict update live.
328
+
329
+ Load multiple result databases in one dashboard (sidebar tabs per DB):
330
+
331
+ ```bash
332
+ retobs serve --db .retobs/publish_smoke_scifact.db --db .retobs/dashboard_demo.db
333
+ # or comma-separated:
334
+ retobs serve --db .retobs/a.db,.retobs/b.db
335
+ # or env var (colon-separated):
336
+ RETOBS_DASHBOARD_DBS=.retobs/a.db:.retobs/b.db retobs serve
337
+ ```
338
+
339
+ ---
340
+
341
+ ## YAML Stage Combinations
342
+
343
+ You can define stages once and ask `retobs` to expand the exact combinations you want to benchmark.
344
+
345
+ ```yaml
346
+ experiment:
347
+ name: my-rag-sweep
348
+
349
+ dataset:
350
+ type: custom
351
+ name: custom
352
+ queries_path: data/queries.jsonl
353
+ corpus_path: data/corpus.jsonl
354
+ timestamp_field: timestamp
355
+ metadata_fields: [source]
356
+
357
+ stages:
358
+ bm25:
359
+ type: adapter.bm25
360
+ config: {k: 100}
361
+
362
+ dense:
363
+ type: adapter.hf_biencoder
364
+ config:
365
+ model: sentence-transformers/all-MiniLM-L6-v2
366
+ k: 100
367
+
368
+ rerank:
369
+ type: adapter.hf_crossencoder
370
+ config:
371
+ model: cross-encoder/ms-marco-MiniLM-L-6-v2
372
+ k: 10
373
+
374
+ combinations:
375
+ include:
376
+ - [bm25, rerank]
377
+ - [dense, rerank]
378
+ ablations: true # auto-generates [bm25] and [dense] prefix pipelines
379
+
380
+ metrics:
381
+ recall_at_k: [1, 5, 10, 20]
382
+ precision_at_k: [5, 10]
383
+ ndcg_at_k: [10]
384
+ mrr: true
385
+ map: true
386
+
387
+ execution:
388
+ concurrency: 4
389
+ timeout_seconds: 60
390
+ cache_results: true
391
+
392
+ output:
393
+ store: sqlite
394
+ db_path: .retobs/results.db
395
+ ```
396
+
397
+ Expanded pipeline IDs are stable, for example `bm25`, `dense`, `bm25__rerank`, and `dense__rerank`.
398
+
399
+ Cost is configured for relative tradeoff analysis:
400
+
401
+ ```yaml
402
+ costs:
403
+ bm25:
404
+ per_1k_queries: 0.10
405
+ rerank:
406
+ per_1k_queries: 1.50
407
+ ```
408
+
409
+ `retobs run` and the dashboard both treat this as an **estimated** cost model from your YAML, not measured cloud billing telemetry.
410
+
411
+ > **Stage cache note:** When `execution.cache_results: true`, retrieval stages are cached by
412
+ > `hash(stage_config + upstream_candidates + query_id)`. The upstream candidate fingerprint ensures
413
+ > that two pipelines sharing the same reranker but with different first-stage retrievers (e.g.
414
+ > `bm25→rerank` vs `dense→rerank`) never share reranker snapshots. Stage 0 (first retriever) still
415
+ > shares cache entries across ablation combos as intended. Use `--no-cache` when you want
416
+ > fully independent execution for reproducibility auditing.
417
+
418
+ ### HTTP adapter schema
419
+
420
+ The `adapter.http` stage wraps any REST endpoint. Your server must accept:
421
+
422
+ **Request** — `POST` with JSON body:
423
+
424
+ ```json
425
+ {"query": "user question text", "k": 100}
426
+ ```
427
+
428
+ When query filters are set, a `filters` object is also included.
429
+
430
+ **Response** — JSON in either shape:
431
+
432
+ ```json
433
+ {"documents": [{"id": "doc_1", "text": "...", "score": 0.92}]}
434
+ ```
435
+
436
+ ```json
437
+ [{"id": "doc_1", "text": "...", "score": 0.92}]
438
+ ```
439
+
440
+ Each document object must include the configured ID field (default `id`). Text and score fields default to `text` and `score` but can be remapped:
441
+
442
+ ```yaml
443
+ - type: adapter.http
444
+ url: http://localhost:8080/retrieve
445
+ config:
446
+ k: 100
447
+ id_field: doc_id
448
+ text_field: content
449
+ score_field: relevance
450
+ ```
451
+
452
+ See `[examples/http_quickstart/server.py](examples/http_quickstart/server.py)` for a reference implementation.
453
+
454
+ ### Custom Python retriever via `adapter.import`
455
+
456
+ Use `adapter.import` to load a Python factory callable from your own module without editing retobs internals:
457
+
458
+ ```yaml
459
+ - type: adapter.import
460
+ retriever_id: keyword
461
+ config:
462
+ factory: retriever:build_retriever
463
+ k: 10
464
+ ```
465
+
466
+ Supported factory paths:
467
+
468
+ - `package.module:callable`
469
+ - `package.module.callable`
470
+
471
+ Factory signature:
472
+
473
+ ```python
474
+ def build_retriever(corpus: dict | None, stage_cfg: dict, **kwargs):
475
+ ...
476
+ return retriever_or_reranker, k
477
+ ```
478
+
479
+ Runnable example: `[examples/custom_retriever/](examples/custom_retriever/)`
480
+
481
+ ---
482
+
483
+ ## Custom Dataset Format
484
+
485
+ ### `queries.jsonl`
486
+
487
+ ```json
488
+ {"query_id":"q1","text":"What changed in the refund policy?","relevant_doc_ids":{"doc_17":2,"doc_22":1},"temporal_anchor":"2024-01-15T00:00:00"}
489
+ ```
490
+
491
+ `relevant_doc_ids` can be a list for binary labels or a dict for graded relevance.
492
+
493
+ ### `corpus.jsonl`
494
+
495
+ ```json
496
+ {"id":"doc_17","title":"Refund policy update","text":"Refunds are now processed within 7 days.","timestamp":"2024-01-10T00:00:00"}
497
+ ```
498
+
499
+ ### Optional `qrels.jsonl`
500
+
501
+ ```json
502
+ {"query_id":"q1","doc_id":"doc_17","grade":2}
503
+ ```
504
+
505
+ `qrels.tsv` in TREC-style format is also supported.
506
+
507
+ ---
508
+
509
+ ## LLM-Assisted Labels
510
+
511
+ Gold labels are the default and remain the recommended evaluation source.
512
+
513
+ For unlabeled datasets, you can opt into LLM-assisted labels:
514
+
515
+ ```yaml
516
+ labels:
517
+ mode: pooled_llm_judge # gold, llm_judge, or pooled_llm_judge
518
+ judge: gemini # gemini, openai, or anthropic
519
+ model: gemini-2.0-flash
520
+ cache_path: .retobs/llm_judge_cache.db
521
+ ```
522
+
523
+ ---
524
+
525
+ ## Dashboard Features
526
+
527
+
528
+ | Feature | Description |
529
+ | ------------------------ | ----------------------------------------------------------------------------------------- |
530
+ | Stage Attribution | Before/after metric table for each pipeline pair with BH-corrected significance. |
531
+ | Tradeoff Explorer | Latency budget + min quality delta sliders; verdict computed client-side. |
532
+ | Experiment Overview | Headline winner, difficulty buckets, failure-label summary, reproducibility warnings. |
533
+ | Pipeline Architecture | Stage-by-stage flow diagram with per-stage quality and latency. |
534
+ | Stage Combination Matrix | Compact view of quality, latency, and optional cost-per-1k by pipeline/stage. |
535
+ | Query Explorer | Query-level diagnostics with failure labels, missing relevant IDs, and difficulty bucket. |
536
+ | Run Comparison | Side-by-side metrics with query-ID-aligned paired bootstrap p-values. |
537
+ | Recall@K Curves | Recall trends across K with BEIR reference lines when available. |
538
+ | Stage Recall Funnel | Shows how much candidate recall survives through reranking stages. |
539
+ | Latency Breakdown | P50/P95/P99 plus profiling metrics for compute, network, and retries. |
540
+ | Segment Analysis | NDCG@10 by query metadata such as number of relevant docs. |
541
+
542
+
543
+ ---
544
+
545
+ ## Example Runs
546
+
547
+ ### BEIR BM25 Baseline
548
+
549
+ ```bash
550
+ retobs validate --config examples/beir_demo.yaml
551
+ retobs run --config examples/beir_demo.yaml
552
+ retobs serve --db .retobs/beir_demo.db
553
+ ```
554
+
555
+ ### Three-Way nfcorpus Comparison
556
+
557
+ ```bash
558
+ pip install -e ".[demo,dashboard,dense]"
559
+ retobs validate --config examples/nfcorpus_three_way.yaml
560
+ retobs run --config examples/nfcorpus_three_way.yaml --no-cache
561
+ retobs serve --db .retobs/nfcorpus_three_way.db
562
+ ```
563
+
564
+ ### Temporal Recall Demo
565
+
566
+ ```bash
567
+ pip install -e ".[demo,dashboard]"
568
+ python examples/temporal_demo/generate_data.py
569
+ retobs run --config examples/temporal_demo/config.yaml --no-cache
570
+ retobs serve --db .retobs/temporal_demo.db
571
+ ```
572
+
573
+ This demo intentionally includes old and new relevant documents per query so `recall@1` and `temporal_recall@1` diverge when top-ranked hits are stale.
574
+
575
+ ### RRF Hybrid (BM25 + Dense)
576
+
577
+ ```bash
578
+ pip install -e ".[demo,dashboard,dense]"
579
+ retobs run --config examples/rrf_hybrid.yaml
580
+ ```
581
+
582
+ ### Dense vs BM25+Cohere Hybrid
583
+
584
+ ```bash
585
+ pip install -e ".[demo,dashboard,dense,cohere]"
586
+ export COHERE_API_KEY=your-key-here
587
+ retobs run --config examples/hybrid_comparison.yaml
588
+ ```
589
+
590
+ ---
591
+
592
+ ## CLI Reference
593
+
594
+ ```bash
595
+ retobs init --mode MODE --output PATH Generate starter config and sample data
596
+ retobs validate --config PATH [--db PATH] Validate config and dataset before running
597
+ retobs run --config PATH [--no-cache] Run a benchmark experiment
598
+ [--latency-budget-ms N] Print verdict against stage latency delta
599
+ retobs serve --db PATH [--db PATH ...] [--port N] Start dashboard (repeat --db for multiple SQLite files)
600
+ retobs compare RUN_ID_1 RUN_ID_2 --db PATH Compare runs with paired bootstrap tests
601
+ retobs inspect RUN_ID --query QUERY_ID [--pipeline ID] Debug per-query retrieval results
602
+ ```
603
+
604
+ Init modes: `beir`, `custom-jsonl`, `http-endpoint`, `bm25+dense` (includes RRF), `bm25+reranker` (includes ablations).
605
+
606
+ ---
607
+
608
+ ## Run The Test Suite
609
+
610
+ ```bash
611
+ source .venv/bin/activate
612
+ pip install -e ".[demo,dashboard,dense,dev,llm-judge]"
613
+ pytest tests/ -q
614
+ npm --prefix retrieval_observatory/dashboard/ui run build
615
+ python -m compileall retrieval_observatory -q
616
+ ```
617
+
618
+ ---
619
+
620
+ ## Dashboard Development
621
+
622
+ The dashboard UI is **pre-built in the PyPI wheel**, so `retobs serve` works after `pip install` with no Node.js required. When developing from a git clone and editing React sources, rebuild the UI:
623
+
624
+ ```bash
625
+ cd retrieval_observatory/dashboard/ui
626
+ npm install
627
+ npm run dev # hot-reloading dev server on :5173 (proxies API to retobs serve)
628
+ npm run build # rebuild dist/ before python -m build or tagging a release
629
+ ```
630
+
631
+ Or use `make dashboard-dev` / `make dashboard-build` from the repo root.
632
+
633
+ ---
634
+
635
+ ## Optional Dependency Groups
636
+
637
+
638
+ | Group | Installs | Use for |
639
+ | ------------ | --------------------------------------- | ------------------------------------------------------------ |
640
+ | `demo` | beir, datasets, rank-bm25 | Running BEIR datasets with BM25 |
641
+ | `dashboard` | fastapi, uvicorn, python-multipart | Serving the dashboard and accepting uploads |
642
+ | `dense` | sentence-transformers, faiss-cpu, torch | Dense bi-encoder retrieval and local cross-encoder reranking |
643
+ | `dev` | pytest, pytest-asyncio, coverage, respx | Running tests |
644
+ | `cohere` | cohere | Cohere reranking |
645
+ | `langchain` | langchain-core | LangChain adapter (programmatic use) |
646
+ | `llamaindex` | llama-index-core | LlamaIndex adapter (programmatic use) |
647
+ | `pgvector` | asyncpg, pgvector | Pgvector adapter |
648
+ | `llm-judge` | google-generativeai, anthropic, openai | LLM-assisted relevance judging |
649
+
650
+
651
+ PostgreSQL backend (`asyncpg`) is community-supported and not CI-tested. SQLite is recommended for evaluation workloads.
652
+
653
+ ```bash
654
+ pip install -e ".[demo,dashboard,dense,dev,llm-judge]"
655
+ ```