ragfallback 2.0.2__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ragfallback-2.0.2/ragfallback.egg-info → ragfallback-2.1.0}/PKG-INFO +109 -6
- {ragfallback-2.0.2 → ragfallback-2.1.0}/README.md +101 -4
- ragfallback-2.1.0/examples/build_golden_dataset.py +230 -0
- ragfallback-2.1.0/examples/ci_regression_gate.py +185 -0
- ragfallback-2.1.0/examples/mlops_demo.py +350 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/pyproject.toml +9 -2
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/__init__.py +1 -1
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/evaluation/rag_evaluator.py +4 -3
- ragfallback-2.1.0/ragfallback/mlops/__init__.py +25 -0
- ragfallback-2.1.0/ragfallback/mlops/baseline_registry.py +146 -0
- ragfallback-2.1.0/ragfallback/mlops/golden_runner.py +189 -0
- ragfallback-2.1.0/ragfallback/mlops/locust_template.py +74 -0
- ragfallback-2.1.0/ragfallback/mlops/mlflow_logger.py +68 -0
- ragfallback-2.1.0/ragfallback/mlops/query_simulator.py +144 -0
- ragfallback-2.1.0/ragfallback/mlops/ragas_hook.py +208 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0/ragfallback.egg-info}/PKG-INFO +109 -6
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback.egg-info/SOURCES.txt +10 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback.egg-info/requires.txt +7 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/INSTALL_AND_RUN.md +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/LICENSE +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/MANIFEST.in +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/_kb_common.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/chroma_real_kb_demo.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/financial_risk_analysis.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/legal_document_analysis.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/medical_research_synthesis.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/production_reliability_example.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/qdrant_local_demo.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/real_data_demo.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc10_metadata_sanitizer.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc1_retrieval_health.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc2_embedding_guard.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc3_chunk_quality.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc4_context_window.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc5_hybrid_failover.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc6_adaptive_rag.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc6_multi_hop_demo.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc7_rag_evaluator.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc8_context_stitcher.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc9_embedding_probe.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/pytest.ini +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/core/__init__.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/core/adaptive_retriever.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/__init__.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/chunking.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/context_stitcher.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/context_window.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/embedding_guard.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/embedding_probe.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/embedding_validator.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/retrieval_health.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/schema_sanitizer.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/stale_index.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/evaluation/__init__.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/exceptions.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/py.typed +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/retrieval/__init__.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/retrieval/failover.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/retrieval/rerank_guard.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/retrieval/smart_hybrid.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/retrieval/wrappers.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/strategies/__init__.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/strategies/base.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/strategies/multi_hop.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/strategies/query_variations.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/tracking/__init__.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/tracking/cost_tracker.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/tracking/metrics.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/__init__.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/confidence_scorer.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/embedding_factory.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/env.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/llm_factory.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/vector_store_factory.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback.egg-info/dependency_links.txt +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback.egg-info/top_level.txt +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/requirements-dev.txt +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/setup.cfg +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/setup.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/__init__.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/conftest.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/integration/__init__.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/integration/test_adaptive_workflow.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/integration/test_chroma_pipeline.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/__init__.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_adaptive_multi_hop_bridge.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_confidence_scorer.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_cost_tracker.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_diagnostics.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_hybrid_retrieval.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_metrics.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_multi_hop.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_query_variations.py +0 -0
- {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_retrieval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragfallback
|
|
3
|
-
Version: 2.0
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: Prevents silent RAG failures — chunk quality, retrieval fallback, adaptive querying, and answer evaluation in one library.
|
|
5
5
|
Home-page: https://github.com/irfanalidv/ragfallback
|
|
6
6
|
Author: Irfan Ali
|
|
@@ -11,7 +11,7 @@ Project-URL: Documentation, https://github.com/irfanalidv/ragfallback#readme
|
|
|
11
11
|
Project-URL: Repository, https://github.com/irfanalidv/ragfallback
|
|
12
12
|
Project-URL: Issues, https://github.com/irfanalidv/ragfallback/issues
|
|
13
13
|
Keywords: rag,retrieval,llm,fallback,query-variations,langchain,bm25,hybrid-search
|
|
14
|
-
Classifier: Development Status ::
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
15
|
Classifier: Intended Audience :: Developers
|
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.8
|
|
@@ -91,6 +91,12 @@ Requires-Dist: qdrant-client>=1.7.0; extra == "all"
|
|
|
91
91
|
Requires-Dist: weaviate-client>=3.25.0; extra == "all"
|
|
92
92
|
Requires-Dist: rank_bm25>=0.2.2; extra == "all"
|
|
93
93
|
Requires-Dist: cohere>=4.0.0; extra == "all"
|
|
94
|
+
Provides-Extra: mlops
|
|
95
|
+
Requires-Dist: ragas>=0.2.0; extra == "mlops"
|
|
96
|
+
Requires-Dist: mlflow>=2.10.0; extra == "mlops"
|
|
97
|
+
Requires-Dist: locust>=2.20.0; extra == "mlops"
|
|
98
|
+
Requires-Dist: aiohttp>=3.9.0; extra == "mlops"
|
|
99
|
+
Requires-Dist: numpy>=1.24.0; extra == "mlops"
|
|
94
100
|
Dynamic: author
|
|
95
101
|
Dynamic: home-page
|
|
96
102
|
Dynamic: license-file
|
|
@@ -99,15 +105,15 @@ Dynamic: requires-python
|
|
|
99
105
|
# ragfallback
|
|
100
106
|
|
|
101
107
|
[](https://github.com/irfanalidv/ragfallback/blob/main/LICENSE)
|
|
102
|
-
[](https://pypi.org/project/ragfallback/)
|
|
108
|
+
[](https://pypi.org/project/ragfallback/)
|
|
103
109
|
[](https://pypi.org/project/ragfallback/)
|
|
104
110
|
[](https://pepy.tech/project/ragfallback)
|
|
105
111
|
[](https://github.com/irfanalidv/ragfallback/actions/workflows/test.yml)
|
|
112
|
+
[](https://colab.research.google.com/github/irfanalidv/ragfallback/blob/main/ragfallback_colab.ipynb)
|
|
113
|
+
[](https://github.com/irfanalidv/ragfallback/tree/main/ragfallback/mlops)
|
|
106
114
|
|
|
107
115
|
**ragfallback** prevents silent RAG failures across the full pipeline — from bad chunks at ingest, through retrieval outages at runtime, to invisible answer quality degradation in production.
|
|
108
116
|
|
|
109
|
-

|
|
110
|
-
|
|
111
117
|
---
|
|
112
118
|
|
|
113
119
|
## What it prevents
|
|
@@ -123,7 +129,8 @@ Dynamic: requires-python
|
|
|
123
129
|
| 7 | Multi-step questions always fail single-shot RAG | `MultiHopFallbackStrategy` | `uc6_multi_hop_demo.py` |
|
|
124
130
|
| 8 | Index serves stale data after document updates | `StaleIndexDetector` | — |
|
|
125
131
|
| 9 | Answer quality invisible in production | `RAGEvaluator` | `uc7_rag_evaluator.py` |
|
|
126
|
-
| 10 | Cross-boundary answers lost between adjacent chunks | `OverlappingContextStitcher` | `uc8_context_stitcher.py`
|
|
132
|
+
| 10 | Cross-boundary answers lost between adjacent chunks | `OverlappingContextStitcher` | `uc8_context_stitcher.py` |
|
|
133
|
+
| 11 | Metric regression after model/embedder/chunker change | `GoldenRunner` + `BaselineRegistry` | `examples/ci_regression_gate.py` |
|
|
127
134
|
|
|
128
135
|
---
|
|
129
136
|
|
|
@@ -446,6 +453,9 @@ print(ev.batch_summary([score]))
|
|
|
446
453
|
| Financial news RAG | nickmuchi/financial-classification (Apache 2.0) | `python examples/financial_risk_analysis.py` |
|
|
447
454
|
| Legal contract RAG | theatticusproject/cuad-qa (CC BY 4.0) | `python examples/legal_document_analysis.py` |
|
|
448
455
|
| Medical abstract RAG | qiaojin/PubMedQA (MIT) | `python examples/medical_research_synthesis.py` |
|
|
456
|
+
| MLOps: build golden dataset | SQuAD (CC BY-SA 4.0) + SciQ (CC BY-NC 3.0) | `python examples/build_golden_dataset.py` |
|
|
457
|
+
| MLOps: full demo | SQuAD golden set, zero API keys | `python examples/mlops_demo.py` |
|
|
458
|
+
| MLOps: CI regression gate | SQuAD golden set, committed baseline | `python examples/ci_regression_gate.py` |
|
|
449
459
|
|
|
450
460
|
---
|
|
451
461
|
|
|
@@ -483,6 +493,7 @@ pip install ragfallback[chroma,huggingface] # golden path (no API keys)
|
|
|
483
493
|
pip install ragfallback[faiss,huggingface] # FAISS instead of Chroma
|
|
484
494
|
pip install ragfallback[hybrid] # adds BM25 (rank_bm25)
|
|
485
495
|
pip install ragfallback[real-data] # real dataset examples (HuggingFace datasets)
|
|
496
|
+
pip install ragfallback[mlops] # MLOps eval layer (RAGAS + MLflow + Locust)
|
|
486
497
|
```
|
|
487
498
|
|
|
488
499
|
| Extra | Installs |
|
|
@@ -493,6 +504,7 @@ pip install ragfallback[real-data] # real dataset examples (Hu
|
|
|
493
504
|
| `hybrid` | rank_bm25, langchain-community |
|
|
494
505
|
| `real-data` | datasets |
|
|
495
506
|
| `openai` | langchain-openai, openai |
|
|
507
|
+
| `mlops` | ragas, mlflow, locust, aiohttp |
|
|
496
508
|
|
|
497
509
|
---
|
|
498
510
|
|
|
@@ -509,6 +521,97 @@ from ragfallback.diagnostics import (
|
|
|
509
521
|
from ragfallback.retrieval import SmartThresholdHybridRetriever, FailoverRetriever
|
|
510
522
|
from ragfallback.strategies import QueryVariationsStrategy, MultiHopFallbackStrategy
|
|
511
523
|
from ragfallback.evaluation import RAGEvaluator
|
|
524
|
+
from ragfallback.mlops import (
|
|
525
|
+
RagasHook, RagasReport,
|
|
526
|
+
BaselineRegistry, RegressionError,
|
|
527
|
+
GoldenRunner, GoldenReport,
|
|
528
|
+
QuerySimulator, SimQuery,
|
|
529
|
+
MLflowLogger,
|
|
530
|
+
generate_locustfile,
|
|
531
|
+
)
|
|
532
|
+
```
|
|
533
|
+
|
|
534
|
+
---
|
|
535
|
+
|
|
536
|
+
## MLOps — Evaluation & Regression Gate
|
|
537
|
+
|
|
538
|
+
ragfallback ships a complete MLOps evaluation layer for RAG pipelines.
|
|
539
|
+
No API keys required — all metrics use local heuristics by default,
|
|
540
|
+
with optional RAGAS + MLflow when installed.
|
|
541
|
+
|
|
542
|
+
### Install
|
|
543
|
+
|
|
544
|
+
```bash
|
|
545
|
+
pip install ragfallback[chroma,huggingface,real-data,mlops]
|
|
546
|
+
```
|
|
547
|
+
|
|
548
|
+
### Full eval loop
|
|
549
|
+
|
|
550
|
+
```python
|
|
551
|
+
import asyncio
|
|
552
|
+
from ragfallback.mlops import GoldenRunner, RagasHook, BaselineRegistry
|
|
553
|
+
|
|
554
|
+
# 1 — Build evaluation hook (heuristic by default; RAGAS when installed)
|
|
555
|
+
hook = RagasHook(llm=None, embeddings=embeddings)
|
|
556
|
+
|
|
557
|
+
# 2 — Run against 75 real SQuAD QA pairs
|
|
558
|
+
runner = GoldenRunner(
|
|
559
|
+
retriever=retriever, # AdaptiveRAGRetriever instance
|
|
560
|
+
ragas_hook=hook,
|
|
561
|
+
dataset="examples/golden_qa.json",
|
|
562
|
+
)
|
|
563
|
+
report = asyncio.run(runner.run_async())
|
|
564
|
+
|
|
565
|
+
print(f"Recall@3 : {report.recall_at_3:.3f}")
|
|
566
|
+
print(f"Faithfulness : {report.ragas.faithfulness:.3f}")
|
|
567
|
+
print(f"Latency P95 : {report.latency_p95_ms:.0f}ms")
|
|
568
|
+
print(f"Fallback rate : {report.fallback_rate:.1%}")
|
|
569
|
+
|
|
570
|
+
# 3 — Regression gate: fails if any metric drops > 5% vs baseline
|
|
571
|
+
registry = BaselineRegistry("baselines.json")
|
|
572
|
+
registry.compare_or_fail(report, dataset="my_dataset") # raises RegressionError if degraded
|
|
573
|
+
registry.update(report, dataset="my_dataset") # save new baseline
|
|
574
|
+
```
|
|
575
|
+
|
|
576
|
+
### Adversarial query simulation
|
|
577
|
+
|
|
578
|
+
```python
|
|
579
|
+
from ragfallback.mlops import QuerySimulator
|
|
580
|
+
|
|
581
|
+
sim = QuerySimulator()
|
|
582
|
+
queries = ["What is the refund policy?", "How do API rate limits work?"]
|
|
583
|
+
|
|
584
|
+
# 4 types: short_keyword, long_nl, ambiguous, out_of_domain
|
|
585
|
+
mixed = sim.simulate(queries)
|
|
586
|
+
|
|
587
|
+
# All 4 types for every query — for stress testing
|
|
588
|
+
unhappy = sim.simulate_unhappy_paths(queries)
|
|
589
|
+
```
|
|
590
|
+
|
|
591
|
+
### Load testing
|
|
592
|
+
|
|
593
|
+
```python
|
|
594
|
+
from ragfallback.mlops import generate_locustfile
|
|
595
|
+
|
|
596
|
+
generate_locustfile("locustfile.py", endpoint="http://localhost:8000")
|
|
597
|
+
# Run: locust -f locustfile.py --host http://localhost:8000 --users 50
|
|
598
|
+
```
|
|
599
|
+
|
|
600
|
+
### CI regression gate (GitHub Actions)
|
|
601
|
+
|
|
602
|
+
The included workflow (`mlops-regression-gate` job in `.github/workflows/test.yml`)
|
|
603
|
+
runs on every push to main:
|
|
604
|
+
|
|
605
|
+
1. Pulls 75 SQuAD samples from HuggingFace (open data, CC BY-SA 4.0)
|
|
606
|
+
2. Indexes them in ChromaDB using `all-MiniLM-L6-v2` (no API key)
|
|
607
|
+
3. Runs `GoldenRunner` async — computes recall@3, recall@5, latency P95
|
|
608
|
+
4. Calls `compare_or_fail()` against `examples/baselines.json` (committed)
|
|
609
|
+
5. Fails the pipeline if any metric regresses more than 5%
|
|
610
|
+
|
|
611
|
+
```bash
|
|
612
|
+
# Run the CI gate locally
|
|
613
|
+
python examples/build_golden_dataset.py # one-time setup
|
|
614
|
+
python examples/ci_regression_gate.py # exits 0 (pass) or 1 (fail)
|
|
512
615
|
```
|
|
513
616
|
|
|
514
617
|
---
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
# ragfallback
|
|
2
2
|
|
|
3
3
|
[](https://github.com/irfanalidv/ragfallback/blob/main/LICENSE)
|
|
4
|
-
[](https://pypi.org/project/ragfallback/)
|
|
4
|
+
[](https://pypi.org/project/ragfallback/)
|
|
5
5
|
[](https://pypi.org/project/ragfallback/)
|
|
6
6
|
[](https://pepy.tech/project/ragfallback)
|
|
7
7
|
[](https://github.com/irfanalidv/ragfallback/actions/workflows/test.yml)
|
|
8
|
+
[](https://colab.research.google.com/github/irfanalidv/ragfallback/blob/main/ragfallback_colab.ipynb)
|
|
9
|
+
[](https://github.com/irfanalidv/ragfallback/tree/main/ragfallback/mlops)
|
|
8
10
|
|
|
9
11
|
**ragfallback** prevents silent RAG failures across the full pipeline — from bad chunks at ingest, through retrieval outages at runtime, to invisible answer quality degradation in production.
|
|
10
12
|
|
|
11
|
-

|
|
12
|
-
|
|
13
13
|
---
|
|
14
14
|
|
|
15
15
|
## What it prevents
|
|
@@ -25,7 +25,8 @@
|
|
|
25
25
|
| 7 | Multi-step questions always fail single-shot RAG | `MultiHopFallbackStrategy` | `uc6_multi_hop_demo.py` |
|
|
26
26
|
| 8 | Index serves stale data after document updates | `StaleIndexDetector` | — |
|
|
27
27
|
| 9 | Answer quality invisible in production | `RAGEvaluator` | `uc7_rag_evaluator.py` |
|
|
28
|
-
| 10 | Cross-boundary answers lost between adjacent chunks | `OverlappingContextStitcher` | `uc8_context_stitcher.py`
|
|
28
|
+
| 10 | Cross-boundary answers lost between adjacent chunks | `OverlappingContextStitcher` | `uc8_context_stitcher.py` |
|
|
29
|
+
| 11 | Metric regression after model/embedder/chunker change | `GoldenRunner` + `BaselineRegistry` | `examples/ci_regression_gate.py` |
|
|
29
30
|
|
|
30
31
|
---
|
|
31
32
|
|
|
@@ -348,6 +349,9 @@ print(ev.batch_summary([score]))
|
|
|
348
349
|
| Financial news RAG | nickmuchi/financial-classification (Apache 2.0) | `python examples/financial_risk_analysis.py` |
|
|
349
350
|
| Legal contract RAG | theatticusproject/cuad-qa (CC BY 4.0) | `python examples/legal_document_analysis.py` |
|
|
350
351
|
| Medical abstract RAG | qiaojin/PubMedQA (MIT) | `python examples/medical_research_synthesis.py` |
|
|
352
|
+
| MLOps: build golden dataset | SQuAD (CC BY-SA 4.0) + SciQ (CC BY-NC 3.0) | `python examples/build_golden_dataset.py` |
|
|
353
|
+
| MLOps: full demo | SQuAD golden set, zero API keys | `python examples/mlops_demo.py` |
|
|
354
|
+
| MLOps: CI regression gate | SQuAD golden set, committed baseline | `python examples/ci_regression_gate.py` |
|
|
351
355
|
|
|
352
356
|
---
|
|
353
357
|
|
|
@@ -385,6 +389,7 @@ pip install ragfallback[chroma,huggingface] # golden path (no API keys)
|
|
|
385
389
|
pip install ragfallback[faiss,huggingface] # FAISS instead of Chroma
|
|
386
390
|
pip install ragfallback[hybrid] # adds BM25 (rank_bm25)
|
|
387
391
|
pip install ragfallback[real-data] # real dataset examples (HuggingFace datasets)
|
|
392
|
+
pip install ragfallback[mlops] # MLOps eval layer (RAGAS + MLflow + Locust)
|
|
388
393
|
```
|
|
389
394
|
|
|
390
395
|
| Extra | Installs |
|
|
@@ -395,6 +400,7 @@ pip install ragfallback[real-data] # real dataset examples (Hu
|
|
|
395
400
|
| `hybrid` | rank_bm25, langchain-community |
|
|
396
401
|
| `real-data` | datasets |
|
|
397
402
|
| `openai` | langchain-openai, openai |
|
|
403
|
+
| `mlops` | ragas, mlflow, locust, aiohttp |
|
|
398
404
|
|
|
399
405
|
---
|
|
400
406
|
|
|
@@ -411,6 +417,97 @@ from ragfallback.diagnostics import (
|
|
|
411
417
|
from ragfallback.retrieval import SmartThresholdHybridRetriever, FailoverRetriever
|
|
412
418
|
from ragfallback.strategies import QueryVariationsStrategy, MultiHopFallbackStrategy
|
|
413
419
|
from ragfallback.evaluation import RAGEvaluator
|
|
420
|
+
from ragfallback.mlops import (
|
|
421
|
+
RagasHook, RagasReport,
|
|
422
|
+
BaselineRegistry, RegressionError,
|
|
423
|
+
GoldenRunner, GoldenReport,
|
|
424
|
+
QuerySimulator, SimQuery,
|
|
425
|
+
MLflowLogger,
|
|
426
|
+
generate_locustfile,
|
|
427
|
+
)
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
---
|
|
431
|
+
|
|
432
|
+
## MLOps — Evaluation & Regression Gate
|
|
433
|
+
|
|
434
|
+
ragfallback ships a complete MLOps evaluation layer for RAG pipelines.
|
|
435
|
+
No API keys required — all metrics use local heuristics by default,
|
|
436
|
+
with optional RAGAS + MLflow when installed.
|
|
437
|
+
|
|
438
|
+
### Install
|
|
439
|
+
|
|
440
|
+
```bash
|
|
441
|
+
pip install ragfallback[chroma,huggingface,real-data,mlops]
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
### Full eval loop
|
|
445
|
+
|
|
446
|
+
```python
|
|
447
|
+
import asyncio
|
|
448
|
+
from ragfallback.mlops import GoldenRunner, RagasHook, BaselineRegistry
|
|
449
|
+
|
|
450
|
+
# 1 — Build evaluation hook (heuristic by default; RAGAS when installed)
|
|
451
|
+
hook = RagasHook(llm=None, embeddings=embeddings)
|
|
452
|
+
|
|
453
|
+
# 2 — Run against 75 real SQuAD QA pairs
|
|
454
|
+
runner = GoldenRunner(
|
|
455
|
+
retriever=retriever, # AdaptiveRAGRetriever instance
|
|
456
|
+
ragas_hook=hook,
|
|
457
|
+
dataset="examples/golden_qa.json",
|
|
458
|
+
)
|
|
459
|
+
report = asyncio.run(runner.run_async())
|
|
460
|
+
|
|
461
|
+
print(f"Recall@3 : {report.recall_at_3:.3f}")
|
|
462
|
+
print(f"Faithfulness : {report.ragas.faithfulness:.3f}")
|
|
463
|
+
print(f"Latency P95 : {report.latency_p95_ms:.0f}ms")
|
|
464
|
+
print(f"Fallback rate : {report.fallback_rate:.1%}")
|
|
465
|
+
|
|
466
|
+
# 3 — Regression gate: fails if any metric drops > 5% vs baseline
|
|
467
|
+
registry = BaselineRegistry("baselines.json")
|
|
468
|
+
registry.compare_or_fail(report, dataset="my_dataset") # raises RegressionError if degraded
|
|
469
|
+
registry.update(report, dataset="my_dataset") # save new baseline
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
### Adversarial query simulation
|
|
473
|
+
|
|
474
|
+
```python
|
|
475
|
+
from ragfallback.mlops import QuerySimulator
|
|
476
|
+
|
|
477
|
+
sim = QuerySimulator()
|
|
478
|
+
queries = ["What is the refund policy?", "How do API rate limits work?"]
|
|
479
|
+
|
|
480
|
+
# 4 types: short_keyword, long_nl, ambiguous, out_of_domain
|
|
481
|
+
mixed = sim.simulate(queries)
|
|
482
|
+
|
|
483
|
+
# All 4 types for every query — for stress testing
|
|
484
|
+
unhappy = sim.simulate_unhappy_paths(queries)
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
### Load testing
|
|
488
|
+
|
|
489
|
+
```python
|
|
490
|
+
from ragfallback.mlops import generate_locustfile
|
|
491
|
+
|
|
492
|
+
generate_locustfile("locustfile.py", endpoint="http://localhost:8000")
|
|
493
|
+
# Run: locust -f locustfile.py --host http://localhost:8000 --users 50
|
|
494
|
+
```
|
|
495
|
+
|
|
496
|
+
### CI regression gate (GitHub Actions)
|
|
497
|
+
|
|
498
|
+
The included workflow (`mlops-regression-gate` job in `.github/workflows/test.yml`)
|
|
499
|
+
runs on every push to main:
|
|
500
|
+
|
|
501
|
+
1. Pulls 75 SQuAD samples from HuggingFace (open data, CC BY-SA 4.0)
|
|
502
|
+
2. Indexes them in ChromaDB using `all-MiniLM-L6-v2` (no API key)
|
|
503
|
+
3. Runs `GoldenRunner` async — computes recall@3, recall@5, latency P95
|
|
504
|
+
4. Calls `compare_or_fail()` against `examples/baselines.json` (committed)
|
|
505
|
+
5. Fails the pipeline if any metric regresses more than 5%
|
|
506
|
+
|
|
507
|
+
```bash
|
|
508
|
+
# Run the CI gate locally
|
|
509
|
+
python examples/build_golden_dataset.py # one-time setup
|
|
510
|
+
python examples/ci_regression_gate.py # exits 0 (pass) or 1 (fail)
|
|
414
511
|
```
|
|
415
512
|
|
|
416
513
|
---
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Build Golden Dataset for ragfallback MLOps Evaluation
|
|
3
|
+
======================================================
|
|
4
|
+
Pulls 75 real QA pairs from SQuAD (Wikipedia, CC BY-SA 4.0) and formats
|
|
5
|
+
them into golden_qa.json for use with GoldenRunner + BaselineRegistry.
|
|
6
|
+
|
|
7
|
+
Also pulls 25 from SciQ for a mixed domain stress set (golden_qa_stress.json).
|
|
8
|
+
|
|
9
|
+
Install : pip install ragfallback[real-data,chroma,huggingface]
|
|
10
|
+
Run : python examples/build_golden_dataset.py
|
|
11
|
+
Output : examples/golden_qa.json (75 SQuAD samples)
|
|
12
|
+
examples/golden_qa_stress.json (25 SQuAD + 25 SciQ mixed)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
import json
|
|
19
|
+
import sys
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any, Dict, List, Tuple
|
|
22
|
+
|
|
23
|
+
# Allow running directly from repo root without pip install -e .
|
|
24
|
+
_repo_root = Path(__file__).resolve().parent.parent
|
|
25
|
+
if (_repo_root / "ragfallback").is_dir() and str(_repo_root) not in sys.path:
|
|
26
|
+
sys.path.insert(0, str(_repo_root))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _doc_id(text: str, prefix: str = "doc") -> str:
|
|
30
|
+
"""Stable deterministic ID from content hash."""
|
|
31
|
+
h = hashlib.md5(text.encode()).hexdigest()[:8]
|
|
32
|
+
return f"{prefix}_{h}"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def build_squad_samples(n: int = 75) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
36
|
+
"""
|
|
37
|
+
Load SQuAD validation split.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
(samples, docs_meta) where samples follow GoldenRunner format:
|
|
41
|
+
{"query", "ground_truth", "relevant_doc_ids"}
|
|
42
|
+
and docs_meta is a list of {"id", "text", "title"} for reference.
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
from datasets import load_dataset # type: ignore
|
|
46
|
+
except ImportError:
|
|
47
|
+
print("ERROR: pip install ragfallback[real-data]")
|
|
48
|
+
sys.exit(1)
|
|
49
|
+
|
|
50
|
+
print(" Downloading SQuAD validation split...")
|
|
51
|
+
ds = load_dataset("rajpurkar/squad", split="validation")
|
|
52
|
+
|
|
53
|
+
# Build passage registry: context_text → doc_id
|
|
54
|
+
passage_registry: Dict[str, str] = {}
|
|
55
|
+
samples: List[Dict[str, Any]] = []
|
|
56
|
+
docs_meta: List[Dict[str, Any]] = []
|
|
57
|
+
|
|
58
|
+
# We need good samples: has an answer, answer is in context, not too short
|
|
59
|
+
for row in ds:
|
|
60
|
+
if len(samples) >= n:
|
|
61
|
+
break
|
|
62
|
+
|
|
63
|
+
context = row["context"].strip()
|
|
64
|
+
question = row["question"].strip()
|
|
65
|
+
answers = row["answers"]["text"]
|
|
66
|
+
|
|
67
|
+
if not answers:
|
|
68
|
+
continue
|
|
69
|
+
ground_truth = answers[0].strip()
|
|
70
|
+
|
|
71
|
+
# Skip trivial answers (too short to be meaningful)
|
|
72
|
+
if len(ground_truth) < 3:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
# Register the passage
|
|
76
|
+
if context not in passage_registry:
|
|
77
|
+
doc_id = _doc_id(context, prefix="squad")
|
|
78
|
+
passage_registry[context] = doc_id
|
|
79
|
+
docs_meta.append(
|
|
80
|
+
{
|
|
81
|
+
"id": doc_id,
|
|
82
|
+
"text": context,
|
|
83
|
+
"title": row["title"],
|
|
84
|
+
"source": "squad",
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
else:
|
|
88
|
+
doc_id = passage_registry[context]
|
|
89
|
+
|
|
90
|
+
samples.append(
|
|
91
|
+
{
|
|
92
|
+
"query": question,
|
|
93
|
+
"ground_truth": ground_truth,
|
|
94
|
+
"relevant_doc_ids": [doc_id], # the passage that contains the answer
|
|
95
|
+
"metadata": {
|
|
96
|
+
"source": "squad",
|
|
97
|
+
"title": row["title"],
|
|
98
|
+
"doc_id": doc_id,
|
|
99
|
+
},
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
print(f" SQuAD: {len(samples)} samples, {len(docs_meta)} unique passages")
|
|
104
|
+
return samples, docs_meta
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def build_sciq_samples(n: int = 25) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
108
|
+
"""
|
|
109
|
+
Load SciQ test split — science domain, harder than SQuAD.
|
|
110
|
+
|
|
111
|
+
Returns same format as build_squad_samples.
|
|
112
|
+
"""
|
|
113
|
+
try:
|
|
114
|
+
from datasets import load_dataset # type: ignore
|
|
115
|
+
except ImportError:
|
|
116
|
+
print("ERROR: pip install ragfallback[real-data]")
|
|
117
|
+
sys.exit(1)
|
|
118
|
+
|
|
119
|
+
print(" Downloading SciQ test split...")
|
|
120
|
+
ds = load_dataset("allenai/sciq", split="test")
|
|
121
|
+
|
|
122
|
+
samples: List[Dict[str, Any]] = []
|
|
123
|
+
docs_meta: List[Dict[str, Any]] = []
|
|
124
|
+
|
|
125
|
+
for row in ds:
|
|
126
|
+
if len(samples) >= n:
|
|
127
|
+
break
|
|
128
|
+
|
|
129
|
+
support = (row.get("support") or "").strip()
|
|
130
|
+
question = row["question"].strip()
|
|
131
|
+
answer = row["correct_answer"].strip()
|
|
132
|
+
|
|
133
|
+
# SciQ: skip rows with no supporting passage
|
|
134
|
+
if len(support) < 50:
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
doc_id = _doc_id(support, prefix="sciq")
|
|
138
|
+
docs_meta.append(
|
|
139
|
+
{
|
|
140
|
+
"id": doc_id,
|
|
141
|
+
"text": support,
|
|
142
|
+
"title": "SciQ",
|
|
143
|
+
"source": "sciq",
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
samples.append(
|
|
148
|
+
{
|
|
149
|
+
"query": question,
|
|
150
|
+
"ground_truth": answer,
|
|
151
|
+
"relevant_doc_ids": [doc_id],
|
|
152
|
+
"metadata": {
|
|
153
|
+
"source": "sciq",
|
|
154
|
+
"doc_id": doc_id,
|
|
155
|
+
},
|
|
156
|
+
}
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
print(f" SciQ: {len(samples)} samples, {len(docs_meta)} unique passages")
|
|
160
|
+
return samples, docs_meta
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def write_dataset(samples: List[Dict[str, Any]], path: Path) -> None:
|
|
164
|
+
"""Write samples to JSON file."""
|
|
165
|
+
# Remove metadata key from final output (GoldenRunner doesn't need it)
|
|
166
|
+
clean = []
|
|
167
|
+
for s in samples:
|
|
168
|
+
clean.append(
|
|
169
|
+
{
|
|
170
|
+
"query": s["query"],
|
|
171
|
+
"ground_truth": s["ground_truth"],
|
|
172
|
+
"relevant_doc_ids": s["relevant_doc_ids"],
|
|
173
|
+
}
|
|
174
|
+
)
|
|
175
|
+
path.write_text(json.dumps(clean, indent=2, ensure_ascii=False))
|
|
176
|
+
print(f" Written: {path} ({len(clean)} samples)")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def write_docs_registry(docs: List[Dict[str, Any]], path: Path) -> None:
|
|
180
|
+
"""Write passage registry — useful for building vector store from same data."""
|
|
181
|
+
path.write_text(json.dumps(docs, indent=2, ensure_ascii=False))
|
|
182
|
+
print(f" Written: {path} ({len(docs)} passages)")
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def main() -> None:
|
|
186
|
+
print("=" * 60)
|
|
187
|
+
print("ragfallback — Build Golden Dataset from Open Data")
|
|
188
|
+
print("=" * 60)
|
|
189
|
+
|
|
190
|
+
out_dir = Path(__file__).resolve().parent
|
|
191
|
+
squad_json = out_dir / "golden_qa.json"
|
|
192
|
+
stress_json = out_dir / "golden_qa_stress.json"
|
|
193
|
+
docs_registry = out_dir / "golden_docs_registry.json"
|
|
194
|
+
|
|
195
|
+
# --- SQuAD: primary golden dataset ---
|
|
196
|
+
print("\n[1/3] Building primary golden dataset (SQuAD, n=75)...")
|
|
197
|
+
squad_samples, squad_docs = build_squad_samples(n=75)
|
|
198
|
+
write_dataset(squad_samples, squad_json)
|
|
199
|
+
|
|
200
|
+
# --- SciQ: stress set ---
|
|
201
|
+
print("\n[2/3] Building stress golden dataset (SciQ, n=25)...")
|
|
202
|
+
sciq_samples, sciq_docs = build_sciq_samples(n=25)
|
|
203
|
+
|
|
204
|
+
# Stress set = 25 SQuAD + 25 SciQ (mixed domain)
|
|
205
|
+
stress_samples = squad_samples[:25] + sciq_samples
|
|
206
|
+
write_dataset(stress_samples, stress_json)
|
|
207
|
+
|
|
208
|
+
# --- Docs registry ---
|
|
209
|
+
print("\n[3/3] Writing passage registry (for vector store construction)...")
|
|
210
|
+
all_docs = squad_docs + sciq_docs
|
|
211
|
+
write_docs_registry(all_docs, docs_registry)
|
|
212
|
+
|
|
213
|
+
# --- Summary ---
|
|
214
|
+
print("\n" + "=" * 60)
|
|
215
|
+
print("DONE. Files written:")
|
|
216
|
+
print(f" {squad_json.name:<35} — 75 SQuAD samples (primary eval)")
|
|
217
|
+
print(f" {stress_json.name:<35} — 50 mixed samples (stress eval)")
|
|
218
|
+
print(f" {docs_registry.name:<35} — passage registry")
|
|
219
|
+
print()
|
|
220
|
+
print("Next step:")
|
|
221
|
+
print(" python examples/mlops_demo.py")
|
|
222
|
+
print()
|
|
223
|
+
print("Licenses:")
|
|
224
|
+
print(" SQuAD : CC BY-SA 4.0 (https://huggingface.co/datasets/rajpurkar/squad)")
|
|
225
|
+
print(" SciQ : CC BY-NC 3.0 (https://huggingface.co/datasets/allenai/sciq)")
|
|
226
|
+
print("=" * 60)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
if __name__ == "__main__":
|
|
230
|
+
main()
|