ragfallback 2.0.2__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {ragfallback-2.0.2/ragfallback.egg-info → ragfallback-2.1.0}/PKG-INFO +109 -6
  2. {ragfallback-2.0.2 → ragfallback-2.1.0}/README.md +101 -4
  3. ragfallback-2.1.0/examples/build_golden_dataset.py +230 -0
  4. ragfallback-2.1.0/examples/ci_regression_gate.py +185 -0
  5. ragfallback-2.1.0/examples/mlops_demo.py +350 -0
  6. {ragfallback-2.0.2 → ragfallback-2.1.0}/pyproject.toml +9 -2
  7. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/__init__.py +1 -1
  8. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/evaluation/rag_evaluator.py +4 -3
  9. ragfallback-2.1.0/ragfallback/mlops/__init__.py +25 -0
  10. ragfallback-2.1.0/ragfallback/mlops/baseline_registry.py +146 -0
  11. ragfallback-2.1.0/ragfallback/mlops/golden_runner.py +189 -0
  12. ragfallback-2.1.0/ragfallback/mlops/locust_template.py +74 -0
  13. ragfallback-2.1.0/ragfallback/mlops/mlflow_logger.py +68 -0
  14. ragfallback-2.1.0/ragfallback/mlops/query_simulator.py +144 -0
  15. ragfallback-2.1.0/ragfallback/mlops/ragas_hook.py +208 -0
  16. {ragfallback-2.0.2 → ragfallback-2.1.0/ragfallback.egg-info}/PKG-INFO +109 -6
  17. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback.egg-info/SOURCES.txt +10 -0
  18. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback.egg-info/requires.txt +7 -0
  19. {ragfallback-2.0.2 → ragfallback-2.1.0}/INSTALL_AND_RUN.md +0 -0
  20. {ragfallback-2.0.2 → ragfallback-2.1.0}/LICENSE +0 -0
  21. {ragfallback-2.0.2 → ragfallback-2.1.0}/MANIFEST.in +0 -0
  22. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/_kb_common.py +0 -0
  23. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/chroma_real_kb_demo.py +0 -0
  24. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/financial_risk_analysis.py +0 -0
  25. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/legal_document_analysis.py +0 -0
  26. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/medical_research_synthesis.py +0 -0
  27. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/production_reliability_example.py +0 -0
  28. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/qdrant_local_demo.py +0 -0
  29. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/real_data_demo.py +0 -0
  30. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc10_metadata_sanitizer.py +0 -0
  31. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc1_retrieval_health.py +0 -0
  32. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc2_embedding_guard.py +0 -0
  33. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc3_chunk_quality.py +0 -0
  34. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc4_context_window.py +0 -0
  35. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc5_hybrid_failover.py +0 -0
  36. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc6_adaptive_rag.py +0 -0
  37. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc6_multi_hop_demo.py +0 -0
  38. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc7_rag_evaluator.py +0 -0
  39. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc8_context_stitcher.py +0 -0
  40. {ragfallback-2.0.2 → ragfallback-2.1.0}/examples/uc9_embedding_probe.py +0 -0
  41. {ragfallback-2.0.2 → ragfallback-2.1.0}/pytest.ini +0 -0
  42. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/core/__init__.py +0 -0
  43. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/core/adaptive_retriever.py +0 -0
  44. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/__init__.py +0 -0
  45. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/chunking.py +0 -0
  46. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/context_stitcher.py +0 -0
  47. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/context_window.py +0 -0
  48. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/embedding_guard.py +0 -0
  49. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/embedding_probe.py +0 -0
  50. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/embedding_validator.py +0 -0
  51. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/retrieval_health.py +0 -0
  52. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/schema_sanitizer.py +0 -0
  53. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/diagnostics/stale_index.py +0 -0
  54. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/evaluation/__init__.py +0 -0
  55. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/exceptions.py +0 -0
  56. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/py.typed +0 -0
  57. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/retrieval/__init__.py +0 -0
  58. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/retrieval/failover.py +0 -0
  59. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/retrieval/rerank_guard.py +0 -0
  60. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/retrieval/smart_hybrid.py +0 -0
  61. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/retrieval/wrappers.py +0 -0
  62. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/strategies/__init__.py +0 -0
  63. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/strategies/base.py +0 -0
  64. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/strategies/multi_hop.py +0 -0
  65. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/strategies/query_variations.py +0 -0
  66. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/tracking/__init__.py +0 -0
  67. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/tracking/cost_tracker.py +0 -0
  68. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/tracking/metrics.py +0 -0
  69. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/__init__.py +0 -0
  70. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/confidence_scorer.py +0 -0
  71. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/embedding_factory.py +0 -0
  72. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/env.py +0 -0
  73. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/llm_factory.py +0 -0
  74. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback/utils/vector_store_factory.py +0 -0
  75. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback.egg-info/dependency_links.txt +0 -0
  76. {ragfallback-2.0.2 → ragfallback-2.1.0}/ragfallback.egg-info/top_level.txt +0 -0
  77. {ragfallback-2.0.2 → ragfallback-2.1.0}/requirements-dev.txt +0 -0
  78. {ragfallback-2.0.2 → ragfallback-2.1.0}/setup.cfg +0 -0
  79. {ragfallback-2.0.2 → ragfallback-2.1.0}/setup.py +0 -0
  80. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/__init__.py +0 -0
  81. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/conftest.py +0 -0
  82. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/integration/__init__.py +0 -0
  83. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/integration/test_adaptive_workflow.py +0 -0
  84. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/integration/test_chroma_pipeline.py +0 -0
  85. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/__init__.py +0 -0
  86. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_adaptive_multi_hop_bridge.py +0 -0
  87. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_confidence_scorer.py +0 -0
  88. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_cost_tracker.py +0 -0
  89. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_diagnostics.py +0 -0
  90. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_hybrid_retrieval.py +0 -0
  91. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_metrics.py +0 -0
  92. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_multi_hop.py +0 -0
  93. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_query_variations.py +0 -0
  94. {ragfallback-2.0.2 → ragfallback-2.1.0}/tests/unit/test_retrieval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragfallback
3
- Version: 2.0.2
3
+ Version: 2.1.0
4
4
  Summary: Prevents silent RAG failures — chunk quality, retrieval fallback, adaptive querying, and answer evaluation in one library.
5
5
  Home-page: https://github.com/irfanalidv/ragfallback
6
6
  Author: Irfan Ali
@@ -11,7 +11,7 @@ Project-URL: Documentation, https://github.com/irfanalidv/ragfallback#readme
11
11
  Project-URL: Repository, https://github.com/irfanalidv/ragfallback
12
12
  Project-URL: Issues, https://github.com/irfanalidv/ragfallback/issues
13
13
  Keywords: rag,retrieval,llm,fallback,query-variations,langchain,bm25,hybrid-search
14
- Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Development Status :: 4 - Beta
15
15
  Classifier: Intended Audience :: Developers
16
16
  Classifier: Programming Language :: Python :: 3
17
17
  Classifier: Programming Language :: Python :: 3.8
@@ -91,6 +91,12 @@ Requires-Dist: qdrant-client>=1.7.0; extra == "all"
91
91
  Requires-Dist: weaviate-client>=3.25.0; extra == "all"
92
92
  Requires-Dist: rank_bm25>=0.2.2; extra == "all"
93
93
  Requires-Dist: cohere>=4.0.0; extra == "all"
94
+ Provides-Extra: mlops
95
+ Requires-Dist: ragas>=0.2.0; extra == "mlops"
96
+ Requires-Dist: mlflow>=2.10.0; extra == "mlops"
97
+ Requires-Dist: locust>=2.20.0; extra == "mlops"
98
+ Requires-Dist: aiohttp>=3.9.0; extra == "mlops"
99
+ Requires-Dist: numpy>=1.24.0; extra == "mlops"
94
100
  Dynamic: author
95
101
  Dynamic: home-page
96
102
  Dynamic: license-file
@@ -99,15 +105,15 @@ Dynamic: requires-python
99
105
  # ragfallback
100
106
 
101
107
  [![GitHub license](https://img.shields.io/github/license/irfanalidv/ragfallback)](https://github.com/irfanalidv/ragfallback/blob/main/LICENSE)
102
- [![Python version](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11-blue.svg)](https://pypi.org/project/ragfallback/)
108
+ [![Python version](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue.svg)](https://pypi.org/project/ragfallback/)
103
109
  [![PyPI](https://img.shields.io/pypi/v/ragfallback)](https://pypi.org/project/ragfallback/)
104
110
  [![Downloads](https://static.pepy.tech/badge/ragfallback)](https://pepy.tech/project/ragfallback)
105
111
  [![Tests](https://github.com/irfanalidv/ragfallback/actions/workflows/test.yml/badge.svg)](https://github.com/irfanalidv/ragfallback/actions/workflows/test.yml)
112
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/irfanalidv/ragfallback/blob/main/ragfallback_colab.ipynb)
113
+ [![MLOps](https://img.shields.io/badge/MLOps-RAGAS%20%2B%20CI%20Gate-blueviolet)](https://github.com/irfanalidv/ragfallback/tree/main/ragfallback/mlops)
106
114
 
107
115
  **ragfallback** prevents silent RAG failures across the full pipeline — from bad chunks at ingest, through retrieval outages at runtime, to invisible answer quality degradation in production.
108
116
 
109
- ![ragfallback architecture](ragfallback_architecture.png?v=2)
110
-
111
117
  ---
112
118
 
113
119
  ## What it prevents
@@ -123,7 +129,8 @@ Dynamic: requires-python
123
129
  | 7 | Multi-step questions always fail single-shot RAG | `MultiHopFallbackStrategy` | `uc6_multi_hop_demo.py` |
124
130
  | 8 | Index serves stale data after document updates | `StaleIndexDetector` | — |
125
131
  | 9 | Answer quality invisible in production | `RAGEvaluator` | `uc7_rag_evaluator.py` |
126
- | 10 | Cross-boundary answers lost between adjacent chunks | `OverlappingContextStitcher` | `uc8_context_stitcher.py` |
132
+ | 10 | Cross-boundary answers lost between adjacent chunks | `OverlappingContextStitcher` | `uc8_context_stitcher.py` |
133
+ | 11 | Metric regression after model/embedder/chunker change | `GoldenRunner` + `BaselineRegistry` | `examples/ci_regression_gate.py` |
127
134
 
128
135
  ---
129
136
 
@@ -446,6 +453,9 @@ print(ev.batch_summary([score]))
446
453
  | Financial news RAG | nickmuchi/financial-classification (Apache 2.0) | `python examples/financial_risk_analysis.py` |
447
454
  | Legal contract RAG | theatticusproject/cuad-qa (CC BY 4.0) | `python examples/legal_document_analysis.py` |
448
455
  | Medical abstract RAG | qiaojin/PubMedQA (MIT) | `python examples/medical_research_synthesis.py` |
456
+ | MLOps: build golden dataset | SQuAD (CC BY-SA 4.0) + SciQ (CC BY-NC 3.0) | `python examples/build_golden_dataset.py` |
457
+ | MLOps: full demo | SQuAD golden set, zero API keys | `python examples/mlops_demo.py` |
458
+ | MLOps: CI regression gate | SQuAD golden set, committed baseline | `python examples/ci_regression_gate.py` |
449
459
 
450
460
  ---
451
461
 
@@ -483,6 +493,7 @@ pip install ragfallback[chroma,huggingface] # golden path (no API keys)
483
493
  pip install ragfallback[faiss,huggingface] # FAISS instead of Chroma
484
494
  pip install ragfallback[hybrid] # adds BM25 (rank_bm25)
485
495
  pip install ragfallback[real-data] # real dataset examples (HuggingFace datasets)
496
+ pip install ragfallback[mlops] # MLOps eval layer (RAGAS + MLflow + Locust)
486
497
  ```
487
498
 
488
499
  | Extra | Installs |
@@ -493,6 +504,7 @@ pip install ragfallback[real-data] # real dataset examples (Hu
493
504
  | `hybrid` | rank_bm25, langchain-community |
494
505
  | `real-data` | datasets |
495
506
  | `openai` | langchain-openai, openai |
507
+ | `mlops` | ragas, mlflow, locust, aiohttp |
496
508
 
497
509
  ---
498
510
 
@@ -509,6 +521,97 @@ from ragfallback.diagnostics import (
509
521
  from ragfallback.retrieval import SmartThresholdHybridRetriever, FailoverRetriever
510
522
  from ragfallback.strategies import QueryVariationsStrategy, MultiHopFallbackStrategy
511
523
  from ragfallback.evaluation import RAGEvaluator
524
+ from ragfallback.mlops import (
525
+ RagasHook, RagasReport,
526
+ BaselineRegistry, RegressionError,
527
+ GoldenRunner, GoldenReport,
528
+ QuerySimulator, SimQuery,
529
+ MLflowLogger,
530
+ generate_locustfile,
531
+ )
532
+ ```
533
+
534
+ ---
535
+
536
+ ## MLOps — Evaluation & Regression Gate
537
+
538
+ ragfallback ships a complete MLOps evaluation layer for RAG pipelines.
539
+ No API keys required — all metrics use local heuristics by default,
540
+ with optional RAGAS + MLflow when installed.
541
+
542
+ ### Install
543
+
544
+ ```bash
545
+ pip install ragfallback[chroma,huggingface,real-data,mlops]
546
+ ```
547
+
548
+ ### Full eval loop
549
+
550
+ ```python
551
+ import asyncio
552
+ from ragfallback.mlops import GoldenRunner, RagasHook, BaselineRegistry
553
+
554
+ # 1 — Build evaluation hook (heuristic by default; RAGAS when installed)
555
+ hook = RagasHook(llm=None, embeddings=embeddings)
556
+
557
+ # 2 — Run against 75 real SQuAD QA pairs
558
+ runner = GoldenRunner(
559
+ retriever=retriever, # AdaptiveRAGRetriever instance
560
+ ragas_hook=hook,
561
+ dataset="examples/golden_qa.json",
562
+ )
563
+ report = asyncio.run(runner.run_async())
564
+
565
+ print(f"Recall@3 : {report.recall_at_3:.3f}")
566
+ print(f"Faithfulness : {report.ragas.faithfulness:.3f}")
567
+ print(f"Latency P95 : {report.latency_p95_ms:.0f}ms")
568
+ print(f"Fallback rate : {report.fallback_rate:.1%}")
569
+
570
+ # 3 — Regression gate: fails if any metric drops > 5% vs baseline
571
+ registry = BaselineRegistry("baselines.json")
572
+ registry.compare_or_fail(report, dataset="my_dataset") # raises RegressionError if degraded
573
+ registry.update(report, dataset="my_dataset") # save new baseline
574
+ ```
575
+
576
+ ### Adversarial query simulation
577
+
578
+ ```python
579
+ from ragfallback.mlops import QuerySimulator
580
+
581
+ sim = QuerySimulator()
582
+ queries = ["What is the refund policy?", "How do API rate limits work?"]
583
+
584
+ # 4 types: short_keyword, long_nl, ambiguous, out_of_domain
585
+ mixed = sim.simulate(queries)
586
+
587
+ # All 4 types for every query — for stress testing
588
+ unhappy = sim.simulate_unhappy_paths(queries)
589
+ ```
590
+
591
+ ### Load testing
592
+
593
+ ```python
594
+ from ragfallback.mlops import generate_locustfile
595
+
596
+ generate_locustfile("locustfile.py", endpoint="http://localhost:8000")
597
+ # Run: locust -f locustfile.py --host http://localhost:8000 --users 50
598
+ ```
599
+
600
+ ### CI regression gate (GitHub Actions)
601
+
602
+ The included workflow (`mlops-regression-gate` job in `.github/workflows/test.yml`)
603
+ runs on every push to main:
604
+
605
+ 1. Pulls 75 SQuAD samples from HuggingFace (open data, CC BY-SA 4.0)
606
+ 2. Indexes them in ChromaDB using `all-MiniLM-L6-v2` (no API key)
607
+ 3. Runs `GoldenRunner` async — computes recall@3, recall@5, latency P95
608
+ 4. Calls `compare_or_fail()` against `examples/baselines.json` (committed)
609
+ 5. Fails the pipeline if any metric regresses more than 5%
610
+
611
+ ```bash
612
+ # Run the CI gate locally
613
+ python examples/build_golden_dataset.py # one-time setup
614
+ python examples/ci_regression_gate.py # exits 0 (pass) or 1 (fail)
512
615
  ```
513
616
 
514
617
  ---
@@ -1,15 +1,15 @@
1
1
  # ragfallback
2
2
 
3
3
  [![GitHub license](https://img.shields.io/github/license/irfanalidv/ragfallback)](https://github.com/irfanalidv/ragfallback/blob/main/LICENSE)
4
- [![Python version](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11-blue.svg)](https://pypi.org/project/ragfallback/)
4
+ [![Python version](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue.svg)](https://pypi.org/project/ragfallback/)
5
5
  [![PyPI](https://img.shields.io/pypi/v/ragfallback)](https://pypi.org/project/ragfallback/)
6
6
  [![Downloads](https://static.pepy.tech/badge/ragfallback)](https://pepy.tech/project/ragfallback)
7
7
  [![Tests](https://github.com/irfanalidv/ragfallback/actions/workflows/test.yml/badge.svg)](https://github.com/irfanalidv/ragfallback/actions/workflows/test.yml)
8
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/irfanalidv/ragfallback/blob/main/ragfallback_colab.ipynb)
9
+ [![MLOps](https://img.shields.io/badge/MLOps-RAGAS%20%2B%20CI%20Gate-blueviolet)](https://github.com/irfanalidv/ragfallback/tree/main/ragfallback/mlops)
8
10
 
9
11
  **ragfallback** prevents silent RAG failures across the full pipeline — from bad chunks at ingest, through retrieval outages at runtime, to invisible answer quality degradation in production.
10
12
 
11
- ![ragfallback architecture](ragfallback_architecture.png?v=2)
12
-
13
13
  ---
14
14
 
15
15
  ## What it prevents
@@ -25,7 +25,8 @@
25
25
  | 7 | Multi-step questions always fail single-shot RAG | `MultiHopFallbackStrategy` | `uc6_multi_hop_demo.py` |
26
26
  | 8 | Index serves stale data after document updates | `StaleIndexDetector` | — |
27
27
  | 9 | Answer quality invisible in production | `RAGEvaluator` | `uc7_rag_evaluator.py` |
28
- | 10 | Cross-boundary answers lost between adjacent chunks | `OverlappingContextStitcher` | `uc8_context_stitcher.py` |
28
+ | 10 | Cross-boundary answers lost between adjacent chunks | `OverlappingContextStitcher` | `uc8_context_stitcher.py` |
29
+ | 11 | Metric regression after model/embedder/chunker change | `GoldenRunner` + `BaselineRegistry` | `examples/ci_regression_gate.py` |
29
30
 
30
31
  ---
31
32
 
@@ -348,6 +349,9 @@ print(ev.batch_summary([score]))
348
349
  | Financial news RAG | nickmuchi/financial-classification (Apache 2.0) | `python examples/financial_risk_analysis.py` |
349
350
  | Legal contract RAG | theatticusproject/cuad-qa (CC BY 4.0) | `python examples/legal_document_analysis.py` |
350
351
  | Medical abstract RAG | qiaojin/PubMedQA (MIT) | `python examples/medical_research_synthesis.py` |
352
+ | MLOps: build golden dataset | SQuAD (CC BY-SA 4.0) + SciQ (CC BY-NC 3.0) | `python examples/build_golden_dataset.py` |
353
+ | MLOps: full demo | SQuAD golden set, zero API keys | `python examples/mlops_demo.py` |
354
+ | MLOps: CI regression gate | SQuAD golden set, committed baseline | `python examples/ci_regression_gate.py` |
351
355
 
352
356
  ---
353
357
 
@@ -385,6 +389,7 @@ pip install ragfallback[chroma,huggingface] # golden path (no API keys)
385
389
  pip install ragfallback[faiss,huggingface] # FAISS instead of Chroma
386
390
  pip install ragfallback[hybrid] # adds BM25 (rank_bm25)
387
391
  pip install ragfallback[real-data] # real dataset examples (HuggingFace datasets)
392
+ pip install ragfallback[mlops] # MLOps eval layer (RAGAS + MLflow + Locust)
388
393
  ```
389
394
 
390
395
  | Extra | Installs |
@@ -395,6 +400,7 @@ pip install ragfallback[real-data] # real dataset examples (Hu
395
400
  | `hybrid` | rank_bm25, langchain-community |
396
401
  | `real-data` | datasets |
397
402
  | `openai` | langchain-openai, openai |
403
+ | `mlops` | ragas, mlflow, locust, aiohttp |
398
404
 
399
405
  ---
400
406
 
@@ -411,6 +417,97 @@ from ragfallback.diagnostics import (
411
417
  from ragfallback.retrieval import SmartThresholdHybridRetriever, FailoverRetriever
412
418
  from ragfallback.strategies import QueryVariationsStrategy, MultiHopFallbackStrategy
413
419
  from ragfallback.evaluation import RAGEvaluator
420
+ from ragfallback.mlops import (
421
+ RagasHook, RagasReport,
422
+ BaselineRegistry, RegressionError,
423
+ GoldenRunner, GoldenReport,
424
+ QuerySimulator, SimQuery,
425
+ MLflowLogger,
426
+ generate_locustfile,
427
+ )
428
+ ```
429
+
430
+ ---
431
+
432
+ ## MLOps — Evaluation & Regression Gate
433
+
434
+ ragfallback ships a complete MLOps evaluation layer for RAG pipelines.
435
+ No API keys required — all metrics use local heuristics by default,
436
+ with optional RAGAS + MLflow when installed.
437
+
438
+ ### Install
439
+
440
+ ```bash
441
+ pip install ragfallback[chroma,huggingface,real-data,mlops]
442
+ ```
443
+
444
+ ### Full eval loop
445
+
446
+ ```python
447
+ import asyncio
448
+ from ragfallback.mlops import GoldenRunner, RagasHook, BaselineRegistry
449
+
450
+ # 1 — Build evaluation hook (heuristic by default; RAGAS when installed)
451
+ hook = RagasHook(llm=None, embeddings=embeddings)
452
+
453
+ # 2 — Run against 75 real SQuAD QA pairs
454
+ runner = GoldenRunner(
455
+ retriever=retriever, # AdaptiveRAGRetriever instance
456
+ ragas_hook=hook,
457
+ dataset="examples/golden_qa.json",
458
+ )
459
+ report = asyncio.run(runner.run_async())
460
+
461
+ print(f"Recall@3 : {report.recall_at_3:.3f}")
462
+ print(f"Faithfulness : {report.ragas.faithfulness:.3f}")
463
+ print(f"Latency P95 : {report.latency_p95_ms:.0f}ms")
464
+ print(f"Fallback rate : {report.fallback_rate:.1%}")
465
+
466
+ # 3 — Regression gate: fails if any metric drops > 5% vs baseline
467
+ registry = BaselineRegistry("baselines.json")
468
+ registry.compare_or_fail(report, dataset="my_dataset") # raises RegressionError if degraded
469
+ registry.update(report, dataset="my_dataset") # save new baseline
470
+ ```
471
+
472
+ ### Adversarial query simulation
473
+
474
+ ```python
475
+ from ragfallback.mlops import QuerySimulator
476
+
477
+ sim = QuerySimulator()
478
+ queries = ["What is the refund policy?", "How do API rate limits work?"]
479
+
480
+ # 4 types: short_keyword, long_nl, ambiguous, out_of_domain
481
+ mixed = sim.simulate(queries)
482
+
483
+ # All 4 types for every query — for stress testing
484
+ unhappy = sim.simulate_unhappy_paths(queries)
485
+ ```
486
+
487
+ ### Load testing
488
+
489
+ ```python
490
+ from ragfallback.mlops import generate_locustfile
491
+
492
+ generate_locustfile("locustfile.py", endpoint="http://localhost:8000")
493
+ # Run: locust -f locustfile.py --host http://localhost:8000 --users 50
494
+ ```
495
+
496
+ ### CI regression gate (GitHub Actions)
497
+
498
+ The included workflow (`mlops-regression-gate` job in `.github/workflows/test.yml`)
499
+ runs on every push to main:
500
+
501
+ 1. Pulls 75 SQuAD samples from HuggingFace (open data, CC BY-SA 4.0)
502
+ 2. Indexes them in ChromaDB using `all-MiniLM-L6-v2` (no API key)
503
+ 3. Runs `GoldenRunner` async — computes recall@3, recall@5, latency P95
504
+ 4. Calls `compare_or_fail()` against `examples/baselines.json` (committed)
505
+ 5. Fails the pipeline if any metric regresses more than 5%
506
+
507
+ ```bash
508
+ # Run the CI gate locally
509
+ python examples/build_golden_dataset.py # one-time setup
510
+ python examples/ci_regression_gate.py # exits 0 (pass) or 1 (fail)
414
511
  ```
415
512
 
416
513
  ---
@@ -0,0 +1,230 @@
1
+ """
2
+ Build Golden Dataset for ragfallback MLOps Evaluation
3
+ ======================================================
4
+ Pulls 75 real QA pairs from SQuAD (Wikipedia, CC BY-SA 4.0) and formats
5
+ them into golden_qa.json for use with GoldenRunner + BaselineRegistry.
6
+
7
+ Also pulls 25 from SciQ for a mixed domain stress set (golden_qa_stress.json).
8
+
9
+ Install : pip install ragfallback[real-data,chroma,huggingface]
10
+ Run : python examples/build_golden_dataset.py
11
+ Output : examples/golden_qa.json (75 SQuAD samples)
12
+ examples/golden_qa_stress.json (25 SQuAD + 25 SciQ mixed)
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import hashlib
18
+ import json
19
+ import sys
20
+ from pathlib import Path
21
+ from typing import Any, Dict, List, Tuple
22
+
23
+ # Allow running directly from repo root without pip install -e .
24
+ _repo_root = Path(__file__).resolve().parent.parent
25
+ if (_repo_root / "ragfallback").is_dir() and str(_repo_root) not in sys.path:
26
+ sys.path.insert(0, str(_repo_root))
27
+
28
+
29
+ def _doc_id(text: str, prefix: str = "doc") -> str:
30
+ """Stable deterministic ID from content hash."""
31
+ h = hashlib.md5(text.encode()).hexdigest()[:8]
32
+ return f"{prefix}_{h}"
33
+
34
+
35
+ def build_squad_samples(n: int = 75) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
36
+ """
37
+ Load SQuAD validation split.
38
+
39
+ Returns:
40
+ (samples, docs_meta) where samples follow GoldenRunner format:
41
+ {"query", "ground_truth", "relevant_doc_ids"}
42
+ and docs_meta is a list of {"id", "text", "title"} for reference.
43
+ """
44
+ try:
45
+ from datasets import load_dataset # type: ignore
46
+ except ImportError:
47
+ print("ERROR: pip install ragfallback[real-data]")
48
+ sys.exit(1)
49
+
50
+ print(" Downloading SQuAD validation split...")
51
+ ds = load_dataset("rajpurkar/squad", split="validation")
52
+
53
+ # Build passage registry: context_text → doc_id
54
+ passage_registry: Dict[str, str] = {}
55
+ samples: List[Dict[str, Any]] = []
56
+ docs_meta: List[Dict[str, Any]] = []
57
+
58
+ # We need good samples: has an answer, answer is in context, not too short
59
+ for row in ds:
60
+ if len(samples) >= n:
61
+ break
62
+
63
+ context = row["context"].strip()
64
+ question = row["question"].strip()
65
+ answers = row["answers"]["text"]
66
+
67
+ if not answers:
68
+ continue
69
+ ground_truth = answers[0].strip()
70
+
71
+ # Skip trivial answers (too short to be meaningful)
72
+ if len(ground_truth) < 3:
73
+ continue
74
+
75
+ # Register the passage
76
+ if context not in passage_registry:
77
+ doc_id = _doc_id(context, prefix="squad")
78
+ passage_registry[context] = doc_id
79
+ docs_meta.append(
80
+ {
81
+ "id": doc_id,
82
+ "text": context,
83
+ "title": row["title"],
84
+ "source": "squad",
85
+ }
86
+ )
87
+ else:
88
+ doc_id = passage_registry[context]
89
+
90
+ samples.append(
91
+ {
92
+ "query": question,
93
+ "ground_truth": ground_truth,
94
+ "relevant_doc_ids": [doc_id], # the passage that contains the answer
95
+ "metadata": {
96
+ "source": "squad",
97
+ "title": row["title"],
98
+ "doc_id": doc_id,
99
+ },
100
+ }
101
+ )
102
+
103
+ print(f" SQuAD: {len(samples)} samples, {len(docs_meta)} unique passages")
104
+ return samples, docs_meta
105
+
106
+
107
+ def build_sciq_samples(n: int = 25) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
108
+ """
109
+ Load SciQ test split — science domain, harder than SQuAD.
110
+
111
+ Returns same format as build_squad_samples.
112
+ """
113
+ try:
114
+ from datasets import load_dataset # type: ignore
115
+ except ImportError:
116
+ print("ERROR: pip install ragfallback[real-data]")
117
+ sys.exit(1)
118
+
119
+ print(" Downloading SciQ test split...")
120
+ ds = load_dataset("allenai/sciq", split="test")
121
+
122
+ samples: List[Dict[str, Any]] = []
123
+ docs_meta: List[Dict[str, Any]] = []
124
+
125
+ for row in ds:
126
+ if len(samples) >= n:
127
+ break
128
+
129
+ support = (row.get("support") or "").strip()
130
+ question = row["question"].strip()
131
+ answer = row["correct_answer"].strip()
132
+
133
+ # SciQ: skip rows with no supporting passage
134
+ if len(support) < 50:
135
+ continue
136
+
137
+ doc_id = _doc_id(support, prefix="sciq")
138
+ docs_meta.append(
139
+ {
140
+ "id": doc_id,
141
+ "text": support,
142
+ "title": "SciQ",
143
+ "source": "sciq",
144
+ }
145
+ )
146
+
147
+ samples.append(
148
+ {
149
+ "query": question,
150
+ "ground_truth": answer,
151
+ "relevant_doc_ids": [doc_id],
152
+ "metadata": {
153
+ "source": "sciq",
154
+ "doc_id": doc_id,
155
+ },
156
+ }
157
+ )
158
+
159
+ print(f" SciQ: {len(samples)} samples, {len(docs_meta)} unique passages")
160
+ return samples, docs_meta
161
+
162
+
163
+ def write_dataset(samples: List[Dict[str, Any]], path: Path) -> None:
164
+ """Write samples to JSON file."""
165
+ # Remove metadata key from final output (GoldenRunner doesn't need it)
166
+ clean = []
167
+ for s in samples:
168
+ clean.append(
169
+ {
170
+ "query": s["query"],
171
+ "ground_truth": s["ground_truth"],
172
+ "relevant_doc_ids": s["relevant_doc_ids"],
173
+ }
174
+ )
175
+ path.write_text(json.dumps(clean, indent=2, ensure_ascii=False))
176
+ print(f" Written: {path} ({len(clean)} samples)")
177
+
178
+
179
+ def write_docs_registry(docs: List[Dict[str, Any]], path: Path) -> None:
180
+ """Write passage registry — useful for building vector store from same data."""
181
+ path.write_text(json.dumps(docs, indent=2, ensure_ascii=False))
182
+ print(f" Written: {path} ({len(docs)} passages)")
183
+
184
+
185
+ def main() -> None:
186
+ print("=" * 60)
187
+ print("ragfallback — Build Golden Dataset from Open Data")
188
+ print("=" * 60)
189
+
190
+ out_dir = Path(__file__).resolve().parent
191
+ squad_json = out_dir / "golden_qa.json"
192
+ stress_json = out_dir / "golden_qa_stress.json"
193
+ docs_registry = out_dir / "golden_docs_registry.json"
194
+
195
+ # --- SQuAD: primary golden dataset ---
196
+ print("\n[1/3] Building primary golden dataset (SQuAD, n=75)...")
197
+ squad_samples, squad_docs = build_squad_samples(n=75)
198
+ write_dataset(squad_samples, squad_json)
199
+
200
+ # --- SciQ: stress set ---
201
+ print("\n[2/3] Building stress golden dataset (SciQ, n=25)...")
202
+ sciq_samples, sciq_docs = build_sciq_samples(n=25)
203
+
204
+ # Stress set = 25 SQuAD + 25 SciQ (mixed domain)
205
+ stress_samples = squad_samples[:25] + sciq_samples
206
+ write_dataset(stress_samples, stress_json)
207
+
208
+ # --- Docs registry ---
209
+ print("\n[3/3] Writing passage registry (for vector store construction)...")
210
+ all_docs = squad_docs + sciq_docs
211
+ write_docs_registry(all_docs, docs_registry)
212
+
213
+ # --- Summary ---
214
+ print("\n" + "=" * 60)
215
+ print("DONE. Files written:")
216
+ print(f" {squad_json.name:<35} — 75 SQuAD samples (primary eval)")
217
+ print(f" {stress_json.name:<35} — 50 mixed samples (stress eval)")
218
+ print(f" {docs_registry.name:<35} — passage registry")
219
+ print()
220
+ print("Next step:")
221
+ print(" python examples/mlops_demo.py")
222
+ print()
223
+ print("Licenses:")
224
+ print(" SQuAD : CC BY-SA 4.0 (https://huggingface.co/datasets/rajpurkar/squad)")
225
+ print(" SciQ : CC BY-NC 3.0 (https://huggingface.co/datasets/allenai/sciq)")
226
+ print("=" * 60)
227
+
228
+
229
+ if __name__ == "__main__":
230
+ main()