ragfallback 2.1.0__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ragfallback-2.1.0/ragfallback.egg-info → ragfallback-2.2.0}/PKG-INFO +1 -1
- {ragfallback-2.1.0 → ragfallback-2.2.0}/pyproject.toml +1 -1
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/__init__.py +3 -1
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/core/adaptive_retriever.py +281 -4
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/mlops/golden_runner.py +90 -15
- ragfallback-2.2.0/ragfallback/tracking/__init__.py +16 -0
- ragfallback-2.2.0/ragfallback/tracking/cache_monitor.py +245 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0/ragfallback.egg-info}/PKG-INFO +1 -1
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback.egg-info/SOURCES.txt +3 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/requirements-dev.txt +1 -0
- ragfallback-2.2.0/tests/unit/test_async_retriever.py +160 -0
- ragfallback-2.2.0/tests/unit/test_cache_monitor.py +189 -0
- ragfallback-2.1.0/ragfallback/tracking/__init__.py +0 -15
- {ragfallback-2.1.0 → ragfallback-2.2.0}/INSTALL_AND_RUN.md +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/LICENSE +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/MANIFEST.in +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/README.md +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/_kb_common.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/build_golden_dataset.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/chroma_real_kb_demo.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/ci_regression_gate.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/financial_risk_analysis.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/legal_document_analysis.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/medical_research_synthesis.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/mlops_demo.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/production_reliability_example.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/qdrant_local_demo.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/real_data_demo.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/uc10_metadata_sanitizer.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/uc1_retrieval_health.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/uc2_embedding_guard.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/uc3_chunk_quality.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/uc4_context_window.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/uc5_hybrid_failover.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/uc6_adaptive_rag.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/uc6_multi_hop_demo.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/uc7_rag_evaluator.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/uc8_context_stitcher.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/examples/uc9_embedding_probe.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/pytest.ini +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/core/__init__.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/diagnostics/__init__.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/diagnostics/chunking.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/diagnostics/context_stitcher.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/diagnostics/context_window.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/diagnostics/embedding_guard.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/diagnostics/embedding_probe.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/diagnostics/embedding_validator.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/diagnostics/retrieval_health.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/diagnostics/schema_sanitizer.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/diagnostics/stale_index.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/evaluation/__init__.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/evaluation/rag_evaluator.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/exceptions.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/mlops/__init__.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/mlops/baseline_registry.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/mlops/locust_template.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/mlops/mlflow_logger.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/mlops/query_simulator.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/mlops/ragas_hook.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/py.typed +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/retrieval/__init__.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/retrieval/failover.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/retrieval/rerank_guard.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/retrieval/smart_hybrid.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/retrieval/wrappers.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/strategies/__init__.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/strategies/base.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/strategies/multi_hop.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/strategies/query_variations.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/tracking/cost_tracker.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/tracking/metrics.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/utils/__init__.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/utils/confidence_scorer.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/utils/embedding_factory.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/utils/env.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/utils/llm_factory.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback/utils/vector_store_factory.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback.egg-info/dependency_links.txt +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback.egg-info/requires.txt +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/ragfallback.egg-info/top_level.txt +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/setup.cfg +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/setup.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/__init__.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/conftest.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/integration/__init__.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/integration/test_adaptive_workflow.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/integration/test_chroma_pipeline.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/unit/__init__.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/unit/test_adaptive_multi_hop_bridge.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/unit/test_confidence_scorer.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/unit/test_cost_tracker.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/unit/test_diagnostics.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/unit/test_hybrid_retrieval.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/unit/test_metrics.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/unit/test_multi_hop.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/unit/test_query_variations.py +0 -0
- {ragfallback-2.1.0 → ragfallback-2.2.0}/tests/unit/test_retrieval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragfallback
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: Prevents silent RAG failures — chunk quality, retrieval fallback, adaptive querying, and answer evaluation in one library.
|
|
5
5
|
Home-page: https://github.com/irfanalidv/ragfallback
|
|
6
6
|
Author: Irfan Ali
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ragfallback"
|
|
7
|
-
version = "2.
|
|
7
|
+
version = "2.2.0"
|
|
8
8
|
description = "Prevents silent RAG failures — chunk quality, retrieval fallback, adaptive querying, and answer evaluation in one library."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -9,16 +9,18 @@ This module exposes a small curated shortcut only (see ``__all__``).
|
|
|
9
9
|
|
|
10
10
|
from __future__ import annotations
|
|
11
11
|
|
|
12
|
-
__version__ = "2.
|
|
12
|
+
__version__ = "2.2.0"
|
|
13
13
|
__author__ = "Irfan Ali"
|
|
14
14
|
|
|
15
15
|
from ragfallback.core.adaptive_retriever import AdaptiveRAGRetriever, QueryResult
|
|
16
|
+
from ragfallback.tracking.cache_monitor import CacheMonitor
|
|
16
17
|
from ragfallback.tracking.cost_tracker import CostTracker
|
|
17
18
|
from ragfallback.tracking.metrics import MetricsCollector
|
|
18
19
|
|
|
19
20
|
__all__ = [
|
|
20
21
|
"AdaptiveRAGRetriever",
|
|
21
22
|
"QueryResult",
|
|
23
|
+
"CacheMonitor",
|
|
22
24
|
"CostTracker",
|
|
23
25
|
"MetricsCollector",
|
|
24
26
|
]
|
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
"""Retriever wrapper that retries failed or low-confidence queries using pluggable strategies."""
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
|
|
5
|
-
import
|
|
6
|
-
import
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import functools
|
|
7
7
|
import json
|
|
8
|
+
import logging
|
|
8
9
|
import re
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
9
13
|
|
|
10
14
|
from langchain_core.vectorstores import VectorStore
|
|
11
15
|
from langchain_core.language_models import BaseLanguageModel
|
|
@@ -270,6 +274,279 @@ Return your answer in JSON format: {"answer": "...", "source": "..."}"""
|
|
|
270
274
|
intermediate_steps=intermediate_steps if return_intermediate_steps else None
|
|
271
275
|
)
|
|
272
276
|
|
|
277
|
+
async def aquery_with_fallback(
|
|
278
|
+
self,
|
|
279
|
+
question: str,
|
|
280
|
+
context: Optional[Dict[str, Any]] = None,
|
|
281
|
+
return_intermediate_steps: bool = False,
|
|
282
|
+
enforce_budget: bool = False,
|
|
283
|
+
) -> QueryResult:
|
|
284
|
+
"""Async mirror of :meth:`query_with_fallback` using LangChain ``ainvoke``.
|
|
285
|
+
|
|
286
|
+
Uses ``ainvoke`` on the retriever and LLM so concurrent callers can overlap
|
|
287
|
+
I/O-bound work. Falls back transparently to a thread-pool for objects that
|
|
288
|
+
do not implement ``ainvoke``.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
question: The question to answer.
|
|
292
|
+
context: Optional metadata filters / context dict.
|
|
293
|
+
return_intermediate_steps: Include all attempt dicts in the result.
|
|
294
|
+
enforce_budget: Stop early if the cost budget is exceeded.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
:class:`QueryResult` with the same fields as the sync version.
|
|
298
|
+
"""
|
|
299
|
+
context = context or {}
|
|
300
|
+
intermediate_steps: List[Dict[str, Any]] = []
|
|
301
|
+
total_cost = 0.0
|
|
302
|
+
loop = asyncio.get_event_loop()
|
|
303
|
+
start = loop.time()
|
|
304
|
+
|
|
305
|
+
for strategy_idx, strategy in enumerate(self.strategies):
|
|
306
|
+
if strategy_idx >= self.max_attempts:
|
|
307
|
+
break
|
|
308
|
+
|
|
309
|
+
if enforce_budget and self.cost_tracker.budget_exceeded():
|
|
310
|
+
if self.logger:
|
|
311
|
+
self.logger.warning("Budget exceeded, stopping fallback attempts")
|
|
312
|
+
break
|
|
313
|
+
|
|
314
|
+
attempt_num = strategy_idx + 1
|
|
315
|
+
|
|
316
|
+
# Delegate multi-hop strategies to thread pool (they have no async API)
|
|
317
|
+
if callable(getattr(strategy, "run", None)):
|
|
318
|
+
if self.logger:
|
|
319
|
+
self.logger.debug(
|
|
320
|
+
"strategy %s has run() — delegating to thread pool",
|
|
321
|
+
strategy.__class__.__name__,
|
|
322
|
+
)
|
|
323
|
+
retriever = self.vector_store.as_retriever()
|
|
324
|
+
hop_result = await loop.run_in_executor(
|
|
325
|
+
None,
|
|
326
|
+
functools.partial(
|
|
327
|
+
strategy.run,
|
|
328
|
+
question=question,
|
|
329
|
+
retriever=retriever,
|
|
330
|
+
llm=self.llm,
|
|
331
|
+
),
|
|
332
|
+
)
|
|
333
|
+
step_data: Dict[str, Any] = {
|
|
334
|
+
"attempt": attempt_num,
|
|
335
|
+
"query": question,
|
|
336
|
+
"strategy": "multi_hop",
|
|
337
|
+
"hops": hop_result.total_hops,
|
|
338
|
+
"answer": hop_result.final_answer,
|
|
339
|
+
"confidence": 0.85 if hop_result.success else 0.0,
|
|
340
|
+
"cost": 0.0,
|
|
341
|
+
}
|
|
342
|
+
intermediate_steps.append(step_data)
|
|
343
|
+
if hop_result.success and hop_result.final_answer:
|
|
344
|
+
latency_ms = (loop.time() - start) * 1000
|
|
345
|
+
self.metrics_collector.record_success(
|
|
346
|
+
attempts=attempt_num,
|
|
347
|
+
confidence=0.85,
|
|
348
|
+
cost=total_cost,
|
|
349
|
+
latency_ms=latency_ms,
|
|
350
|
+
strategy_used="multi_hop",
|
|
351
|
+
)
|
|
352
|
+
return QueryResult(
|
|
353
|
+
answer=hop_result.final_answer,
|
|
354
|
+
source="multi_hop",
|
|
355
|
+
confidence=0.85,
|
|
356
|
+
attempts=attempt_num,
|
|
357
|
+
cost=total_cost,
|
|
358
|
+
intermediate_steps=intermediate_steps if return_intermediate_steps else None,
|
|
359
|
+
)
|
|
360
|
+
continue
|
|
361
|
+
|
|
362
|
+
queries = strategy.generate_queries(
|
|
363
|
+
original_query=question,
|
|
364
|
+
context=context,
|
|
365
|
+
attempt=strategy_idx + 1,
|
|
366
|
+
llm=self.llm,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
for query_idx, query in enumerate(queries):
|
|
370
|
+
attempt_num = strategy_idx * len(queries) + query_idx + 1
|
|
371
|
+
|
|
372
|
+
if attempt_num > self.max_attempts:
|
|
373
|
+
break
|
|
374
|
+
|
|
375
|
+
if self.logger:
|
|
376
|
+
self.logger.info(
|
|
377
|
+
"async attempt %d/%d: %s", attempt_num, self.max_attempts, query[:100]
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
docs = await self._aretrieve_documents(query, context)
|
|
381
|
+
|
|
382
|
+
if not docs:
|
|
383
|
+
if self.logger:
|
|
384
|
+
self.logger.warning("no documents found for query: %s", query)
|
|
385
|
+
intermediate_steps.append(
|
|
386
|
+
{"attempt": attempt_num, "query": query,
|
|
387
|
+
"documents": 0, "confidence": 0.0, "cost": 0.0}
|
|
388
|
+
)
|
|
389
|
+
continue
|
|
390
|
+
|
|
391
|
+
answer, source, confidence, cost = await self._agenerate_answer(
|
|
392
|
+
question=question,
|
|
393
|
+
query=query,
|
|
394
|
+
documents=docs,
|
|
395
|
+
context=context,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
total_cost += cost
|
|
399
|
+
latency_ms = (loop.time() - start) * 1000
|
|
400
|
+
|
|
401
|
+
step_data = {
|
|
402
|
+
"attempt": attempt_num,
|
|
403
|
+
"query": query,
|
|
404
|
+
"documents": len(docs),
|
|
405
|
+
"answer": answer,
|
|
406
|
+
"source": source,
|
|
407
|
+
"confidence": confidence,
|
|
408
|
+
"cost": cost,
|
|
409
|
+
}
|
|
410
|
+
intermediate_steps.append(step_data)
|
|
411
|
+
|
|
412
|
+
if confidence >= self.min_confidence and answer.lower() not in [
|
|
413
|
+
"x", "not found", "n/a", "unknown"
|
|
414
|
+
]:
|
|
415
|
+
if self.logger:
|
|
416
|
+
self.logger.debug(
|
|
417
|
+
"async attempt %d succeeded (confidence %.2f)",
|
|
418
|
+
attempt_num,
|
|
419
|
+
confidence,
|
|
420
|
+
)
|
|
421
|
+
self.metrics_collector.record_success(
|
|
422
|
+
attempts=attempt_num,
|
|
423
|
+
confidence=confidence,
|
|
424
|
+
cost=total_cost,
|
|
425
|
+
latency_ms=latency_ms,
|
|
426
|
+
strategy_used=strategy.get_name(),
|
|
427
|
+
)
|
|
428
|
+
return QueryResult(
|
|
429
|
+
answer=answer,
|
|
430
|
+
source=source,
|
|
431
|
+
confidence=confidence,
|
|
432
|
+
attempts=attempt_num,
|
|
433
|
+
cost=total_cost,
|
|
434
|
+
intermediate_steps=intermediate_steps if return_intermediate_steps else None,
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
latency_ms = (loop.time() - start) * 1000
|
|
438
|
+
|
|
439
|
+
if self.logger:
|
|
440
|
+
self.logger.warning(
|
|
441
|
+
"all %d async attempts exhausted without meeting confidence threshold",
|
|
442
|
+
len(intermediate_steps),
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
if intermediate_steps:
|
|
446
|
+
best_attempt = max(intermediate_steps, key=lambda x: x.get("confidence", 0.0))
|
|
447
|
+
best_answer = best_attempt.get("answer", "No answer found")
|
|
448
|
+
best_source = best_attempt.get("source", "")
|
|
449
|
+
best_confidence = best_attempt.get("confidence", 0.0)
|
|
450
|
+
else:
|
|
451
|
+
best_answer = "No answer found"
|
|
452
|
+
best_source = ""
|
|
453
|
+
best_confidence = 0.0
|
|
454
|
+
|
|
455
|
+
self.metrics_collector.record_failure(
|
|
456
|
+
attempts=len(intermediate_steps),
|
|
457
|
+
cost=total_cost,
|
|
458
|
+
latency_ms=latency_ms,
|
|
459
|
+
strategy_used=self.strategies[0].get_name() if self.strategies else "unknown",
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
return QueryResult(
|
|
463
|
+
answer=best_answer,
|
|
464
|
+
source=best_source,
|
|
465
|
+
confidence=best_confidence,
|
|
466
|
+
attempts=len(intermediate_steps) or 1,
|
|
467
|
+
cost=total_cost,
|
|
468
|
+
intermediate_steps=intermediate_steps if return_intermediate_steps else None,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
async def _aretrieve_documents(
|
|
472
|
+
self, query: str, context: Dict[str, Any]
|
|
473
|
+
) -> List[Any]:
|
|
474
|
+
"""Async document retrieval; falls back to thread pool if ``ainvoke`` absent."""
|
|
475
|
+
try:
|
|
476
|
+
search_kwargs = self._build_search_kwargs(context)
|
|
477
|
+
retriever = self.vector_store.as_retriever(search_kwargs=search_kwargs)
|
|
478
|
+
ainvoke = getattr(retriever, "ainvoke", None)
|
|
479
|
+
if ainvoke is not None:
|
|
480
|
+
result = await ainvoke(query)
|
|
481
|
+
return list(result or [])
|
|
482
|
+
# Fall back: run sync invoke in executor
|
|
483
|
+
loop = asyncio.get_event_loop()
|
|
484
|
+
invoke = getattr(retriever, "invoke", retriever.get_relevant_documents)
|
|
485
|
+
return list(
|
|
486
|
+
await loop.run_in_executor(None, functools.partial(invoke, query)) or []
|
|
487
|
+
)
|
|
488
|
+
except Exception as exc:
|
|
489
|
+
if self.logger:
|
|
490
|
+
self.logger.error("async retrieve error: %s", exc)
|
|
491
|
+
return []
|
|
492
|
+
|
|
493
|
+
async def _agenerate_answer(
|
|
494
|
+
self,
|
|
495
|
+
question: str,
|
|
496
|
+
query: str,
|
|
497
|
+
documents: List[Any],
|
|
498
|
+
context: Dict[str, Any],
|
|
499
|
+
) -> Tuple[str, str, float, float]:
|
|
500
|
+
"""Async answer generation; falls back to thread pool if ``ainvoke`` absent."""
|
|
501
|
+
docs_text = self._format_documents(documents)
|
|
502
|
+
prompt = self._build_answer_prompt(question, docs_text, context)
|
|
503
|
+
loop = asyncio.get_event_loop()
|
|
504
|
+
|
|
505
|
+
with self.cost_tracker.track(operation="answer_generation"):
|
|
506
|
+
messages = [
|
|
507
|
+
SystemMessage(content=self.answer_prompt_template),
|
|
508
|
+
HumanMessage(content=prompt),
|
|
509
|
+
]
|
|
510
|
+
try:
|
|
511
|
+
ainvoke = getattr(self.llm, "ainvoke", None)
|
|
512
|
+
if ainvoke is not None:
|
|
513
|
+
response = await ainvoke(messages)
|
|
514
|
+
else:
|
|
515
|
+
response = await loop.run_in_executor(
|
|
516
|
+
None, functools.partial(self.llm.invoke, messages)
|
|
517
|
+
)
|
|
518
|
+
except AttributeError:
|
|
519
|
+
response = await loop.run_in_executor(
|
|
520
|
+
None, functools.partial(self.llm.invoke, messages)
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
answer_text = response.content if hasattr(response, "content") else str(response)
|
|
524
|
+
|
|
525
|
+
if hasattr(response, "response_metadata"):
|
|
526
|
+
metadata = response.response_metadata
|
|
527
|
+
if "token_usage" in metadata:
|
|
528
|
+
usage = metadata["token_usage"]
|
|
529
|
+
self.cost_tracker.record_tokens(
|
|
530
|
+
input_tokens=usage.get("prompt_tokens", 0),
|
|
531
|
+
output_tokens=usage.get("completion_tokens", 0),
|
|
532
|
+
model=getattr(self.llm, "model_name", "gpt-4"),
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
answer, source = self._parse_answer(answer_text)
|
|
536
|
+
scorer = ConfidenceScorer(llm=self.llm)
|
|
537
|
+
confidence = await loop.run_in_executor(
|
|
538
|
+
None,
|
|
539
|
+
functools.partial(
|
|
540
|
+
scorer.score,
|
|
541
|
+
question=question,
|
|
542
|
+
answer=answer,
|
|
543
|
+
documents=documents,
|
|
544
|
+
context=context,
|
|
545
|
+
),
|
|
546
|
+
)
|
|
547
|
+
cost = self.cost_tracker.get_last_cost()
|
|
548
|
+
return answer, source, confidence, cost
|
|
549
|
+
|
|
273
550
|
def _retrieve_documents(
|
|
274
551
|
self,
|
|
275
552
|
query: str,
|
|
@@ -3,12 +3,14 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import functools
|
|
6
7
|
import json
|
|
8
|
+
import logging
|
|
7
9
|
import time
|
|
8
|
-
from dataclasses import dataclass
|
|
10
|
+
from dataclasses import dataclass, field
|
|
9
11
|
from datetime import datetime
|
|
10
12
|
from pathlib import Path
|
|
11
|
-
from typing import Any, Dict, List, Sequence, Set, Union
|
|
13
|
+
from typing import Any, Dict, List, Optional, Sequence, Set, Union
|
|
12
14
|
|
|
13
15
|
import numpy as np
|
|
14
16
|
|
|
@@ -16,6 +18,8 @@ from ragfallback.core.adaptive_retriever import AdaptiveRAGRetriever, QueryResul
|
|
|
16
18
|
from ragfallback.evaluation import recall_at_k
|
|
17
19
|
from ragfallback.mlops.ragas_hook import RagasHook, RagasReport
|
|
18
20
|
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
19
23
|
|
|
20
24
|
@dataclass
|
|
21
25
|
class GoldenReport:
|
|
@@ -30,6 +34,7 @@ class GoldenReport:
|
|
|
30
34
|
n_samples: int
|
|
31
35
|
timestamp: datetime
|
|
32
36
|
per_sample: List[Dict[str, Any]]
|
|
37
|
+
cache_stats: Optional[Dict[str, Any]] = field(default=None)
|
|
33
38
|
|
|
34
39
|
|
|
35
40
|
class GoldenRunner:
|
|
@@ -40,10 +45,22 @@ class GoldenRunner:
|
|
|
40
45
|
retriever: AdaptiveRAGRetriever,
|
|
41
46
|
ragas_hook: RagasHook,
|
|
42
47
|
dataset: Union[str, List[Dict[str, Any]]],
|
|
48
|
+
cache_monitor: Optional[Any] = None,
|
|
43
49
|
) -> None:
|
|
44
|
-
"""Load JSON path or use in-memory list
|
|
50
|
+
"""Load JSON path or use in-memory list; optionally wrap retriever with CacheMonitor.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
retriever: :class:`~ragfallback.core.adaptive_retriever.AdaptiveRAGRetriever`
|
|
54
|
+
instance to query.
|
|
55
|
+
ragas_hook: :class:`~ragfallback.mlops.ragas_hook.RagasHook` for scoring.
|
|
56
|
+
dataset: JSON file path or list of ``{"query", "ground_truth", ...}`` dicts.
|
|
57
|
+
cache_monitor: Optional :class:`~ragfallback.tracking.cache_monitor.CacheMonitor`
|
|
58
|
+
instance. When provided, the retriever's vector store retriever is wrapped
|
|
59
|
+
to track cache hits/misses. Stats appear in ``GoldenReport.cache_stats``.
|
|
60
|
+
"""
|
|
45
61
|
self.retriever = retriever
|
|
46
62
|
self.ragas_hook = ragas_hook
|
|
63
|
+
self._cache_monitor = cache_monitor
|
|
47
64
|
if isinstance(dataset, str):
|
|
48
65
|
raw = Path(dataset).read_text(encoding="utf-8")
|
|
49
66
|
self._dataset = json.loads(raw)
|
|
@@ -57,11 +74,27 @@ class GoldenRunner:
|
|
|
57
74
|
def _retrieve_docs(self, query: str, k: int = 5) -> List[Any]:
|
|
58
75
|
"""Fetch top-``k`` documents for context and id extraction."""
|
|
59
76
|
r = self.retriever.vector_store.as_retriever(search_kwargs={"k": k})
|
|
77
|
+
if self._cache_monitor is not None:
|
|
78
|
+
r = self._cache_monitor.wrap_retriever(r, k=k)
|
|
60
79
|
invoke = getattr(r, "invoke", None)
|
|
61
80
|
if invoke is not None:
|
|
62
81
|
return list(invoke(query) or [])
|
|
63
82
|
return list(r.get_relevant_documents(query))
|
|
64
83
|
|
|
84
|
+
async def _aretrieve_docs(self, query: str, k: int = 5) -> List[Any]:
|
|
85
|
+
"""Async fetch of top-``k`` documents."""
|
|
86
|
+
r = self.retriever.vector_store.as_retriever(search_kwargs={"k": k})
|
|
87
|
+
if self._cache_monitor is not None:
|
|
88
|
+
r = self._cache_monitor.wrap_retriever(r, k=k)
|
|
89
|
+
ainvoke = getattr(r, "ainvoke", None)
|
|
90
|
+
if ainvoke is not None:
|
|
91
|
+
return list(await ainvoke(query) or [])
|
|
92
|
+
loop = asyncio.get_event_loop()
|
|
93
|
+
invoke = getattr(r, "invoke", None)
|
|
94
|
+
if invoke is not None:
|
|
95
|
+
return list(await loop.run_in_executor(None, functools.partial(invoke, query)) or [])
|
|
96
|
+
return []
|
|
97
|
+
|
|
65
98
|
def _doc_ids(self, docs: Sequence[Any]) -> List[str]:
|
|
66
99
|
"""Stable string ids from document metadata or content hash."""
|
|
67
100
|
out: List[str] = []
|
|
@@ -77,9 +110,7 @@ class GoldenRunner:
|
|
|
77
110
|
|
|
78
111
|
def _contexts_from_docs(self, docs: Sequence[Any]) -> List[str]:
|
|
79
112
|
"""Plain-text contexts for evaluation."""
|
|
80
|
-
return [
|
|
81
|
-
(getattr(d, "page_content", str(d)) or "") for d in docs
|
|
82
|
-
]
|
|
113
|
+
return [(getattr(d, "page_content", str(d)) or "") for d in docs]
|
|
83
114
|
|
|
84
115
|
def _ids_from_intermediate(self, result: QueryResult) -> List[str]:
|
|
85
116
|
"""Best-effort doc id list from intermediate steps (often empty)."""
|
|
@@ -92,7 +123,7 @@ class GoldenRunner:
|
|
|
92
123
|
return []
|
|
93
124
|
|
|
94
125
|
def _run_single(self, item: Dict[str, Any]) -> Dict[str, Any]:
|
|
95
|
-
"""Run one golden row and return diagnostics
|
|
126
|
+
"""Run one golden row synchronously and return diagnostics."""
|
|
96
127
|
query = item["query"]
|
|
97
128
|
gt = item.get("ground_truth", "")
|
|
98
129
|
rel_ids: Set[str] = set(str(x) for x in item.get("relevant_doc_ids", []))
|
|
@@ -111,7 +142,6 @@ class GoldenRunner:
|
|
|
111
142
|
|
|
112
143
|
r3 = recall_at_k(retrieved_ids, rel_ids, 3)
|
|
113
144
|
r5 = recall_at_k(retrieved_ids, rel_ids, 5)
|
|
114
|
-
fallback_triggered = result.attempts > 1
|
|
115
145
|
|
|
116
146
|
return {
|
|
117
147
|
"question": query,
|
|
@@ -121,7 +151,50 @@ class GoldenRunner:
|
|
|
121
151
|
"contexts": contexts,
|
|
122
152
|
"latency_ms": latency_ms,
|
|
123
153
|
"retrieved_ids": retrieved_ids,
|
|
124
|
-
"fallback_triggered":
|
|
154
|
+
"fallback_triggered": result.attempts > 1,
|
|
155
|
+
"recall_at_3": r3,
|
|
156
|
+
"recall_at_5": r5,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
async def _arun_single(self, item: Dict[str, Any]) -> Dict[str, Any]:
|
|
160
|
+
"""Run one golden row natively async using :meth:`aquery_with_fallback`."""
|
|
161
|
+
query = item["query"]
|
|
162
|
+
gt = item.get("ground_truth", "")
|
|
163
|
+
rel_ids: Set[str] = set(str(x) for x in item.get("relevant_doc_ids", []))
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
t0 = time.perf_counter()
|
|
167
|
+
result = await self.retriever.aquery_with_fallback(
|
|
168
|
+
query, return_intermediate_steps=True
|
|
169
|
+
)
|
|
170
|
+
latency_ms = (time.perf_counter() - t0) * 1000.0
|
|
171
|
+
except AttributeError:
|
|
172
|
+
logger.warning(
|
|
173
|
+
"retriever does not support aquery_with_fallback — "
|
|
174
|
+
"falling back to thread pool for query: %s",
|
|
175
|
+
query[:80],
|
|
176
|
+
)
|
|
177
|
+
loop = asyncio.get_event_loop()
|
|
178
|
+
return await loop.run_in_executor(None, self._run_single, item)
|
|
179
|
+
|
|
180
|
+
docs = await self._aretrieve_docs(query, k=5)
|
|
181
|
+
retrieved_ids = self._ids_from_intermediate(result)
|
|
182
|
+
if not retrieved_ids:
|
|
183
|
+
retrieved_ids = self._doc_ids(docs)
|
|
184
|
+
contexts = self._contexts_from_docs(docs)
|
|
185
|
+
|
|
186
|
+
r3 = recall_at_k(retrieved_ids, rel_ids, 3)
|
|
187
|
+
r5 = recall_at_k(retrieved_ids, rel_ids, 5)
|
|
188
|
+
|
|
189
|
+
return {
|
|
190
|
+
"question": query,
|
|
191
|
+
"ground_truth": gt,
|
|
192
|
+
"answer": result.answer,
|
|
193
|
+
"confidence": result.confidence,
|
|
194
|
+
"contexts": contexts,
|
|
195
|
+
"latency_ms": latency_ms,
|
|
196
|
+
"retrieved_ids": retrieved_ids,
|
|
197
|
+
"fallback_triggered": result.attempts > 1,
|
|
125
198
|
"recall_at_3": r3,
|
|
126
199
|
"recall_at_5": r5,
|
|
127
200
|
}
|
|
@@ -139,6 +212,11 @@ class GoldenRunner:
|
|
|
139
212
|
fb = sum(1 for s in per_sample if s.get("fallback_triggered")) / n
|
|
140
213
|
p95 = float(np.percentile(latencies, 95)) if latencies else 0.0
|
|
141
214
|
mean_lat = sum(latencies) / len(latencies) if latencies else 0.0
|
|
215
|
+
cache_stats = (
|
|
216
|
+
self._cache_monitor.get_stats().as_dict()
|
|
217
|
+
if self._cache_monitor is not None
|
|
218
|
+
else None
|
|
219
|
+
)
|
|
142
220
|
return GoldenReport(
|
|
143
221
|
ragas=ragas_report,
|
|
144
222
|
recall_at_3=mean_r3,
|
|
@@ -149,6 +227,7 @@ class GoldenRunner:
|
|
|
149
227
|
n_samples=len(per_sample),
|
|
150
228
|
timestamp=datetime.utcnow(),
|
|
151
229
|
per_sample=per_sample,
|
|
230
|
+
cache_stats=cache_stats,
|
|
152
231
|
)
|
|
153
232
|
|
|
154
233
|
def run(self) -> GoldenReport:
|
|
@@ -169,12 +248,8 @@ class GoldenRunner:
|
|
|
169
248
|
return self._build_report(per_sample, ragas_rep)
|
|
170
249
|
|
|
171
250
|
async def run_async(self) -> GoldenReport:
|
|
172
|
-
"""Evaluate golden rows concurrently
|
|
173
|
-
|
|
174
|
-
tasks = [
|
|
175
|
-
loop.run_in_executor(None, self._run_single, item)
|
|
176
|
-
for item in self._dataset
|
|
177
|
-
]
|
|
251
|
+
"""Evaluate golden rows concurrently using native async, then Ragas async."""
|
|
252
|
+
tasks = [self._arun_single(item) for item in self._dataset]
|
|
178
253
|
per_sample = list(await asyncio.gather(*tasks))
|
|
179
254
|
ragas_samples = [
|
|
180
255
|
{
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Cost tracking, metrics collection, and cache monitoring."""
|
|
2
|
+
|
|
3
|
+
from ragfallback.tracking.cache_monitor import CacheMonitor, CacheStats
|
|
4
|
+
from ragfallback.tracking.cost_tracker import CostTracker, ModelPricing
|
|
5
|
+
from ragfallback.tracking.metrics import MetricsCollector
|
|
6
|
+
|
|
7
|
+
__all__ = ["CacheMonitor", "CacheStats", "CostTracker", "ModelPricing", "MetricsCollector"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|