rag-audit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rag_audit/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """rag-audit — CLI + library to audit and benchmark RAG pipelines."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,12 @@
1
+ """Adapters for vector store backends."""
2
+
3
+ from rag_audit.adapters.base import VectorStoreAdapter
4
+ from rag_audit.adapters.chroma import ChromaDBAdapter
5
+ from rag_audit.adapters.stubs import PineconeAdapter, QdrantAdapter
6
+
7
+ __all__ = [
8
+ "ChromaDBAdapter",
9
+ "PineconeAdapter",
10
+ "QdrantAdapter",
11
+ "VectorStoreAdapter",
12
+ ]
@@ -0,0 +1,11 @@
1
+ from typing import Protocol, runtime_checkable
2
+
3
+
4
+ @runtime_checkable
5
+ class VectorStoreAdapter(Protocol):
6
+ def add(
7
+ self, ids: list[str], texts: list[str], embeddings: list[list[float]]
8
+ ) -> None: ...
9
+ def query(self, embedding: list[float], k: int) -> list[str]: ...
10
+ def delete(self, ids: list[str]) -> None: ...
11
+ def count(self) -> int: ...
@@ -0,0 +1,42 @@
1
+ import chromadb
2
+ from chromadb.api import ClientAPI
3
+
4
+
5
+ class ChromaDBAdapter:
6
+ def __init__(
7
+ self,
8
+ collection_name: str,
9
+ client: ClientAPI | None = None,
10
+ path: str | None = None,
11
+ ) -> None:
12
+ if client is not None:
13
+ self._client: ClientAPI = client
14
+ elif path is not None:
15
+ self._client = chromadb.PersistentClient(path=path)
16
+ else:
17
+ self._client = chromadb.EphemeralClient()
18
+ self._collection = self._client.get_or_create_collection(collection_name)
19
+
20
+ def add(
21
+ self, ids: list[str], texts: list[str], embeddings: list[list[float]]
22
+ ) -> None:
23
+ self._collection.add(ids=ids, documents=texts, embeddings=embeddings) # type: ignore[arg-type]
24
+
25
+ def query(self, embedding: list[float], k: int) -> list[str]:
26
+ actual_k = min(k, self._collection.count())
27
+ if actual_k == 0:
28
+ return []
29
+ results = self._collection.query(
30
+ query_embeddings=[embedding], # type: ignore[arg-type]
31
+ n_results=actual_k,
32
+ )
33
+ docs = results["documents"]
34
+ if not docs or docs[0] is None:
35
+ return []
36
+ return list(docs[0])
37
+
38
+ def delete(self, ids: list[str]) -> None:
39
+ self._collection.delete(ids=ids)
40
+
41
+ def count(self) -> int:
42
+ return int(self._collection.count())
@@ -0,0 +1,30 @@
1
+ class PineconeAdapter:
2
+ def add(
3
+ self, ids: list[str], texts: list[str], embeddings: list[list[float]]
4
+ ) -> None:
5
+ raise NotImplementedError
6
+
7
+ def query(self, embedding: list[float], k: int) -> list[str]:
8
+ raise NotImplementedError
9
+
10
+ def delete(self, ids: list[str]) -> None:
11
+ raise NotImplementedError
12
+
13
+ def count(self) -> int:
14
+ raise NotImplementedError
15
+
16
+
17
+ class QdrantAdapter:
18
+ def add(
19
+ self, ids: list[str], texts: list[str], embeddings: list[list[float]]
20
+ ) -> None:
21
+ raise NotImplementedError
22
+
23
+ def query(self, embedding: list[float], k: int) -> list[str]:
24
+ raise NotImplementedError
25
+
26
+ def delete(self, ids: list[str]) -> None:
27
+ raise NotImplementedError
28
+
29
+ def count(self) -> int:
30
+ raise NotImplementedError
@@ -0,0 +1,20 @@
1
+ """Chunking strategies and cohesion-based evaluation for RAG pipelines."""
2
+
3
+ from rag_audit.chunker.base import BaseChunker
4
+ from rag_audit.chunker.evaluator import ChunkingEvaluator
5
+ from rag_audit.chunker.models import ChunkingReport, ChunkingStrategyReport
6
+ from rag_audit.chunker.strategies import (
7
+ FixedSizeChunker,
8
+ RecursiveChunker,
9
+ SemanticChunker,
10
+ )
11
+
12
+ __all__ = [
13
+ "BaseChunker",
14
+ "ChunkingEvaluator",
15
+ "ChunkingReport",
16
+ "ChunkingStrategyReport",
17
+ "FixedSizeChunker",
18
+ "RecursiveChunker",
19
+ "SemanticChunker",
20
+ ]
@@ -0,0 +1,16 @@
1
+ import math
2
+ from typing import Protocol, runtime_checkable
3
+
4
+
5
+ @runtime_checkable
6
+ class BaseChunker(Protocol):
7
+ def chunk(self, text: str) -> list[str]: ...
8
+
9
+
10
+ def cosine_similarity(a: list[float], b: list[float]) -> float:
11
+ dot = sum(x * y for x, y in zip(a, b, strict=False))
12
+ mag_a = math.sqrt(sum(x * x for x in a))
13
+ mag_b = math.sqrt(sum(x * x for x in b))
14
+ if mag_a == 0.0 or mag_b == 0.0:
15
+ return 0.0
16
+ return dot / (mag_a * mag_b)
@@ -0,0 +1,51 @@
1
+ from langchain_core.embeddings import Embeddings
2
+
3
+ from rag_audit.chunker.base import BaseChunker, cosine_similarity
4
+ from rag_audit.chunker.models import ChunkingReport, ChunkingStrategyReport
5
+
6
+
7
+ class ChunkingEvaluator:
8
+ def __init__(self, embeddings: Embeddings) -> None:
9
+ self._embeddings = embeddings
10
+
11
+ def evaluate(
12
+ self,
13
+ text: str,
14
+ chunkers: dict[str, BaseChunker],
15
+ ) -> ChunkingReport:
16
+ reports = [
17
+ self._evaluate_strategy(name, text, chunker)
18
+ for name, chunker in chunkers.items()
19
+ ]
20
+ best = max(reports, key=lambda r: r.avg_cohesion).strategy if reports else ""
21
+ return ChunkingReport(strategies=reports, best_strategy=best)
22
+
23
+ def _evaluate_strategy(
24
+ self,
25
+ name: str,
26
+ text: str,
27
+ chunker: BaseChunker,
28
+ ) -> ChunkingStrategyReport:
29
+ chunks = chunker.chunk(text)
30
+ if not chunks:
31
+ return ChunkingStrategyReport(
32
+ strategy=name,
33
+ chunk_count=0,
34
+ avg_cohesion=0.0,
35
+ min_cohesion=0.0,
36
+ max_cohesion=0.0,
37
+ avg_chunk_length=0.0,
38
+ )
39
+
40
+ doc_vec = self._embeddings.embed_query(text)
41
+ chunk_vecs = self._embeddings.embed_documents(chunks)
42
+ cohesions = [cosine_similarity(cv, doc_vec) for cv in chunk_vecs]
43
+
44
+ return ChunkingStrategyReport(
45
+ strategy=name,
46
+ chunk_count=len(chunks),
47
+ avg_cohesion=sum(cohesions) / len(cohesions),
48
+ min_cohesion=min(cohesions),
49
+ max_cohesion=max(cohesions),
50
+ avg_chunk_length=sum(len(c) for c in chunks) / len(chunks),
51
+ )
@@ -0,0 +1,15 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class ChunkingStrategyReport(BaseModel):
5
+ strategy: str
6
+ chunk_count: int = Field(..., ge=0)
7
+ avg_cohesion: float = Field(..., ge=0.0, le=1.0)
8
+ min_cohesion: float = Field(..., ge=0.0, le=1.0)
9
+ max_cohesion: float = Field(..., ge=0.0, le=1.0)
10
+ avg_chunk_length: float = Field(..., ge=0.0)
11
+
12
+
13
+ class ChunkingReport(BaseModel):
14
+ strategies: list[ChunkingStrategyReport]
15
+ best_strategy: str
@@ -0,0 +1,96 @@
1
+ import re
2
+
3
+ from langchain_core.embeddings import Embeddings
4
+
5
+ from rag_audit.chunker.base import cosine_similarity
6
+
7
+ _RECURSIVE_SEPARATORS = ["\n\n", "\n", ". ", " ", ""]
8
+
9
+
10
+ class FixedSizeChunker:
11
+ def __init__(self, chunk_size: int = 500, overlap: int = 50) -> None:
12
+ self._chunk_size = chunk_size
13
+ self._overlap = overlap
14
+
15
+ def chunk(self, text: str) -> list[str]:
16
+ if len(text) <= self._chunk_size:
17
+ return [text] if text.strip() else []
18
+ step = max(1, self._chunk_size - self._overlap)
19
+ return [
20
+ text[i : i + self._chunk_size]
21
+ for i in range(0, len(text), step)
22
+ if text[i : i + self._chunk_size].strip()
23
+ ]
24
+
25
+
26
+ class RecursiveChunker:
27
+ def __init__(self, chunk_size: int = 500, overlap: int = 0) -> None:
28
+ self._chunk_size = chunk_size
29
+ self._overlap = overlap
30
+
31
+ def chunk(self, text: str) -> list[str]:
32
+ return self._split(text, _RECURSIVE_SEPARATORS)
33
+
34
+ def _split(self, text: str, separators: list[str]) -> list[str]:
35
+ if len(text) <= self._chunk_size:
36
+ return [text] if text.strip() else []
37
+
38
+ sep, *rest = separators
39
+
40
+ if not sep:
41
+ step = max(1, self._chunk_size - self._overlap)
42
+ return [
43
+ text[i : i + self._chunk_size]
44
+ for i in range(0, len(text), step)
45
+ if text[i : i + self._chunk_size].strip()
46
+ ]
47
+
48
+ if sep not in text:
49
+ return self._split(text, rest) if rest else [text]
50
+
51
+ parts = [p for p in text.split(sep) if p.strip()]
52
+ chunks: list[str] = []
53
+ current = ""
54
+
55
+ for part in parts:
56
+ joined = f"{current}{sep}{part}" if current else part
57
+ if len(joined) <= self._chunk_size:
58
+ current = joined
59
+ else:
60
+ if current:
61
+ chunks.append(current)
62
+ if len(part) > self._chunk_size:
63
+ chunks.extend(self._split(part, rest if rest else [""]))
64
+ current = ""
65
+ else:
66
+ current = part
67
+
68
+ if current:
69
+ chunks.append(current)
70
+
71
+ return chunks
72
+
73
+
74
+ class SemanticChunker:
75
+ def __init__(
76
+ self, embeddings: Embeddings, similarity_threshold: float = 0.8
77
+ ) -> None:
78
+ self._embeddings = embeddings
79
+ self._threshold = similarity_threshold
80
+
81
+ def chunk(self, text: str) -> list[str]:
82
+ sentences = [s for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
83
+ if len(sentences) <= 1:
84
+ return [text] if text.strip() else []
85
+
86
+ vecs = self._embeddings.embed_documents(sentences)
87
+
88
+ groups: list[list[str]] = [[sentences[0]]]
89
+ for i in range(1, len(sentences)):
90
+ sim = cosine_similarity(vecs[i - 1], vecs[i])
91
+ if sim >= self._threshold:
92
+ groups[-1].append(sentences[i])
93
+ else:
94
+ groups.append([sentences[i]])
95
+
96
+ return [" ".join(g) for g in groups if any(s.strip() for s in g)]
File without changes
rag_audit/cli/main.py ADDED
@@ -0,0 +1,68 @@
1
+ from pathlib import Path
2
+
3
+ import typer
4
+
5
+ from rag_audit import __version__
6
+ from rag_audit.core.config import load_config
7
+ from rag_audit.core.runner import AuditRunner
8
+ from rag_audit.report.models import AuditReport
9
+ from rag_audit.report.renderer import ReportRenderer
10
+
11
+ app = typer.Typer(
12
+ name="rag-audit",
13
+ help="Audit and benchmark your RAG pipelines.",
14
+ add_completion=False,
15
+ )
16
+
17
+
18
+ @app.command()
19
+ def version() -> None:
20
+ typer.echo(f"rag-audit v{__version__}")
21
+
22
+
23
+ @app.command()
24
+ def run(
25
+ pipeline: str = typer.Argument(
26
+ ..., help="Path to the pipeline config file (JSON)."
27
+ ),
28
+ output: str = typer.Option(
29
+ "audit_result.json", "--output", "-o", help="Output file path."
30
+ ),
31
+ ) -> None:
32
+ try:
33
+ config = load_config(pipeline)
34
+ except FileNotFoundError:
35
+ typer.secho(f"Config file not found: {pipeline}", fg=typer.colors.RED, err=True)
36
+ raise typer.Exit(code=1) from None
37
+ except Exception as exc:
38
+ typer.secho(f"Invalid config: {exc}", fg=typer.colors.RED, err=True)
39
+ raise typer.Exit(code=1) from exc
40
+
41
+ report = AuditRunner(config).run()
42
+ Path(output).write_text(ReportRenderer().to_json(report))
43
+ typer.echo(f"Audit complete. Report saved to {output}")
44
+
45
+
46
+ @app.command()
47
+ def report(
48
+ result: str = typer.Argument(..., help="Path to an audit result JSON file."),
49
+ fmt: str = typer.Option(
50
+ "markdown", "--format", "-f", help="Output format: markdown or json."
51
+ ),
52
+ ) -> None:
53
+ try:
54
+ audit_report = AuditReport.model_validate_json(Path(result).read_text())
55
+ except FileNotFoundError:
56
+ typer.secho(f"Result file not found: {result}", fg=typer.colors.RED, err=True)
57
+ raise typer.Exit(code=1) from None
58
+ except Exception as exc:
59
+ typer.secho(f"Invalid result file: {exc}", fg=typer.colors.RED, err=True)
60
+ raise typer.Exit(code=1) from exc
61
+
62
+ renderer = ReportRenderer()
63
+ output = (
64
+ renderer.to_json(audit_report)
65
+ if fmt == "json"
66
+ else renderer.to_markdown(audit_report)
67
+ )
68
+ typer.echo(output)
@@ -0,0 +1 @@
1
+ """Core orchestration logic for rag-audit."""
@@ -0,0 +1,25 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Literal
5
+
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class LLMConfig(BaseModel):
10
+ provider: Literal["openai", "anthropic"] = "openai"
11
+ model: str = "gpt-4o-mini"
12
+
13
+
14
+ class PipelineConfig(BaseModel):
15
+ pipeline_id: str
16
+ question: str
17
+ answer: str
18
+ contexts: list[str]
19
+ relevant: list[str]
20
+ k: int = 5
21
+ llm: LLMConfig = LLMConfig()
22
+
23
+
24
+ def load_config(path: str) -> PipelineConfig:
25
+ return PipelineConfig.model_validate_json(Path(path).read_text())
@@ -0,0 +1,41 @@
1
+ from langchain_core.language_models import BaseChatModel
2
+
3
+ from rag_audit.core.config import LLMConfig, PipelineConfig
4
+ from rag_audit.metrics.hallucination import HallucinationDetector
5
+ from rag_audit.metrics.retrieval import evaluate_retrieval
6
+ from rag_audit.report.models import AuditReport
7
+
8
+
9
+ def _build_llm(cfg: LLMConfig) -> BaseChatModel:
10
+ if cfg.provider == "openai":
11
+ from langchain_openai import ChatOpenAI
12
+
13
+ return ChatOpenAI(model=cfg.model)
14
+ from langchain_anthropic import ChatAnthropic
15
+
16
+ return ChatAnthropic(model_name=cfg.model) # type: ignore[call-arg]
17
+
18
+
19
+ class AuditRunner:
20
+ def __init__(self, config: PipelineConfig) -> None:
21
+ self._config = config
22
+ self._detector = HallucinationDetector(llm=_build_llm(config.llm))
23
+
24
+ def run(self) -> AuditReport:
25
+ faithfulness = self._detector.detect(
26
+ answer=self._config.answer,
27
+ contexts=self._config.contexts,
28
+ )
29
+ retrieval = evaluate_retrieval(
30
+ retrieved=self._config.contexts,
31
+ relevant=self._config.relevant,
32
+ k=self._config.k,
33
+ )
34
+ return AuditReport(
35
+ pipeline_id=self._config.pipeline_id,
36
+ question=self._config.question,
37
+ answer=self._config.answer,
38
+ contexts=self._config.contexts,
39
+ retrieval=retrieval,
40
+ faithfulness=faithfulness,
41
+ )
@@ -0,0 +1,11 @@
1
+ """Metrics for evaluating RAG pipeline quality."""
2
+
3
+ from rag_audit.metrics.hallucination import FaithfulnessResult, HallucinationDetector
4
+ from rag_audit.metrics.retrieval import RetrievalResult, evaluate_retrieval
5
+
6
+ __all__ = [
7
+ "FaithfulnessResult",
8
+ "HallucinationDetector",
9
+ "RetrievalResult",
10
+ "evaluate_retrieval",
11
+ ]
@@ -0,0 +1,60 @@
1
+ from langchain_core.language_models import BaseChatModel
2
+ from langchain_core.messages import HumanMessage
3
+ from pydantic import BaseModel, Field
4
+
5
+ _FAITHFULNESS_PROMPT = """\
6
+ You are a faithfulness evaluator. Determine whether the ANSWER is supported by the CONTEXT.
7
+
8
+ CONTEXT:
9
+ {contexts}
10
+
11
+ ANSWER:
12
+ {answer}
13
+
14
+ Evaluate whether every claim in the ANSWER is explicitly supported by the CONTEXT.
15
+ Return only a JSON object with exactly two fields:
16
+ - "score": float from 0.0 (completely unfaithful) to 1.0 (completely faithful)
17
+ - "reasoning": concise explanation of your evaluation\
18
+ """
19
+
20
+
21
+ class FaithfulnessResult(BaseModel):
22
+ score: float = Field(..., ge=0.0, le=1.0)
23
+ reasoning: str
24
+ is_faithful: bool
25
+
26
+
27
+ class _JudgeOutput(BaseModel):
28
+ score: float = Field(..., ge=0.0, le=1.0)
29
+ reasoning: str
30
+
31
+
32
+ def _parse_judge_response(content: str, threshold: float) -> FaithfulnessResult:
33
+ stripped = content.strip()
34
+ if stripped.startswith("```"):
35
+ lines = stripped.splitlines()
36
+ stripped = "\n".join(lines[1:-1])
37
+ output = _JudgeOutput.model_validate_json(stripped)
38
+ return FaithfulnessResult(
39
+ score=output.score,
40
+ reasoning=output.reasoning,
41
+ is_faithful=output.score >= threshold,
42
+ )
43
+
44
+
45
+ class HallucinationDetector:
46
+ def __init__(self, llm: BaseChatModel, threshold: float = 0.5) -> None:
47
+ self._llm = llm
48
+ self.threshold = threshold
49
+
50
+ def detect(self, answer: str, contexts: list[str]) -> FaithfulnessResult:
51
+ if not contexts:
52
+ return FaithfulnessResult(
53
+ score=0.0,
54
+ reasoning="No context provided to evaluate faithfulness against.",
55
+ is_faithful=False,
56
+ )
57
+ context_block = "\n\n".join(f"[{i + 1}] {c}" for i, c in enumerate(contexts))
58
+ prompt = _FAITHFULNESS_PROMPT.format(answer=answer, contexts=context_block)
59
+ response = self._llm.invoke([HumanMessage(content=prompt)])
60
+ return _parse_judge_response(str(response.content), self.threshold)
@@ -0,0 +1,42 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class RetrievalResult(BaseModel):
5
+ precision_at_k: float = Field(..., ge=0.0, le=1.0)
6
+ recall_at_k: float = Field(..., ge=0.0, le=1.0)
7
+ mrr: float = Field(..., ge=0.0, le=1.0)
8
+ k: int = Field(..., gt=0)
9
+
10
+
11
+ def _precision(retrieved: list[str], relevant: frozenset[str], k: int) -> float:
12
+ top_k = retrieved[:k]
13
+ return sum(1 for doc in top_k if doc in relevant) / k
14
+
15
+
16
+ def _recall(retrieved: list[str], relevant: frozenset[str], k: int) -> float:
17
+ if not relevant:
18
+ return 0.0
19
+ top_k = retrieved[:k]
20
+ return sum(1 for doc in top_k if doc in relevant) / len(relevant)
21
+
22
+
23
+ def _mrr(retrieved: list[str], relevant: frozenset[str]) -> float:
24
+ for rank, doc in enumerate(retrieved, start=1):
25
+ if doc in relevant:
26
+ return 1.0 / rank
27
+ return 0.0
28
+
29
+
30
+ def evaluate_retrieval(
31
+ retrieved: list[str],
32
+ relevant: list[str],
33
+ k: int,
34
+ ) -> RetrievalResult:
35
+ effective_k = min(k, len(retrieved)) if retrieved else k
36
+ relevant_set = frozenset(relevant)
37
+ return RetrievalResult(
38
+ precision_at_k=_precision(retrieved, relevant_set, effective_k),
39
+ recall_at_k=_recall(retrieved, relevant_set, effective_k),
40
+ mrr=_mrr(retrieved, relevant_set),
41
+ k=effective_k,
42
+ )
@@ -0,0 +1,6 @@
1
+ """Audit report generation for RAG pipelines."""
2
+
3
+ from rag_audit.report.models import AuditReport
4
+ from rag_audit.report.renderer import ReportRenderer
5
+
6
+ __all__ = ["AuditReport", "ReportRenderer"]
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from rag_audit.metrics.hallucination import FaithfulnessResult
8
+ from rag_audit.metrics.retrieval import RetrievalResult
9
+
10
+
11
+ class AuditReport(BaseModel):
12
+ pipeline_id: str = Field(..., description="Identifier for the audited pipeline.")
13
+ created_at: datetime = Field(default_factory=datetime.utcnow)
14
+ question: str
15
+ answer: str
16
+ contexts: list[str]
17
+ retrieval: RetrievalResult
18
+ faithfulness: FaithfulnessResult
@@ -0,0 +1,58 @@
1
+ from rag_audit.report.models import AuditReport
2
+
3
+ _MARKDOWN_TEMPLATE = """\
4
+ # Audit Report — {pipeline_id}
5
+
6
+ **Date:** {created_at}
7
+
8
+ ## Question
9
+
10
+ {question}
11
+
12
+ ## Answer
13
+
14
+ {answer}
15
+
16
+ ## Retrieval (k={k})
17
+
18
+ | Metric | Score |
19
+ |---|---|
20
+ | Precision@k | {precision:.2%} |
21
+ | Recall@k | {recall:.2%} |
22
+ | MRR | {mrr:.4f} |
23
+
24
+ ## Faithfulness
25
+
26
+ **Score:** {faith_score:.2%} — **Verdict:** {verdict}
27
+
28
+ **Reasoning:** {reasoning}
29
+
30
+ ## Contexts
31
+
32
+ {contexts}
33
+ """
34
+
35
+
36
+ class ReportRenderer:
37
+ def to_json(self, report: AuditReport) -> str:
38
+ return report.model_dump_json(indent=2)
39
+
40
+ def to_markdown(self, report: AuditReport) -> str:
41
+ contexts_block = "\n".join(
42
+ f"{i + 1}. {ctx}" for i, ctx in enumerate(report.contexts)
43
+ )
44
+ verdict = "FAITHFUL" if report.faithfulness.is_faithful else "HALLUCINATION"
45
+ return _MARKDOWN_TEMPLATE.format(
46
+ pipeline_id=report.pipeline_id,
47
+ created_at=report.created_at.strftime("%Y-%m-%d %H:%M UTC"),
48
+ question=report.question,
49
+ answer=report.answer,
50
+ k=report.retrieval.k,
51
+ precision=report.retrieval.precision_at_k,
52
+ recall=report.retrieval.recall_at_k,
53
+ mrr=report.retrieval.mrr,
54
+ faith_score=report.faithfulness.score,
55
+ verdict=verdict,
56
+ reasoning=report.faithfulness.reasoning,
57
+ contexts=contexts_block,
58
+ )
@@ -0,0 +1,225 @@
1
+ Metadata-Version: 2.4
2
+ Name: rag-audit
3
+ Version: 0.1.0
4
+ Summary: CLI + library to audit and benchmark RAG pipelines
5
+ Project-URL: Homepage, https://github.com/seu-usuario/rag-audit
6
+ Project-URL: Documentation, https://seu-usuario.github.io/rag-audit
7
+ Project-URL: Repository, https://github.com/seu-usuario/rag-audit
8
+ Project-URL: Bug Tracker, https://github.com/seu-usuario/rag-audit/issues
9
+ License: MIT License
10
+
11
+ Copyright (c) 2026 Andrey Pontes
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Keywords: audit,evaluation,llm,rag,retrieval
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Programming Language :: Python :: 3.11
36
+ Classifier: Programming Language :: Python :: 3.12
37
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
38
+ Requires-Python: >=3.11
39
+ Requires-Dist: chromadb>=0.5
40
+ Requires-Dist: httpx>=0.27
41
+ Requires-Dist: langchain-anthropic>=0.1
42
+ Requires-Dist: langchain-openai>=0.1
43
+ Requires-Dist: langchain>=0.2
44
+ Requires-Dist: loguru>=0.7
45
+ Requires-Dist: pydantic>=2.0
46
+ Requires-Dist: rich>=13.0
47
+ Requires-Dist: typer>=0.12
48
+ Description-Content-Type: text/markdown
49
+
50
+ # rag-audit
51
+
52
+ CLI + library to audit and benchmark RAG pipelines. Detects hallucinations, measures retrieval quality, compares chunking strategies, and generates structured reports.
53
+
54
+ **[Documentation](https://andrey-pontes.github.io/rag-audit/)**
55
+
56
+ ## Installation
57
+
58
+ ```bash
59
+ pip install rag-audit
60
+ ```
61
+
62
+ Or with [uv](https://docs.astral.sh/uv/):
63
+
64
+ ```bash
65
+ uv add rag-audit
66
+ ```
67
+
68
+ ## Quickstart
69
+
70
+ **1. Create a pipeline config file (`pipeline.json`):**
71
+
72
+ ```json
73
+ {
74
+ "pipeline_id": "my-pipeline",
75
+ "question": "What is the capital of France?",
76
+ "answer": "Paris is the capital of France.",
77
+ "contexts": [
78
+ "Paris is the capital and largest city of France.",
79
+ "France is a country in Western Europe."
80
+ ],
81
+ "relevant": [
82
+ "Paris is the capital and largest city of France."
83
+ ],
84
+ "k": 2,
85
+ "llm": {
86
+ "provider": "openai",
87
+ "model": "gpt-4o-mini"
88
+ }
89
+ }
90
+ ```
91
+
92
+ **2. Run the audit:**
93
+
94
+ ```bash
95
+ export OPENAI_API_KEY=sk-...
96
+ rag-audit run pipeline.json -o result.json
97
+ ```
98
+
99
+ **3. Generate a report:**
100
+
101
+ ```bash
102
+ # Markdown (default)
103
+ rag-audit report result.json
104
+
105
+ # JSON
106
+ rag-audit report result.json --format json
107
+ ```
108
+
109
+ ## Config reference
110
+
111
+ | Field | Type | Description |
112
+ |---|---|---|
113
+ | `pipeline_id` | `string` | Identifier for the pipeline being audited |
114
+ | `question` | `string` | The question posed to the RAG pipeline |
115
+ | `answer` | `string` | The answer generated by the pipeline |
116
+ | `contexts` | `string[]` | Retrieved chunks, in rank order |
117
+ | `relevant` | `string[]` | Ground-truth relevant chunks (for retrieval metrics) |
118
+ | `k` | `int` | Number of top chunks to evaluate (default: `5`) |
119
+ | `llm.provider` | `"openai"` \| `"anthropic"` | LLM provider for the faithfulness judge |
120
+ | `llm.model` | `string` | Model name (e.g. `"gpt-4o-mini"`, `"claude-3-5-haiku-20241022"`) |
121
+
122
+ ## Metrics
123
+
124
+ ### Retrieval
125
+
126
+ | Metric | Description |
127
+ |---|---|
128
+ | **Precision@k** | Fraction of the top-k retrieved chunks that are relevant |
129
+ | **Recall@k** | Fraction of all relevant chunks that appear in the top-k |
130
+ | **MRR** | Mean Reciprocal Rank — how high the first relevant chunk ranks |
131
+
132
+ ### Faithfulness
133
+
134
+ | Metric | Description |
135
+ |---|---|
136
+ | **Score** | 0.0–1.0 — how well the answer is grounded in the retrieved contexts |
137
+ | **Verdict** | `FAITHFUL` if score ≥ threshold (default `0.5`), otherwise `HALLUCINATION` |
138
+
139
+ ## Python API
140
+
141
+ ### Audit a pipeline
142
+
143
+ ```python
144
+ from rag_audit.core.config import PipelineConfig, LLMConfig
145
+ from rag_audit.core.runner import AuditRunner
146
+ from rag_audit.report.renderer import ReportRenderer
147
+
148
+ config = PipelineConfig(
149
+ pipeline_id="my-pipeline",
150
+ question="What is the capital of France?",
151
+ answer="Paris is the capital of France.",
152
+ contexts=["Paris is the capital and largest city of France."],
153
+ relevant=["Paris is the capital and largest city of France."],
154
+ k=1,
155
+ llm=LLMConfig(provider="openai", model="gpt-4o-mini"),
156
+ )
157
+
158
+ report = AuditRunner(config).run()
159
+ print(ReportRenderer().to_markdown(report))
160
+ ```
161
+
162
+ ### Compare chunking strategies
163
+
164
+ ```python
165
+ from langchain_openai import OpenAIEmbeddings
166
+ from rag_audit.chunker import ChunkingEvaluator, FixedSizeChunker, RecursiveChunker, SemanticChunker
167
+
168
+ embeddings = OpenAIEmbeddings()
169
+ evaluator = ChunkingEvaluator(embeddings)
170
+
171
+ report = evaluator.evaluate(
172
+ "Your long document text here...",
173
+ {
174
+ "fixed": FixedSizeChunker(chunk_size=500, overlap=50),
175
+ "recursive": RecursiveChunker(chunk_size=500),
176
+ "semantic": SemanticChunker(embeddings, similarity_threshold=0.8),
177
+ },
178
+ )
179
+
180
+ print(f"Best strategy: {report.best_strategy}")
181
+ for s in report.strategies:
182
+ print(f" {s.strategy}: avg_cohesion={s.avg_cohesion:.3f}, chunks={s.chunk_count}")
183
+ ```
184
+
185
+ ### Use a vectorstore adapter
186
+
187
+ ```python
188
+ from rag_audit.adapters import ChromaDBAdapter
189
+
190
+ adapter = ChromaDBAdapter("my-collection")
191
+ adapter.add(ids=["doc1"], texts=["Paris is in France."], embeddings=[[...]])
192
+ results = adapter.query(embedding=[...], k=1)
193
+ ```
194
+
195
+ ## Roadmap
196
+
197
+ - [x] CLI (`rag-audit run`, `rag-audit report`)
198
+ - [x] Hallucination detection (LLM-as-judge)
199
+ - [x] Retrieval metrics (Precision@k, Recall@k, MRR)
200
+ - [x] Structured audit reports (JSON + Markdown)
201
+ - [x] Chunking strategy benchmark (fixed-size vs recursive vs semantic)
202
+ - [x] Vectorstore adapters (ChromaDB — Pinecone and Qdrant coming soon)
203
+ - [x] Documentation (GitHub Pages)
204
+ - [ ] PyPI release
205
+
206
+ ## Development
207
+
208
+ ```bash
209
+ # Install dependencies
210
+ uv sync --group dev
211
+
212
+ # Run tests
213
+ uv run pytest
214
+
215
+ # Lint + format
216
+ uv run ruff check src/ tests/
217
+ uv run ruff format src/ tests/
218
+
219
+ # Type check
220
+ uv run mypy src/rag_audit
221
+
222
+ # Build docs locally
223
+ uv sync --group docs
224
+ uv run mkdocs serve
225
+ ```
@@ -0,0 +1,26 @@
1
+ rag_audit/__init__.py,sha256=F4I8YR81Fo281iRKpkEGgMIWpfTPQ6LIl19bo62Vzlc,95
2
+ rag_audit/adapters/__init__.py,sha256=UBCsVaBtrarw-wbAxZvtWjOGVF-nHnlstOF2HKXK_5U,328
3
+ rag_audit/adapters/base.py,sha256=RwjabhQNKcMaG9Z3e2MsrLGcE4fc55HukVfDq69UM2M,366
4
+ rag_audit/adapters/chroma.py,sha256=Azi0tn9jpQSjWWwwlXW_ov5YXIVU91TqewpSEi4FfdQ,1367
5
+ rag_audit/adapters/stubs.py,sha256=OEoGKJkY2_GNWIiJKUxfSgnDUsFgmmfKXvvPwT5RRYg,816
6
+ rag_audit/chunker/__init__.py,sha256=wATANNgiuL1YB8_jZFFtou1mRCnPg3-rdC2KkyhNk2s,549
7
+ rag_audit/chunker/base.py,sha256=bw2x1eWqKlZUxy7rWkBfyT0FPZUN6noHeNbwzvkoh6g,460
8
+ rag_audit/chunker/evaluator.py,sha256=BudrgEyIuROxaWymRpQx0OtGOUZWaqmy_S0dhDNS0e0,1704
9
+ rag_audit/chunker/models.py,sha256=g5eAzesvtj5y-_4tsbeUFanqQAkdv7CSFMLQjuJIZCQ,450
10
+ rag_audit/chunker/strategies.py,sha256=XfEhsFtH0zqmgmnV3DTpfq1PVL7wtyR0ooaL_9uyw0A,3065
11
+ rag_audit/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ rag_audit/cli/main.py,sha256=1n4HQlb7MCMJDY4QBIvs3Icyi6CN2ySCqNx5KEbtDls,2073
13
+ rag_audit/core/__init__.py,sha256=uWm8nOHY6dJlDjrAyZk-1_7QrAgCs3QR6QeGPQLPdK0,46
14
+ rag_audit/core/config.py,sha256=sHmrnrm6BmVivZ7pri3ng7jOivueWECWvwdj3oUG5B0,541
15
+ rag_audit/core/runner.py,sha256=uvDasgevFWEdxj_u3JMbBX2sZZQIfkB1a2cM13t-GpQ,1405
16
+ rag_audit/metrics/__init__.py,sha256=KksI0lNVykFhJNILwEvYa3UJYL-tWGRUyb5X-o2LIeM,333
17
+ rag_audit/metrics/hallucination.py,sha256=si_AaHSTavXYIegq3IfDacYV9v2NtWi43yuPmoorlko,2016
18
+ rag_audit/metrics/retrieval.py,sha256=zRyoWHaRfB38lJ_VrwAJuh3ydPh93arwMKLF-0HCqUk,1283
19
+ rag_audit/report/__init__.py,sha256=bDMU_bJazD_h8i0uj-R6Z6UDB0PC09JZ1kMdEcFmvNI,196
20
+ rag_audit/report/models.py,sha256=KM100LyLrvSFFpMYgnadwjzlBH9HiI6tsIoN-lVQEMk,535
21
+ rag_audit/report/renderer.py,sha256=76Fj0LLqxeHV5i776-tBSj1ckaPL8ROTZ-J-e9nTB20,1411
22
+ rag_audit-0.1.0.dist-info/METADATA,sha256=w8ji8AWBSVEc-7DhrYDmDZad-M1ir84KIBDIIyCM5A0,6957
23
+ rag_audit-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
24
+ rag_audit-0.1.0.dist-info/entry_points.txt,sha256=aJN6kN55_mRkGxKS_l_BrWen0xvhlvVJvo8AlNVJmhU,53
25
+ rag_audit-0.1.0.dist-info/licenses/LICENSE,sha256=1Aj1te3ZIg2_1l0Oa4KZYVIuqkeF8tyFV9BWcDe-JO8,1070
26
+ rag_audit-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ rag-audit = rag_audit.cli.main:app
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Andrey Pontes
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.