agentic-qa 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentic_qa/__init__.py +103 -0
- agentic_qa/agents/__init__.py +15 -0
- agentic_qa/agents/discovery.py +88 -0
- agentic_qa/agents/executor.py +89 -0
- agentic_qa/agents/judge.py +231 -0
- agentic_qa/agents/red_team.py +162 -0
- agentic_qa/agents/refiner.py +109 -0
- agentic_qa/agents/reporter.py +68 -0
- agentic_qa/graph/__init__.py +1 -0
- agentic_qa/graph/conditions.py +51 -0
- agentic_qa/graph/state.py +75 -0
- agentic_qa/graph/workflow.py +154 -0
- agentic_qa/schemas/__init__.py +1 -0
- agentic_qa/schemas/test_case.py +53 -0
- agentic_qa/schemas/verdict.py +57 -0
- agentic_qa/sut/__init__.py +39 -0
- agentic_qa/sut/api_adapter.py +119 -0
- agentic_qa/sut/base.py +55 -0
- agentic_qa/sut/callable_adapter.py +80 -0
- agentic_qa/sut/financial_rag.py +185 -0
- agentic_qa/utils/__init__.py +1 -0
- agentic_qa/utils/prompt_templates.py +227 -0
- agentic_qa-0.1.0.dist-info/METADATA +180 -0
- agentic_qa-0.1.0.dist-info/RECORD +26 -0
- agentic_qa-0.1.0.dist-info/WHEEL +5 -0
- agentic_qa-0.1.0.dist-info/top_level.txt +1 -0
agentic_qa/__init__.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agentic QA - Main Public API
|
|
3
|
+
|
|
4
|
+
This file exposes the simple `run_autonomous_test` function, allowing developers
|
|
5
|
+
to test their RAG systems in just a few lines of code.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from typing import Callable, Optional
|
|
10
|
+
|
|
11
|
+
# Expose SUT adapters for developers if they want advanced setups
|
|
12
|
+
from agentic_qa.sut.base import BaseSUTAdapter
|
|
13
|
+
from agentic_qa.sut.api_adapter import APIAdapter
|
|
14
|
+
from agentic_qa.sut.callable_adapter import CallableAdapter
|
|
15
|
+
from agentic_qa.sut import set_active_sut
|
|
16
|
+
|
|
17
|
+
# Import the core workflow
|
|
18
|
+
from agentic_qa.graph.workflow import build_qa_graph, get_initial_state
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def run_autonomous_test(
|
|
22
|
+
target_function: Optional[Callable] = None,
|
|
23
|
+
api_endpoint: Optional[str] = None,
|
|
24
|
+
system_name: str = "Target System",
|
|
25
|
+
system_description: str = "A generic RAG system",
|
|
26
|
+
domain: str = "general",
|
|
27
|
+
max_iterations: int = 3,
|
|
28
|
+
tests_per_iteration: int = 5,
|
|
29
|
+
model_name: str = "gpt-4o-mini",
|
|
30
|
+
) -> dict:
|
|
31
|
+
"""
|
|
32
|
+
Run an autonomous multi-agent QA test against a target system.
|
|
33
|
+
|
|
34
|
+
You must provide EITHER a `target_function` (a python function) OR
|
|
35
|
+
an `api_endpoint` (a URL string).
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
target_function: A python function that takes a string query and returns a string answer.
|
|
39
|
+
api_endpoint: A URL endpoint (e.g., http://localhost:8000/chat) to test.
|
|
40
|
+
system_name: The name of the system being tested.
|
|
41
|
+
system_description: A description of what the system does. Highly important for agents!
|
|
42
|
+
domain: The domain of the system (e.g., 'financial', 'healthcare', 'customer support').
|
|
43
|
+
max_iterations: How many times the agents should refine and retry their tests.
|
|
44
|
+
tests_per_iteration: How many tests the Red-Team agent generates per round.
|
|
45
|
+
model_name: The LLM to use for the agents (default: gpt-4o-mini).
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
A dictionary containing the final execution state, including the test suite, verdicts,
|
|
49
|
+
failure patterns, and the final Markdown report.
|
|
50
|
+
"""
|
|
51
|
+
# 1. Configure Environment variables needed by LangGraph
|
|
52
|
+
os.environ["MAX_ITERATIONS"] = str(max_iterations)
|
|
53
|
+
os.environ["TESTS_PER_ITERATION"] = str(tests_per_iteration)
|
|
54
|
+
os.environ["MODEL_NAME"] = model_name
|
|
55
|
+
|
|
56
|
+
# 2. Setup the SUT Adapter
|
|
57
|
+
if target_function:
|
|
58
|
+
adapter = CallableAdapter(
|
|
59
|
+
fn=target_function,
|
|
60
|
+
description=system_description,
|
|
61
|
+
system_name=system_name,
|
|
62
|
+
domain=domain
|
|
63
|
+
)
|
|
64
|
+
elif api_endpoint:
|
|
65
|
+
adapter = APIAdapter(
|
|
66
|
+
endpoint=api_endpoint,
|
|
67
|
+
description=system_description,
|
|
68
|
+
system_name=system_name,
|
|
69
|
+
domain=domain
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
raise ValueError("You must provide either a `target_function` or an `api_endpoint`.")
|
|
73
|
+
|
|
74
|
+
# Register the adapter globally for the Executor Agent to use
|
|
75
|
+
set_active_sut(adapter)
|
|
76
|
+
|
|
77
|
+
# 3. Build and Run the Graph
|
|
78
|
+
print(f"š Starting Autonomous QA Test against: {system_name}")
|
|
79
|
+
print(f"Domain: {domain} | Max Iterations: {max_iterations}")
|
|
80
|
+
|
|
81
|
+
graph = build_qa_graph()
|
|
82
|
+
initial_state = get_initial_state()
|
|
83
|
+
initial_state["max_iterations"] = max_iterations
|
|
84
|
+
|
|
85
|
+
final_state = None
|
|
86
|
+
# Stream the graph to provide real-time console feedback
|
|
87
|
+
for event in graph.stream(initial_state, stream_mode="values"):
|
|
88
|
+
final_state = event
|
|
89
|
+
|
|
90
|
+
print("\nā
Autonomous QA Test Complete!")
|
|
91
|
+
print(f"Coverage Score: {final_state.get('coverage_score', 0):.1%}")
|
|
92
|
+
print(f"Total Failure Patterns Found: {len(final_state.get('failure_patterns', []))}")
|
|
93
|
+
|
|
94
|
+
return final_state
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# Define what is exported when someone runs `from agentic_qa import *`
|
|
98
|
+
__all__ = [
|
|
99
|
+
"run_autonomous_test",
|
|
100
|
+
"APIAdapter",
|
|
101
|
+
"CallableAdapter",
|
|
102
|
+
"BaseSUTAdapter"
|
|
103
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Multi-Agent Autonomous QA System - Agents Package"""
|
|
2
|
+
|
|
3
|
+
from agentic_qa.agents.red_team import red_team_node
|
|
4
|
+
from agentic_qa.agents.executor import executor_node
|
|
5
|
+
from agentic_qa.agents.judge import judge_node
|
|
6
|
+
from agentic_qa.agents.refiner import refiner_node
|
|
7
|
+
from agentic_qa.agents.reporter import reporter_node
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"red_team_node",
|
|
11
|
+
"executor_node",
|
|
12
|
+
"judge_node",
|
|
13
|
+
"refiner_node",
|
|
14
|
+
"reporter_node",
|
|
15
|
+
]
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Discovery Agent (Graphify) ā Architecture Mapping.
|
|
3
|
+
|
|
4
|
+
This agent performs White-Box introspection. It analyzes the System Under Test (SUT)
|
|
5
|
+
and maps out its internal architecture (e.g., Vector DB, Chunk Size, Retriever type, LLM).
|
|
6
|
+
This architectural "graph" allows the Red-Team agent to launch hyper-targeted attacks.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from langchain_openai import ChatOpenAI
|
|
11
|
+
from langchain_core.messages import SystemMessage, HumanMessage
|
|
12
|
+
from agentic_qa.graph.state import QAState
|
|
13
|
+
|
|
14
|
+
DISCOVERY_SYSTEM_PROMPT = """You are an elite AI Architecture Mapper (Graphify Agent).
|
|
15
|
+
|
|
16
|
+
Your mission is to analyze a generic description of an AI/RAG system and deduce its likely internal architecture graph.
|
|
17
|
+
You must break down the system into a logical pipeline (e.g., Data Ingestion -> Chunking -> VectorDB -> Retriever -> LLM -> Output).
|
|
18
|
+
|
|
19
|
+
Think about:
|
|
20
|
+
1. What components must exist for this system to work?
|
|
21
|
+
2. Where are the likely weak points or bottlenecks between these nodes?
|
|
22
|
+
3. What are the assumed configurations (e.g., chunk size, top-k retrieval)?
|
|
23
|
+
|
|
24
|
+
Output a detailed, graph-like text representation of the architecture. Be specific about potential vulnerabilities at each node."""
|
|
25
|
+
|
|
26
|
+
DISCOVERY_USER_PROMPT = """Analyze the following System Under Test and map its architecture.
|
|
27
|
+
|
|
28
|
+
**System Name:** {name}
|
|
29
|
+
**Domain:** {domain}
|
|
30
|
+
**Description:**
|
|
31
|
+
{description}
|
|
32
|
+
|
|
33
|
+
Provide a detailed architectural breakdown (Graphify) of this system. Highlight the specific nodes (e.g., Retriever, VectorStore, LLM) and the data flow.
|
|
34
|
+
Keep it concise but technical."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_llm() -> ChatOpenAI:
|
|
38
|
+
return ChatOpenAI(
|
|
39
|
+
model=os.getenv("MODEL_NAME", "gpt-4o-mini"),
|
|
40
|
+
temperature=0.2,
|
|
41
|
+
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
42
|
+
openai_api_base=os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def discovery_node(state: QAState) -> dict:
|
|
47
|
+
"""
|
|
48
|
+
LangGraph node: Discovery Agent.
|
|
49
|
+
|
|
50
|
+
Runs once at the beginning of the pipeline to map the SUT architecture.
|
|
51
|
+
"""
|
|
52
|
+
print(f"\n{'='*60}")
|
|
53
|
+
print(f"š DISCOVERY AGENT (Graphify) ā Mapping Architecture")
|
|
54
|
+
print(f"{'='*60}")
|
|
55
|
+
|
|
56
|
+
description = state.get("sut_description", "")
|
|
57
|
+
domain = state.get("domain", "")
|
|
58
|
+
|
|
59
|
+
# If the architecture is already provided (e.g. by the adapter), skip LLM discovery
|
|
60
|
+
if state.get("sut_architecture"):
|
|
61
|
+
print(" Architecture already provided. Skipping LLM discovery.")
|
|
62
|
+
return {}
|
|
63
|
+
|
|
64
|
+
print(" Analyzing SUT description to deduce internal graph...")
|
|
65
|
+
|
|
66
|
+
llm = _get_llm()
|
|
67
|
+
prompt = DISCOVERY_USER_PROMPT.format(
|
|
68
|
+
name="Target System",
|
|
69
|
+
domain=domain,
|
|
70
|
+
description=description
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
messages = [
|
|
74
|
+
SystemMessage(content=DISCOVERY_SYSTEM_PROMPT),
|
|
75
|
+
HumanMessage(content=prompt)
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
response = llm.invoke(messages)
|
|
79
|
+
architecture = response.content
|
|
80
|
+
|
|
81
|
+
print("\n šŗļø Deduced Architecture Graph:")
|
|
82
|
+
# Print the first few lines as a preview
|
|
83
|
+
preview = "\n".join([f" {line}" for line in architecture.split("\n")[:10]])
|
|
84
|
+
print(f"{preview}\n ...")
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
"sut_architecture": architecture
|
|
88
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Executor Agent ā Generic System Under Test Runner.
|
|
3
|
+
|
|
4
|
+
Executes test cases against ANY connected SUT (RAG, API, function).
|
|
5
|
+
Uses the active SUT adapter from the registry ā works with:
|
|
6
|
+
- Built-in Financial RAG demo
|
|
7
|
+
- Any RAG connected via API endpoint
|
|
8
|
+
- Any Python function wrapped as a callable
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import time
|
|
12
|
+
from agentic_qa.graph.state import QAState
|
|
13
|
+
from agentic_qa.sut import get_active_sut
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def executor_node(state: QAState) -> dict:
|
|
17
|
+
"""
|
|
18
|
+
LangGraph node: Executor Agent.
|
|
19
|
+
|
|
20
|
+
Runs each test case from the current iteration through whatever
|
|
21
|
+
SUT is currently active and collects results.
|
|
22
|
+
"""
|
|
23
|
+
iteration = state.get("current_iteration", 1)
|
|
24
|
+
test_suite = state.get("test_suite", [])
|
|
25
|
+
|
|
26
|
+
print(f"\n{'='*60}")
|
|
27
|
+
print(f"ā” EXECUTOR AGENT ā Iteration {iteration}")
|
|
28
|
+
print(f"{'='*60}")
|
|
29
|
+
|
|
30
|
+
# Get test cases for the current iteration only
|
|
31
|
+
iter_prefix = f"TC-{iteration:02d}"
|
|
32
|
+
current_tests = [tc for tc in test_suite if tc["id"].startswith(iter_prefix)]
|
|
33
|
+
|
|
34
|
+
if not current_tests:
|
|
35
|
+
num_per_iter = 5
|
|
36
|
+
current_tests = test_suite[-num_per_iter:]
|
|
37
|
+
|
|
38
|
+
# Get the active SUT (whatever the user connected)
|
|
39
|
+
sut = get_active_sut()
|
|
40
|
+
print(f" SUT: {sut.name}")
|
|
41
|
+
print(f" Executing {len(current_tests)} test cases...")
|
|
42
|
+
|
|
43
|
+
execution_results = []
|
|
44
|
+
|
|
45
|
+
for tc in current_tests:
|
|
46
|
+
test_id = tc["id"]
|
|
47
|
+
input_data = tc["input_data"]
|
|
48
|
+
|
|
49
|
+
print(f" ā¶ Running [{test_id}]...", end=" ")
|
|
50
|
+
|
|
51
|
+
start_time = time.time()
|
|
52
|
+
try:
|
|
53
|
+
output = sut.process(input_data)
|
|
54
|
+
exec_time = time.time() - start_time
|
|
55
|
+
|
|
56
|
+
# Normalize output ā adapters return "output" key
|
|
57
|
+
sut_output = output.get("output", output.get("sut_output", str(output)))
|
|
58
|
+
status = output.get("status", "unknown")
|
|
59
|
+
|
|
60
|
+
result = {
|
|
61
|
+
"test_id": test_id,
|
|
62
|
+
"sut_output": str(output),
|
|
63
|
+
"execution_time": round(exec_time, 4),
|
|
64
|
+
"error": output.get("error") if status == "error" else None,
|
|
65
|
+
}
|
|
66
|
+
print(f"Done ({exec_time:.3f}s) ā Status: {status}")
|
|
67
|
+
|
|
68
|
+
except Exception as e:
|
|
69
|
+
exec_time = time.time() - start_time
|
|
70
|
+
result = {
|
|
71
|
+
"test_id": test_id,
|
|
72
|
+
"sut_output": "",
|
|
73
|
+
"execution_time": round(exec_time, 4),
|
|
74
|
+
"error": str(e),
|
|
75
|
+
}
|
|
76
|
+
print(f"ERROR ({exec_time:.3f}s) ā {e}")
|
|
77
|
+
|
|
78
|
+
execution_results.append(result)
|
|
79
|
+
|
|
80
|
+
errors = sum(1 for r in execution_results if r["error"])
|
|
81
|
+
avg_time = sum(r["execution_time"] for r in execution_results) / max(len(execution_results), 1)
|
|
82
|
+
print(f"\n š Execution Summary:")
|
|
83
|
+
print(f" Tests executed: {len(execution_results)}")
|
|
84
|
+
print(f" Errors/crashes: {errors}")
|
|
85
|
+
print(f" Avg exec time: {avg_time:.3f}s")
|
|
86
|
+
|
|
87
|
+
return {
|
|
88
|
+
"execution_results": execution_results,
|
|
89
|
+
}
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Judge Agent ā LLM-as-Judge Evaluator.
|
|
3
|
+
|
|
4
|
+
This agent evaluates each test case's expected behavior against the SUT's
|
|
5
|
+
actual output, delivering precise pass/fail verdicts with detailed reasoning.
|
|
6
|
+
|
|
7
|
+
Uses the LLM-as-Judge pattern: the LLM acts as an impartial evaluator
|
|
8
|
+
that understands financial domain expectations.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
from langchain_openai import ChatOpenAI
|
|
14
|
+
from langchain_core.messages import SystemMessage, HumanMessage
|
|
15
|
+
from agentic_qa.graph.state import QAState
|
|
16
|
+
from agentic_qa.utils.prompt_templates import JUDGE_SYSTEM_PROMPT, JUDGE_EVALUATION_PROMPT
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _get_llm() -> ChatOpenAI:
|
|
20
|
+
"""Initialize the LLM for the Judge Agent."""
|
|
21
|
+
return ChatOpenAI(
|
|
22
|
+
model=os.getenv("MODEL_NAME", "gpt-4o-mini"),
|
|
23
|
+
temperature=0.1, # Low temperature for consistent, precise judgments
|
|
24
|
+
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
25
|
+
openai_api_base=os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _build_test_results_json(state: QAState) -> str:
|
|
30
|
+
"""Combine test cases with execution results for judge evaluation."""
|
|
31
|
+
iteration = state.get("current_iteration", 1)
|
|
32
|
+
test_suite = state.get("test_suite", [])
|
|
33
|
+
execution_results = state.get("execution_results", [])
|
|
34
|
+
|
|
35
|
+
# Build lookup of execution results by test_id
|
|
36
|
+
results_map = {r["test_id"]: r for r in execution_results}
|
|
37
|
+
|
|
38
|
+
# Get current iteration tests
|
|
39
|
+
iter_prefix = f"TC-{iteration:02d}"
|
|
40
|
+
current_tests = [tc for tc in test_suite if tc["id"].startswith(iter_prefix)]
|
|
41
|
+
if not current_tests:
|
|
42
|
+
num_per_iter = 5
|
|
43
|
+
current_tests = test_suite[-num_per_iter:]
|
|
44
|
+
|
|
45
|
+
# Combine test case + result for each
|
|
46
|
+
combined = []
|
|
47
|
+
for tc in current_tests:
|
|
48
|
+
result = results_map.get(tc["id"], {})
|
|
49
|
+
combined.append({
|
|
50
|
+
"test_id": tc["id"],
|
|
51
|
+
"input_data": tc["input_data"],
|
|
52
|
+
"expected_behavior": tc["expected_behavior"],
|
|
53
|
+
"edge_case_type": tc["edge_case_type"],
|
|
54
|
+
"difficulty": tc["difficulty"],
|
|
55
|
+
"sut_output": result.get("sut_output", "NO OUTPUT"),
|
|
56
|
+
"execution_error": result.get("error"),
|
|
57
|
+
"execution_time": result.get("execution_time", 0),
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
return combined, json.dumps(combined, indent=2)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _parse_verdicts(response_text: str) -> tuple:
|
|
64
|
+
"""Parse Judge LLM response into structured verdicts."""
|
|
65
|
+
try:
|
|
66
|
+
text = response_text.strip()
|
|
67
|
+
if "```json" in text:
|
|
68
|
+
text = text.split("```json")[1].split("```")[0].strip()
|
|
69
|
+
elif "```" in text:
|
|
70
|
+
text = text.split("```")[1].split("```")[0].strip()
|
|
71
|
+
|
|
72
|
+
data = json.loads(text)
|
|
73
|
+
verdicts = data.get("verdicts", [])
|
|
74
|
+
pass_rate = data.get("pass_rate", 0.0)
|
|
75
|
+
summary = data.get("summary", "")
|
|
76
|
+
|
|
77
|
+
normalized = []
|
|
78
|
+
for v in verdicts:
|
|
79
|
+
normalized.append({
|
|
80
|
+
"test_id": v.get("test_id", "unknown"),
|
|
81
|
+
"status": v.get("status", "error"),
|
|
82
|
+
"reasoning": v.get("reasoning", "No reasoning provided"),
|
|
83
|
+
"severity": v.get("severity", "medium"),
|
|
84
|
+
"failure_category": v.get("failure_category"),
|
|
85
|
+
"confidence": float(v.get("confidence", 0.5)),
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
return normalized, pass_rate, summary
|
|
89
|
+
|
|
90
|
+
except (json.JSONDecodeError, KeyError, IndexError) as e:
|
|
91
|
+
print(f" ā ļø Failed to parse Judge response: {e}")
|
|
92
|
+
return [], 0.0, "Failed to parse judge evaluation"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _run_ragas_evaluation(combined_results: list) -> dict:
|
|
96
|
+
"""Run Ragas mathematical metrics on the results."""
|
|
97
|
+
try:
|
|
98
|
+
from datasets import Dataset
|
|
99
|
+
from ragas import evaluate
|
|
100
|
+
from ragas.metrics import answer_relevancy
|
|
101
|
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
|
102
|
+
import os
|
|
103
|
+
|
|
104
|
+
# 1. Explicitly initialize the models so Ragas doesn't try to guess
|
|
105
|
+
eval_llm = ChatOpenAI(
|
|
106
|
+
model=os.getenv("MODEL_NAME", "gpt-4o-mini"),
|
|
107
|
+
openai_api_key=os.getenv("OPENAI_API_KEY")
|
|
108
|
+
)
|
|
109
|
+
eval_embeddings = OpenAIEmbeddings(
|
|
110
|
+
openai_api_key=os.getenv("OPENAI_API_KEY")
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Format for Ragas
|
|
114
|
+
data = {
|
|
115
|
+
"question": [],
|
|
116
|
+
"answer": [],
|
|
117
|
+
"contexts": [],
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
for r in combined_results:
|
|
121
|
+
data["question"].append(str(r["input_data"]))
|
|
122
|
+
data["answer"].append(str(r["sut_output"]))
|
|
123
|
+
data["contexts"].append(["Black-box evaluation context"])
|
|
124
|
+
|
|
125
|
+
dataset = Dataset.from_dict(data)
|
|
126
|
+
|
|
127
|
+
print(" š§® Running RAGAS mathematical evaluation (Answer Relevancy)...")
|
|
128
|
+
# 2. Pass the LLM and Embeddings directly into evaluate
|
|
129
|
+
result = evaluate(
|
|
130
|
+
dataset,
|
|
131
|
+
metrics=[answer_relevancy],
|
|
132
|
+
llm=eval_llm,
|
|
133
|
+
embeddings=eval_embeddings,
|
|
134
|
+
raise_exceptions=False
|
|
135
|
+
)
|
|
136
|
+
scores_df = result.to_pandas()
|
|
137
|
+
|
|
138
|
+
ragas_scores = {}
|
|
139
|
+
for idx, row in scores_df.iterrows():
|
|
140
|
+
test_id = combined_results[idx]["test_id"]
|
|
141
|
+
ragas_scores[test_id] = {
|
|
142
|
+
"ragas_answer_relevancy": float(row.get("answer_relevancy", 0.0))
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return ragas_scores
|
|
146
|
+
|
|
147
|
+
except Exception as e:
|
|
148
|
+
print(f" ā ļø Ragas evaluation skipped/failed: {e}")
|
|
149
|
+
return {}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def judge_node(state: QAState) -> dict:
|
|
153
|
+
"""
|
|
154
|
+
LangGraph node: Judge Agent.
|
|
155
|
+
|
|
156
|
+
Evaluates SUT outputs against expected behaviors using LLM-as-Judge.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
state: Current QAState
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
State update with judge_verdicts and all_verdicts accumulation
|
|
163
|
+
"""
|
|
164
|
+
iteration = state.get("current_iteration", 1)
|
|
165
|
+
|
|
166
|
+
print(f"\n{'='*60}")
|
|
167
|
+
print(f"āļø JUDGE AGENT ā Iteration {iteration}")
|
|
168
|
+
print(f"{'='*60}")
|
|
169
|
+
print(f" Evaluating test results...")
|
|
170
|
+
|
|
171
|
+
# Build the evaluation context
|
|
172
|
+
combined_results, test_results_json = _build_test_results_json(state)
|
|
173
|
+
|
|
174
|
+
# Run mathematical Ragas evaluation
|
|
175
|
+
ragas_scores = _run_ragas_evaluation(combined_results)
|
|
176
|
+
|
|
177
|
+
evaluation_prompt = JUDGE_EVALUATION_PROMPT.format(
|
|
178
|
+
test_results_json=test_results_json
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# Call the LLM ā format system prompt with domain context
|
|
182
|
+
llm = _get_llm()
|
|
183
|
+
system_prompt = JUDGE_SYSTEM_PROMPT.format(
|
|
184
|
+
sut_description=state.get("sut_description", "Unknown System"),
|
|
185
|
+
domain=state.get("domain", "general"),
|
|
186
|
+
)
|
|
187
|
+
messages = [
|
|
188
|
+
SystemMessage(content=system_prompt),
|
|
189
|
+
HumanMessage(content=evaluation_prompt),
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
response = llm.invoke(messages)
|
|
193
|
+
|
|
194
|
+
# Parse verdicts
|
|
195
|
+
verdicts, pass_rate, summary = _parse_verdicts(response.content)
|
|
196
|
+
|
|
197
|
+
# Inject Ragas scores into the verdicts
|
|
198
|
+
for v in verdicts:
|
|
199
|
+
tid = v["test_id"]
|
|
200
|
+
if tid in ragas_scores:
|
|
201
|
+
v["ragas_scores"] = ragas_scores[tid]
|
|
202
|
+
|
|
203
|
+
# Print results
|
|
204
|
+
pass_count = sum(1 for v in verdicts if v["status"] == "pass")
|
|
205
|
+
fail_count = sum(1 for v in verdicts if v["status"] == "fail")
|
|
206
|
+
error_count = sum(1 for v in verdicts if v["status"] == "error")
|
|
207
|
+
|
|
208
|
+
print(f"\n š Verdict Summary:")
|
|
209
|
+
print(f" ā
Passed: {pass_count}")
|
|
210
|
+
print(f" ā Failed: {fail_count}")
|
|
211
|
+
print(f" š„ Errors: {error_count}")
|
|
212
|
+
print(f" š Pass Rate: {pass_rate:.1%}")
|
|
213
|
+
print(f"\n Detailed Verdicts:")
|
|
214
|
+
|
|
215
|
+
for v in verdicts:
|
|
216
|
+
icon = "ā
" if v["status"] == "pass" else "ā" if v["status"] == "fail" else "š„"
|
|
217
|
+
score_str = ""
|
|
218
|
+
if "ragas_scores" in v:
|
|
219
|
+
rel = v['ragas_scores'].get('ragas_answer_relevancy', 0)
|
|
220
|
+
score_str = f"| Relevancy: {rel:.2f}"
|
|
221
|
+
|
|
222
|
+
print(f" {icon} [{v['test_id']}] {v['status'].upper():7s} | {v['severity']:8s} {score_str} | {v['reasoning'][:50]}...")
|
|
223
|
+
|
|
224
|
+
if summary:
|
|
225
|
+
print(f"\n š¬ Summary: {summary[:100]}...")
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
"judge_verdicts": verdicts,
|
|
229
|
+
"all_verdicts": verdicts,
|
|
230
|
+
"iteration_pass_rates": [pass_rate],
|
|
231
|
+
}
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Red-Team Agent ā Adversarial Test Case Generator.
|
|
3
|
+
|
|
4
|
+
This agent autonomously generates diverse, creative adversarial test inputs
|
|
5
|
+
targeting edge cases and vulnerabilities in the financial document SUT.
|
|
6
|
+
|
|
7
|
+
On iteration 1, it generates broad-spectrum tests. On subsequent iterations,
|
|
8
|
+
it uses failure patterns from the Refiner Agent to generate increasingly
|
|
9
|
+
targeted and sophisticated test cases.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
from langchain_openai import ChatOpenAI
|
|
15
|
+
from langchain_core.messages import SystemMessage, HumanMessage
|
|
16
|
+
from agentic_qa.graph.state import QAState
|
|
17
|
+
from agentic_qa.utils.prompt_templates import (
|
|
18
|
+
RED_TEAM_SYSTEM_PROMPT,
|
|
19
|
+
RED_TEAM_GENERATION_PROMPT,
|
|
20
|
+
RED_TEAM_REFINEMENT_CONTEXT,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _get_llm() -> ChatOpenAI:
|
|
25
|
+
"""Initialize the LLM for the Red-Team Agent."""
|
|
26
|
+
return ChatOpenAI(
|
|
27
|
+
model=os.getenv("MODEL_NAME", "gpt-4o-mini"),
|
|
28
|
+
temperature=0.9, # High creativity for diverse adversarial inputs
|
|
29
|
+
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
30
|
+
openai_api_base=os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _build_failure_context(state: QAState) -> str:
|
|
35
|
+
"""Build context from previous failures to guide targeted generation."""
|
|
36
|
+
failure_patterns = state.get("failure_patterns", [])
|
|
37
|
+
all_verdicts = state.get("all_verdicts", [])
|
|
38
|
+
|
|
39
|
+
if not failure_patterns and not all_verdicts:
|
|
40
|
+
return "This is the FIRST iteration. Generate a diverse initial test suite covering all edge case categories."
|
|
41
|
+
|
|
42
|
+
# Count failures
|
|
43
|
+
total_tests = len(all_verdicts)
|
|
44
|
+
total_failures = sum(1 for v in all_verdicts if v.get("status") in ("fail", "error"))
|
|
45
|
+
|
|
46
|
+
# Find top failure categories
|
|
47
|
+
categories = {}
|
|
48
|
+
for v in all_verdicts:
|
|
49
|
+
if v.get("status") in ("fail", "error"):
|
|
50
|
+
cat = v.get("failure_category", "unknown")
|
|
51
|
+
categories[cat] = categories.get(cat, 0) + 1
|
|
52
|
+
|
|
53
|
+
top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:5]
|
|
54
|
+
top_str = ", ".join([f"{cat} ({count})" for cat, count in top_categories])
|
|
55
|
+
|
|
56
|
+
return RED_TEAM_REFINEMENT_CONTEXT.format(
|
|
57
|
+
failure_patterns="\n".join(f" - {p}" for p in failure_patterns[-10:]),
|
|
58
|
+
total_tests=total_tests,
|
|
59
|
+
total_failures=total_failures,
|
|
60
|
+
top_failure_categories=top_str or "None identified yet",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _parse_test_cases(response_text: str, iteration: int) -> list:
|
|
65
|
+
"""Parse LLM response into structured test cases."""
|
|
66
|
+
try:
|
|
67
|
+
# Try to extract JSON from the response
|
|
68
|
+
# Handle cases where LLM wraps JSON in markdown code blocks
|
|
69
|
+
text = response_text.strip()
|
|
70
|
+
if "```json" in text:
|
|
71
|
+
text = text.split("```json")[1].split("```")[0].strip()
|
|
72
|
+
elif "```" in text:
|
|
73
|
+
text = text.split("```")[1].split("```")[0].strip()
|
|
74
|
+
|
|
75
|
+
data = json.loads(text)
|
|
76
|
+
test_cases = data.get("test_cases", [])
|
|
77
|
+
|
|
78
|
+
# Validate and normalize each test case
|
|
79
|
+
normalized = []
|
|
80
|
+
for i, tc in enumerate(test_cases):
|
|
81
|
+
normalized.append({
|
|
82
|
+
"id": tc.get("id", f"TC-{iteration:02d}{i+1:02d}"),
|
|
83
|
+
"input_data": tc.get("input_data", ""),
|
|
84
|
+
"expected_behavior": tc.get("expected_behavior", ""),
|
|
85
|
+
"edge_case_type": tc.get("edge_case_type", "adversarial"),
|
|
86
|
+
"difficulty": tc.get("difficulty", "medium"),
|
|
87
|
+
"rationale": tc.get("rationale", ""),
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
return normalized
|
|
91
|
+
|
|
92
|
+
except (json.JSONDecodeError, KeyError, IndexError) as e:
|
|
93
|
+
print(f" ā ļø Failed to parse Red-Team response: {e}")
|
|
94
|
+
# Fallback: generate a single default test case
|
|
95
|
+
return [{
|
|
96
|
+
"id": f"TC-{iteration:02d}01",
|
|
97
|
+
"input_data": "Test with empty input",
|
|
98
|
+
"expected_behavior": "Should handle empty or minimal input gracefully",
|
|
99
|
+
"edge_case_type": "missing_data",
|
|
100
|
+
"difficulty": "medium",
|
|
101
|
+
"rationale": "Fallback test case due to parsing error",
|
|
102
|
+
}]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def red_team_node(state: QAState) -> dict:
|
|
106
|
+
"""
|
|
107
|
+
LangGraph node: Red-Team Agent.
|
|
108
|
+
|
|
109
|
+
Generates adversarial test cases for the current iteration.
|
|
110
|
+
Uses failure patterns from previous iterations to create more targeted tests.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
state: Current QAState
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
State update with new test_suite entries and incremented iteration
|
|
117
|
+
"""
|
|
118
|
+
iteration = state.get("current_iteration", 0) + 1
|
|
119
|
+
num_tests = int(os.getenv("TESTS_PER_ITERATION", "5"))
|
|
120
|
+
|
|
121
|
+
print(f"\n{'='*60}")
|
|
122
|
+
print(f"š“ RED-TEAM AGENT ā Iteration {iteration}")
|
|
123
|
+
print(f"{'='*60}")
|
|
124
|
+
print(f" Generating {num_tests} adversarial test cases...")
|
|
125
|
+
|
|
126
|
+
# Build the prompt
|
|
127
|
+
failure_context = _build_failure_context(state)
|
|
128
|
+
|
|
129
|
+
generation_prompt = RED_TEAM_GENERATION_PROMPT.format(
|
|
130
|
+
num_tests=num_tests,
|
|
131
|
+
sut_description=state.get("sut_description", "Financial Document RAG"),
|
|
132
|
+
domain=state.get("domain", "WAM financial documents"),
|
|
133
|
+
iteration=iteration,
|
|
134
|
+
iteration_prefix=f"{iteration:02d}",
|
|
135
|
+
failure_context=failure_context,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Call the LLM ā format system prompt with domain context
|
|
139
|
+
llm = _get_llm()
|
|
140
|
+
system_prompt = RED_TEAM_SYSTEM_PROMPT.format(
|
|
141
|
+
sut_description=state.get("sut_description", "Unknown System"),
|
|
142
|
+
domain=state.get("domain", "general"),
|
|
143
|
+
sut_architecture=state.get("sut_architecture", "Architecture unknown. Treat as black-box.")
|
|
144
|
+
)
|
|
145
|
+
messages = [
|
|
146
|
+
SystemMessage(content=system_prompt),
|
|
147
|
+
HumanMessage(content=generation_prompt),
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
response = llm.invoke(messages)
|
|
151
|
+
|
|
152
|
+
# Parse test cases from LLM response
|
|
153
|
+
test_cases = _parse_test_cases(response.content, iteration)
|
|
154
|
+
|
|
155
|
+
print(f" ā
Generated {len(test_cases)} test cases:")
|
|
156
|
+
for tc in test_cases:
|
|
157
|
+
print(f" [{tc['id']}] {tc['edge_case_type']:20s} | {tc['difficulty']:6s} | {tc['input_data'][:60]}...")
|
|
158
|
+
|
|
159
|
+
return {
|
|
160
|
+
"test_suite": test_cases,
|
|
161
|
+
"current_iteration": iteration,
|
|
162
|
+
}
|