agentic-qa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agentic_qa/__init__.py ADDED
@@ -0,0 +1,103 @@
1
+ """
2
+ Agentic QA - Main Public API
3
+
4
+ This file exposes the simple `run_autonomous_test` function, allowing developers
5
+ to test their RAG systems in just a few lines of code.
6
+ """
7
+
8
+ import os
9
+ from typing import Callable, Optional
10
+
11
+ # Expose SUT adapters for developers if they want advanced setups
12
+ from agentic_qa.sut.base import BaseSUTAdapter
13
+ from agentic_qa.sut.api_adapter import APIAdapter
14
+ from agentic_qa.sut.callable_adapter import CallableAdapter
15
+ from agentic_qa.sut import set_active_sut
16
+
17
+ # Import the core workflow
18
+ from agentic_qa.graph.workflow import build_qa_graph, get_initial_state
19
+
20
+
21
+ def run_autonomous_test(
22
+ target_function: Optional[Callable] = None,
23
+ api_endpoint: Optional[str] = None,
24
+ system_name: str = "Target System",
25
+ system_description: str = "A generic RAG system",
26
+ domain: str = "general",
27
+ max_iterations: int = 3,
28
+ tests_per_iteration: int = 5,
29
+ model_name: str = "gpt-4o-mini",
30
+ ) -> dict:
31
+ """
32
+ Run an autonomous multi-agent QA test against a target system.
33
+
34
+ You must provide EITHER a `target_function` (a python function) OR
35
+ an `api_endpoint` (a URL string).
36
+
37
+ Args:
38
+ target_function: A python function that takes a string query and returns a string answer.
39
+ api_endpoint: A URL endpoint (e.g., http://localhost:8000/chat) to test.
40
+ system_name: The name of the system being tested.
41
+ system_description: A description of what the system does. Highly important for agents!
42
+ domain: The domain of the system (e.g., 'financial', 'healthcare', 'customer support').
43
+ max_iterations: How many times the agents should refine and retry their tests.
44
+ tests_per_iteration: How many tests the Red-Team agent generates per round.
45
+ model_name: The LLM to use for the agents (default: gpt-4o-mini).
46
+
47
+ Returns:
48
+ A dictionary containing the final execution state, including the test suite, verdicts,
49
+ failure patterns, and the final Markdown report.
50
+ """
51
+ # 1. Configure Environment variables needed by LangGraph
52
+ os.environ["MAX_ITERATIONS"] = str(max_iterations)
53
+ os.environ["TESTS_PER_ITERATION"] = str(tests_per_iteration)
54
+ os.environ["MODEL_NAME"] = model_name
55
+
56
+ # 2. Setup the SUT Adapter
57
+ if target_function:
58
+ adapter = CallableAdapter(
59
+ fn=target_function,
60
+ description=system_description,
61
+ system_name=system_name,
62
+ domain=domain
63
+ )
64
+ elif api_endpoint:
65
+ adapter = APIAdapter(
66
+ endpoint=api_endpoint,
67
+ description=system_description,
68
+ system_name=system_name,
69
+ domain=domain
70
+ )
71
+ else:
72
+ raise ValueError("You must provide either a `target_function` or an `api_endpoint`.")
73
+
74
+ # Register the adapter globally for the Executor Agent to use
75
+ set_active_sut(adapter)
76
+
77
+ # 3. Build and Run the Graph
78
+ print(f"šŸš€ Starting Autonomous QA Test against: {system_name}")
79
+ print(f"Domain: {domain} | Max Iterations: {max_iterations}")
80
+
81
+ graph = build_qa_graph()
82
+ initial_state = get_initial_state()
83
+ initial_state["max_iterations"] = max_iterations
84
+
85
+ final_state = None
86
+ # Stream the graph to provide real-time console feedback
87
+ for event in graph.stream(initial_state, stream_mode="values"):
88
+ final_state = event
89
+
90
+ print("\nāœ… Autonomous QA Test Complete!")
91
+ print(f"Coverage Score: {final_state.get('coverage_score', 0):.1%}")
92
+ print(f"Total Failure Patterns Found: {len(final_state.get('failure_patterns', []))}")
93
+
94
+ return final_state
95
+
96
+
97
+ # Define what is exported when someone runs `from agentic_qa import *`
98
+ __all__ = [
99
+ "run_autonomous_test",
100
+ "APIAdapter",
101
+ "CallableAdapter",
102
+ "BaseSUTAdapter"
103
+ ]
@@ -0,0 +1,15 @@
1
+ """Multi-Agent Autonomous QA System - Agents Package"""
2
+
3
+ from agentic_qa.agents.red_team import red_team_node
4
+ from agentic_qa.agents.executor import executor_node
5
+ from agentic_qa.agents.judge import judge_node
6
+ from agentic_qa.agents.refiner import refiner_node
7
+ from agentic_qa.agents.reporter import reporter_node
8
+
9
+ __all__ = [
10
+ "red_team_node",
11
+ "executor_node",
12
+ "judge_node",
13
+ "refiner_node",
14
+ "reporter_node",
15
+ ]
@@ -0,0 +1,88 @@
1
+ """
2
+ Discovery Agent (Graphify) — Architecture Mapping.
3
+
4
+ This agent performs White-Box introspection. It analyzes the System Under Test (SUT)
5
+ and maps out its internal architecture (e.g., Vector DB, Chunk Size, Retriever type, LLM).
6
+ This architectural "graph" allows the Red-Team agent to launch hyper-targeted attacks.
7
+ """
8
+
9
+ import os
10
+ from langchain_openai import ChatOpenAI
11
+ from langchain_core.messages import SystemMessage, HumanMessage
12
+ from agentic_qa.graph.state import QAState
13
+
14
+ DISCOVERY_SYSTEM_PROMPT = """You are an elite AI Architecture Mapper (Graphify Agent).
15
+
16
+ Your mission is to analyze a generic description of an AI/RAG system and deduce its likely internal architecture graph.
17
+ You must break down the system into a logical pipeline (e.g., Data Ingestion -> Chunking -> VectorDB -> Retriever -> LLM -> Output).
18
+
19
+ Think about:
20
+ 1. What components must exist for this system to work?
21
+ 2. Where are the likely weak points or bottlenecks between these nodes?
22
+ 3. What are the assumed configurations (e.g., chunk size, top-k retrieval)?
23
+
24
+ Output a detailed, graph-like text representation of the architecture. Be specific about potential vulnerabilities at each node."""
25
+
26
+ DISCOVERY_USER_PROMPT = """Analyze the following System Under Test and map its architecture.
27
+
28
+ **System Name:** {name}
29
+ **Domain:** {domain}
30
+ **Description:**
31
+ {description}
32
+
33
+ Provide a detailed architectural breakdown (Graphify) of this system. Highlight the specific nodes (e.g., Retriever, VectorStore, LLM) and the data flow.
34
+ Keep it concise but technical."""
35
+
36
+
37
+ def _get_llm() -> ChatOpenAI:
38
+ return ChatOpenAI(
39
+ model=os.getenv("MODEL_NAME", "gpt-4o-mini"),
40
+ temperature=0.2,
41
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
42
+ openai_api_base=os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
43
+ )
44
+
45
+
46
+ def discovery_node(state: QAState) -> dict:
47
+ """
48
+ LangGraph node: Discovery Agent.
49
+
50
+ Runs once at the beginning of the pipeline to map the SUT architecture.
51
+ """
52
+ print(f"\n{'='*60}")
53
+ print(f"šŸ” DISCOVERY AGENT (Graphify) — Mapping Architecture")
54
+ print(f"{'='*60}")
55
+
56
+ description = state.get("sut_description", "")
57
+ domain = state.get("domain", "")
58
+
59
+ # If the architecture is already provided (e.g. by the adapter), skip LLM discovery
60
+ if state.get("sut_architecture"):
61
+ print(" Architecture already provided. Skipping LLM discovery.")
62
+ return {}
63
+
64
+ print(" Analyzing SUT description to deduce internal graph...")
65
+
66
+ llm = _get_llm()
67
+ prompt = DISCOVERY_USER_PROMPT.format(
68
+ name="Target System",
69
+ domain=domain,
70
+ description=description
71
+ )
72
+
73
+ messages = [
74
+ SystemMessage(content=DISCOVERY_SYSTEM_PROMPT),
75
+ HumanMessage(content=prompt)
76
+ ]
77
+
78
+ response = llm.invoke(messages)
79
+ architecture = response.content
80
+
81
+ print("\n šŸ—ŗļø Deduced Architecture Graph:")
82
+ # Print the first few lines as a preview
83
+ preview = "\n".join([f" {line}" for line in architecture.split("\n")[:10]])
84
+ print(f"{preview}\n ...")
85
+
86
+ return {
87
+ "sut_architecture": architecture
88
+ }
@@ -0,0 +1,89 @@
1
+ """
2
+ Executor Agent — Generic System Under Test Runner.
3
+
4
+ Executes test cases against ANY connected SUT (RAG, API, function).
5
+ Uses the active SUT adapter from the registry — works with:
6
+ - Built-in Financial RAG demo
7
+ - Any RAG connected via API endpoint
8
+ - Any Python function wrapped as a callable
9
+ """
10
+
11
+ import time
12
+ from agentic_qa.graph.state import QAState
13
+ from agentic_qa.sut import get_active_sut
14
+
15
+
16
+ def executor_node(state: QAState) -> dict:
17
+ """
18
+ LangGraph node: Executor Agent.
19
+
20
+ Runs each test case from the current iteration through whatever
21
+ SUT is currently active and collects results.
22
+ """
23
+ iteration = state.get("current_iteration", 1)
24
+ test_suite = state.get("test_suite", [])
25
+
26
+ print(f"\n{'='*60}")
27
+ print(f"⚔ EXECUTOR AGENT — Iteration {iteration}")
28
+ print(f"{'='*60}")
29
+
30
+ # Get test cases for the current iteration only
31
+ iter_prefix = f"TC-{iteration:02d}"
32
+ current_tests = [tc for tc in test_suite if tc["id"].startswith(iter_prefix)]
33
+
34
+ if not current_tests:
35
+ num_per_iter = 5
36
+ current_tests = test_suite[-num_per_iter:]
37
+
38
+ # Get the active SUT (whatever the user connected)
39
+ sut = get_active_sut()
40
+ print(f" SUT: {sut.name}")
41
+ print(f" Executing {len(current_tests)} test cases...")
42
+
43
+ execution_results = []
44
+
45
+ for tc in current_tests:
46
+ test_id = tc["id"]
47
+ input_data = tc["input_data"]
48
+
49
+ print(f" ā–¶ Running [{test_id}]...", end=" ")
50
+
51
+ start_time = time.time()
52
+ try:
53
+ output = sut.process(input_data)
54
+ exec_time = time.time() - start_time
55
+
56
+ # Normalize output — adapters return "output" key
57
+ sut_output = output.get("output", output.get("sut_output", str(output)))
58
+ status = output.get("status", "unknown")
59
+
60
+ result = {
61
+ "test_id": test_id,
62
+ "sut_output": str(output),
63
+ "execution_time": round(exec_time, 4),
64
+ "error": output.get("error") if status == "error" else None,
65
+ }
66
+ print(f"Done ({exec_time:.3f}s) — Status: {status}")
67
+
68
+ except Exception as e:
69
+ exec_time = time.time() - start_time
70
+ result = {
71
+ "test_id": test_id,
72
+ "sut_output": "",
73
+ "execution_time": round(exec_time, 4),
74
+ "error": str(e),
75
+ }
76
+ print(f"ERROR ({exec_time:.3f}s) — {e}")
77
+
78
+ execution_results.append(result)
79
+
80
+ errors = sum(1 for r in execution_results if r["error"])
81
+ avg_time = sum(r["execution_time"] for r in execution_results) / max(len(execution_results), 1)
82
+ print(f"\n šŸ“Š Execution Summary:")
83
+ print(f" Tests executed: {len(execution_results)}")
84
+ print(f" Errors/crashes: {errors}")
85
+ print(f" Avg exec time: {avg_time:.3f}s")
86
+
87
+ return {
88
+ "execution_results": execution_results,
89
+ }
@@ -0,0 +1,231 @@
1
+ """
2
+ Judge Agent — LLM-as-Judge Evaluator.
3
+
4
+ This agent evaluates each test case's expected behavior against the SUT's
5
+ actual output, delivering precise pass/fail verdicts with detailed reasoning.
6
+
7
+ Uses the LLM-as-Judge pattern: the LLM acts as an impartial evaluator
8
+ that understands financial domain expectations.
9
+ """
10
+
11
+ import json
12
+ import os
13
+ from langchain_openai import ChatOpenAI
14
+ from langchain_core.messages import SystemMessage, HumanMessage
15
+ from agentic_qa.graph.state import QAState
16
+ from agentic_qa.utils.prompt_templates import JUDGE_SYSTEM_PROMPT, JUDGE_EVALUATION_PROMPT
17
+
18
+
19
+ def _get_llm() -> ChatOpenAI:
20
+ """Initialize the LLM for the Judge Agent."""
21
+ return ChatOpenAI(
22
+ model=os.getenv("MODEL_NAME", "gpt-4o-mini"),
23
+ temperature=0.1, # Low temperature for consistent, precise judgments
24
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
25
+ openai_api_base=os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
26
+ )
27
+
28
+
29
+ def _build_test_results_json(state: QAState) -> str:
30
+ """Combine test cases with execution results for judge evaluation."""
31
+ iteration = state.get("current_iteration", 1)
32
+ test_suite = state.get("test_suite", [])
33
+ execution_results = state.get("execution_results", [])
34
+
35
+ # Build lookup of execution results by test_id
36
+ results_map = {r["test_id"]: r for r in execution_results}
37
+
38
+ # Get current iteration tests
39
+ iter_prefix = f"TC-{iteration:02d}"
40
+ current_tests = [tc for tc in test_suite if tc["id"].startswith(iter_prefix)]
41
+ if not current_tests:
42
+ num_per_iter = 5
43
+ current_tests = test_suite[-num_per_iter:]
44
+
45
+ # Combine test case + result for each
46
+ combined = []
47
+ for tc in current_tests:
48
+ result = results_map.get(tc["id"], {})
49
+ combined.append({
50
+ "test_id": tc["id"],
51
+ "input_data": tc["input_data"],
52
+ "expected_behavior": tc["expected_behavior"],
53
+ "edge_case_type": tc["edge_case_type"],
54
+ "difficulty": tc["difficulty"],
55
+ "sut_output": result.get("sut_output", "NO OUTPUT"),
56
+ "execution_error": result.get("error"),
57
+ "execution_time": result.get("execution_time", 0),
58
+ })
59
+
60
+ return combined, json.dumps(combined, indent=2)
61
+
62
+
63
+ def _parse_verdicts(response_text: str) -> tuple:
64
+ """Parse Judge LLM response into structured verdicts."""
65
+ try:
66
+ text = response_text.strip()
67
+ if "```json" in text:
68
+ text = text.split("```json")[1].split("```")[0].strip()
69
+ elif "```" in text:
70
+ text = text.split("```")[1].split("```")[0].strip()
71
+
72
+ data = json.loads(text)
73
+ verdicts = data.get("verdicts", [])
74
+ pass_rate = data.get("pass_rate", 0.0)
75
+ summary = data.get("summary", "")
76
+
77
+ normalized = []
78
+ for v in verdicts:
79
+ normalized.append({
80
+ "test_id": v.get("test_id", "unknown"),
81
+ "status": v.get("status", "error"),
82
+ "reasoning": v.get("reasoning", "No reasoning provided"),
83
+ "severity": v.get("severity", "medium"),
84
+ "failure_category": v.get("failure_category"),
85
+ "confidence": float(v.get("confidence", 0.5)),
86
+ })
87
+
88
+ return normalized, pass_rate, summary
89
+
90
+ except (json.JSONDecodeError, KeyError, IndexError) as e:
91
+ print(f" āš ļø Failed to parse Judge response: {e}")
92
+ return [], 0.0, "Failed to parse judge evaluation"
93
+
94
+
95
+ def _run_ragas_evaluation(combined_results: list) -> dict:
96
+ """Run Ragas mathematical metrics on the results."""
97
+ try:
98
+ from datasets import Dataset
99
+ from ragas import evaluate
100
+ from ragas.metrics import answer_relevancy
101
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
102
+ import os
103
+
104
+ # 1. Explicitly initialize the models so Ragas doesn't try to guess
105
+ eval_llm = ChatOpenAI(
106
+ model=os.getenv("MODEL_NAME", "gpt-4o-mini"),
107
+ openai_api_key=os.getenv("OPENAI_API_KEY")
108
+ )
109
+ eval_embeddings = OpenAIEmbeddings(
110
+ openai_api_key=os.getenv("OPENAI_API_KEY")
111
+ )
112
+
113
+ # Format for Ragas
114
+ data = {
115
+ "question": [],
116
+ "answer": [],
117
+ "contexts": [],
118
+ }
119
+
120
+ for r in combined_results:
121
+ data["question"].append(str(r["input_data"]))
122
+ data["answer"].append(str(r["sut_output"]))
123
+ data["contexts"].append(["Black-box evaluation context"])
124
+
125
+ dataset = Dataset.from_dict(data)
126
+
127
+ print(" 🧮 Running RAGAS mathematical evaluation (Answer Relevancy)...")
128
+ # 2. Pass the LLM and Embeddings directly into evaluate
129
+ result = evaluate(
130
+ dataset,
131
+ metrics=[answer_relevancy],
132
+ llm=eval_llm,
133
+ embeddings=eval_embeddings,
134
+ raise_exceptions=False
135
+ )
136
+ scores_df = result.to_pandas()
137
+
138
+ ragas_scores = {}
139
+ for idx, row in scores_df.iterrows():
140
+ test_id = combined_results[idx]["test_id"]
141
+ ragas_scores[test_id] = {
142
+ "ragas_answer_relevancy": float(row.get("answer_relevancy", 0.0))
143
+ }
144
+
145
+ return ragas_scores
146
+
147
+ except Exception as e:
148
+ print(f" āš ļø Ragas evaluation skipped/failed: {e}")
149
+ return {}
150
+
151
+
152
+ def judge_node(state: QAState) -> dict:
153
+ """
154
+ LangGraph node: Judge Agent.
155
+
156
+ Evaluates SUT outputs against expected behaviors using LLM-as-Judge.
157
+
158
+ Args:
159
+ state: Current QAState
160
+
161
+ Returns:
162
+ State update with judge_verdicts and all_verdicts accumulation
163
+ """
164
+ iteration = state.get("current_iteration", 1)
165
+
166
+ print(f"\n{'='*60}")
167
+ print(f"āš–ļø JUDGE AGENT — Iteration {iteration}")
168
+ print(f"{'='*60}")
169
+ print(f" Evaluating test results...")
170
+
171
+ # Build the evaluation context
172
+ combined_results, test_results_json = _build_test_results_json(state)
173
+
174
+ # Run mathematical Ragas evaluation
175
+ ragas_scores = _run_ragas_evaluation(combined_results)
176
+
177
+ evaluation_prompt = JUDGE_EVALUATION_PROMPT.format(
178
+ test_results_json=test_results_json
179
+ )
180
+
181
+ # Call the LLM — format system prompt with domain context
182
+ llm = _get_llm()
183
+ system_prompt = JUDGE_SYSTEM_PROMPT.format(
184
+ sut_description=state.get("sut_description", "Unknown System"),
185
+ domain=state.get("domain", "general"),
186
+ )
187
+ messages = [
188
+ SystemMessage(content=system_prompt),
189
+ HumanMessage(content=evaluation_prompt),
190
+ ]
191
+
192
+ response = llm.invoke(messages)
193
+
194
+ # Parse verdicts
195
+ verdicts, pass_rate, summary = _parse_verdicts(response.content)
196
+
197
+ # Inject Ragas scores into the verdicts
198
+ for v in verdicts:
199
+ tid = v["test_id"]
200
+ if tid in ragas_scores:
201
+ v["ragas_scores"] = ragas_scores[tid]
202
+
203
+ # Print results
204
+ pass_count = sum(1 for v in verdicts if v["status"] == "pass")
205
+ fail_count = sum(1 for v in verdicts if v["status"] == "fail")
206
+ error_count = sum(1 for v in verdicts if v["status"] == "error")
207
+
208
+ print(f"\n šŸ“‹ Verdict Summary:")
209
+ print(f" āœ… Passed: {pass_count}")
210
+ print(f" āŒ Failed: {fail_count}")
211
+ print(f" šŸ’„ Errors: {error_count}")
212
+ print(f" šŸ“ˆ Pass Rate: {pass_rate:.1%}")
213
+ print(f"\n Detailed Verdicts:")
214
+
215
+ for v in verdicts:
216
+ icon = "āœ…" if v["status"] == "pass" else "āŒ" if v["status"] == "fail" else "šŸ’„"
217
+ score_str = ""
218
+ if "ragas_scores" in v:
219
+ rel = v['ragas_scores'].get('ragas_answer_relevancy', 0)
220
+ score_str = f"| Relevancy: {rel:.2f}"
221
+
222
+ print(f" {icon} [{v['test_id']}] {v['status'].upper():7s} | {v['severity']:8s} {score_str} | {v['reasoning'][:50]}...")
223
+
224
+ if summary:
225
+ print(f"\n šŸ’¬ Summary: {summary[:100]}...")
226
+
227
+ return {
228
+ "judge_verdicts": verdicts,
229
+ "all_verdicts": verdicts,
230
+ "iteration_pass_rates": [pass_rate],
231
+ }
@@ -0,0 +1,162 @@
1
+ """
2
+ Red-Team Agent — Adversarial Test Case Generator.
3
+
4
+ This agent autonomously generates diverse, creative adversarial test inputs
5
+ targeting edge cases and vulnerabilities in the financial document SUT.
6
+
7
+ On iteration 1, it generates broad-spectrum tests. On subsequent iterations,
8
+ it uses failure patterns from the Refiner Agent to generate increasingly
9
+ targeted and sophisticated test cases.
10
+ """
11
+
12
+ import json
13
+ import os
14
+ from langchain_openai import ChatOpenAI
15
+ from langchain_core.messages import SystemMessage, HumanMessage
16
+ from agentic_qa.graph.state import QAState
17
+ from agentic_qa.utils.prompt_templates import (
18
+ RED_TEAM_SYSTEM_PROMPT,
19
+ RED_TEAM_GENERATION_PROMPT,
20
+ RED_TEAM_REFINEMENT_CONTEXT,
21
+ )
22
+
23
+
24
+ def _get_llm() -> ChatOpenAI:
25
+ """Initialize the LLM for the Red-Team Agent."""
26
+ return ChatOpenAI(
27
+ model=os.getenv("MODEL_NAME", "gpt-4o-mini"),
28
+ temperature=0.9, # High creativity for diverse adversarial inputs
29
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
30
+ openai_api_base=os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
31
+ )
32
+
33
+
34
+ def _build_failure_context(state: QAState) -> str:
35
+ """Build context from previous failures to guide targeted generation."""
36
+ failure_patterns = state.get("failure_patterns", [])
37
+ all_verdicts = state.get("all_verdicts", [])
38
+
39
+ if not failure_patterns and not all_verdicts:
40
+ return "This is the FIRST iteration. Generate a diverse initial test suite covering all edge case categories."
41
+
42
+ # Count failures
43
+ total_tests = len(all_verdicts)
44
+ total_failures = sum(1 for v in all_verdicts if v.get("status") in ("fail", "error"))
45
+
46
+ # Find top failure categories
47
+ categories = {}
48
+ for v in all_verdicts:
49
+ if v.get("status") in ("fail", "error"):
50
+ cat = v.get("failure_category", "unknown")
51
+ categories[cat] = categories.get(cat, 0) + 1
52
+
53
+ top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:5]
54
+ top_str = ", ".join([f"{cat} ({count})" for cat, count in top_categories])
55
+
56
+ return RED_TEAM_REFINEMENT_CONTEXT.format(
57
+ failure_patterns="\n".join(f" - {p}" for p in failure_patterns[-10:]),
58
+ total_tests=total_tests,
59
+ total_failures=total_failures,
60
+ top_failure_categories=top_str or "None identified yet",
61
+ )
62
+
63
+
64
+ def _parse_test_cases(response_text: str, iteration: int) -> list:
65
+ """Parse LLM response into structured test cases."""
66
+ try:
67
+ # Try to extract JSON from the response
68
+ # Handle cases where LLM wraps JSON in markdown code blocks
69
+ text = response_text.strip()
70
+ if "```json" in text:
71
+ text = text.split("```json")[1].split("```")[0].strip()
72
+ elif "```" in text:
73
+ text = text.split("```")[1].split("```")[0].strip()
74
+
75
+ data = json.loads(text)
76
+ test_cases = data.get("test_cases", [])
77
+
78
+ # Validate and normalize each test case
79
+ normalized = []
80
+ for i, tc in enumerate(test_cases):
81
+ normalized.append({
82
+ "id": tc.get("id", f"TC-{iteration:02d}{i+1:02d}"),
83
+ "input_data": tc.get("input_data", ""),
84
+ "expected_behavior": tc.get("expected_behavior", ""),
85
+ "edge_case_type": tc.get("edge_case_type", "adversarial"),
86
+ "difficulty": tc.get("difficulty", "medium"),
87
+ "rationale": tc.get("rationale", ""),
88
+ })
89
+
90
+ return normalized
91
+
92
+ except (json.JSONDecodeError, KeyError, IndexError) as e:
93
+ print(f" āš ļø Failed to parse Red-Team response: {e}")
94
+ # Fallback: generate a single default test case
95
+ return [{
96
+ "id": f"TC-{iteration:02d}01",
97
+ "input_data": "Test with empty input",
98
+ "expected_behavior": "Should handle empty or minimal input gracefully",
99
+ "edge_case_type": "missing_data",
100
+ "difficulty": "medium",
101
+ "rationale": "Fallback test case due to parsing error",
102
+ }]
103
+
104
+
105
+ def red_team_node(state: QAState) -> dict:
106
+ """
107
+ LangGraph node: Red-Team Agent.
108
+
109
+ Generates adversarial test cases for the current iteration.
110
+ Uses failure patterns from previous iterations to create more targeted tests.
111
+
112
+ Args:
113
+ state: Current QAState
114
+
115
+ Returns:
116
+ State update with new test_suite entries and incremented iteration
117
+ """
118
+ iteration = state.get("current_iteration", 0) + 1
119
+ num_tests = int(os.getenv("TESTS_PER_ITERATION", "5"))
120
+
121
+ print(f"\n{'='*60}")
122
+ print(f"šŸ”“ RED-TEAM AGENT — Iteration {iteration}")
123
+ print(f"{'='*60}")
124
+ print(f" Generating {num_tests} adversarial test cases...")
125
+
126
+ # Build the prompt
127
+ failure_context = _build_failure_context(state)
128
+
129
+ generation_prompt = RED_TEAM_GENERATION_PROMPT.format(
130
+ num_tests=num_tests,
131
+ sut_description=state.get("sut_description", "Financial Document RAG"),
132
+ domain=state.get("domain", "WAM financial documents"),
133
+ iteration=iteration,
134
+ iteration_prefix=f"{iteration:02d}",
135
+ failure_context=failure_context,
136
+ )
137
+
138
+ # Call the LLM — format system prompt with domain context
139
+ llm = _get_llm()
140
+ system_prompt = RED_TEAM_SYSTEM_PROMPT.format(
141
+ sut_description=state.get("sut_description", "Unknown System"),
142
+ domain=state.get("domain", "general"),
143
+ sut_architecture=state.get("sut_architecture", "Architecture unknown. Treat as black-box.")
144
+ )
145
+ messages = [
146
+ SystemMessage(content=system_prompt),
147
+ HumanMessage(content=generation_prompt),
148
+ ]
149
+
150
+ response = llm.invoke(messages)
151
+
152
+ # Parse test cases from LLM response
153
+ test_cases = _parse_test_cases(response.content, iteration)
154
+
155
+ print(f" āœ… Generated {len(test_cases)} test cases:")
156
+ for tc in test_cases:
157
+ print(f" [{tc['id']}] {tc['edge_case_type']:20s} | {tc['difficulty']:6s} | {tc['input_data'][:60]}...")
158
+
159
+ return {
160
+ "test_suite": test_cases,
161
+ "current_iteration": iteration,
162
+ }