agentic-qa 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. agentic_qa-0.1.0/PKG-INFO +180 -0
  2. agentic_qa-0.1.0/README.md +150 -0
  3. agentic_qa-0.1.0/agentic_qa/__init__.py +103 -0
  4. agentic_qa-0.1.0/agentic_qa/agents/__init__.py +15 -0
  5. agentic_qa-0.1.0/agentic_qa/agents/discovery.py +88 -0
  6. agentic_qa-0.1.0/agentic_qa/agents/executor.py +89 -0
  7. agentic_qa-0.1.0/agentic_qa/agents/judge.py +231 -0
  8. agentic_qa-0.1.0/agentic_qa/agents/red_team.py +162 -0
  9. agentic_qa-0.1.0/agentic_qa/agents/refiner.py +109 -0
  10. agentic_qa-0.1.0/agentic_qa/agents/reporter.py +68 -0
  11. agentic_qa-0.1.0/agentic_qa/graph/__init__.py +1 -0
  12. agentic_qa-0.1.0/agentic_qa/graph/conditions.py +51 -0
  13. agentic_qa-0.1.0/agentic_qa/graph/state.py +75 -0
  14. agentic_qa-0.1.0/agentic_qa/graph/workflow.py +154 -0
  15. agentic_qa-0.1.0/agentic_qa/schemas/__init__.py +1 -0
  16. agentic_qa-0.1.0/agentic_qa/schemas/test_case.py +53 -0
  17. agentic_qa-0.1.0/agentic_qa/schemas/verdict.py +57 -0
  18. agentic_qa-0.1.0/agentic_qa/sut/__init__.py +39 -0
  19. agentic_qa-0.1.0/agentic_qa/sut/api_adapter.py +119 -0
  20. agentic_qa-0.1.0/agentic_qa/sut/base.py +55 -0
  21. agentic_qa-0.1.0/agentic_qa/sut/callable_adapter.py +80 -0
  22. agentic_qa-0.1.0/agentic_qa/sut/financial_rag.py +185 -0
  23. agentic_qa-0.1.0/agentic_qa/utils/__init__.py +1 -0
  24. agentic_qa-0.1.0/agentic_qa/utils/prompt_templates.py +227 -0
  25. agentic_qa-0.1.0/agentic_qa.egg-info/PKG-INFO +180 -0
  26. agentic_qa-0.1.0/agentic_qa.egg-info/SOURCES.txt +29 -0
  27. agentic_qa-0.1.0/agentic_qa.egg-info/dependency_links.txt +1 -0
  28. agentic_qa-0.1.0/agentic_qa.egg-info/requires.txt +10 -0
  29. agentic_qa-0.1.0/agentic_qa.egg-info/top_level.txt +1 -0
  30. agentic_qa-0.1.0/setup.cfg +4 -0
  31. agentic_qa-0.1.0/setup.py +36 -0
@@ -0,0 +1,180 @@
1
+ Metadata-Version: 2.4
2
+ Name: agentic_qa
3
+ Version: 0.1.0
4
+ Summary: Autonomous Agentic QA System for testing RAG pipelines and LLM systems.
5
+ Home-page: https://github.com/yourusername/multi-agent-qa
6
+ Author: Your Name
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: langgraph>=0.2.0
13
+ Requires-Dist: langchain>=0.3.0
14
+ Requires-Dist: langchain-openai>=0.2.0
15
+ Requires-Dist: langchain-core>=0.3.0
16
+ Requires-Dist: langsmith>=0.1.0
17
+ Requires-Dist: pydantic>=2.0.0
18
+ Requires-Dist: python-dotenv>=1.0.0
19
+ Requires-Dist: requests>=2.30.0
20
+ Requires-Dist: ragas>=0.1.0
21
+ Requires-Dist: datasets>=2.0.0
22
+ Dynamic: author
23
+ Dynamic: classifier
24
+ Dynamic: description
25
+ Dynamic: description-content-type
26
+ Dynamic: home-page
27
+ Dynamic: requires-dist
28
+ Dynamic: requires-python
29
+ Dynamic: summary
30
+
31
+ # ๐Ÿ›ก๏ธ Agentic QA: Autonomous Multi-Agent Testing for RAG & LLMs
32
+
33
+ **Agentic QA** is a Python library that autonomously generates adversarial test cases, executes them against your RAG/LLM system, evaluates the results, and self-improves its testing coverageโ€”all without human intervention.
34
+
35
+ Unlike traditional testing frameworks (like RAGAS or TruLens) that evaluate outputs against static, human-written inputs, **Agentic QA acts as an active red-team**, dynamically generating the tricky edge cases needed to break your system.
36
+
37
+ ![Python](https://img.shields.io/badge/Python-3.10+-3776AB?logo=python&logoColor=white)
38
+ ![LangGraph](https://img.shields.io/badge/LangGraph-0.2+-1C3C3C?logo=langchain&logoColor=white)
39
+ ![LangSmith](https://img.shields.io/badge/LangSmith-Monitored-FF6B35?logo=langchain&logoColor=white)
40
+ ![Streamlit](https://img.shields.io/badge/Streamlit-Dashboard-FF4B4B?logo=streamlit&logoColor=white)
41
+
42
+ ---
43
+
44
+ ## ๐Ÿš€ Quick Start
45
+
46
+ ### 1. Installation
47
+
48
+ Install the library locally:
49
+
50
+ ```bash
51
+ git clone https://github.com/yourusername/multi-agent-qa.git
52
+ cd multi-agent-qa
53
+ pip install -e .
54
+ ```
55
+
56
+ Ensure you have your `.env` configured with your API keys:
57
+
58
+ ```bash
59
+ cp .env.example .env
60
+ # Edit .env and provide OPENAI_API_KEY
61
+ ```
62
+
63
+ ### 2. Using the Python Library
64
+
65
+ You can test any RAG or LLM pipeline in just a few lines of code.
66
+
67
+ #### Option A: Testing a Python Function
68
+ If your RAG system is a Python function in your codebase:
69
+
70
+ ```python
71
+ import agentic_qa
72
+
73
+ # Your existing RAG or Chatbot function
74
+ def my_custom_rag(query: str) -> str:
75
+ # Example: return my_langchain_pipeline.invoke(query)
76
+ return "This is my AI response."
77
+
78
+ # Run the autonomous testing loop
79
+ report = agentic_qa.run_autonomous_test(
80
+ target_function=my_custom_rag,
81
+ system_name="YouTube Video Q&A",
82
+ system_description="A chatbot that answers questions about YouTube transcripts.",
83
+ domain="video content",
84
+ max_iterations=3, # How many times agents learn and retry
85
+ tests_per_iteration=5 # Tests generated per round
86
+ )
87
+ ```
88
+
89
+ #### Option B: Testing an API Endpoint
90
+ If your system is deployed behind a REST API (FastAPI, Flask, LangServe):
91
+
92
+ ```python
93
+ import agentic_qa
94
+
95
+ report = agentic_qa.run_autonomous_test(
96
+ api_endpoint="http://localhost:8000/api/chat",
97
+ system_name="Customer Support Bot",
98
+ system_description="An AI that resolves customer support tickets.",
99
+ domain="customer support"
100
+ )
101
+ ```
102
+
103
+ ### 3. Using the Streamlit UI
104
+
105
+ If you prefer a visual dashboard to monitor the agents in real-time, run the included Streamlit app:
106
+
107
+ ```bash
108
+ streamlit run app.py
109
+ ```
110
+
111
+ From the UI, you can connect your API endpoint or use the built-in mock system for a demonstration.
112
+
113
+ ---
114
+
115
+ ## ๐Ÿ—๏ธ Architecture
116
+
117
+ The framework is powered by 5 autonomous agents built with LangGraph:
118
+
119
+ ```
120
+ START โ”€โ”€โ–ถ ๐Ÿ”ด Red-Team Agent โ”€โ”€โ–ถ โšก Executor Agent โ”€โ”€โ–ถ โš–๏ธ Judge Agent โ”€โ”€โ–ถ Decision
121
+ โ–ฒ โ”‚
122
+ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”
123
+ โ”‚ โ–ผ โ–ผ
124
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ ๐Ÿ”ง Refiner Agent ๐Ÿ“Š Reporter Agent
125
+ (loop back) (END)
126
+ ```
127
+
128
+ ### Agent Roles
129
+
130
+ | Agent | Role |
131
+ |-------|------|
132
+ | ๐Ÿ”ด **Red-Team** | Generates adversarial test inputs targeting edge cases (prompt injections, boundary values, etc.). |
133
+ | โšก **Executor** | Runs tests through the target system and captures the outputs. |
134
+ | โš–๏ธ **Judge** | Evaluates the outputs using an LLM-as-a-Judge pattern with strict pass/fail criteria. |
135
+ | ๐Ÿ”ง **Refiner** | Analyzes the judge's failure patterns and instructs the Red-Team on how to exploit weaknesses in the next iteration. |
136
+ | ๐Ÿ“Š **Reporter** | Compiles a comprehensive final Markdown QA report. |
137
+
138
+ ---
139
+
140
+ ## ๐Ÿง  What Makes This Novel
141
+
142
+ | Traditional Testing Tools (RAGAS, TruLens) | Agentic QA |
143
+ |----------------------------------------|-------------|
144
+ | Measures outputs against static inputs | **Generates** the adversarial inputs autonomously |
145
+ | Human writes test cases | AI agents write and refine test cases |
146
+ | One-shot evaluation | **Self-improving loop** with pattern learning |
147
+ | Relies heavily on reference data | Relies on behavioral boundaries and edge-case testing |
148
+
149
+ ---
150
+
151
+ ## ๐Ÿ“‚ Project Structure
152
+
153
+ ```text
154
+ multi-agent-qa/
155
+ โ”œโ”€โ”€ agentic_qa/
156
+ โ”‚ โ”œโ”€โ”€ __init__.py # Clean developer API (run_autonomous_test)
157
+ โ”‚ โ”œโ”€โ”€ agents/ # 5 LangGraph agent definitions
158
+ โ”‚ โ”œโ”€โ”€ graph/ # State definitions and LangGraph flow
159
+ โ”‚ โ”œโ”€โ”€ schemas/ # Pydantic validation models
160
+ โ”‚ โ”œโ”€โ”€ sut/ # Adapters (API, Callable, Base)
161
+ โ”‚ โ””โ”€โ”€ utils/ # Prompt templates
162
+ โ”œโ”€โ”€ setup.py # Package configuration
163
+ โ”œโ”€โ”€ app.py # Streamlit Dashboard UI
164
+ โ”œโ”€โ”€ .env # API Keys configuration
165
+ โ””โ”€โ”€ README.md
166
+ ```
167
+
168
+ ## ๐Ÿ“ก LangSmith Monitoring
169
+
170
+ All agent interactions are automatically traced via LangSmith if configured in `.env`.
171
+
172
+ ```env
173
+ LANGCHAIN_TRACING_V2=true
174
+ LANGCHAIN_API_KEY=your-langsmith-api-key
175
+ LANGCHAIN_PROJECT=agentic-qa
176
+ ```
177
+
178
+ ## ๐Ÿ“„ License
179
+
180
+ MIT
@@ -0,0 +1,150 @@
1
+ # ๐Ÿ›ก๏ธ Agentic QA: Autonomous Multi-Agent Testing for RAG & LLMs
2
+
3
+ **Agentic QA** is a Python library that autonomously generates adversarial test cases, executes them against your RAG/LLM system, evaluates the results, and self-improves its testing coverageโ€”all without human intervention.
4
+
5
+ Unlike traditional testing frameworks (like RAGAS or TruLens) that evaluate outputs against static, human-written inputs, **Agentic QA acts as an active red-team**, dynamically generating the tricky edge cases needed to break your system.
6
+
7
+ ![Python](https://img.shields.io/badge/Python-3.10+-3776AB?logo=python&logoColor=white)
8
+ ![LangGraph](https://img.shields.io/badge/LangGraph-0.2+-1C3C3C?logo=langchain&logoColor=white)
9
+ ![LangSmith](https://img.shields.io/badge/LangSmith-Monitored-FF6B35?logo=langchain&logoColor=white)
10
+ ![Streamlit](https://img.shields.io/badge/Streamlit-Dashboard-FF4B4B?logo=streamlit&logoColor=white)
11
+
12
+ ---
13
+
14
+ ## ๐Ÿš€ Quick Start
15
+
16
+ ### 1. Installation
17
+
18
+ Install the library locally:
19
+
20
+ ```bash
21
+ git clone https://github.com/yourusername/multi-agent-qa.git
22
+ cd multi-agent-qa
23
+ pip install -e .
24
+ ```
25
+
26
+ Ensure you have your `.env` configured with your API keys:
27
+
28
+ ```bash
29
+ cp .env.example .env
30
+ # Edit .env and provide OPENAI_API_KEY
31
+ ```
32
+
33
+ ### 2. Using the Python Library
34
+
35
+ You can test any RAG or LLM pipeline in just a few lines of code.
36
+
37
+ #### Option A: Testing a Python Function
38
+ If your RAG system is a Python function in your codebase:
39
+
40
+ ```python
41
+ import agentic_qa
42
+
43
+ # Your existing RAG or Chatbot function
44
+ def my_custom_rag(query: str) -> str:
45
+ # Example: return my_langchain_pipeline.invoke(query)
46
+ return "This is my AI response."
47
+
48
+ # Run the autonomous testing loop
49
+ report = agentic_qa.run_autonomous_test(
50
+ target_function=my_custom_rag,
51
+ system_name="YouTube Video Q&A",
52
+ system_description="A chatbot that answers questions about YouTube transcripts.",
53
+ domain="video content",
54
+ max_iterations=3, # How many times agents learn and retry
55
+ tests_per_iteration=5 # Tests generated per round
56
+ )
57
+ ```
58
+
59
+ #### Option B: Testing an API Endpoint
60
+ If your system is deployed behind a REST API (FastAPI, Flask, LangServe):
61
+
62
+ ```python
63
+ import agentic_qa
64
+
65
+ report = agentic_qa.run_autonomous_test(
66
+ api_endpoint="http://localhost:8000/api/chat",
67
+ system_name="Customer Support Bot",
68
+ system_description="An AI that resolves customer support tickets.",
69
+ domain="customer support"
70
+ )
71
+ ```
72
+
73
+ ### 3. Using the Streamlit UI
74
+
75
+ If you prefer a visual dashboard to monitor the agents in real-time, run the included Streamlit app:
76
+
77
+ ```bash
78
+ streamlit run app.py
79
+ ```
80
+
81
+ From the UI, you can connect your API endpoint or use the built-in mock system for a demonstration.
82
+
83
+ ---
84
+
85
+ ## ๐Ÿ—๏ธ Architecture
86
+
87
+ The framework is powered by 5 autonomous agents built with LangGraph:
88
+
89
+ ```
90
+ START โ”€โ”€โ–ถ ๐Ÿ”ด Red-Team Agent โ”€โ”€โ–ถ โšก Executor Agent โ”€โ”€โ–ถ โš–๏ธ Judge Agent โ”€โ”€โ–ถ Decision
91
+ โ–ฒ โ”‚
92
+ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”
93
+ โ”‚ โ–ผ โ–ผ
94
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ ๐Ÿ”ง Refiner Agent ๐Ÿ“Š Reporter Agent
95
+ (loop back) (END)
96
+ ```
97
+
98
+ ### Agent Roles
99
+
100
+ | Agent | Role |
101
+ |-------|------|
102
+ | ๐Ÿ”ด **Red-Team** | Generates adversarial test inputs targeting edge cases (prompt injections, boundary values, etc.). |
103
+ | โšก **Executor** | Runs tests through the target system and captures the outputs. |
104
+ | โš–๏ธ **Judge** | Evaluates the outputs using an LLM-as-a-Judge pattern with strict pass/fail criteria. |
105
+ | ๐Ÿ”ง **Refiner** | Analyzes the judge's failure patterns and instructs the Red-Team on how to exploit weaknesses in the next iteration. |
106
+ | ๐Ÿ“Š **Reporter** | Compiles a comprehensive final Markdown QA report. |
107
+
108
+ ---
109
+
110
+ ## ๐Ÿง  What Makes This Novel
111
+
112
+ | Traditional Testing Tools (RAGAS, TruLens) | Agentic QA |
113
+ |----------------------------------------|-------------|
114
+ | Measures outputs against static inputs | **Generates** the adversarial inputs autonomously |
115
+ | Human writes test cases | AI agents write and refine test cases |
116
+ | One-shot evaluation | **Self-improving loop** with pattern learning |
117
+ | Relies heavily on reference data | Relies on behavioral boundaries and edge-case testing |
118
+
119
+ ---
120
+
121
+ ## ๐Ÿ“‚ Project Structure
122
+
123
+ ```text
124
+ multi-agent-qa/
125
+ โ”œโ”€โ”€ agentic_qa/
126
+ โ”‚ โ”œโ”€โ”€ __init__.py # Clean developer API (run_autonomous_test)
127
+ โ”‚ โ”œโ”€โ”€ agents/ # 5 LangGraph agent definitions
128
+ โ”‚ โ”œโ”€โ”€ graph/ # State definitions and LangGraph flow
129
+ โ”‚ โ”œโ”€โ”€ schemas/ # Pydantic validation models
130
+ โ”‚ โ”œโ”€โ”€ sut/ # Adapters (API, Callable, Base)
131
+ โ”‚ โ””โ”€โ”€ utils/ # Prompt templates
132
+ โ”œโ”€โ”€ setup.py # Package configuration
133
+ โ”œโ”€โ”€ app.py # Streamlit Dashboard UI
134
+ โ”œโ”€โ”€ .env # API Keys configuration
135
+ โ””โ”€โ”€ README.md
136
+ ```
137
+
138
+ ## ๐Ÿ“ก LangSmith Monitoring
139
+
140
+ All agent interactions are automatically traced via LangSmith if configured in `.env`.
141
+
142
+ ```env
143
+ LANGCHAIN_TRACING_V2=true
144
+ LANGCHAIN_API_KEY=your-langsmith-api-key
145
+ LANGCHAIN_PROJECT=agentic-qa
146
+ ```
147
+
148
+ ## ๐Ÿ“„ License
149
+
150
+ MIT
@@ -0,0 +1,103 @@
1
+ """
2
+ Agentic QA - Main Public API
3
+
4
+ This file exposes the simple `run_autonomous_test` function, allowing developers
5
+ to test their RAG systems in just a few lines of code.
6
+ """
7
+
8
+ import os
9
+ from typing import Callable, Optional
10
+
11
+ # Expose SUT adapters for developers if they want advanced setups
12
+ from agentic_qa.sut.base import BaseSUTAdapter
13
+ from agentic_qa.sut.api_adapter import APIAdapter
14
+ from agentic_qa.sut.callable_adapter import CallableAdapter
15
+ from agentic_qa.sut import set_active_sut
16
+
17
+ # Import the core workflow
18
+ from agentic_qa.graph.workflow import build_qa_graph, get_initial_state
19
+
20
+
21
+ def run_autonomous_test(
22
+ target_function: Optional[Callable] = None,
23
+ api_endpoint: Optional[str] = None,
24
+ system_name: str = "Target System",
25
+ system_description: str = "A generic RAG system",
26
+ domain: str = "general",
27
+ max_iterations: int = 3,
28
+ tests_per_iteration: int = 5,
29
+ model_name: str = "gpt-4o-mini",
30
+ ) -> dict:
31
+ """
32
+ Run an autonomous multi-agent QA test against a target system.
33
+
34
+ You must provide EITHER a `target_function` (a python function) OR
35
+ an `api_endpoint` (a URL string).
36
+
37
+ Args:
38
+ target_function: A python function that takes a string query and returns a string answer.
39
+ api_endpoint: A URL endpoint (e.g., http://localhost:8000/chat) to test.
40
+ system_name: The name of the system being tested.
41
+ system_description: A description of what the system does. Highly important for agents!
42
+ domain: The domain of the system (e.g., 'financial', 'healthcare', 'customer support').
43
+ max_iterations: How many times the agents should refine and retry their tests.
44
+ tests_per_iteration: How many tests the Red-Team agent generates per round.
45
+ model_name: The LLM to use for the agents (default: gpt-4o-mini).
46
+
47
+ Returns:
48
+ A dictionary containing the final execution state, including the test suite, verdicts,
49
+ failure patterns, and the final Markdown report.
50
+ """
51
+ # 1. Configure Environment variables needed by LangGraph
52
+ os.environ["MAX_ITERATIONS"] = str(max_iterations)
53
+ os.environ["TESTS_PER_ITERATION"] = str(tests_per_iteration)
54
+ os.environ["MODEL_NAME"] = model_name
55
+
56
+ # 2. Setup the SUT Adapter
57
+ if target_function:
58
+ adapter = CallableAdapter(
59
+ fn=target_function,
60
+ description=system_description,
61
+ system_name=system_name,
62
+ domain=domain
63
+ )
64
+ elif api_endpoint:
65
+ adapter = APIAdapter(
66
+ endpoint=api_endpoint,
67
+ description=system_description,
68
+ system_name=system_name,
69
+ domain=domain
70
+ )
71
+ else:
72
+ raise ValueError("You must provide either a `target_function` or an `api_endpoint`.")
73
+
74
+ # Register the adapter globally for the Executor Agent to use
75
+ set_active_sut(adapter)
76
+
77
+ # 3. Build and Run the Graph
78
+ print(f"๐Ÿš€ Starting Autonomous QA Test against: {system_name}")
79
+ print(f"Domain: {domain} | Max Iterations: {max_iterations}")
80
+
81
+ graph = build_qa_graph()
82
+ initial_state = get_initial_state()
83
+ initial_state["max_iterations"] = max_iterations
84
+
85
+ final_state = None
86
+ # Stream the graph to provide real-time console feedback
87
+ for event in graph.stream(initial_state, stream_mode="values"):
88
+ final_state = event
89
+
90
+ print("\nโœ… Autonomous QA Test Complete!")
91
+ print(f"Coverage Score: {final_state.get('coverage_score', 0):.1%}")
92
+ print(f"Total Failure Patterns Found: {len(final_state.get('failure_patterns', []))}")
93
+
94
+ return final_state
95
+
96
+
97
+ # Define what is exported when someone runs `from agentic_qa import *`
98
+ __all__ = [
99
+ "run_autonomous_test",
100
+ "APIAdapter",
101
+ "CallableAdapter",
102
+ "BaseSUTAdapter"
103
+ ]
@@ -0,0 +1,15 @@
1
+ """Multi-Agent Autonomous QA System - Agents Package"""
2
+
3
+ from agentic_qa.agents.red_team import red_team_node
4
+ from agentic_qa.agents.executor import executor_node
5
+ from agentic_qa.agents.judge import judge_node
6
+ from agentic_qa.agents.refiner import refiner_node
7
+ from agentic_qa.agents.reporter import reporter_node
8
+
9
+ __all__ = [
10
+ "red_team_node",
11
+ "executor_node",
12
+ "judge_node",
13
+ "refiner_node",
14
+ "reporter_node",
15
+ ]
@@ -0,0 +1,88 @@
1
+ """
2
+ Discovery Agent (Graphify) โ€” Architecture Mapping.
3
+
4
+ This agent performs White-Box introspection. It analyzes the System Under Test (SUT)
5
+ and maps out its internal architecture (e.g., Vector DB, Chunk Size, Retriever type, LLM).
6
+ This architectural "graph" allows the Red-Team agent to launch hyper-targeted attacks.
7
+ """
8
+
9
+ import os
10
+ from langchain_openai import ChatOpenAI
11
+ from langchain_core.messages import SystemMessage, HumanMessage
12
+ from agentic_qa.graph.state import QAState
13
+
14
+ DISCOVERY_SYSTEM_PROMPT = """You are an elite AI Architecture Mapper (Graphify Agent).
15
+
16
+ Your mission is to analyze a generic description of an AI/RAG system and deduce its likely internal architecture graph.
17
+ You must break down the system into a logical pipeline (e.g., Data Ingestion -> Chunking -> VectorDB -> Retriever -> LLM -> Output).
18
+
19
+ Think about:
20
+ 1. What components must exist for this system to work?
21
+ 2. Where are the likely weak points or bottlenecks between these nodes?
22
+ 3. What are the assumed configurations (e.g., chunk size, top-k retrieval)?
23
+
24
+ Output a detailed, graph-like text representation of the architecture. Be specific about potential vulnerabilities at each node."""
25
+
26
+ DISCOVERY_USER_PROMPT = """Analyze the following System Under Test and map its architecture.
27
+
28
+ **System Name:** {name}
29
+ **Domain:** {domain}
30
+ **Description:**
31
+ {description}
32
+
33
+ Provide a detailed architectural breakdown (Graphify) of this system. Highlight the specific nodes (e.g., Retriever, VectorStore, LLM) and the data flow.
34
+ Keep it concise but technical."""
35
+
36
+
37
+ def _get_llm() -> ChatOpenAI:
38
+ return ChatOpenAI(
39
+ model=os.getenv("MODEL_NAME", "gpt-4o-mini"),
40
+ temperature=0.2,
41
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
42
+ openai_api_base=os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
43
+ )
44
+
45
+
46
+ def discovery_node(state: QAState) -> dict:
47
+ """
48
+ LangGraph node: Discovery Agent.
49
+
50
+ Runs once at the beginning of the pipeline to map the SUT architecture.
51
+ """
52
+ print(f"\n{'='*60}")
53
+ print(f"๐Ÿ” DISCOVERY AGENT (Graphify) โ€” Mapping Architecture")
54
+ print(f"{'='*60}")
55
+
56
+ description = state.get("sut_description", "")
57
+ domain = state.get("domain", "")
58
+
59
+ # If the architecture is already provided (e.g. by the adapter), skip LLM discovery
60
+ if state.get("sut_architecture"):
61
+ print(" Architecture already provided. Skipping LLM discovery.")
62
+ return {}
63
+
64
+ print(" Analyzing SUT description to deduce internal graph...")
65
+
66
+ llm = _get_llm()
67
+ prompt = DISCOVERY_USER_PROMPT.format(
68
+ name="Target System",
69
+ domain=domain,
70
+ description=description
71
+ )
72
+
73
+ messages = [
74
+ SystemMessage(content=DISCOVERY_SYSTEM_PROMPT),
75
+ HumanMessage(content=prompt)
76
+ ]
77
+
78
+ response = llm.invoke(messages)
79
+ architecture = response.content
80
+
81
+ print("\n ๐Ÿ—บ๏ธ Deduced Architecture Graph:")
82
+ # Print the first few lines as a preview
83
+ preview = "\n".join([f" {line}" for line in architecture.split("\n")[:10]])
84
+ print(f"{preview}\n ...")
85
+
86
+ return {
87
+ "sut_architecture": architecture
88
+ }
@@ -0,0 +1,89 @@
1
+ """
2
+ Executor Agent โ€” Generic System Under Test Runner.
3
+
4
+ Executes test cases against ANY connected SUT (RAG, API, function).
5
+ Uses the active SUT adapter from the registry โ€” works with:
6
+ - Built-in Financial RAG demo
7
+ - Any RAG connected via API endpoint
8
+ - Any Python function wrapped as a callable
9
+ """
10
+
11
+ import time
12
+ from agentic_qa.graph.state import QAState
13
+ from agentic_qa.sut import get_active_sut
14
+
15
+
16
+ def executor_node(state: QAState) -> dict:
17
+ """
18
+ LangGraph node: Executor Agent.
19
+
20
+ Runs each test case from the current iteration through whatever
21
+ SUT is currently active and collects results.
22
+ """
23
+ iteration = state.get("current_iteration", 1)
24
+ test_suite = state.get("test_suite", [])
25
+
26
+ print(f"\n{'='*60}")
27
+ print(f"โšก EXECUTOR AGENT โ€” Iteration {iteration}")
28
+ print(f"{'='*60}")
29
+
30
+ # Get test cases for the current iteration only
31
+ iter_prefix = f"TC-{iteration:02d}"
32
+ current_tests = [tc for tc in test_suite if tc["id"].startswith(iter_prefix)]
33
+
34
+ if not current_tests:
35
+ num_per_iter = 5
36
+ current_tests = test_suite[-num_per_iter:]
37
+
38
+ # Get the active SUT (whatever the user connected)
39
+ sut = get_active_sut()
40
+ print(f" SUT: {sut.name}")
41
+ print(f" Executing {len(current_tests)} test cases...")
42
+
43
+ execution_results = []
44
+
45
+ for tc in current_tests:
46
+ test_id = tc["id"]
47
+ input_data = tc["input_data"]
48
+
49
+ print(f" โ–ถ Running [{test_id}]...", end=" ")
50
+
51
+ start_time = time.time()
52
+ try:
53
+ output = sut.process(input_data)
54
+ exec_time = time.time() - start_time
55
+
56
+ # Normalize output โ€” adapters return "output" key
57
+ sut_output = output.get("output", output.get("sut_output", str(output)))
58
+ status = output.get("status", "unknown")
59
+
60
+ result = {
61
+ "test_id": test_id,
62
+ "sut_output": str(output),
63
+ "execution_time": round(exec_time, 4),
64
+ "error": output.get("error") if status == "error" else None,
65
+ }
66
+ print(f"Done ({exec_time:.3f}s) โ€” Status: {status}")
67
+
68
+ except Exception as e:
69
+ exec_time = time.time() - start_time
70
+ result = {
71
+ "test_id": test_id,
72
+ "sut_output": "",
73
+ "execution_time": round(exec_time, 4),
74
+ "error": str(e),
75
+ }
76
+ print(f"ERROR ({exec_time:.3f}s) โ€” {e}")
77
+
78
+ execution_results.append(result)
79
+
80
+ errors = sum(1 for r in execution_results if r["error"])
81
+ avg_time = sum(r["execution_time"] for r in execution_results) / max(len(execution_results), 1)
82
+ print(f"\n ๐Ÿ“Š Execution Summary:")
83
+ print(f" Tests executed: {len(execution_results)}")
84
+ print(f" Errors/crashes: {errors}")
85
+ print(f" Avg exec time: {avg_time:.3f}s")
86
+
87
+ return {
88
+ "execution_results": execution_results,
89
+ }