agentic-qa 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentic_qa-0.1.0/PKG-INFO +180 -0
- agentic_qa-0.1.0/README.md +150 -0
- agentic_qa-0.1.0/agentic_qa/__init__.py +103 -0
- agentic_qa-0.1.0/agentic_qa/agents/__init__.py +15 -0
- agentic_qa-0.1.0/agentic_qa/agents/discovery.py +88 -0
- agentic_qa-0.1.0/agentic_qa/agents/executor.py +89 -0
- agentic_qa-0.1.0/agentic_qa/agents/judge.py +231 -0
- agentic_qa-0.1.0/agentic_qa/agents/red_team.py +162 -0
- agentic_qa-0.1.0/agentic_qa/agents/refiner.py +109 -0
- agentic_qa-0.1.0/agentic_qa/agents/reporter.py +68 -0
- agentic_qa-0.1.0/agentic_qa/graph/__init__.py +1 -0
- agentic_qa-0.1.0/agentic_qa/graph/conditions.py +51 -0
- agentic_qa-0.1.0/agentic_qa/graph/state.py +75 -0
- agentic_qa-0.1.0/agentic_qa/graph/workflow.py +154 -0
- agentic_qa-0.1.0/agentic_qa/schemas/__init__.py +1 -0
- agentic_qa-0.1.0/agentic_qa/schemas/test_case.py +53 -0
- agentic_qa-0.1.0/agentic_qa/schemas/verdict.py +57 -0
- agentic_qa-0.1.0/agentic_qa/sut/__init__.py +39 -0
- agentic_qa-0.1.0/agentic_qa/sut/api_adapter.py +119 -0
- agentic_qa-0.1.0/agentic_qa/sut/base.py +55 -0
- agentic_qa-0.1.0/agentic_qa/sut/callable_adapter.py +80 -0
- agentic_qa-0.1.0/agentic_qa/sut/financial_rag.py +185 -0
- agentic_qa-0.1.0/agentic_qa/utils/__init__.py +1 -0
- agentic_qa-0.1.0/agentic_qa/utils/prompt_templates.py +227 -0
- agentic_qa-0.1.0/agentic_qa.egg-info/PKG-INFO +180 -0
- agentic_qa-0.1.0/agentic_qa.egg-info/SOURCES.txt +29 -0
- agentic_qa-0.1.0/agentic_qa.egg-info/dependency_links.txt +1 -0
- agentic_qa-0.1.0/agentic_qa.egg-info/requires.txt +10 -0
- agentic_qa-0.1.0/agentic_qa.egg-info/top_level.txt +1 -0
- agentic_qa-0.1.0/setup.cfg +4 -0
- agentic_qa-0.1.0/setup.py +36 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agentic_qa
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Autonomous Agentic QA System for testing RAG pipelines and LLM systems.
|
|
5
|
+
Home-page: https://github.com/yourusername/multi-agent-qa
|
|
6
|
+
Author: Your Name
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: langgraph>=0.2.0
|
|
13
|
+
Requires-Dist: langchain>=0.3.0
|
|
14
|
+
Requires-Dist: langchain-openai>=0.2.0
|
|
15
|
+
Requires-Dist: langchain-core>=0.3.0
|
|
16
|
+
Requires-Dist: langsmith>=0.1.0
|
|
17
|
+
Requires-Dist: pydantic>=2.0.0
|
|
18
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
19
|
+
Requires-Dist: requests>=2.30.0
|
|
20
|
+
Requires-Dist: ragas>=0.1.0
|
|
21
|
+
Requires-Dist: datasets>=2.0.0
|
|
22
|
+
Dynamic: author
|
|
23
|
+
Dynamic: classifier
|
|
24
|
+
Dynamic: description
|
|
25
|
+
Dynamic: description-content-type
|
|
26
|
+
Dynamic: home-page
|
|
27
|
+
Dynamic: requires-dist
|
|
28
|
+
Dynamic: requires-python
|
|
29
|
+
Dynamic: summary
|
|
30
|
+
|
|
31
|
+
# ๐ก๏ธ Agentic QA: Autonomous Multi-Agent Testing for RAG & LLMs
|
|
32
|
+
|
|
33
|
+
**Agentic QA** is a Python library that autonomously generates adversarial test cases, executes them against your RAG/LLM system, evaluates the results, and self-improves its testing coverageโall without human intervention.
|
|
34
|
+
|
|
35
|
+
Unlike traditional testing frameworks (like RAGAS or TruLens) that evaluate outputs against static, human-written inputs, **Agentic QA acts as an active red-team**, dynamically generating the tricky edge cases needed to break your system.
|
|
36
|
+
|
|
37
|
+

|
|
38
|
+

|
|
39
|
+

|
|
40
|
+

|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## ๐ Quick Start
|
|
45
|
+
|
|
46
|
+
### 1. Installation
|
|
47
|
+
|
|
48
|
+
Install the library locally:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
git clone https://github.com/yourusername/multi-agent-qa.git
|
|
52
|
+
cd multi-agent-qa
|
|
53
|
+
pip install -e .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Ensure you have your `.env` configured with your API keys:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
cp .env.example .env
|
|
60
|
+
# Edit .env and provide OPENAI_API_KEY
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### 2. Using the Python Library
|
|
64
|
+
|
|
65
|
+
You can test any RAG or LLM pipeline in just a few lines of code.
|
|
66
|
+
|
|
67
|
+
#### Option A: Testing a Python Function
|
|
68
|
+
If your RAG system is a Python function in your codebase:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
import agentic_qa
|
|
72
|
+
|
|
73
|
+
# Your existing RAG or Chatbot function
|
|
74
|
+
def my_custom_rag(query: str) -> str:
|
|
75
|
+
# Example: return my_langchain_pipeline.invoke(query)
|
|
76
|
+
return "This is my AI response."
|
|
77
|
+
|
|
78
|
+
# Run the autonomous testing loop
|
|
79
|
+
report = agentic_qa.run_autonomous_test(
|
|
80
|
+
target_function=my_custom_rag,
|
|
81
|
+
system_name="YouTube Video Q&A",
|
|
82
|
+
system_description="A chatbot that answers questions about YouTube transcripts.",
|
|
83
|
+
domain="video content",
|
|
84
|
+
max_iterations=3, # How many times agents learn and retry
|
|
85
|
+
tests_per_iteration=5 # Tests generated per round
|
|
86
|
+
)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
#### Option B: Testing an API Endpoint
|
|
90
|
+
If your system is deployed behind a REST API (FastAPI, Flask, LangServe):
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import agentic_qa
|
|
94
|
+
|
|
95
|
+
report = agentic_qa.run_autonomous_test(
|
|
96
|
+
api_endpoint="http://localhost:8000/api/chat",
|
|
97
|
+
system_name="Customer Support Bot",
|
|
98
|
+
system_description="An AI that resolves customer support tickets.",
|
|
99
|
+
domain="customer support"
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### 3. Using the Streamlit UI
|
|
104
|
+
|
|
105
|
+
If you prefer a visual dashboard to monitor the agents in real-time, run the included Streamlit app:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
streamlit run app.py
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
From the UI, you can connect your API endpoint or use the built-in mock system for a demonstration.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## ๐๏ธ Architecture
|
|
116
|
+
|
|
117
|
+
The framework is powered by 5 autonomous agents built with LangGraph:
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
START โโโถ ๐ด Red-Team Agent โโโถ โก Executor Agent โโโถ โ๏ธ Judge Agent โโโถ Decision
|
|
121
|
+
โฒ โ
|
|
122
|
+
โ โโโโโโโดโโโโโโ
|
|
123
|
+
โ โผ โผ
|
|
124
|
+
โโโโโโโโโโโโโโโโโ ๐ง Refiner Agent ๐ Reporter Agent
|
|
125
|
+
(loop back) (END)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Agent Roles
|
|
129
|
+
|
|
130
|
+
| Agent | Role |
|
|
131
|
+
|-------|------|
|
|
132
|
+
| ๐ด **Red-Team** | Generates adversarial test inputs targeting edge cases (prompt injections, boundary values, etc.). |
|
|
133
|
+
| โก **Executor** | Runs tests through the target system and captures the outputs. |
|
|
134
|
+
| โ๏ธ **Judge** | Evaluates the outputs using an LLM-as-a-Judge pattern with strict pass/fail criteria. |
|
|
135
|
+
| ๐ง **Refiner** | Analyzes the judge's failure patterns and instructs the Red-Team on how to exploit weaknesses in the next iteration. |
|
|
136
|
+
| ๐ **Reporter** | Compiles a comprehensive final Markdown QA report. |
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## ๐ง What Makes This Novel
|
|
141
|
+
|
|
142
|
+
| Traditional Testing Tools (RAGAS, TruLens) | Agentic QA |
|
|
143
|
+
|----------------------------------------|-------------|
|
|
144
|
+
| Measures outputs against static inputs | **Generates** the adversarial inputs autonomously |
|
|
145
|
+
| Human writes test cases | AI agents write and refine test cases |
|
|
146
|
+
| One-shot evaluation | **Self-improving loop** with pattern learning |
|
|
147
|
+
| Relies heavily on reference data | Relies on behavioral boundaries and edge-case testing |
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## ๐ Project Structure
|
|
152
|
+
|
|
153
|
+
```text
|
|
154
|
+
multi-agent-qa/
|
|
155
|
+
โโโ agentic_qa/
|
|
156
|
+
โ โโโ __init__.py # Clean developer API (run_autonomous_test)
|
|
157
|
+
โ โโโ agents/ # 5 LangGraph agent definitions
|
|
158
|
+
โ โโโ graph/ # State definitions and LangGraph flow
|
|
159
|
+
โ โโโ schemas/ # Pydantic validation models
|
|
160
|
+
โ โโโ sut/ # Adapters (API, Callable, Base)
|
|
161
|
+
โ โโโ utils/ # Prompt templates
|
|
162
|
+
โโโ setup.py # Package configuration
|
|
163
|
+
โโโ app.py # Streamlit Dashboard UI
|
|
164
|
+
โโโ .env # API Keys configuration
|
|
165
|
+
โโโ README.md
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## ๐ก LangSmith Monitoring
|
|
169
|
+
|
|
170
|
+
All agent interactions are automatically traced via LangSmith if configured in `.env`.
|
|
171
|
+
|
|
172
|
+
```env
|
|
173
|
+
LANGCHAIN_TRACING_V2=true
|
|
174
|
+
LANGCHAIN_API_KEY=your-langsmith-api-key
|
|
175
|
+
LANGCHAIN_PROJECT=agentic-qa
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## ๐ License
|
|
179
|
+
|
|
180
|
+
MIT
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# ๐ก๏ธ Agentic QA: Autonomous Multi-Agent Testing for RAG & LLMs
|
|
2
|
+
|
|
3
|
+
**Agentic QA** is a Python library that autonomously generates adversarial test cases, executes them against your RAG/LLM system, evaluates the results, and self-improves its testing coverageโall without human intervention.
|
|
4
|
+
|
|
5
|
+
Unlike traditional testing frameworks (like RAGAS or TruLens) that evaluate outputs against static, human-written inputs, **Agentic QA acts as an active red-team**, dynamically generating the tricky edge cases needed to break your system.
|
|
6
|
+
|
|
7
|
+

|
|
8
|
+

|
|
9
|
+

|
|
10
|
+

|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## ๐ Quick Start
|
|
15
|
+
|
|
16
|
+
### 1. Installation
|
|
17
|
+
|
|
18
|
+
Install the library locally:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
git clone https://github.com/yourusername/multi-agent-qa.git
|
|
22
|
+
cd multi-agent-qa
|
|
23
|
+
pip install -e .
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Ensure you have your `.env` configured with your API keys:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
cp .env.example .env
|
|
30
|
+
# Edit .env and provide OPENAI_API_KEY
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### 2. Using the Python Library
|
|
34
|
+
|
|
35
|
+
You can test any RAG or LLM pipeline in just a few lines of code.
|
|
36
|
+
|
|
37
|
+
#### Option A: Testing a Python Function
|
|
38
|
+
If your RAG system is a Python function in your codebase:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import agentic_qa
|
|
42
|
+
|
|
43
|
+
# Your existing RAG or Chatbot function
|
|
44
|
+
def my_custom_rag(query: str) -> str:
|
|
45
|
+
# Example: return my_langchain_pipeline.invoke(query)
|
|
46
|
+
return "This is my AI response."
|
|
47
|
+
|
|
48
|
+
# Run the autonomous testing loop
|
|
49
|
+
report = agentic_qa.run_autonomous_test(
|
|
50
|
+
target_function=my_custom_rag,
|
|
51
|
+
system_name="YouTube Video Q&A",
|
|
52
|
+
system_description="A chatbot that answers questions about YouTube transcripts.",
|
|
53
|
+
domain="video content",
|
|
54
|
+
max_iterations=3, # How many times agents learn and retry
|
|
55
|
+
tests_per_iteration=5 # Tests generated per round
|
|
56
|
+
)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
#### Option B: Testing an API Endpoint
|
|
60
|
+
If your system is deployed behind a REST API (FastAPI, Flask, LangServe):
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
import agentic_qa
|
|
64
|
+
|
|
65
|
+
report = agentic_qa.run_autonomous_test(
|
|
66
|
+
api_endpoint="http://localhost:8000/api/chat",
|
|
67
|
+
system_name="Customer Support Bot",
|
|
68
|
+
system_description="An AI that resolves customer support tickets.",
|
|
69
|
+
domain="customer support"
|
|
70
|
+
)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### 3. Using the Streamlit UI
|
|
74
|
+
|
|
75
|
+
If you prefer a visual dashboard to monitor the agents in real-time, run the included Streamlit app:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
streamlit run app.py
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
From the UI, you can connect your API endpoint or use the built-in mock system for a demonstration.
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## ๐๏ธ Architecture
|
|
86
|
+
|
|
87
|
+
The framework is powered by 5 autonomous agents built with LangGraph:
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
START โโโถ ๐ด Red-Team Agent โโโถ โก Executor Agent โโโถ โ๏ธ Judge Agent โโโถ Decision
|
|
91
|
+
โฒ โ
|
|
92
|
+
โ โโโโโโโดโโโโโโ
|
|
93
|
+
โ โผ โผ
|
|
94
|
+
โโโโโโโโโโโโโโโโโ ๐ง Refiner Agent ๐ Reporter Agent
|
|
95
|
+
(loop back) (END)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Agent Roles
|
|
99
|
+
|
|
100
|
+
| Agent | Role |
|
|
101
|
+
|-------|------|
|
|
102
|
+
| ๐ด **Red-Team** | Generates adversarial test inputs targeting edge cases (prompt injections, boundary values, etc.). |
|
|
103
|
+
| โก **Executor** | Runs tests through the target system and captures the outputs. |
|
|
104
|
+
| โ๏ธ **Judge** | Evaluates the outputs using an LLM-as-a-Judge pattern with strict pass/fail criteria. |
|
|
105
|
+
| ๐ง **Refiner** | Analyzes the judge's failure patterns and instructs the Red-Team on how to exploit weaknesses in the next iteration. |
|
|
106
|
+
| ๐ **Reporter** | Compiles a comprehensive final Markdown QA report. |
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## ๐ง What Makes This Novel
|
|
111
|
+
|
|
112
|
+
| Traditional Testing Tools (RAGAS, TruLens) | Agentic QA |
|
|
113
|
+
|----------------------------------------|-------------|
|
|
114
|
+
| Measures outputs against static inputs | **Generates** the adversarial inputs autonomously |
|
|
115
|
+
| Human writes test cases | AI agents write and refine test cases |
|
|
116
|
+
| One-shot evaluation | **Self-improving loop** with pattern learning |
|
|
117
|
+
| Relies heavily on reference data | Relies on behavioral boundaries and edge-case testing |
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## ๐ Project Structure
|
|
122
|
+
|
|
123
|
+
```text
|
|
124
|
+
multi-agent-qa/
|
|
125
|
+
โโโ agentic_qa/
|
|
126
|
+
โ โโโ __init__.py # Clean developer API (run_autonomous_test)
|
|
127
|
+
โ โโโ agents/ # 5 LangGraph agent definitions
|
|
128
|
+
โ โโโ graph/ # State definitions and LangGraph flow
|
|
129
|
+
โ โโโ schemas/ # Pydantic validation models
|
|
130
|
+
โ โโโ sut/ # Adapters (API, Callable, Base)
|
|
131
|
+
โ โโโ utils/ # Prompt templates
|
|
132
|
+
โโโ setup.py # Package configuration
|
|
133
|
+
โโโ app.py # Streamlit Dashboard UI
|
|
134
|
+
โโโ .env # API Keys configuration
|
|
135
|
+
โโโ README.md
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## ๐ก LangSmith Monitoring
|
|
139
|
+
|
|
140
|
+
All agent interactions are automatically traced via LangSmith if configured in `.env`.
|
|
141
|
+
|
|
142
|
+
```env
|
|
143
|
+
LANGCHAIN_TRACING_V2=true
|
|
144
|
+
LANGCHAIN_API_KEY=your-langsmith-api-key
|
|
145
|
+
LANGCHAIN_PROJECT=agentic-qa
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## ๐ License
|
|
149
|
+
|
|
150
|
+
MIT
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agentic QA - Main Public API
|
|
3
|
+
|
|
4
|
+
This file exposes the simple `run_autonomous_test` function, allowing developers
|
|
5
|
+
to test their RAG systems in just a few lines of code.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from typing import Callable, Optional
|
|
10
|
+
|
|
11
|
+
# Expose SUT adapters for developers if they want advanced setups
|
|
12
|
+
from agentic_qa.sut.base import BaseSUTAdapter
|
|
13
|
+
from agentic_qa.sut.api_adapter import APIAdapter
|
|
14
|
+
from agentic_qa.sut.callable_adapter import CallableAdapter
|
|
15
|
+
from agentic_qa.sut import set_active_sut
|
|
16
|
+
|
|
17
|
+
# Import the core workflow
|
|
18
|
+
from agentic_qa.graph.workflow import build_qa_graph, get_initial_state
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def run_autonomous_test(
|
|
22
|
+
target_function: Optional[Callable] = None,
|
|
23
|
+
api_endpoint: Optional[str] = None,
|
|
24
|
+
system_name: str = "Target System",
|
|
25
|
+
system_description: str = "A generic RAG system",
|
|
26
|
+
domain: str = "general",
|
|
27
|
+
max_iterations: int = 3,
|
|
28
|
+
tests_per_iteration: int = 5,
|
|
29
|
+
model_name: str = "gpt-4o-mini",
|
|
30
|
+
) -> dict:
|
|
31
|
+
"""
|
|
32
|
+
Run an autonomous multi-agent QA test against a target system.
|
|
33
|
+
|
|
34
|
+
You must provide EITHER a `target_function` (a python function) OR
|
|
35
|
+
an `api_endpoint` (a URL string).
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
target_function: A python function that takes a string query and returns a string answer.
|
|
39
|
+
api_endpoint: A URL endpoint (e.g., http://localhost:8000/chat) to test.
|
|
40
|
+
system_name: The name of the system being tested.
|
|
41
|
+
system_description: A description of what the system does. Highly important for agents!
|
|
42
|
+
domain: The domain of the system (e.g., 'financial', 'healthcare', 'customer support').
|
|
43
|
+
max_iterations: How many times the agents should refine and retry their tests.
|
|
44
|
+
tests_per_iteration: How many tests the Red-Team agent generates per round.
|
|
45
|
+
model_name: The LLM to use for the agents (default: gpt-4o-mini).
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
A dictionary containing the final execution state, including the test suite, verdicts,
|
|
49
|
+
failure patterns, and the final Markdown report.
|
|
50
|
+
"""
|
|
51
|
+
# 1. Configure Environment variables needed by LangGraph
|
|
52
|
+
os.environ["MAX_ITERATIONS"] = str(max_iterations)
|
|
53
|
+
os.environ["TESTS_PER_ITERATION"] = str(tests_per_iteration)
|
|
54
|
+
os.environ["MODEL_NAME"] = model_name
|
|
55
|
+
|
|
56
|
+
# 2. Setup the SUT Adapter
|
|
57
|
+
if target_function:
|
|
58
|
+
adapter = CallableAdapter(
|
|
59
|
+
fn=target_function,
|
|
60
|
+
description=system_description,
|
|
61
|
+
system_name=system_name,
|
|
62
|
+
domain=domain
|
|
63
|
+
)
|
|
64
|
+
elif api_endpoint:
|
|
65
|
+
adapter = APIAdapter(
|
|
66
|
+
endpoint=api_endpoint,
|
|
67
|
+
description=system_description,
|
|
68
|
+
system_name=system_name,
|
|
69
|
+
domain=domain
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
raise ValueError("You must provide either a `target_function` or an `api_endpoint`.")
|
|
73
|
+
|
|
74
|
+
# Register the adapter globally for the Executor Agent to use
|
|
75
|
+
set_active_sut(adapter)
|
|
76
|
+
|
|
77
|
+
# 3. Build and Run the Graph
|
|
78
|
+
print(f"๐ Starting Autonomous QA Test against: {system_name}")
|
|
79
|
+
print(f"Domain: {domain} | Max Iterations: {max_iterations}")
|
|
80
|
+
|
|
81
|
+
graph = build_qa_graph()
|
|
82
|
+
initial_state = get_initial_state()
|
|
83
|
+
initial_state["max_iterations"] = max_iterations
|
|
84
|
+
|
|
85
|
+
final_state = None
|
|
86
|
+
# Stream the graph to provide real-time console feedback
|
|
87
|
+
for event in graph.stream(initial_state, stream_mode="values"):
|
|
88
|
+
final_state = event
|
|
89
|
+
|
|
90
|
+
print("\nโ
Autonomous QA Test Complete!")
|
|
91
|
+
print(f"Coverage Score: {final_state.get('coverage_score', 0):.1%}")
|
|
92
|
+
print(f"Total Failure Patterns Found: {len(final_state.get('failure_patterns', []))}")
|
|
93
|
+
|
|
94
|
+
return final_state
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# Define what is exported when someone runs `from agentic_qa import *`
|
|
98
|
+
__all__ = [
|
|
99
|
+
"run_autonomous_test",
|
|
100
|
+
"APIAdapter",
|
|
101
|
+
"CallableAdapter",
|
|
102
|
+
"BaseSUTAdapter"
|
|
103
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Multi-Agent Autonomous QA System - Agents Package"""
|
|
2
|
+
|
|
3
|
+
from agentic_qa.agents.red_team import red_team_node
|
|
4
|
+
from agentic_qa.agents.executor import executor_node
|
|
5
|
+
from agentic_qa.agents.judge import judge_node
|
|
6
|
+
from agentic_qa.agents.refiner import refiner_node
|
|
7
|
+
from agentic_qa.agents.reporter import reporter_node
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"red_team_node",
|
|
11
|
+
"executor_node",
|
|
12
|
+
"judge_node",
|
|
13
|
+
"refiner_node",
|
|
14
|
+
"reporter_node",
|
|
15
|
+
]
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Discovery Agent (Graphify) โ Architecture Mapping.
|
|
3
|
+
|
|
4
|
+
This agent performs White-Box introspection. It analyzes the System Under Test (SUT)
|
|
5
|
+
and maps out its internal architecture (e.g., Vector DB, Chunk Size, Retriever type, LLM).
|
|
6
|
+
This architectural "graph" allows the Red-Team agent to launch hyper-targeted attacks.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from langchain_openai import ChatOpenAI
|
|
11
|
+
from langchain_core.messages import SystemMessage, HumanMessage
|
|
12
|
+
from agentic_qa.graph.state import QAState
|
|
13
|
+
|
|
14
|
+
DISCOVERY_SYSTEM_PROMPT = """You are an elite AI Architecture Mapper (Graphify Agent).
|
|
15
|
+
|
|
16
|
+
Your mission is to analyze a generic description of an AI/RAG system and deduce its likely internal architecture graph.
|
|
17
|
+
You must break down the system into a logical pipeline (e.g., Data Ingestion -> Chunking -> VectorDB -> Retriever -> LLM -> Output).
|
|
18
|
+
|
|
19
|
+
Think about:
|
|
20
|
+
1. What components must exist for this system to work?
|
|
21
|
+
2. Where are the likely weak points or bottlenecks between these nodes?
|
|
22
|
+
3. What are the assumed configurations (e.g., chunk size, top-k retrieval)?
|
|
23
|
+
|
|
24
|
+
Output a detailed, graph-like text representation of the architecture. Be specific about potential vulnerabilities at each node."""
|
|
25
|
+
|
|
26
|
+
DISCOVERY_USER_PROMPT = """Analyze the following System Under Test and map its architecture.
|
|
27
|
+
|
|
28
|
+
**System Name:** {name}
|
|
29
|
+
**Domain:** {domain}
|
|
30
|
+
**Description:**
|
|
31
|
+
{description}
|
|
32
|
+
|
|
33
|
+
Provide a detailed architectural breakdown (Graphify) of this system. Highlight the specific nodes (e.g., Retriever, VectorStore, LLM) and the data flow.
|
|
34
|
+
Keep it concise but technical."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_llm() -> ChatOpenAI:
|
|
38
|
+
return ChatOpenAI(
|
|
39
|
+
model=os.getenv("MODEL_NAME", "gpt-4o-mini"),
|
|
40
|
+
temperature=0.2,
|
|
41
|
+
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
42
|
+
openai_api_base=os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def discovery_node(state: QAState) -> dict:
|
|
47
|
+
"""
|
|
48
|
+
LangGraph node: Discovery Agent.
|
|
49
|
+
|
|
50
|
+
Runs once at the beginning of the pipeline to map the SUT architecture.
|
|
51
|
+
"""
|
|
52
|
+
print(f"\n{'='*60}")
|
|
53
|
+
print(f"๐ DISCOVERY AGENT (Graphify) โ Mapping Architecture")
|
|
54
|
+
print(f"{'='*60}")
|
|
55
|
+
|
|
56
|
+
description = state.get("sut_description", "")
|
|
57
|
+
domain = state.get("domain", "")
|
|
58
|
+
|
|
59
|
+
# If the architecture is already provided (e.g. by the adapter), skip LLM discovery
|
|
60
|
+
if state.get("sut_architecture"):
|
|
61
|
+
print(" Architecture already provided. Skipping LLM discovery.")
|
|
62
|
+
return {}
|
|
63
|
+
|
|
64
|
+
print(" Analyzing SUT description to deduce internal graph...")
|
|
65
|
+
|
|
66
|
+
llm = _get_llm()
|
|
67
|
+
prompt = DISCOVERY_USER_PROMPT.format(
|
|
68
|
+
name="Target System",
|
|
69
|
+
domain=domain,
|
|
70
|
+
description=description
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
messages = [
|
|
74
|
+
SystemMessage(content=DISCOVERY_SYSTEM_PROMPT),
|
|
75
|
+
HumanMessage(content=prompt)
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
response = llm.invoke(messages)
|
|
79
|
+
architecture = response.content
|
|
80
|
+
|
|
81
|
+
print("\n ๐บ๏ธ Deduced Architecture Graph:")
|
|
82
|
+
# Print the first few lines as a preview
|
|
83
|
+
preview = "\n".join([f" {line}" for line in architecture.split("\n")[:10]])
|
|
84
|
+
print(f"{preview}\n ...")
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
"sut_architecture": architecture
|
|
88
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Executor Agent โ Generic System Under Test Runner.
|
|
3
|
+
|
|
4
|
+
Executes test cases against ANY connected SUT (RAG, API, function).
|
|
5
|
+
Uses the active SUT adapter from the registry โ works with:
|
|
6
|
+
- Built-in Financial RAG demo
|
|
7
|
+
- Any RAG connected via API endpoint
|
|
8
|
+
- Any Python function wrapped as a callable
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import time
|
|
12
|
+
from agentic_qa.graph.state import QAState
|
|
13
|
+
from agentic_qa.sut import get_active_sut
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def executor_node(state: QAState) -> dict:
|
|
17
|
+
"""
|
|
18
|
+
LangGraph node: Executor Agent.
|
|
19
|
+
|
|
20
|
+
Runs each test case from the current iteration through whatever
|
|
21
|
+
SUT is currently active and collects results.
|
|
22
|
+
"""
|
|
23
|
+
iteration = state.get("current_iteration", 1)
|
|
24
|
+
test_suite = state.get("test_suite", [])
|
|
25
|
+
|
|
26
|
+
print(f"\n{'='*60}")
|
|
27
|
+
print(f"โก EXECUTOR AGENT โ Iteration {iteration}")
|
|
28
|
+
print(f"{'='*60}")
|
|
29
|
+
|
|
30
|
+
# Get test cases for the current iteration only
|
|
31
|
+
iter_prefix = f"TC-{iteration:02d}"
|
|
32
|
+
current_tests = [tc for tc in test_suite if tc["id"].startswith(iter_prefix)]
|
|
33
|
+
|
|
34
|
+
if not current_tests:
|
|
35
|
+
num_per_iter = 5
|
|
36
|
+
current_tests = test_suite[-num_per_iter:]
|
|
37
|
+
|
|
38
|
+
# Get the active SUT (whatever the user connected)
|
|
39
|
+
sut = get_active_sut()
|
|
40
|
+
print(f" SUT: {sut.name}")
|
|
41
|
+
print(f" Executing {len(current_tests)} test cases...")
|
|
42
|
+
|
|
43
|
+
execution_results = []
|
|
44
|
+
|
|
45
|
+
for tc in current_tests:
|
|
46
|
+
test_id = tc["id"]
|
|
47
|
+
input_data = tc["input_data"]
|
|
48
|
+
|
|
49
|
+
print(f" โถ Running [{test_id}]...", end=" ")
|
|
50
|
+
|
|
51
|
+
start_time = time.time()
|
|
52
|
+
try:
|
|
53
|
+
output = sut.process(input_data)
|
|
54
|
+
exec_time = time.time() - start_time
|
|
55
|
+
|
|
56
|
+
# Normalize output โ adapters return "output" key
|
|
57
|
+
sut_output = output.get("output", output.get("sut_output", str(output)))
|
|
58
|
+
status = output.get("status", "unknown")
|
|
59
|
+
|
|
60
|
+
result = {
|
|
61
|
+
"test_id": test_id,
|
|
62
|
+
"sut_output": str(output),
|
|
63
|
+
"execution_time": round(exec_time, 4),
|
|
64
|
+
"error": output.get("error") if status == "error" else None,
|
|
65
|
+
}
|
|
66
|
+
print(f"Done ({exec_time:.3f}s) โ Status: {status}")
|
|
67
|
+
|
|
68
|
+
except Exception as e:
|
|
69
|
+
exec_time = time.time() - start_time
|
|
70
|
+
result = {
|
|
71
|
+
"test_id": test_id,
|
|
72
|
+
"sut_output": "",
|
|
73
|
+
"execution_time": round(exec_time, 4),
|
|
74
|
+
"error": str(e),
|
|
75
|
+
}
|
|
76
|
+
print(f"ERROR ({exec_time:.3f}s) โ {e}")
|
|
77
|
+
|
|
78
|
+
execution_results.append(result)
|
|
79
|
+
|
|
80
|
+
errors = sum(1 for r in execution_results if r["error"])
|
|
81
|
+
avg_time = sum(r["execution_time"] for r in execution_results) / max(len(execution_results), 1)
|
|
82
|
+
print(f"\n ๐ Execution Summary:")
|
|
83
|
+
print(f" Tests executed: {len(execution_results)}")
|
|
84
|
+
print(f" Errors/crashes: {errors}")
|
|
85
|
+
print(f" Avg exec time: {avg_time:.3f}s")
|
|
86
|
+
|
|
87
|
+
return {
|
|
88
|
+
"execution_results": execution_results,
|
|
89
|
+
}
|