alita-sdk 0.3.603__py3-none-any.whl → 0.3.609__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/agents.py +108 -826
- alita_sdk/cli/testcases/__init__.py +94 -0
- alita_sdk/cli/testcases/data_generation.py +119 -0
- alita_sdk/cli/testcases/discovery.py +96 -0
- alita_sdk/cli/testcases/executor.py +84 -0
- alita_sdk/cli/testcases/logger.py +85 -0
- alita_sdk/cli/testcases/parser.py +172 -0
- alita_sdk/cli/testcases/prompts.py +91 -0
- alita_sdk/cli/testcases/reporting.py +125 -0
- alita_sdk/cli/testcases/setup.py +108 -0
- alita_sdk/cli/testcases/test_runner.py +282 -0
- alita_sdk/cli/testcases/utils.py +39 -0
- alita_sdk/cli/testcases/validation.py +90 -0
- alita_sdk/cli/testcases/workflow.py +196 -0
- alita_sdk/configurations/openapi.py +2 -2
- alita_sdk/runtime/clients/artifact.py +1 -1
- alita_sdk/runtime/tools/artifact.py +253 -8
- alita_sdk/runtime/tools/llm.py +12 -11
- alita_sdk/tools/bitbucket/api_wrapper.py +31 -30
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
- alita_sdk/tools/confluence/api_wrapper.py +8 -1
- alita_sdk/tools/elitea_base.py +40 -36
- alita_sdk/tools/figma/api_wrapper.py +140 -83
- alita_sdk/tools/github/graphql_client_wrapper.py +1 -0
- alita_sdk/tools/utils/text_operations.py +156 -52
- {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.609.dist-info}/METADATA +1 -1
- {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.609.dist-info}/RECORD +31 -18
- {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.609.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.609.dist-info}/entry_points.txt +0 -0
- {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.609.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.609.dist-info}/top_level.txt +0 -0
alita_sdk/cli/agents.py
CHANGED
|
@@ -36,439 +36,44 @@ from .callbacks import create_cli_callback, CLICallbackHandler
|
|
|
36
36
|
from .input_handler import get_input_handler, styled_input, styled_selection_input
|
|
37
37
|
# Context management for chat history
|
|
38
38
|
from .context import CLIContextManager, CLIMessage, purge_old_sessions as purge_context_sessions
|
|
39
|
+
# Test execution utilities
|
|
40
|
+
from .testcases import (
|
|
41
|
+
parse_test_case,
|
|
42
|
+
resolve_toolkit_config_path,
|
|
43
|
+
build_bulk_data_gen_prompt,
|
|
44
|
+
build_single_test_execution_prompt,
|
|
45
|
+
build_single_test_validation_prompt,
|
|
46
|
+
extract_json_from_text,
|
|
47
|
+
create_fallback_result_for_test,
|
|
48
|
+
print_validation_diagnostics,
|
|
49
|
+
TestLogCapture,
|
|
50
|
+
create_executor_from_cache,
|
|
51
|
+
cleanup_executor_cache,
|
|
52
|
+
extract_toolkit_name,
|
|
53
|
+
# New helper functions
|
|
54
|
+
load_test_runner_agent,
|
|
55
|
+
load_data_generator_agent,
|
|
56
|
+
load_validator_agent,
|
|
57
|
+
discover_test_case_files,
|
|
58
|
+
validate_test_case_files,
|
|
59
|
+
print_test_execution_header,
|
|
60
|
+
execute_bulk_data_generation,
|
|
61
|
+
execute_single_test_case,
|
|
62
|
+
validate_single_test_case,
|
|
63
|
+
generate_summary_report,
|
|
64
|
+
save_structured_report,
|
|
65
|
+
print_test_execution_summary,
|
|
66
|
+
# Workflow orchestration
|
|
67
|
+
parse_all_test_cases,
|
|
68
|
+
filter_test_cases_needing_data_gen,
|
|
69
|
+
execute_all_test_cases,
|
|
70
|
+
)
|
|
39
71
|
|
|
40
72
|
logger = logging.getLogger(__name__)
|
|
41
73
|
|
|
42
74
|
# Create a rich console for beautiful output
|
|
43
75
|
console = Console()
|
|
44
76
|
|
|
45
|
-
|
|
46
|
-
def resolve_toolkit_config_path(config_path_str: str, test_file: Path, test_cases_dir: Path) -> Optional[str]:
|
|
47
|
-
"""
|
|
48
|
-
Resolve toolkit configuration file path from test case.
|
|
49
|
-
|
|
50
|
-
Tries multiple locations in order:
|
|
51
|
-
1. Absolute path
|
|
52
|
-
2. Relative to test case file directory
|
|
53
|
-
3. Relative to test cases directory
|
|
54
|
-
4. Relative to workspace root
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
config_path_str: Config path from test case
|
|
58
|
-
test_file: Path to the test case file
|
|
59
|
-
test_cases_dir: Path to test cases directory
|
|
60
|
-
|
|
61
|
-
Returns:
|
|
62
|
-
Absolute path to config file if found, None otherwise
|
|
63
|
-
"""
|
|
64
|
-
if not config_path_str:
|
|
65
|
-
return None
|
|
66
|
-
|
|
67
|
-
# Normalize path separators
|
|
68
|
-
config_path_str = config_path_str.replace('\\', '/')
|
|
69
|
-
|
|
70
|
-
# Try absolute path first
|
|
71
|
-
config_path = Path(config_path_str)
|
|
72
|
-
if config_path.is_absolute() and config_path.exists():
|
|
73
|
-
return str(config_path)
|
|
74
|
-
|
|
75
|
-
# Try relative to test case file directory
|
|
76
|
-
config_path = test_file.parent / config_path_str
|
|
77
|
-
if config_path.exists():
|
|
78
|
-
return str(config_path)
|
|
79
|
-
|
|
80
|
-
# Try relative to test_cases_dir
|
|
81
|
-
config_path = test_cases_dir / config_path_str
|
|
82
|
-
if config_path.exists():
|
|
83
|
-
return str(config_path)
|
|
84
|
-
|
|
85
|
-
# Try relative to workspace root
|
|
86
|
-
workspace_root = Path.cwd()
|
|
87
|
-
config_path = workspace_root / config_path_str
|
|
88
|
-
if config_path.exists():
|
|
89
|
-
return str(config_path)
|
|
90
|
-
|
|
91
|
-
return None
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def parse_test_case(test_case_path: str) -> Dict[str, Any]:
|
|
95
|
-
"""
|
|
96
|
-
Parse a test case markdown file to extract configuration, steps, and expectations.
|
|
97
|
-
|
|
98
|
-
Args:
|
|
99
|
-
test_case_path: Path to the test case markdown file
|
|
100
|
-
|
|
101
|
-
Returns:
|
|
102
|
-
Dictionary containing:
|
|
103
|
-
- name: Test case name
|
|
104
|
-
- objective: Test objective
|
|
105
|
-
- config_path: Path to toolkit config file
|
|
106
|
-
- generate_test_data: Boolean flag indicating if test data generation is needed (default: True)
|
|
107
|
-
- test_data_config: Dictionary of test data configuration from table
|
|
108
|
-
- prerequisites: Pre-requisites section text
|
|
109
|
-
- variables: List of variable placeholders found (e.g., {{TEST_PR_NUMBER}})
|
|
110
|
-
- steps: List of test steps with their descriptions
|
|
111
|
-
- expectations: List of expectations/assertions
|
|
112
|
-
"""
|
|
113
|
-
path = Path(test_case_path)
|
|
114
|
-
if not path.exists():
|
|
115
|
-
raise FileNotFoundError(f"Test case not found: {test_case_path}")
|
|
116
|
-
|
|
117
|
-
content = path.read_text(encoding='utf-8')
|
|
118
|
-
|
|
119
|
-
# Extract test case name from the first heading
|
|
120
|
-
name_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
|
|
121
|
-
name = name_match.group(1) if name_match else path.stem
|
|
122
|
-
|
|
123
|
-
# Extract objective
|
|
124
|
-
objective_match = re.search(r'##\s+Objective\s*\n\n(.+?)(?=\n\n##|\Z)', content, re.DOTALL)
|
|
125
|
-
objective = objective_match.group(1).strip() if objective_match else ""
|
|
126
|
-
|
|
127
|
-
# Extract config path and generateTestData flag
|
|
128
|
-
config_section_match = re.search(r'##\s+Config\s*\n\n(.+?)(?=\n\n##|\Z)', content, re.DOTALL)
|
|
129
|
-
config_path = None
|
|
130
|
-
generate_test_data = True # Default to True if not specified
|
|
131
|
-
|
|
132
|
-
if config_section_match:
|
|
133
|
-
config_section = config_section_match.group(1)
|
|
134
|
-
# Extract path
|
|
135
|
-
path_match = re.search(r'path:\s*(.+?)(?=\n|$)', config_section, re.MULTILINE)
|
|
136
|
-
if path_match:
|
|
137
|
-
config_path = path_match.group(1).strip()
|
|
138
|
-
|
|
139
|
-
# Extract generateTestData flag
|
|
140
|
-
gen_data_match = re.search(r'generateTestData\s*:\s*(true|false)', config_section, re.IGNORECASE)
|
|
141
|
-
if gen_data_match:
|
|
142
|
-
generate_test_data = gen_data_match.group(1).lower() == 'true'
|
|
143
|
-
|
|
144
|
-
# Extract Test Data Configuration section as a raw fenced code block string
|
|
145
|
-
# NOTE: We intentionally store the entire section as a single string rather than parsing
|
|
146
|
-
# individual table rows. This preserves the original formatting for downstream tools
|
|
147
|
-
# which may prefer the raw markdown block.
|
|
148
|
-
test_data_config = None
|
|
149
|
-
config_section_match = re.search(r'##\s+Test Data Configuration\s*\n(.+?)(?=\n##|\Z)', content, re.DOTALL)
|
|
150
|
-
if config_section_match:
|
|
151
|
-
config_section = config_section_match.group(1).strip()
|
|
152
|
-
# Store as a fenced code block to make it clear this is a raw block of text
|
|
153
|
-
test_data_config = f"\n{config_section}\n"
|
|
154
|
-
|
|
155
|
-
# Extract Pre-requisites section
|
|
156
|
-
prerequisites = ""
|
|
157
|
-
prereq_match = re.search(r'##\s+Pre-requisites\s*\n\n(.+?)(?=\n\n##|\Z)', content, re.DOTALL)
|
|
158
|
-
if prereq_match:
|
|
159
|
-
prerequisites = prereq_match.group(1).strip()
|
|
160
|
-
|
|
161
|
-
# Find all variable placeholders ({{VARIABLE_NAME}})
|
|
162
|
-
variables = list(set(re.findall(r'\{\{([A-Z_]+)\}\}', content)))
|
|
163
|
-
|
|
164
|
-
# Extract test steps and expectations
|
|
165
|
-
steps = []
|
|
166
|
-
expectations = []
|
|
167
|
-
|
|
168
|
-
# Find all Step sections
|
|
169
|
-
step_pattern = r'###\s+Step\s+(\d+):\s+(.+?)\n\n(.+?)(?=\n\n###|\n\n##|\Z)'
|
|
170
|
-
for step_match in re.finditer(step_pattern, content, re.DOTALL):
|
|
171
|
-
step_num = step_match.group(1)
|
|
172
|
-
step_title = step_match.group(2).strip()
|
|
173
|
-
step_content = step_match.group(3).strip()
|
|
174
|
-
|
|
175
|
-
# Extract the actual instruction (first paragraph before "Expectation:")
|
|
176
|
-
instruction_match = re.search(r'(.+?)(?=\n\n\*\*Expectation:\*\*|\Z)', step_content, re.DOTALL)
|
|
177
|
-
instruction = instruction_match.group(1).strip() if instruction_match else step_content
|
|
178
|
-
|
|
179
|
-
# Extract expectation if present
|
|
180
|
-
expectation_match = re.search(r'\*\*Expectation:\*\*\s+(.+)', step_content, re.DOTALL)
|
|
181
|
-
expectation = expectation_match.group(1).strip() if expectation_match else None
|
|
182
|
-
|
|
183
|
-
steps.append({
|
|
184
|
-
'number': int(step_num),
|
|
185
|
-
'title': step_title,
|
|
186
|
-
'instruction': instruction,
|
|
187
|
-
'expectation': expectation
|
|
188
|
-
})
|
|
189
|
-
|
|
190
|
-
if expectation:
|
|
191
|
-
expectations.append({
|
|
192
|
-
'step': int(step_num),
|
|
193
|
-
'description': expectation
|
|
194
|
-
})
|
|
195
|
-
|
|
196
|
-
return {
|
|
197
|
-
'name': name,
|
|
198
|
-
'objective': objective,
|
|
199
|
-
'config_path': config_path,
|
|
200
|
-
'generate_test_data': generate_test_data,
|
|
201
|
-
'test_data_config': test_data_config,
|
|
202
|
-
'prerequisites': prerequisites,
|
|
203
|
-
'variables': variables,
|
|
204
|
-
'steps': steps,
|
|
205
|
-
'expectations': expectations
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
def validate_test_output(output: str, expectation: str) -> tuple[bool, str]:
|
|
210
|
-
"""
|
|
211
|
-
Validate test output against expectations.
|
|
212
|
-
|
|
213
|
-
Args:
|
|
214
|
-
output: The actual output from the agent
|
|
215
|
-
expectation: The expected result description
|
|
216
|
-
|
|
217
|
-
Returns:
|
|
218
|
-
Tuple of (passed: bool, details: str)
|
|
219
|
-
"""
|
|
220
|
-
# Simple keyword-based validation
|
|
221
|
-
# Extract key phrases from expectation
|
|
222
|
-
|
|
223
|
-
# Common patterns in expectations
|
|
224
|
-
if "contains" in expectation.lower():
|
|
225
|
-
# Extract what should be contained
|
|
226
|
-
contains_match = re.search(r'contains.*?["`]([^"`]+)["`]', expectation, re.IGNORECASE)
|
|
227
|
-
if contains_match:
|
|
228
|
-
expected_text = contains_match.group(1)
|
|
229
|
-
if expected_text in output:
|
|
230
|
-
return True, f"Output contains expected text: '{expected_text}'"
|
|
231
|
-
else:
|
|
232
|
-
return False, f"Output does not contain expected text: '{expected_text}'"
|
|
233
|
-
|
|
234
|
-
if "without errors" in expectation.lower() or "runs without errors" in expectation.lower():
|
|
235
|
-
# Check for common error indicators
|
|
236
|
-
error_indicators = ['error', 'exception', 'failed', 'traceback']
|
|
237
|
-
has_error = any(indicator in output.lower() for indicator in error_indicators)
|
|
238
|
-
if not has_error:
|
|
239
|
-
return True, "Execution completed without errors"
|
|
240
|
-
else:
|
|
241
|
-
return False, "Execution encountered errors"
|
|
242
|
-
|
|
243
|
-
# Default: assume pass if output is non-empty
|
|
244
|
-
if output and len(output.strip()) > 0:
|
|
245
|
-
return True, "Output generated successfully"
|
|
246
|
-
|
|
247
|
-
return False, "No output generated"
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
def _build_bulk_data_gen_prompt(parsed_test_cases: list) -> str:
|
|
251
|
-
"""Build consolidated requirements text for bulk test data generation."""
|
|
252
|
-
requirements = []
|
|
253
|
-
for idx, tc in enumerate(parsed_test_cases, 1):
|
|
254
|
-
test_case = tc['data']
|
|
255
|
-
test_file = tc['file']
|
|
256
|
-
# Build parts for this test case (do not include separator lines here;
|
|
257
|
-
# the entire block is wrapped with separators at the top-level)
|
|
258
|
-
parts = [f"Test Case #{idx}: {test_case['name']}", f"File: {test_file.name}", ""]
|
|
259
|
-
|
|
260
|
-
if test_case.get('test_data_config'):
|
|
261
|
-
parts.append("Test Data Configuration:")
|
|
262
|
-
td = test_case['test_data_config']
|
|
263
|
-
raw_lines = str(td).splitlines()
|
|
264
|
-
for line in raw_lines:
|
|
265
|
-
parts.append(f"{line}")
|
|
266
|
-
|
|
267
|
-
if test_case.get('prerequisites'):
|
|
268
|
-
parts.append(f"\nPre-requisites:\n{test_case['prerequisites']}")
|
|
269
|
-
|
|
270
|
-
requirements.append("\n".join(parts))
|
|
271
|
-
|
|
272
|
-
# If no requirements were collected, return an empty string to avoid
|
|
273
|
-
# producing a prompt with only separator lines.
|
|
274
|
-
if not requirements:
|
|
275
|
-
return ""
|
|
276
|
-
|
|
277
|
-
# Use a visible divider between test cases so each entry is clearly separated
|
|
278
|
-
divider = '-' * 40
|
|
279
|
-
body = f"\n\n{divider}\n\n".join(requirements)
|
|
280
|
-
return f"{('='*60)}\n\n{body}\n\n{('='*60)}"
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
def _build_single_test_execution_prompt(test_case_info: dict, test_number: int) -> str:
|
|
284
|
-
"""Build execution prompt for a single test case."""
|
|
285
|
-
test_case = test_case_info['data']
|
|
286
|
-
test_file = test_case_info['file']
|
|
287
|
-
|
|
288
|
-
parts = [
|
|
289
|
-
f"\n{'='*80}",
|
|
290
|
-
f"TEST CASE #{test_number}: {test_case['name']}",
|
|
291
|
-
f"File: {test_file.name}",
|
|
292
|
-
f"{'='*80}"
|
|
293
|
-
]
|
|
294
|
-
|
|
295
|
-
if test_case['steps']:
|
|
296
|
-
for step in test_case['steps']:
|
|
297
|
-
parts.append(f"\nStep {step['number']}: {step['title']}")
|
|
298
|
-
parts.append(step['instruction'])
|
|
299
|
-
else:
|
|
300
|
-
parts.append("\n(No steps defined)")
|
|
301
|
-
|
|
302
|
-
return "\n".join(parts)
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
def _build_single_test_validation_prompt(test_case_info: dict, test_number: int, execution_output: str) -> str:
|
|
306
|
-
"""Build validation prompt for a single test case."""
|
|
307
|
-
test_case = test_case_info['data']
|
|
308
|
-
|
|
309
|
-
parts = [
|
|
310
|
-
f"\nTest Case #{test_number}: {test_case['name']}"
|
|
311
|
-
]
|
|
312
|
-
|
|
313
|
-
if test_case['steps']:
|
|
314
|
-
for step in test_case['steps']:
|
|
315
|
-
parts.append(f" Step {step['number']}: {step['title']}")
|
|
316
|
-
if step['expectation']:
|
|
317
|
-
parts.append(f" Expected: {step['expectation']}")
|
|
318
|
-
|
|
319
|
-
parts.append(f"\n\nActual Execution Results:\n{execution_output}\n")
|
|
320
|
-
|
|
321
|
-
# Escape quotes in test name for valid JSON in prompt
|
|
322
|
-
escaped_test_name = test_case['name'].replace('"', '\\"')
|
|
323
|
-
|
|
324
|
-
parts.append(f"""\nBased on the execution results above, validate this test case.
|
|
325
|
-
{{
|
|
326
|
-
"test_number": {test_number},
|
|
327
|
-
"test_name": "{escaped_test_name}"
|
|
328
|
-
}}
|
|
329
|
-
""")
|
|
330
|
-
|
|
331
|
-
return "\n".join(parts)
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
def _extract_json_from_text(text: str) -> dict:
|
|
335
|
-
"""Extract JSON object from text using brace counting."""
|
|
336
|
-
start_idx = text.find('{')
|
|
337
|
-
if start_idx == -1:
|
|
338
|
-
raise ValueError("No JSON found in text")
|
|
339
|
-
|
|
340
|
-
brace_count = 0
|
|
341
|
-
end_idx = -1
|
|
342
|
-
for i, char in enumerate(text[start_idx:], start=start_idx):
|
|
343
|
-
if char == '{':
|
|
344
|
-
brace_count += 1
|
|
345
|
-
elif char == '}':
|
|
346
|
-
brace_count -= 1
|
|
347
|
-
if brace_count == 0:
|
|
348
|
-
end_idx = i + 1
|
|
349
|
-
break
|
|
350
|
-
|
|
351
|
-
if end_idx == -1:
|
|
352
|
-
raise ValueError("Could not find matching closing brace")
|
|
353
|
-
|
|
354
|
-
return json.loads(text[start_idx:end_idx])
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
def _create_fallback_result_for_test(test_case: dict, test_file: Path, reason: str = 'Validation failed') -> dict:
|
|
358
|
-
"""Create a fallback result for a single test case with detailed step information.
|
|
359
|
-
|
|
360
|
-
Args:
|
|
361
|
-
test_case: Parsed test case data
|
|
362
|
-
test_file: Path to test case file
|
|
363
|
-
reason: Reason for fallback
|
|
364
|
-
|
|
365
|
-
Returns:
|
|
366
|
-
Fallback test result dict with step details
|
|
367
|
-
"""
|
|
368
|
-
fallback_steps = []
|
|
369
|
-
for step_info in test_case.get('steps', []):
|
|
370
|
-
fallback_steps.append({
|
|
371
|
-
'step_number': step_info['number'],
|
|
372
|
-
'title': step_info['title'],
|
|
373
|
-
'passed': False,
|
|
374
|
-
'details': reason
|
|
375
|
-
})
|
|
376
|
-
|
|
377
|
-
return {
|
|
378
|
-
'title': test_case['name'],
|
|
379
|
-
'passed': False,
|
|
380
|
-
'file': test_file.name,
|
|
381
|
-
'step_results': fallback_steps,
|
|
382
|
-
'validation_error': reason
|
|
383
|
-
}
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
def _cleanup_executor_cache(cache: Dict[str, tuple], cache_name: str = "executor") -> None:
|
|
387
|
-
"""Clean up executor cache resources.
|
|
388
|
-
|
|
389
|
-
Args:
|
|
390
|
-
cache: Dictionary of cached executors
|
|
391
|
-
cache_name: Name of cache for logging
|
|
392
|
-
"""
|
|
393
|
-
console.print(f"[dim]Cleaning up {cache_name} cache...[/dim]")
|
|
394
|
-
for cache_key, cached_items in cache.items():
|
|
395
|
-
try:
|
|
396
|
-
# Extract memory from tuple (second element)
|
|
397
|
-
memory = cached_items[1] if len(cached_items) > 1 else None
|
|
398
|
-
|
|
399
|
-
# Close SQLite memory connection
|
|
400
|
-
if memory and hasattr(memory, 'conn') and memory.conn:
|
|
401
|
-
memory.conn.close()
|
|
402
|
-
except Exception as e:
|
|
403
|
-
logger.debug(f"Error cleaning up {cache_name} cache for {cache_key}: {e}")
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
def _create_executor_from_cache(cache: Dict[str, tuple], cache_key: str,
|
|
407
|
-
client, agent_def: Dict, toolkit_config_path: Optional[str],
|
|
408
|
-
config, model: Optional[str], temperature: Optional[float],
|
|
409
|
-
max_tokens: Optional[int], work_dir: Optional[str]) -> tuple:
|
|
410
|
-
"""Get or create executor from cache.
|
|
411
|
-
|
|
412
|
-
Args:
|
|
413
|
-
cache: Executor cache dictionary
|
|
414
|
-
cache_key: Key for caching
|
|
415
|
-
client: API client
|
|
416
|
-
agent_def: Agent definition
|
|
417
|
-
toolkit_config_path: Path to toolkit config
|
|
418
|
-
config: CLI configuration
|
|
419
|
-
model: Model override
|
|
420
|
-
temperature: Temperature override
|
|
421
|
-
max_tokens: Max tokens override
|
|
422
|
-
work_dir: Working directory
|
|
423
|
-
|
|
424
|
-
Returns:
|
|
425
|
-
Tuple of (agent_executor, memory, mcp_session_manager)
|
|
426
|
-
"""
|
|
427
|
-
if cache_key in cache:
|
|
428
|
-
return cache[cache_key]
|
|
429
|
-
|
|
430
|
-
# Create new executor
|
|
431
|
-
from langgraph.checkpoint.sqlite import SqliteSaver
|
|
432
|
-
import sqlite3
|
|
433
|
-
|
|
434
|
-
memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
|
|
435
|
-
toolkit_config_tuple = (toolkit_config_path,) if toolkit_config_path else ()
|
|
436
|
-
|
|
437
|
-
agent_executor, mcp_session_manager, _, _, _, _, _ = _setup_local_agent_executor(
|
|
438
|
-
client, agent_def, toolkit_config_tuple, config, model, temperature,
|
|
439
|
-
max_tokens, memory, work_dir
|
|
440
|
-
)
|
|
441
|
-
|
|
442
|
-
# Cache the executor
|
|
443
|
-
cached_tuple = (agent_executor, memory, mcp_session_manager)
|
|
444
|
-
cache[cache_key] = cached_tuple
|
|
445
|
-
return cached_tuple
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
def _print_validation_diagnostics(validation_output: str) -> None:
|
|
449
|
-
"""Print diagnostic information for validation output.
|
|
450
|
-
|
|
451
|
-
Args:
|
|
452
|
-
validation_output: The validation output to diagnose
|
|
453
|
-
"""
|
|
454
|
-
console.print(f"\n[bold red]🔍 Diagnostic Information:[/bold red]")
|
|
455
|
-
console.print(f"[dim]Output length: {len(validation_output)} characters[/dim]")
|
|
456
|
-
|
|
457
|
-
# Check for key JSON elements
|
|
458
|
-
has_json = '{' in validation_output and '}' in validation_output
|
|
459
|
-
has_fields = 'test_number' in validation_output and 'steps' in validation_output
|
|
460
|
-
|
|
461
|
-
console.print(f"[dim]Has JSON structure: {has_json}[/dim]")
|
|
462
|
-
console.print(f"[dim]Has required fields: {has_fields}[/dim]")
|
|
463
|
-
|
|
464
|
-
# Show relevant excerpt
|
|
465
|
-
if len(validation_output) > 400:
|
|
466
|
-
console.print(f"\n[red]First 200 chars:[/red] [dim]{validation_output[:200]}[/dim]")
|
|
467
|
-
console.print(f"[red]Last 200 chars:[/red] [dim]{validation_output[-200:]}[/dim]")
|
|
468
|
-
else:
|
|
469
|
-
console.print(f"\n[red]Full output:[/red] [dim]{validation_output}[/dim]")
|
|
470
|
-
|
|
471
|
-
|
|
472
77
|
def _get_alita_system_prompt(config) -> str:
|
|
473
78
|
"""
|
|
474
79
|
Get the Alita system prompt from user config or fallback to default.
|
|
@@ -3343,12 +2948,15 @@ def agent_run(ctx, agent_source: str, message: str, version: Optional[str],
|
|
|
3343
2948
|
help='Path to test validator agent definition file (default: .alita/agents/test-validator.agent.md)')
|
|
3344
2949
|
@click.option('--skip-data-generation', is_flag=True,
|
|
3345
2950
|
help='Skip test data generation step')
|
|
2951
|
+
@click.option('--verbose', '-v', type=click.Choice(['quiet', 'default', 'debug']), default='default',
|
|
2952
|
+
help='Output verbosity level: quiet (final output only), default (tool calls + outputs), debug (all including LLM calls)')
|
|
3346
2953
|
@click.pass_context
|
|
3347
2954
|
def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir: str,
|
|
3348
2955
|
test_case_files: tuple, model: Optional[str], temperature: Optional[float],
|
|
3349
2956
|
max_tokens: Optional[int], work_dir: str,
|
|
3350
2957
|
data_generator: str, validator: Optional[str],
|
|
3351
|
-
|
|
2958
|
+
skip_data_generation: bool,
|
|
2959
|
+
verbose: str):
|
|
3352
2960
|
"""
|
|
3353
2961
|
Execute test cases from a directory and save results.
|
|
3354
2962
|
|
|
@@ -3383,6 +2991,10 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
|
|
|
3383
2991
|
config = ctx.obj['config']
|
|
3384
2992
|
client = get_client(ctx)
|
|
3385
2993
|
|
|
2994
|
+
# Setup verbose level
|
|
2995
|
+
show_verbose = verbose != 'quiet'
|
|
2996
|
+
debug_mode = verbose == 'debug'
|
|
2997
|
+
|
|
3386
2998
|
# Sanity-check committed defaults (should exist; fail early with a clear message if not)
|
|
3387
2999
|
if results_dir and not Path(results_dir).exists():
|
|
3388
3000
|
raise click.ClickException(
|
|
@@ -3391,428 +3003,98 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
|
|
|
3391
3003
|
)
|
|
3392
3004
|
|
|
3393
3005
|
try:
|
|
3394
|
-
# Load agent
|
|
3395
|
-
|
|
3396
|
-
if not agent_source_path.exists():
|
|
3397
|
-
default_path = Path('.alita') / 'agents' / 'test-runner.agent.md'
|
|
3398
|
-
if agent_source_path == default_path:
|
|
3399
|
-
raise click.ClickException(
|
|
3400
|
-
f"Default agent definition not found: {agent_source}. "
|
|
3401
|
-
f"Run this command from the repo root (so {default_path} resolves correctly) "
|
|
3402
|
-
f"or pass --agent_source explicitly."
|
|
3403
|
-
)
|
|
3404
|
-
raise click.ClickException(f"Agent definition not found: {agent_source}")
|
|
3405
|
-
|
|
3406
|
-
agent_def = load_agent_definition(agent_source)
|
|
3407
|
-
agent_name = agent_def.get('name', Path(agent_source).stem)
|
|
3006
|
+
# Load test runner agent
|
|
3007
|
+
agent_def, agent_name = load_test_runner_agent(agent_source)
|
|
3408
3008
|
|
|
3409
|
-
# Find
|
|
3009
|
+
# Find and filter test case files
|
|
3410
3010
|
test_cases_path = Path(test_cases_dir)
|
|
3011
|
+
test_case_files_list = discover_test_case_files(test_cases_dir, test_case_files)
|
|
3411
3012
|
|
|
3412
|
-
#
|
|
3413
|
-
if test_case_files:
|
|
3414
|
-
# User specified specific test case files
|
|
3415
|
-
test_case_files_set = set(test_case_files)
|
|
3416
|
-
all_test_cases = sorted(test_cases_path.rglob('TC-*.md'))
|
|
3417
|
-
test_case_files_list = [
|
|
3418
|
-
tc for tc in all_test_cases
|
|
3419
|
-
if tc.name in test_case_files_set
|
|
3420
|
-
]
|
|
3421
|
-
|
|
3422
|
-
# Check if all specified files were found
|
|
3423
|
-
found_names = {tc.name for tc in test_case_files_list}
|
|
3424
|
-
not_found = test_case_files_set - found_names
|
|
3425
|
-
if not_found:
|
|
3426
|
-
console.print(f"[yellow]⚠ Warning: Test case files not found: {', '.join(not_found)}[/yellow]")
|
|
3427
|
-
else:
|
|
3428
|
-
# Execute all test cases
|
|
3429
|
-
test_case_files_list = sorted(test_cases_path.rglob('TC-*.md'))
|
|
3430
|
-
|
|
3431
|
-
if not test_case_files_list:
|
|
3432
|
-
if test_case_files:
|
|
3433
|
-
console.print(f"[yellow]No matching test case files found in {test_cases_dir}[/yellow]")
|
|
3434
|
-
else:
|
|
3435
|
-
console.print(f"[yellow]No test case files found in {test_cases_dir}[/yellow]")
|
|
3013
|
+
# Validate that test cases were found
|
|
3014
|
+
if not validate_test_case_files(test_case_files_list, test_cases_dir, test_case_files):
|
|
3436
3015
|
return
|
|
3437
3016
|
|
|
3438
|
-
|
|
3439
|
-
|
|
3440
|
-
console.print(f"Test Cases: {len(test_case_files_list)}")
|
|
3441
|
-
if test_case_files:
|
|
3442
|
-
console.print(f"Selected: [cyan]{', '.join(test_case_files)}[/cyan]")
|
|
3443
|
-
console.print(f"Results Directory: {results_dir}\n")
|
|
3444
|
-
|
|
3445
|
-
data_gen_def = None
|
|
3446
|
-
if data_generator and not skip_data_generation:
|
|
3447
|
-
try:
|
|
3448
|
-
data_gen_def = load_agent_definition(data_generator)
|
|
3449
|
-
data_gen_name = data_gen_def.get('name', Path(data_generator).stem)
|
|
3450
|
-
console.print(f"Data Generator Agent: [bold]{data_gen_name}[/bold]\n")
|
|
3451
|
-
except Exception as e:
|
|
3452
|
-
console.print(f"[yellow]⚠ Warning: Failed to setup data generator: {e}[/yellow]")
|
|
3453
|
-
console.print("[yellow]Continuing with test execution...[/yellow]\n")
|
|
3454
|
-
logger.debug(f"Data generator setup error: {e}", exc_info=True)
|
|
3455
|
-
|
|
3456
|
-
# Load validator agent definition
|
|
3457
|
-
validator_def = None
|
|
3458
|
-
validator_agent_name = "Default Validator"
|
|
3017
|
+
# Print execution header
|
|
3018
|
+
print_test_execution_header(agent_name, test_case_files_list, test_case_files, results_dir)
|
|
3459
3019
|
|
|
3460
|
-
#
|
|
3461
|
-
|
|
3462
|
-
if not validator_path:
|
|
3463
|
-
# Default to .alita/agents/test-validator.agent.md
|
|
3464
|
-
default_validator = Path.cwd() / '.alita' / 'agents' / 'test-validator.agent.md'
|
|
3465
|
-
if default_validator.exists():
|
|
3466
|
-
validator_path = str(default_validator)
|
|
3020
|
+
# Load data generator agent (if applicable)
|
|
3021
|
+
data_gen_def = load_data_generator_agent(data_generator, skip_data_generation)
|
|
3467
3022
|
|
|
3468
|
-
|
|
3469
|
-
|
|
3470
|
-
validator_def = load_agent_definition(validator_path)
|
|
3471
|
-
validator_agent_name = validator_def.get('name', Path(validator_path).stem)
|
|
3472
|
-
console.print(f"Validator Agent: [bold]{validator_agent_name}[/bold]")
|
|
3473
|
-
console.print(f"[dim]Using: {validator_path}[/dim]\n")
|
|
3474
|
-
except Exception as e:
|
|
3475
|
-
console.print(f"[yellow]⚠ Warning: Failed to load validator agent: {e}[/yellow]")
|
|
3476
|
-
console.print(f"[yellow]Will use test runner agent for validation[/yellow]\n")
|
|
3477
|
-
logger.debug(f"Validator load error: {e}", exc_info=True)
|
|
3478
|
-
else:
|
|
3479
|
-
console.print(f"[dim]No validator agent specified, using test runner agent for validation[/dim]\n")
|
|
3023
|
+
# Load validator agent
|
|
3024
|
+
validator_def, validator_agent_name, validator_path = load_validator_agent(validator)
|
|
3480
3025
|
|
|
3481
3026
|
# Store bulk data generation chat history to pass to test executors
|
|
3482
3027
|
bulk_gen_chat_history = []
|
|
3483
3028
|
|
|
3484
|
-
# Parse all test cases upfront
|
|
3029
|
+
# Parse all test cases upfront
|
|
3485
3030
|
parsed_test_cases = []
|
|
3486
|
-
|
|
3487
|
-
try:
|
|
3488
|
-
test_case = parse_test_case(str(test_file))
|
|
3489
|
-
parsed_test_cases.append({
|
|
3490
|
-
'file': test_file,
|
|
3491
|
-
'data': test_case
|
|
3492
|
-
})
|
|
3493
|
-
except Exception as e:
|
|
3494
|
-
console.print(f"[yellow]⚠ Warning: Failed to parse {test_file.name}: {e}[/yellow]")
|
|
3495
|
-
logger.debug(f"Parse error for {test_file.name}: {e}", exc_info=True)
|
|
3496
|
-
|
|
3497
|
-
# Filter test cases that need data generation
|
|
3498
|
-
test_cases_needing_data_gen = [
|
|
3499
|
-
tc for tc in parsed_test_cases
|
|
3500
|
-
if tc['data'].get('generate_test_data', True)
|
|
3501
|
-
]
|
|
3031
|
+
test_cases_needing_data_gen = []
|
|
3502
3032
|
|
|
3503
|
-
#
|
|
3504
|
-
|
|
3505
|
-
|
|
3506
|
-
|
|
3507
|
-
|
|
3033
|
+
# Create master log for entire test execution session
|
|
3034
|
+
results_path = Path(results_dir)
|
|
3035
|
+
session_name = f"test-execution-{test_cases_path.name}"
|
|
3036
|
+
|
|
3037
|
+
# Use the callbacks module console so tool-call panels are printed and captured.
|
|
3038
|
+
from .callbacks import console as callbacks_console
|
|
3039
|
+
with TestLogCapture(results_path, session_name, console=callbacks_console) as master_log:
|
|
3040
|
+
# Write header information to log
|
|
3041
|
+
master_log.print(f"\n[bold cyan]🧪 Test Execution Started[/bold cyan]")
|
|
3042
|
+
master_log.print(f"Agent: [bold]{agent_name}[/bold]")
|
|
3043
|
+
master_log.print(f"Test Cases: {len(test_case_files_list)}")
|
|
3044
|
+
if test_case_files:
|
|
3045
|
+
master_log.print(f"Selected: [cyan]{', '.join(test_case_files)}[/cyan]")
|
|
3046
|
+
master_log.print(f"Results Directory: {results_dir}\n")
|
|
3508
3047
|
|
|
3509
|
-
|
|
3048
|
+
if data_gen_def:
|
|
3049
|
+
data_gen_name = data_gen_def.get('name', Path(data_generator).stem if data_generator else 'Data Generator')
|
|
3050
|
+
master_log.print(f"Data Generator Agent: [bold]{data_gen_name}[/bold]\n")
|
|
3510
3051
|
|
|
3511
|
-
|
|
3512
|
-
|
|
3513
|
-
|
|
3514
|
-
|
|
3515
|
-
|
|
3516
|
-
|
|
3517
|
-
# Use first test case's config or empty tuple
|
|
3518
|
-
first_config_path = None
|
|
3519
|
-
if parsed_test_cases:
|
|
3520
|
-
first_tc = parsed_test_cases[0]
|
|
3521
|
-
first_config_path = resolve_toolkit_config_path(
|
|
3522
|
-
first_tc['data'].get('config_path', ''),
|
|
3523
|
-
first_tc['file'],
|
|
3524
|
-
test_cases_path
|
|
3525
|
-
)
|
|
3526
|
-
|
|
3527
|
-
data_gen_config_tuple = (first_config_path,) if first_config_path else ()
|
|
3528
|
-
data_gen_executor, _, _, _, _, _, _ = _setup_local_agent_executor(
|
|
3529
|
-
client, data_gen_def, data_gen_config_tuple, config,
|
|
3530
|
-
model, temperature, max_tokens, bulk_memory, work_dir
|
|
3531
|
-
)
|
|
3532
|
-
|
|
3533
|
-
if data_gen_executor:
|
|
3534
|
-
with console.status("[yellow]Generating test data for all test cases...[/yellow]", spinner="dots"):
|
|
3535
|
-
bulk_gen_result = data_gen_executor.invoke({
|
|
3536
|
-
"input": bulk_data_gen_prompt,
|
|
3537
|
-
"chat_history": []
|
|
3538
|
-
})
|
|
3539
|
-
bulk_gen_output = extract_output_from_result(bulk_gen_result)
|
|
3540
|
-
console.print(f"[green]✓ Bulk test data generation completed[/green]")
|
|
3541
|
-
console.print(f"[dim]{bulk_gen_output}...[/dim]\n")
|
|
3542
|
-
|
|
3543
|
-
# Store chat history from data generation to pass to test executors
|
|
3544
|
-
bulk_gen_chat_history = [
|
|
3545
|
-
{"role": "user", "content": bulk_data_gen_prompt},
|
|
3546
|
-
{"role": "assistant", "content": bulk_gen_output}
|
|
3547
|
-
]
|
|
3548
|
-
else:
|
|
3549
|
-
console.print(f"[yellow]⚠ Warning: Data generator has no executor[/yellow]\n")
|
|
3550
|
-
except Exception as e:
|
|
3551
|
-
console.print(f"[yellow]⚠ Warning: Bulk data generation failed: {e}[/yellow]")
|
|
3552
|
-
console.print("[yellow]Continuing with test execution...[/yellow]\n")
|
|
3553
|
-
logger.debug(f"Bulk data generation error: {e}", exc_info=True)
|
|
3554
|
-
|
|
3555
|
-
# Execute test cases sequentially with executor caching
|
|
3556
|
-
if not parsed_test_cases:
|
|
3557
|
-
console.print("[yellow]No test cases to execute[/yellow]")
|
|
3558
|
-
return
|
|
3559
|
-
|
|
3560
|
-
console.print(f"\n[bold yellow]📋 Executing test cases sequentially...[/bold yellow]\n")
|
|
3561
|
-
|
|
3562
|
-
# Show data generation context availability
|
|
3563
|
-
if bulk_gen_chat_history:
|
|
3564
|
-
console.print(f"[dim]✓ Data generation history available ({len(bulk_gen_chat_history)} messages) - shared with all test cases[/dim]\n")
|
|
3565
|
-
else:
|
|
3566
|
-
console.print(f"[dim]ℹ No data generation history (skipped or disabled)[/dim]\n")
|
|
3567
|
-
|
|
3568
|
-
# Executor cache: key = toolkit_config_path, value = (agent_executor, memory, mcp_session_manager)
|
|
3569
|
-
executor_cache = {}
|
|
3570
|
-
|
|
3571
|
-
# Validation executor cache: separate isolated executors for validation
|
|
3572
|
-
# key = toolkit_config_path, value = (agent_executor, memory, mcp_session_manager)
|
|
3573
|
-
validation_executor_cache = {}
|
|
3574
|
-
|
|
3575
|
-
# Execute each test case sequentially
|
|
3576
|
-
test_results = []
|
|
3577
|
-
total_tests = len(parsed_test_cases)
|
|
3578
|
-
|
|
3579
|
-
for idx, tc_info in enumerate(parsed_test_cases, 1):
|
|
3580
|
-
test_case = tc_info['data']
|
|
3581
|
-
test_file = tc_info['file']
|
|
3582
|
-
test_name = test_case['name']
|
|
3052
|
+
if validator_def:
|
|
3053
|
+
master_log.print(f"Validator Agent: [bold]{validator_agent_name}[/bold]")
|
|
3054
|
+
master_log.print(f"[dim]Using: {validator_path}[/dim]\n")
|
|
3055
|
+
else:
|
|
3056
|
+
master_log.print(f"[dim]No validator agent specified, using test runner agent for validation[/dim]\n")
|
|
3583
3057
|
|
|
3584
|
-
#
|
|
3585
|
-
|
|
3058
|
+
# Parse all test cases
|
|
3059
|
+
parsed_test_cases = parse_all_test_cases(test_case_files_list, master_log)
|
|
3060
|
+
test_cases_needing_data_gen = filter_test_cases_needing_data_gen(parsed_test_cases)
|
|
3586
3061
|
|
|
3587
|
-
|
|
3588
|
-
|
|
3589
|
-
|
|
3590
|
-
|
|
3591
|
-
|
|
3592
|
-
|
|
3062
|
+
# Bulk test data generation (if enabled)
|
|
3063
|
+
if data_gen_def and not skip_data_generation and test_cases_needing_data_gen:
|
|
3064
|
+
bulk_gen_chat_history = execute_bulk_data_generation(
|
|
3065
|
+
data_gen_def, test_cases_needing_data_gen, parsed_test_cases,
|
|
3066
|
+
test_cases_path, client, config, model, temperature, max_tokens,
|
|
3067
|
+
work_dir, master_log, _setup_local_agent_executor,
|
|
3068
|
+
verbose=show_verbose,
|
|
3069
|
+
debug=debug_mode,
|
|
3593
3070
|
)
|
|
3594
|
-
|
|
3595
|
-
|
|
3596
|
-
|
|
3597
|
-
|
|
3598
|
-
|
|
3599
|
-
|
|
3600
|
-
|
|
3601
|
-
|
|
3602
|
-
|
|
3603
|
-
)
|
|
3604
|
-
|
|
3605
|
-
# Build execution prompt for single test case
|
|
3606
|
-
execution_prompt = _build_single_test_execution_prompt(tc_info, idx)
|
|
3607
|
-
console.print(f"[dim]Executing with {len(bulk_gen_chat_history)} history messages[/dim]")
|
|
3608
|
-
console.print(f"[dim]Executing test case with the prompt {execution_prompt}[/dim]")
|
|
3609
|
-
|
|
3610
|
-
# Execute test case
|
|
3611
|
-
execution_output = ""
|
|
3612
|
-
if agent_executor:
|
|
3613
|
-
with console.status(f"[yellow]Executing test case...[/yellow]", spinner="dots"):
|
|
3614
|
-
exec_result = agent_executor.invoke({
|
|
3615
|
-
"input": execution_prompt,
|
|
3616
|
-
"chat_history": bulk_gen_chat_history # ONLY data gen history, no accumulation
|
|
3617
|
-
}, config={"configurable": {"thread_id": thread_id}})
|
|
3618
|
-
execution_output = extract_output_from_result(exec_result)
|
|
3619
|
-
|
|
3620
|
-
console.print(f"[green]✓ Test case executed[/green]")
|
|
3621
|
-
console.print(f"[dim]{execution_output}[/dim]\n")
|
|
3622
|
-
|
|
3623
|
-
# Append execution to bulk gen chat history for validation
|
|
3624
|
-
test_case_history_start = len(bulk_gen_chat_history)
|
|
3625
|
-
bulk_gen_chat_history.extend([
|
|
3626
|
-
{"role": "user", "content": execution_prompt},
|
|
3627
|
-
{"role": "assistant", "content": execution_output}
|
|
3628
|
-
])
|
|
3629
|
-
|
|
3630
|
-
# No history accumulation - each test case is independent
|
|
3631
|
-
else:
|
|
3632
|
-
console.print(f"[red]✗ No agent executor available[/red]")
|
|
3633
|
-
# Create fallback result for this test
|
|
3634
|
-
test_results.append({
|
|
3635
|
-
'title': test_name,
|
|
3636
|
-
'passed': False,
|
|
3637
|
-
'file': test_file.name,
|
|
3638
|
-
'step_results': []
|
|
3639
|
-
})
|
|
3640
|
-
continue
|
|
3641
|
-
|
|
3642
|
-
# Validate test case using validation executor with accumulated history
|
|
3643
|
-
validation_prompt = _build_single_test_validation_prompt(tc_info, idx, execution_output)
|
|
3644
|
-
|
|
3645
|
-
console.print(f"[bold yellow]🔍 Validating test case (with execution history)...[/bold yellow]")
|
|
3646
|
-
console.print(f"[dim]{validation_prompt}[/dim]\n")
|
|
3647
|
-
|
|
3648
|
-
# Create or retrieve isolated validation executor
|
|
3649
|
-
validation_cache_key = f"{cache_key}_validation"
|
|
3650
|
-
validation_agent_def = validator_def if validator_def else agent_def
|
|
3651
|
-
|
|
3652
|
-
validation_executor, validation_memory, validation_mcp_session = _create_executor_from_cache(
|
|
3653
|
-
validation_executor_cache, validation_cache_key, client, validation_agent_def,
|
|
3654
|
-
toolkit_config_path, config, model, temperature, max_tokens, work_dir
|
|
3655
|
-
)
|
|
3656
|
-
|
|
3657
|
-
if validation_cache_key not in validation_executor_cache:
|
|
3658
|
-
console.print(f"[dim]Created new isolated validation executor[/dim]")
|
|
3659
|
-
else:
|
|
3660
|
-
console.print(f"[dim]Using cached validation executor[/dim]")
|
|
3661
|
-
|
|
3662
|
-
# For validation, use a separate thread with accumulated chat history (data gen + execution)
|
|
3663
|
-
# This provides context to the validator about the test execution
|
|
3664
|
-
validation_thread_id = f"validation_{idx}_{uuid.uuid4().hex[:8]}"
|
|
3665
|
-
|
|
3666
|
-
validation_output = ""
|
|
3667
|
-
if validation_executor:
|
|
3668
|
-
with console.status(f"[yellow]Validating test case...[/yellow]", spinner="dots"):
|
|
3669
|
-
validation_result = validation_executor.invoke({
|
|
3670
|
-
"input": validation_prompt,
|
|
3671
|
-
"chat_history": bulk_gen_chat_history # Includes data gen and execution history
|
|
3672
|
-
}, {"configurable": {"thread_id": validation_thread_id}})
|
|
3673
|
-
|
|
3674
|
-
validation_output = extract_output_from_result(validation_result)
|
|
3675
|
-
else:
|
|
3676
|
-
console.print(f"[red]✗ No validation executor available[/red]")
|
|
3677
|
-
validation_output = "{}"
|
|
3678
|
-
|
|
3679
|
-
# No further history update - validation completes the cycle
|
|
3680
|
-
|
|
3681
|
-
# Parse validation JSON
|
|
3682
|
-
try:
|
|
3683
|
-
validation_json = _extract_json_from_text(validation_output)
|
|
3684
|
-
step_results = validation_json.get('steps', [])
|
|
3685
|
-
|
|
3686
|
-
# Determine if test passed (all steps must pass)
|
|
3687
|
-
test_passed = all(step.get('passed', False) for step in step_results) if step_results else False
|
|
3688
|
-
|
|
3689
|
-
if test_passed:
|
|
3690
|
-
console.print(f"[bold green]✅ Test PASSED: {test_name}[/bold green]")
|
|
3691
|
-
else:
|
|
3692
|
-
console.print(f"[bold red]❌ Test FAILED: {test_name}[/bold red]")
|
|
3693
|
-
|
|
3694
|
-
# Display individual step results
|
|
3695
|
-
for step_result in step_results:
|
|
3696
|
-
step_num = step_result.get('step_number')
|
|
3697
|
-
step_title = step_result.get('title', '')
|
|
3698
|
-
passed = step_result.get('passed', False)
|
|
3699
|
-
details = step_result.get('details', '')
|
|
3700
|
-
|
|
3701
|
-
if passed:
|
|
3702
|
-
console.print(f" [green]✓ Step {step_num}: {step_title}[/green]")
|
|
3703
|
-
console.print(f" [dim]{details}[/dim]")
|
|
3704
|
-
else:
|
|
3705
|
-
console.print(f" [red]✗ Step {step_num}: {step_title}[/red]")
|
|
3706
|
-
console.print(f" [dim]{details}[/dim]")
|
|
3707
|
-
|
|
3708
|
-
console.print()
|
|
3709
|
-
|
|
3710
|
-
# Store result
|
|
3711
|
-
test_results.append({
|
|
3712
|
-
'title': test_name,
|
|
3713
|
-
'passed': test_passed,
|
|
3714
|
-
'file': test_file.name,
|
|
3715
|
-
'step_results': step_results
|
|
3716
|
-
})
|
|
3717
|
-
|
|
3718
|
-
except Exception as e:
|
|
3719
|
-
logger.debug(f"Validation parsing failed for {test_name}: {e}", exc_info=True)
|
|
3720
|
-
console.print(f"[yellow]⚠ Warning: Could not parse validation results for {test_name}[/yellow]")
|
|
3721
|
-
console.print(f"[yellow]Error: {str(e)}[/yellow]")
|
|
3722
|
-
|
|
3723
|
-
# Enhanced diagnostic output
|
|
3724
|
-
_print_validation_diagnostics(validation_output)
|
|
3725
|
-
|
|
3726
|
-
# Generate fallback result using helper function
|
|
3727
|
-
console.print(f"\n[yellow]🔄 Generating fallback validation result...[/yellow]")
|
|
3728
|
-
fallback_result = _create_fallback_result_for_test(
|
|
3729
|
-
test_case,
|
|
3730
|
-
test_file,
|
|
3731
|
-
f'Validation failed - could not parse validator output: {str(e)}'
|
|
3732
|
-
)
|
|
3733
|
-
console.print(f"[dim]Created {len(fallback_result['step_results'])} fallback step results[/dim]\n")
|
|
3734
|
-
|
|
3735
|
-
test_results.append(fallback_result)
|
|
3736
|
-
console.print()
|
|
3737
|
-
|
|
3738
|
-
# After validation, remove the test case execution from history to prevent accumulation
|
|
3739
|
-
# Remove the entries added for this test case
|
|
3740
|
-
del bulk_gen_chat_history[test_case_history_start:]
|
|
3741
|
-
|
|
3742
|
-
except Exception as e:
|
|
3743
|
-
logger.debug(f"Test execution failed for {test_name}: {e}", exc_info=True)
|
|
3744
|
-
console.print(f"[red]✗ Test execution failed: {e}[/red]")
|
|
3745
|
-
|
|
3746
|
-
# Create fallback result using helper function
|
|
3747
|
-
fallback_result = _create_fallback_result_for_test(
|
|
3748
|
-
test_case,
|
|
3749
|
-
test_file,
|
|
3750
|
-
f'Test execution failed: {str(e)}'
|
|
3751
|
-
)
|
|
3752
|
-
test_results.append(fallback_result)
|
|
3753
|
-
console.print()
|
|
3754
|
-
|
|
3755
|
-
# Cleanup: Close executor cache resources
|
|
3756
|
-
_cleanup_executor_cache(executor_cache, "executor")
|
|
3757
|
-
_cleanup_executor_cache(validation_executor_cache, "validation executor")
|
|
3758
|
-
|
|
3759
|
-
# Calculate totals
|
|
3760
|
-
total_tests = len(test_results)
|
|
3761
|
-
passed_tests = sum(1 for r in test_results if r['passed'])
|
|
3762
|
-
failed_tests = total_tests - passed_tests
|
|
3763
|
-
|
|
3764
|
-
# Generate summary report
|
|
3765
|
-
console.print(f"\n[bold]{'='*60}[/bold]")
|
|
3766
|
-
console.print(f"[bold cyan]📊 Test Execution Summary[/bold cyan]")
|
|
3767
|
-
console.print(f"[bold]{'='*60}[/bold]\n")
|
|
3768
|
-
|
|
3769
|
-
summary_table = Table(box=box.ROUNDED, border_style="cyan")
|
|
3770
|
-
summary_table.add_column("Metric", style="bold")
|
|
3771
|
-
summary_table.add_column("Value", justify="right")
|
|
3772
|
-
|
|
3773
|
-
summary_table.add_row("Total Tests", str(total_tests))
|
|
3774
|
-
summary_table.add_row("Passed", f"[green]{passed_tests}[/green]")
|
|
3775
|
-
summary_table.add_row("Failed", f"[red]{failed_tests}[/red]")
|
|
3776
|
-
|
|
3777
|
-
if total_tests > 0:
|
|
3778
|
-
pass_rate = (passed_tests / total_tests) * 100
|
|
3779
|
-
summary_table.add_row("Pass Rate", f"{pass_rate:.1f}%")
|
|
3780
|
-
|
|
3781
|
-
console.print(summary_table)
|
|
3071
|
+
|
|
3072
|
+
# Execute all test cases
|
|
3073
|
+
test_results = execute_all_test_cases(
|
|
3074
|
+
parsed_test_cases, bulk_gen_chat_history, test_cases_path,
|
|
3075
|
+
agent_def, validator_def, client, config, model, temperature,
|
|
3076
|
+
max_tokens, work_dir, master_log, _setup_local_agent_executor,
|
|
3077
|
+
verbose=show_verbose,
|
|
3078
|
+
debug=debug_mode,
|
|
3079
|
+
)
|
|
3782
3080
|
|
|
3783
|
-
#
|
|
3784
|
-
overall_result = "pass" if failed_tests == 0 else "fail"
|
|
3081
|
+
# End of master_log context - log file saved automatically
|
|
3785
3082
|
|
|
3786
|
-
|
|
3787
|
-
|
|
3788
|
-
{
|
|
3789
|
-
"title": r['title'],
|
|
3790
|
-
"passed": r['passed'],
|
|
3791
|
-
"steps": r.get('step_results', [])
|
|
3792
|
-
}
|
|
3793
|
-
for r in test_results
|
|
3794
|
-
],
|
|
3795
|
-
"overall_result": overall_result,
|
|
3796
|
-
"summary": {
|
|
3797
|
-
"total_tests": total_tests,
|
|
3798
|
-
"passed": passed_tests,
|
|
3799
|
-
"failed": failed_tests,
|
|
3800
|
-
"pass_rate": f"{pass_rate:.1f}%" if total_tests > 0 else "0%"
|
|
3801
|
-
},
|
|
3802
|
-
"timestamp": datetime.now().isoformat()
|
|
3803
|
-
}
|
|
3083
|
+
# Print test execution summary
|
|
3084
|
+
print_test_execution_summary(test_results, results_dir, session_name)
|
|
3804
3085
|
|
|
3805
|
-
# Save structured report
|
|
3806
|
-
|
|
3807
|
-
|
|
3808
|
-
|
|
3086
|
+
# Save structured JSON report
|
|
3087
|
+
log_file = None
|
|
3088
|
+
toolkit_name = session_name.replace('test-execution-', '')
|
|
3089
|
+
toolkit_dir = results_path / toolkit_name
|
|
3090
|
+
log_files = sorted(toolkit_dir.glob(f"*{session_name}.txt")) if toolkit_dir.exists() else []
|
|
3091
|
+
if log_files:
|
|
3092
|
+
log_file = log_files[0]
|
|
3809
3093
|
|
|
3810
|
-
|
|
3811
|
-
with open(summary_file, 'w') as f:
|
|
3812
|
-
json.dump(structured_report, f, indent=2)
|
|
3813
|
-
console.print(f"[green]✓ Summary saved to {summary_file}[/green]\n")
|
|
3094
|
+
save_structured_report(test_results, results_dir, log_file)
|
|
3814
3095
|
|
|
3815
3096
|
# Exit with error code if any tests failed
|
|
3097
|
+
failed_tests = sum(1 for r in test_results if not r['passed'])
|
|
3816
3098
|
if failed_tests > 0:
|
|
3817
3099
|
sys.exit(1)
|
|
3818
3100
|
|