alita-sdk 0.3.603__py3-none-any.whl → 0.3.611__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (34) hide show
  1. alita_sdk/cli/agents.py +108 -826
  2. alita_sdk/cli/testcases/__init__.py +94 -0
  3. alita_sdk/cli/testcases/data_generation.py +119 -0
  4. alita_sdk/cli/testcases/discovery.py +96 -0
  5. alita_sdk/cli/testcases/executor.py +84 -0
  6. alita_sdk/cli/testcases/logger.py +85 -0
  7. alita_sdk/cli/testcases/parser.py +172 -0
  8. alita_sdk/cli/testcases/prompts.py +91 -0
  9. alita_sdk/cli/testcases/reporting.py +125 -0
  10. alita_sdk/cli/testcases/setup.py +108 -0
  11. alita_sdk/cli/testcases/test_runner.py +282 -0
  12. alita_sdk/cli/testcases/utils.py +39 -0
  13. alita_sdk/cli/testcases/validation.py +90 -0
  14. alita_sdk/cli/testcases/workflow.py +196 -0
  15. alita_sdk/configurations/openapi.py +2 -2
  16. alita_sdk/runtime/clients/artifact.py +1 -1
  17. alita_sdk/runtime/langchain/langraph_agent.py +21 -6
  18. alita_sdk/runtime/tools/artifact.py +253 -8
  19. alita_sdk/runtime/tools/function.py +25 -6
  20. alita_sdk/runtime/tools/llm.py +12 -11
  21. alita_sdk/runtime/utils/serialization.py +155 -0
  22. alita_sdk/tools/bitbucket/api_wrapper.py +31 -30
  23. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
  24. alita_sdk/tools/confluence/api_wrapper.py +8 -1
  25. alita_sdk/tools/elitea_base.py +40 -36
  26. alita_sdk/tools/figma/api_wrapper.py +140 -83
  27. alita_sdk/tools/github/graphql_client_wrapper.py +1 -0
  28. alita_sdk/tools/utils/text_operations.py +156 -52
  29. {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.611.dist-info}/METADATA +1 -1
  30. {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.611.dist-info}/RECORD +34 -20
  31. {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.611.dist-info}/WHEEL +0 -0
  32. {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.611.dist-info}/entry_points.txt +0 -0
  33. {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.611.dist-info}/licenses/LICENSE +0 -0
  34. {alita_sdk-0.3.603.dist-info → alita_sdk-0.3.611.dist-info}/top_level.txt +0 -0
alita_sdk/cli/agents.py CHANGED
@@ -36,439 +36,44 @@ from .callbacks import create_cli_callback, CLICallbackHandler
36
36
  from .input_handler import get_input_handler, styled_input, styled_selection_input
37
37
  # Context management for chat history
38
38
  from .context import CLIContextManager, CLIMessage, purge_old_sessions as purge_context_sessions
39
+ # Test execution utilities
40
+ from .testcases import (
41
+ parse_test_case,
42
+ resolve_toolkit_config_path,
43
+ build_bulk_data_gen_prompt,
44
+ build_single_test_execution_prompt,
45
+ build_single_test_validation_prompt,
46
+ extract_json_from_text,
47
+ create_fallback_result_for_test,
48
+ print_validation_diagnostics,
49
+ TestLogCapture,
50
+ create_executor_from_cache,
51
+ cleanup_executor_cache,
52
+ extract_toolkit_name,
53
+ # New helper functions
54
+ load_test_runner_agent,
55
+ load_data_generator_agent,
56
+ load_validator_agent,
57
+ discover_test_case_files,
58
+ validate_test_case_files,
59
+ print_test_execution_header,
60
+ execute_bulk_data_generation,
61
+ execute_single_test_case,
62
+ validate_single_test_case,
63
+ generate_summary_report,
64
+ save_structured_report,
65
+ print_test_execution_summary,
66
+ # Workflow orchestration
67
+ parse_all_test_cases,
68
+ filter_test_cases_needing_data_gen,
69
+ execute_all_test_cases,
70
+ )
39
71
 
40
72
  logger = logging.getLogger(__name__)
41
73
 
42
74
  # Create a rich console for beautiful output
43
75
  console = Console()
44
76
 
45
-
46
- def resolve_toolkit_config_path(config_path_str: str, test_file: Path, test_cases_dir: Path) -> Optional[str]:
47
- """
48
- Resolve toolkit configuration file path from test case.
49
-
50
- Tries multiple locations in order:
51
- 1. Absolute path
52
- 2. Relative to test case file directory
53
- 3. Relative to test cases directory
54
- 4. Relative to workspace root
55
-
56
- Args:
57
- config_path_str: Config path from test case
58
- test_file: Path to the test case file
59
- test_cases_dir: Path to test cases directory
60
-
61
- Returns:
62
- Absolute path to config file if found, None otherwise
63
- """
64
- if not config_path_str:
65
- return None
66
-
67
- # Normalize path separators
68
- config_path_str = config_path_str.replace('\\', '/')
69
-
70
- # Try absolute path first
71
- config_path = Path(config_path_str)
72
- if config_path.is_absolute() and config_path.exists():
73
- return str(config_path)
74
-
75
- # Try relative to test case file directory
76
- config_path = test_file.parent / config_path_str
77
- if config_path.exists():
78
- return str(config_path)
79
-
80
- # Try relative to test_cases_dir
81
- config_path = test_cases_dir / config_path_str
82
- if config_path.exists():
83
- return str(config_path)
84
-
85
- # Try relative to workspace root
86
- workspace_root = Path.cwd()
87
- config_path = workspace_root / config_path_str
88
- if config_path.exists():
89
- return str(config_path)
90
-
91
- return None
92
-
93
-
94
- def parse_test_case(test_case_path: str) -> Dict[str, Any]:
95
- """
96
- Parse a test case markdown file to extract configuration, steps, and expectations.
97
-
98
- Args:
99
- test_case_path: Path to the test case markdown file
100
-
101
- Returns:
102
- Dictionary containing:
103
- - name: Test case name
104
- - objective: Test objective
105
- - config_path: Path to toolkit config file
106
- - generate_test_data: Boolean flag indicating if test data generation is needed (default: True)
107
- - test_data_config: Dictionary of test data configuration from table
108
- - prerequisites: Pre-requisites section text
109
- - variables: List of variable placeholders found (e.g., {{TEST_PR_NUMBER}})
110
- - steps: List of test steps with their descriptions
111
- - expectations: List of expectations/assertions
112
- """
113
- path = Path(test_case_path)
114
- if not path.exists():
115
- raise FileNotFoundError(f"Test case not found: {test_case_path}")
116
-
117
- content = path.read_text(encoding='utf-8')
118
-
119
- # Extract test case name from the first heading
120
- name_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
121
- name = name_match.group(1) if name_match else path.stem
122
-
123
- # Extract objective
124
- objective_match = re.search(r'##\s+Objective\s*\n\n(.+?)(?=\n\n##|\Z)', content, re.DOTALL)
125
- objective = objective_match.group(1).strip() if objective_match else ""
126
-
127
- # Extract config path and generateTestData flag
128
- config_section_match = re.search(r'##\s+Config\s*\n\n(.+?)(?=\n\n##|\Z)', content, re.DOTALL)
129
- config_path = None
130
- generate_test_data = True # Default to True if not specified
131
-
132
- if config_section_match:
133
- config_section = config_section_match.group(1)
134
- # Extract path
135
- path_match = re.search(r'path:\s*(.+?)(?=\n|$)', config_section, re.MULTILINE)
136
- if path_match:
137
- config_path = path_match.group(1).strip()
138
-
139
- # Extract generateTestData flag
140
- gen_data_match = re.search(r'generateTestData\s*:\s*(true|false)', config_section, re.IGNORECASE)
141
- if gen_data_match:
142
- generate_test_data = gen_data_match.group(1).lower() == 'true'
143
-
144
- # Extract Test Data Configuration section as a raw fenced code block string
145
- # NOTE: We intentionally store the entire section as a single string rather than parsing
146
- # individual table rows. This preserves the original formatting for downstream tools
147
- # which may prefer the raw markdown block.
148
- test_data_config = None
149
- config_section_match = re.search(r'##\s+Test Data Configuration\s*\n(.+?)(?=\n##|\Z)', content, re.DOTALL)
150
- if config_section_match:
151
- config_section = config_section_match.group(1).strip()
152
- # Store as a fenced code block to make it clear this is a raw block of text
153
- test_data_config = f"\n{config_section}\n"
154
-
155
- # Extract Pre-requisites section
156
- prerequisites = ""
157
- prereq_match = re.search(r'##\s+Pre-requisites\s*\n\n(.+?)(?=\n\n##|\Z)', content, re.DOTALL)
158
- if prereq_match:
159
- prerequisites = prereq_match.group(1).strip()
160
-
161
- # Find all variable placeholders ({{VARIABLE_NAME}})
162
- variables = list(set(re.findall(r'\{\{([A-Z_]+)\}\}', content)))
163
-
164
- # Extract test steps and expectations
165
- steps = []
166
- expectations = []
167
-
168
- # Find all Step sections
169
- step_pattern = r'###\s+Step\s+(\d+):\s+(.+?)\n\n(.+?)(?=\n\n###|\n\n##|\Z)'
170
- for step_match in re.finditer(step_pattern, content, re.DOTALL):
171
- step_num = step_match.group(1)
172
- step_title = step_match.group(2).strip()
173
- step_content = step_match.group(3).strip()
174
-
175
- # Extract the actual instruction (first paragraph before "Expectation:")
176
- instruction_match = re.search(r'(.+?)(?=\n\n\*\*Expectation:\*\*|\Z)', step_content, re.DOTALL)
177
- instruction = instruction_match.group(1).strip() if instruction_match else step_content
178
-
179
- # Extract expectation if present
180
- expectation_match = re.search(r'\*\*Expectation:\*\*\s+(.+)', step_content, re.DOTALL)
181
- expectation = expectation_match.group(1).strip() if expectation_match else None
182
-
183
- steps.append({
184
- 'number': int(step_num),
185
- 'title': step_title,
186
- 'instruction': instruction,
187
- 'expectation': expectation
188
- })
189
-
190
- if expectation:
191
- expectations.append({
192
- 'step': int(step_num),
193
- 'description': expectation
194
- })
195
-
196
- return {
197
- 'name': name,
198
- 'objective': objective,
199
- 'config_path': config_path,
200
- 'generate_test_data': generate_test_data,
201
- 'test_data_config': test_data_config,
202
- 'prerequisites': prerequisites,
203
- 'variables': variables,
204
- 'steps': steps,
205
- 'expectations': expectations
206
- }
207
-
208
-
209
- def validate_test_output(output: str, expectation: str) -> tuple[bool, str]:
210
- """
211
- Validate test output against expectations.
212
-
213
- Args:
214
- output: The actual output from the agent
215
- expectation: The expected result description
216
-
217
- Returns:
218
- Tuple of (passed: bool, details: str)
219
- """
220
- # Simple keyword-based validation
221
- # Extract key phrases from expectation
222
-
223
- # Common patterns in expectations
224
- if "contains" in expectation.lower():
225
- # Extract what should be contained
226
- contains_match = re.search(r'contains.*?["`]([^"`]+)["`]', expectation, re.IGNORECASE)
227
- if contains_match:
228
- expected_text = contains_match.group(1)
229
- if expected_text in output:
230
- return True, f"Output contains expected text: '{expected_text}'"
231
- else:
232
- return False, f"Output does not contain expected text: '{expected_text}'"
233
-
234
- if "without errors" in expectation.lower() or "runs without errors" in expectation.lower():
235
- # Check for common error indicators
236
- error_indicators = ['error', 'exception', 'failed', 'traceback']
237
- has_error = any(indicator in output.lower() for indicator in error_indicators)
238
- if not has_error:
239
- return True, "Execution completed without errors"
240
- else:
241
- return False, "Execution encountered errors"
242
-
243
- # Default: assume pass if output is non-empty
244
- if output and len(output.strip()) > 0:
245
- return True, "Output generated successfully"
246
-
247
- return False, "No output generated"
248
-
249
-
250
- def _build_bulk_data_gen_prompt(parsed_test_cases: list) -> str:
251
- """Build consolidated requirements text for bulk test data generation."""
252
- requirements = []
253
- for idx, tc in enumerate(parsed_test_cases, 1):
254
- test_case = tc['data']
255
- test_file = tc['file']
256
- # Build parts for this test case (do not include separator lines here;
257
- # the entire block is wrapped with separators at the top-level)
258
- parts = [f"Test Case #{idx}: {test_case['name']}", f"File: {test_file.name}", ""]
259
-
260
- if test_case.get('test_data_config'):
261
- parts.append("Test Data Configuration:")
262
- td = test_case['test_data_config']
263
- raw_lines = str(td).splitlines()
264
- for line in raw_lines:
265
- parts.append(f"{line}")
266
-
267
- if test_case.get('prerequisites'):
268
- parts.append(f"\nPre-requisites:\n{test_case['prerequisites']}")
269
-
270
- requirements.append("\n".join(parts))
271
-
272
- # If no requirements were collected, return an empty string to avoid
273
- # producing a prompt with only separator lines.
274
- if not requirements:
275
- return ""
276
-
277
- # Use a visible divider between test cases so each entry is clearly separated
278
- divider = '-' * 40
279
- body = f"\n\n{divider}\n\n".join(requirements)
280
- return f"{('='*60)}\n\n{body}\n\n{('='*60)}"
281
-
282
-
283
- def _build_single_test_execution_prompt(test_case_info: dict, test_number: int) -> str:
284
- """Build execution prompt for a single test case."""
285
- test_case = test_case_info['data']
286
- test_file = test_case_info['file']
287
-
288
- parts = [
289
- f"\n{'='*80}",
290
- f"TEST CASE #{test_number}: {test_case['name']}",
291
- f"File: {test_file.name}",
292
- f"{'='*80}"
293
- ]
294
-
295
- if test_case['steps']:
296
- for step in test_case['steps']:
297
- parts.append(f"\nStep {step['number']}: {step['title']}")
298
- parts.append(step['instruction'])
299
- else:
300
- parts.append("\n(No steps defined)")
301
-
302
- return "\n".join(parts)
303
-
304
-
305
- def _build_single_test_validation_prompt(test_case_info: dict, test_number: int, execution_output: str) -> str:
306
- """Build validation prompt for a single test case."""
307
- test_case = test_case_info['data']
308
-
309
- parts = [
310
- f"\nTest Case #{test_number}: {test_case['name']}"
311
- ]
312
-
313
- if test_case['steps']:
314
- for step in test_case['steps']:
315
- parts.append(f" Step {step['number']}: {step['title']}")
316
- if step['expectation']:
317
- parts.append(f" Expected: {step['expectation']}")
318
-
319
- parts.append(f"\n\nActual Execution Results:\n{execution_output}\n")
320
-
321
- # Escape quotes in test name for valid JSON in prompt
322
- escaped_test_name = test_case['name'].replace('"', '\\"')
323
-
324
- parts.append(f"""\nBased on the execution results above, validate this test case.
325
- {{
326
- "test_number": {test_number},
327
- "test_name": "{escaped_test_name}"
328
- }}
329
- """)
330
-
331
- return "\n".join(parts)
332
-
333
-
334
- def _extract_json_from_text(text: str) -> dict:
335
- """Extract JSON object from text using brace counting."""
336
- start_idx = text.find('{')
337
- if start_idx == -1:
338
- raise ValueError("No JSON found in text")
339
-
340
- brace_count = 0
341
- end_idx = -1
342
- for i, char in enumerate(text[start_idx:], start=start_idx):
343
- if char == '{':
344
- brace_count += 1
345
- elif char == '}':
346
- brace_count -= 1
347
- if brace_count == 0:
348
- end_idx = i + 1
349
- break
350
-
351
- if end_idx == -1:
352
- raise ValueError("Could not find matching closing brace")
353
-
354
- return json.loads(text[start_idx:end_idx])
355
-
356
-
357
- def _create_fallback_result_for_test(test_case: dict, test_file: Path, reason: str = 'Validation failed') -> dict:
358
- """Create a fallback result for a single test case with detailed step information.
359
-
360
- Args:
361
- test_case: Parsed test case data
362
- test_file: Path to test case file
363
- reason: Reason for fallback
364
-
365
- Returns:
366
- Fallback test result dict with step details
367
- """
368
- fallback_steps = []
369
- for step_info in test_case.get('steps', []):
370
- fallback_steps.append({
371
- 'step_number': step_info['number'],
372
- 'title': step_info['title'],
373
- 'passed': False,
374
- 'details': reason
375
- })
376
-
377
- return {
378
- 'title': test_case['name'],
379
- 'passed': False,
380
- 'file': test_file.name,
381
- 'step_results': fallback_steps,
382
- 'validation_error': reason
383
- }
384
-
385
-
386
- def _cleanup_executor_cache(cache: Dict[str, tuple], cache_name: str = "executor") -> None:
387
- """Clean up executor cache resources.
388
-
389
- Args:
390
- cache: Dictionary of cached executors
391
- cache_name: Name of cache for logging
392
- """
393
- console.print(f"[dim]Cleaning up {cache_name} cache...[/dim]")
394
- for cache_key, cached_items in cache.items():
395
- try:
396
- # Extract memory from tuple (second element)
397
- memory = cached_items[1] if len(cached_items) > 1 else None
398
-
399
- # Close SQLite memory connection
400
- if memory and hasattr(memory, 'conn') and memory.conn:
401
- memory.conn.close()
402
- except Exception as e:
403
- logger.debug(f"Error cleaning up {cache_name} cache for {cache_key}: {e}")
404
-
405
-
406
- def _create_executor_from_cache(cache: Dict[str, tuple], cache_key: str,
407
- client, agent_def: Dict, toolkit_config_path: Optional[str],
408
- config, model: Optional[str], temperature: Optional[float],
409
- max_tokens: Optional[int], work_dir: Optional[str]) -> tuple:
410
- """Get or create executor from cache.
411
-
412
- Args:
413
- cache: Executor cache dictionary
414
- cache_key: Key for caching
415
- client: API client
416
- agent_def: Agent definition
417
- toolkit_config_path: Path to toolkit config
418
- config: CLI configuration
419
- model: Model override
420
- temperature: Temperature override
421
- max_tokens: Max tokens override
422
- work_dir: Working directory
423
-
424
- Returns:
425
- Tuple of (agent_executor, memory, mcp_session_manager)
426
- """
427
- if cache_key in cache:
428
- return cache[cache_key]
429
-
430
- # Create new executor
431
- from langgraph.checkpoint.sqlite import SqliteSaver
432
- import sqlite3
433
-
434
- memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
435
- toolkit_config_tuple = (toolkit_config_path,) if toolkit_config_path else ()
436
-
437
- agent_executor, mcp_session_manager, _, _, _, _, _ = _setup_local_agent_executor(
438
- client, agent_def, toolkit_config_tuple, config, model, temperature,
439
- max_tokens, memory, work_dir
440
- )
441
-
442
- # Cache the executor
443
- cached_tuple = (agent_executor, memory, mcp_session_manager)
444
- cache[cache_key] = cached_tuple
445
- return cached_tuple
446
-
447
-
448
- def _print_validation_diagnostics(validation_output: str) -> None:
449
- """Print diagnostic information for validation output.
450
-
451
- Args:
452
- validation_output: The validation output to diagnose
453
- """
454
- console.print(f"\n[bold red]🔍 Diagnostic Information:[/bold red]")
455
- console.print(f"[dim]Output length: {len(validation_output)} characters[/dim]")
456
-
457
- # Check for key JSON elements
458
- has_json = '{' in validation_output and '}' in validation_output
459
- has_fields = 'test_number' in validation_output and 'steps' in validation_output
460
-
461
- console.print(f"[dim]Has JSON structure: {has_json}[/dim]")
462
- console.print(f"[dim]Has required fields: {has_fields}[/dim]")
463
-
464
- # Show relevant excerpt
465
- if len(validation_output) > 400:
466
- console.print(f"\n[red]First 200 chars:[/red] [dim]{validation_output[:200]}[/dim]")
467
- console.print(f"[red]Last 200 chars:[/red] [dim]{validation_output[-200:]}[/dim]")
468
- else:
469
- console.print(f"\n[red]Full output:[/red] [dim]{validation_output}[/dim]")
470
-
471
-
472
77
  def _get_alita_system_prompt(config) -> str:
473
78
  """
474
79
  Get the Alita system prompt from user config or fallback to default.
@@ -3343,12 +2948,15 @@ def agent_run(ctx, agent_source: str, message: str, version: Optional[str],
3343
2948
  help='Path to test validator agent definition file (default: .alita/agents/test-validator.agent.md)')
3344
2949
  @click.option('--skip-data-generation', is_flag=True,
3345
2950
  help='Skip test data generation step')
2951
+ @click.option('--verbose', '-v', type=click.Choice(['quiet', 'default', 'debug']), default='default',
2952
+ help='Output verbosity level: quiet (final output only), default (tool calls + outputs), debug (all including LLM calls)')
3346
2953
  @click.pass_context
3347
2954
  def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir: str,
3348
2955
  test_case_files: tuple, model: Optional[str], temperature: Optional[float],
3349
2956
  max_tokens: Optional[int], work_dir: str,
3350
2957
  data_generator: str, validator: Optional[str],
3351
- skip_data_generation: bool):
2958
+ skip_data_generation: bool,
2959
+ verbose: str):
3352
2960
  """
3353
2961
  Execute test cases from a directory and save results.
3354
2962
 
@@ -3383,6 +2991,10 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
3383
2991
  config = ctx.obj['config']
3384
2992
  client = get_client(ctx)
3385
2993
 
2994
+ # Setup verbose level
2995
+ show_verbose = verbose != 'quiet'
2996
+ debug_mode = verbose == 'debug'
2997
+
3386
2998
  # Sanity-check committed defaults (should exist; fail early with a clear message if not)
3387
2999
  if results_dir and not Path(results_dir).exists():
3388
3000
  raise click.ClickException(
@@ -3391,428 +3003,98 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
3391
3003
  )
3392
3004
 
3393
3005
  try:
3394
- # Load agent definition
3395
- agent_source_path = Path(agent_source)
3396
- if not agent_source_path.exists():
3397
- default_path = Path('.alita') / 'agents' / 'test-runner.agent.md'
3398
- if agent_source_path == default_path:
3399
- raise click.ClickException(
3400
- f"Default agent definition not found: {agent_source}. "
3401
- f"Run this command from the repo root (so {default_path} resolves correctly) "
3402
- f"or pass --agent_source explicitly."
3403
- )
3404
- raise click.ClickException(f"Agent definition not found: {agent_source}")
3405
-
3406
- agent_def = load_agent_definition(agent_source)
3407
- agent_name = agent_def.get('name', Path(agent_source).stem)
3006
+ # Load test runner agent
3007
+ agent_def, agent_name = load_test_runner_agent(agent_source)
3408
3008
 
3409
- # Find all test case files (recursively search subdirectories)
3009
+ # Find and filter test case files
3410
3010
  test_cases_path = Path(test_cases_dir)
3011
+ test_case_files_list = discover_test_case_files(test_cases_dir, test_case_files)
3411
3012
 
3412
- # Filter test cases based on --test-case options
3413
- if test_case_files:
3414
- # User specified specific test case files
3415
- test_case_files_set = set(test_case_files)
3416
- all_test_cases = sorted(test_cases_path.rglob('TC-*.md'))
3417
- test_case_files_list = [
3418
- tc for tc in all_test_cases
3419
- if tc.name in test_case_files_set
3420
- ]
3421
-
3422
- # Check if all specified files were found
3423
- found_names = {tc.name for tc in test_case_files_list}
3424
- not_found = test_case_files_set - found_names
3425
- if not_found:
3426
- console.print(f"[yellow]⚠ Warning: Test case files not found: {', '.join(not_found)}[/yellow]")
3427
- else:
3428
- # Execute all test cases
3429
- test_case_files_list = sorted(test_cases_path.rglob('TC-*.md'))
3430
-
3431
- if not test_case_files_list:
3432
- if test_case_files:
3433
- console.print(f"[yellow]No matching test case files found in {test_cases_dir}[/yellow]")
3434
- else:
3435
- console.print(f"[yellow]No test case files found in {test_cases_dir}[/yellow]")
3013
+ # Validate that test cases were found
3014
+ if not validate_test_case_files(test_case_files_list, test_cases_dir, test_case_files):
3436
3015
  return
3437
3016
 
3438
- console.print(f"\n[bold cyan]🧪 Test Execution Started[/bold cyan]")
3439
- console.print(f"Agent: [bold]{agent_name}[/bold]")
3440
- console.print(f"Test Cases: {len(test_case_files_list)}")
3441
- if test_case_files:
3442
- console.print(f"Selected: [cyan]{', '.join(test_case_files)}[/cyan]")
3443
- console.print(f"Results Directory: {results_dir}\n")
3444
-
3445
- data_gen_def = None
3446
- if data_generator and not skip_data_generation:
3447
- try:
3448
- data_gen_def = load_agent_definition(data_generator)
3449
- data_gen_name = data_gen_def.get('name', Path(data_generator).stem)
3450
- console.print(f"Data Generator Agent: [bold]{data_gen_name}[/bold]\n")
3451
- except Exception as e:
3452
- console.print(f"[yellow]⚠ Warning: Failed to setup data generator: {e}[/yellow]")
3453
- console.print("[yellow]Continuing with test execution...[/yellow]\n")
3454
- logger.debug(f"Data generator setup error: {e}", exc_info=True)
3455
-
3456
- # Load validator agent definition
3457
- validator_def = None
3458
- validator_agent_name = "Default Validator"
3017
+ # Print execution header
3018
+ print_test_execution_header(agent_name, test_case_files_list, test_case_files, results_dir)
3459
3019
 
3460
- # Try to load validator from specified path or default location
3461
- validator_path = validator
3462
- if not validator_path:
3463
- # Default to .alita/agents/test-validator.agent.md
3464
- default_validator = Path.cwd() / '.alita' / 'agents' / 'test-validator.agent.md'
3465
- if default_validator.exists():
3466
- validator_path = str(default_validator)
3020
+ # Load data generator agent (if applicable)
3021
+ data_gen_def = load_data_generator_agent(data_generator, skip_data_generation)
3467
3022
 
3468
- if validator_path and Path(validator_path).exists():
3469
- try:
3470
- validator_def = load_agent_definition(validator_path)
3471
- validator_agent_name = validator_def.get('name', Path(validator_path).stem)
3472
- console.print(f"Validator Agent: [bold]{validator_agent_name}[/bold]")
3473
- console.print(f"[dim]Using: {validator_path}[/dim]\n")
3474
- except Exception as e:
3475
- console.print(f"[yellow]⚠ Warning: Failed to load validator agent: {e}[/yellow]")
3476
- console.print(f"[yellow]Will use test runner agent for validation[/yellow]\n")
3477
- logger.debug(f"Validator load error: {e}", exc_info=True)
3478
- else:
3479
- console.print(f"[dim]No validator agent specified, using test runner agent for validation[/dim]\n")
3023
+ # Load validator agent
3024
+ validator_def, validator_agent_name, validator_path = load_validator_agent(validator)
3480
3025
 
3481
3026
  # Store bulk data generation chat history to pass to test executors
3482
3027
  bulk_gen_chat_history = []
3483
3028
 
3484
- # Parse all test cases upfront for bulk data generation
3029
+ # Parse all test cases upfront
3485
3030
  parsed_test_cases = []
3486
- for test_file in test_case_files_list:
3487
- try:
3488
- test_case = parse_test_case(str(test_file))
3489
- parsed_test_cases.append({
3490
- 'file': test_file,
3491
- 'data': test_case
3492
- })
3493
- except Exception as e:
3494
- console.print(f"[yellow]⚠ Warning: Failed to parse {test_file.name}: {e}[/yellow]")
3495
- logger.debug(f"Parse error for {test_file.name}: {e}", exc_info=True)
3496
-
3497
- # Filter test cases that need data generation
3498
- test_cases_needing_data_gen = [
3499
- tc for tc in parsed_test_cases
3500
- if tc['data'].get('generate_test_data', True)
3501
- ]
3031
+ test_cases_needing_data_gen = []
3502
3032
 
3503
- # Bulk test data generation (if enabled)
3504
- if data_gen_def and not skip_data_generation and test_cases_needing_data_gen:
3505
- console.print(f"\n[bold yellow]🔧 Bulk Test Data Generation[/bold yellow]")
3506
- console.print(f"Generating test data for {len(test_cases_needing_data_gen)} test cases...\n")
3507
- console.print(f"[dim]Skipping {len(parsed_test_cases) - len(test_cases_needing_data_gen)} test cases with generateTestData: false[/dim]\n")
3033
+ # Create master log for entire test execution session
3034
+ results_path = Path(results_dir)
3035
+ session_name = f"test-execution-{test_cases_path.name}"
3036
+
3037
+ # Use the callbacks module console so tool-call panels are printed and captured.
3038
+ from .callbacks import console as callbacks_console
3039
+ with TestLogCapture(results_path, session_name, console=callbacks_console) as master_log:
3040
+ # Write header information to log
3041
+ master_log.print(f"\n[bold cyan]🧪 Test Execution Started[/bold cyan]")
3042
+ master_log.print(f"Agent: [bold]{agent_name}[/bold]")
3043
+ master_log.print(f"Test Cases: {len(test_case_files_list)}")
3044
+ if test_case_files:
3045
+ master_log.print(f"Selected: [cyan]{', '.join(test_case_files)}[/cyan]")
3046
+ master_log.print(f"Results Directory: {results_dir}\n")
3508
3047
 
3509
- bulk_data_gen_prompt = _build_bulk_data_gen_prompt(test_cases_needing_data_gen)
3048
+ if data_gen_def:
3049
+ data_gen_name = data_gen_def.get('name', Path(data_generator).stem if data_generator else 'Data Generator')
3050
+ master_log.print(f"Data Generator Agent: [bold]{data_gen_name}[/bold]\n")
3510
3051
 
3511
- console.print(f"Executing test data generation prompt \n{bulk_data_gen_prompt}\n")
3512
-
3513
- try:
3514
- # Setup data generator agent
3515
- bulk_memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
3516
-
3517
- # Use first test case's config or empty tuple
3518
- first_config_path = None
3519
- if parsed_test_cases:
3520
- first_tc = parsed_test_cases[0]
3521
- first_config_path = resolve_toolkit_config_path(
3522
- first_tc['data'].get('config_path', ''),
3523
- first_tc['file'],
3524
- test_cases_path
3525
- )
3526
-
3527
- data_gen_config_tuple = (first_config_path,) if first_config_path else ()
3528
- data_gen_executor, _, _, _, _, _, _ = _setup_local_agent_executor(
3529
- client, data_gen_def, data_gen_config_tuple, config,
3530
- model, temperature, max_tokens, bulk_memory, work_dir
3531
- )
3532
-
3533
- if data_gen_executor:
3534
- with console.status("[yellow]Generating test data for all test cases...[/yellow]", spinner="dots"):
3535
- bulk_gen_result = data_gen_executor.invoke({
3536
- "input": bulk_data_gen_prompt,
3537
- "chat_history": []
3538
- })
3539
- bulk_gen_output = extract_output_from_result(bulk_gen_result)
3540
- console.print(f"[green]✓ Bulk test data generation completed[/green]")
3541
- console.print(f"[dim]{bulk_gen_output}...[/dim]\n")
3542
-
3543
- # Store chat history from data generation to pass to test executors
3544
- bulk_gen_chat_history = [
3545
- {"role": "user", "content": bulk_data_gen_prompt},
3546
- {"role": "assistant", "content": bulk_gen_output}
3547
- ]
3548
- else:
3549
- console.print(f"[yellow]⚠ Warning: Data generator has no executor[/yellow]\n")
3550
- except Exception as e:
3551
- console.print(f"[yellow]⚠ Warning: Bulk data generation failed: {e}[/yellow]")
3552
- console.print("[yellow]Continuing with test execution...[/yellow]\n")
3553
- logger.debug(f"Bulk data generation error: {e}", exc_info=True)
3554
-
3555
- # Execute test cases sequentially with executor caching
3556
- if not parsed_test_cases:
3557
- console.print("[yellow]No test cases to execute[/yellow]")
3558
- return
3559
-
3560
- console.print(f"\n[bold yellow]📋 Executing test cases sequentially...[/bold yellow]\n")
3561
-
3562
- # Show data generation context availability
3563
- if bulk_gen_chat_history:
3564
- console.print(f"[dim]✓ Data generation history available ({len(bulk_gen_chat_history)} messages) - shared with all test cases[/dim]\n")
3565
- else:
3566
- console.print(f"[dim]ℹ No data generation history (skipped or disabled)[/dim]\n")
3567
-
3568
- # Executor cache: key = toolkit_config_path, value = (agent_executor, memory, mcp_session_manager)
3569
- executor_cache = {}
3570
-
3571
- # Validation executor cache: separate isolated executors for validation
3572
- # key = toolkit_config_path, value = (agent_executor, memory, mcp_session_manager)
3573
- validation_executor_cache = {}
3574
-
3575
- # Execute each test case sequentially
3576
- test_results = []
3577
- total_tests = len(parsed_test_cases)
3578
-
3579
- for idx, tc_info in enumerate(parsed_test_cases, 1):
3580
- test_case = tc_info['data']
3581
- test_file = tc_info['file']
3582
- test_name = test_case['name']
3052
+ if validator_def:
3053
+ master_log.print(f"Validator Agent: [bold]{validator_agent_name}[/bold]")
3054
+ master_log.print(f"[dim]Using: {validator_path}[/dim]\n")
3055
+ else:
3056
+ master_log.print(f"[dim]No validator agent specified, using test runner agent for validation[/dim]\n")
3583
3057
 
3584
- # Display progress
3585
- console.print(f"[bold cyan]Test Case {idx}/{total_tests} - {test_name}[/bold cyan]")
3058
+ # Parse all test cases
3059
+ parsed_test_cases = parse_all_test_cases(test_case_files_list, master_log)
3060
+ test_cases_needing_data_gen = filter_test_cases_needing_data_gen(parsed_test_cases)
3586
3061
 
3587
- try:
3588
- # Resolve toolkit config path for this test case
3589
- toolkit_config_path = resolve_toolkit_config_path(
3590
- test_case.get('config_path', ''),
3591
- test_file,
3592
- test_cases_path
3062
+ # Bulk test data generation (if enabled)
3063
+ if data_gen_def and not skip_data_generation and test_cases_needing_data_gen:
3064
+ bulk_gen_chat_history = execute_bulk_data_generation(
3065
+ data_gen_def, test_cases_needing_data_gen, parsed_test_cases,
3066
+ test_cases_path, client, config, model, temperature, max_tokens,
3067
+ work_dir, master_log, _setup_local_agent_executor,
3068
+ verbose=show_verbose,
3069
+ debug=debug_mode,
3593
3070
  )
3594
-
3595
- # Use cache key (None if no config)
3596
- cache_key = toolkit_config_path if toolkit_config_path else '__no_config__'
3597
- thread_id = f"test_case_{idx}_{uuid.uuid4().hex[:8]}"
3598
-
3599
- # Get or create executor from cache
3600
- agent_executor, memory, mcp_session_manager = _create_executor_from_cache(
3601
- executor_cache, cache_key, client, agent_def, toolkit_config_path,
3602
- config, model, temperature, max_tokens, work_dir
3603
- )
3604
-
3605
- # Build execution prompt for single test case
3606
- execution_prompt = _build_single_test_execution_prompt(tc_info, idx)
3607
- console.print(f"[dim]Executing with {len(bulk_gen_chat_history)} history messages[/dim]")
3608
- console.print(f"[dim]Executing test case with the prompt {execution_prompt}[/dim]")
3609
-
3610
- # Execute test case
3611
- execution_output = ""
3612
- if agent_executor:
3613
- with console.status(f"[yellow]Executing test case...[/yellow]", spinner="dots"):
3614
- exec_result = agent_executor.invoke({
3615
- "input": execution_prompt,
3616
- "chat_history": bulk_gen_chat_history # ONLY data gen history, no accumulation
3617
- }, config={"configurable": {"thread_id": thread_id}})
3618
- execution_output = extract_output_from_result(exec_result)
3619
-
3620
- console.print(f"[green]✓ Test case executed[/green]")
3621
- console.print(f"[dim]{execution_output}[/dim]\n")
3622
-
3623
- # Append execution to bulk gen chat history for validation
3624
- test_case_history_start = len(bulk_gen_chat_history)
3625
- bulk_gen_chat_history.extend([
3626
- {"role": "user", "content": execution_prompt},
3627
- {"role": "assistant", "content": execution_output}
3628
- ])
3629
-
3630
- # No history accumulation - each test case is independent
3631
- else:
3632
- console.print(f"[red]✗ No agent executor available[/red]")
3633
- # Create fallback result for this test
3634
- test_results.append({
3635
- 'title': test_name,
3636
- 'passed': False,
3637
- 'file': test_file.name,
3638
- 'step_results': []
3639
- })
3640
- continue
3641
-
3642
- # Validate test case using validation executor with accumulated history
3643
- validation_prompt = _build_single_test_validation_prompt(tc_info, idx, execution_output)
3644
-
3645
- console.print(f"[bold yellow]🔍 Validating test case (with execution history)...[/bold yellow]")
3646
- console.print(f"[dim]{validation_prompt}[/dim]\n")
3647
-
3648
- # Create or retrieve isolated validation executor
3649
- validation_cache_key = f"{cache_key}_validation"
3650
- validation_agent_def = validator_def if validator_def else agent_def
3651
-
3652
- validation_executor, validation_memory, validation_mcp_session = _create_executor_from_cache(
3653
- validation_executor_cache, validation_cache_key, client, validation_agent_def,
3654
- toolkit_config_path, config, model, temperature, max_tokens, work_dir
3655
- )
3656
-
3657
- if validation_cache_key not in validation_executor_cache:
3658
- console.print(f"[dim]Created new isolated validation executor[/dim]")
3659
- else:
3660
- console.print(f"[dim]Using cached validation executor[/dim]")
3661
-
3662
- # For validation, use a separate thread with accumulated chat history (data gen + execution)
3663
- # This provides context to the validator about the test execution
3664
- validation_thread_id = f"validation_{idx}_{uuid.uuid4().hex[:8]}"
3665
-
3666
- validation_output = ""
3667
- if validation_executor:
3668
- with console.status(f"[yellow]Validating test case...[/yellow]", spinner="dots"):
3669
- validation_result = validation_executor.invoke({
3670
- "input": validation_prompt,
3671
- "chat_history": bulk_gen_chat_history # Includes data gen and execution history
3672
- }, {"configurable": {"thread_id": validation_thread_id}})
3673
-
3674
- validation_output = extract_output_from_result(validation_result)
3675
- else:
3676
- console.print(f"[red]✗ No validation executor available[/red]")
3677
- validation_output = "{}"
3678
-
3679
- # No further history update - validation completes the cycle
3680
-
3681
- # Parse validation JSON
3682
- try:
3683
- validation_json = _extract_json_from_text(validation_output)
3684
- step_results = validation_json.get('steps', [])
3685
-
3686
- # Determine if test passed (all steps must pass)
3687
- test_passed = all(step.get('passed', False) for step in step_results) if step_results else False
3688
-
3689
- if test_passed:
3690
- console.print(f"[bold green]✅ Test PASSED: {test_name}[/bold green]")
3691
- else:
3692
- console.print(f"[bold red]❌ Test FAILED: {test_name}[/bold red]")
3693
-
3694
- # Display individual step results
3695
- for step_result in step_results:
3696
- step_num = step_result.get('step_number')
3697
- step_title = step_result.get('title', '')
3698
- passed = step_result.get('passed', False)
3699
- details = step_result.get('details', '')
3700
-
3701
- if passed:
3702
- console.print(f" [green]✓ Step {step_num}: {step_title}[/green]")
3703
- console.print(f" [dim]{details}[/dim]")
3704
- else:
3705
- console.print(f" [red]✗ Step {step_num}: {step_title}[/red]")
3706
- console.print(f" [dim]{details}[/dim]")
3707
-
3708
- console.print()
3709
-
3710
- # Store result
3711
- test_results.append({
3712
- 'title': test_name,
3713
- 'passed': test_passed,
3714
- 'file': test_file.name,
3715
- 'step_results': step_results
3716
- })
3717
-
3718
- except Exception as e:
3719
- logger.debug(f"Validation parsing failed for {test_name}: {e}", exc_info=True)
3720
- console.print(f"[yellow]⚠ Warning: Could not parse validation results for {test_name}[/yellow]")
3721
- console.print(f"[yellow]Error: {str(e)}[/yellow]")
3722
-
3723
- # Enhanced diagnostic output
3724
- _print_validation_diagnostics(validation_output)
3725
-
3726
- # Generate fallback result using helper function
3727
- console.print(f"\n[yellow]🔄 Generating fallback validation result...[/yellow]")
3728
- fallback_result = _create_fallback_result_for_test(
3729
- test_case,
3730
- test_file,
3731
- f'Validation failed - could not parse validator output: {str(e)}'
3732
- )
3733
- console.print(f"[dim]Created {len(fallback_result['step_results'])} fallback step results[/dim]\n")
3734
-
3735
- test_results.append(fallback_result)
3736
- console.print()
3737
-
3738
- # After validation, remove the test case execution from history to prevent accumulation
3739
- # Remove the entries added for this test case
3740
- del bulk_gen_chat_history[test_case_history_start:]
3741
-
3742
- except Exception as e:
3743
- logger.debug(f"Test execution failed for {test_name}: {e}", exc_info=True)
3744
- console.print(f"[red]✗ Test execution failed: {e}[/red]")
3745
-
3746
- # Create fallback result using helper function
3747
- fallback_result = _create_fallback_result_for_test(
3748
- test_case,
3749
- test_file,
3750
- f'Test execution failed: {str(e)}'
3751
- )
3752
- test_results.append(fallback_result)
3753
- console.print()
3754
-
3755
- # Cleanup: Close executor cache resources
3756
- _cleanup_executor_cache(executor_cache, "executor")
3757
- _cleanup_executor_cache(validation_executor_cache, "validation executor")
3758
-
3759
- # Calculate totals
3760
- total_tests = len(test_results)
3761
- passed_tests = sum(1 for r in test_results if r['passed'])
3762
- failed_tests = total_tests - passed_tests
3763
-
3764
- # Generate summary report
3765
- console.print(f"\n[bold]{'='*60}[/bold]")
3766
- console.print(f"[bold cyan]📊 Test Execution Summary[/bold cyan]")
3767
- console.print(f"[bold]{'='*60}[/bold]\n")
3768
-
3769
- summary_table = Table(box=box.ROUNDED, border_style="cyan")
3770
- summary_table.add_column("Metric", style="bold")
3771
- summary_table.add_column("Value", justify="right")
3772
-
3773
- summary_table.add_row("Total Tests", str(total_tests))
3774
- summary_table.add_row("Passed", f"[green]{passed_tests}[/green]")
3775
- summary_table.add_row("Failed", f"[red]{failed_tests}[/red]")
3776
-
3777
- if total_tests > 0:
3778
- pass_rate = (passed_tests / total_tests) * 100
3779
- summary_table.add_row("Pass Rate", f"{pass_rate:.1f}%")
3780
-
3781
- console.print(summary_table)
3071
+
3072
+ # Execute all test cases
3073
+ test_results = execute_all_test_cases(
3074
+ parsed_test_cases, bulk_gen_chat_history, test_cases_path,
3075
+ agent_def, validator_def, client, config, model, temperature,
3076
+ max_tokens, work_dir, master_log, _setup_local_agent_executor,
3077
+ verbose=show_verbose,
3078
+ debug=debug_mode,
3079
+ )
3782
3080
 
3783
- # Generate structured JSON report
3784
- overall_result = "pass" if failed_tests == 0 else "fail"
3081
+ # End of master_log context - log file saved automatically
3785
3082
 
3786
- structured_report = {
3787
- "test_cases": [
3788
- {
3789
- "title": r['title'],
3790
- "passed": r['passed'],
3791
- "steps": r.get('step_results', [])
3792
- }
3793
- for r in test_results
3794
- ],
3795
- "overall_result": overall_result,
3796
- "summary": {
3797
- "total_tests": total_tests,
3798
- "passed": passed_tests,
3799
- "failed": failed_tests,
3800
- "pass_rate": f"{pass_rate:.1f}%" if total_tests > 0 else "0%"
3801
- },
3802
- "timestamp": datetime.now().isoformat()
3803
- }
3083
+ # Print test execution summary
3084
+ print_test_execution_summary(test_results, results_dir, session_name)
3804
3085
 
3805
- # Save structured report
3806
- results_path = Path(results_dir)
3807
- results_path.mkdir(parents=True, exist_ok=True)
3808
- summary_file = results_path / "test_execution_summary.json"
3086
+ # Save structured JSON report
3087
+ log_file = None
3088
+ toolkit_name = session_name.replace('test-execution-', '')
3089
+ toolkit_dir = results_path / toolkit_name
3090
+ log_files = sorted(toolkit_dir.glob(f"*{session_name}.txt")) if toolkit_dir.exists() else []
3091
+ if log_files:
3092
+ log_file = log_files[0]
3809
3093
 
3810
- console.print(f"\n[bold yellow]💾 Saving test execution summary...[/bold yellow]")
3811
- with open(summary_file, 'w') as f:
3812
- json.dump(structured_report, f, indent=2)
3813
- console.print(f"[green]✓ Summary saved to {summary_file}[/green]\n")
3094
+ save_structured_report(test_results, results_dir, log_file)
3814
3095
 
3815
3096
  # Exit with error code if any tests failed
3097
+ failed_tests = sum(1 for r in test_results if not r['passed'])
3816
3098
  if failed_tests > 0:
3817
3099
  sys.exit(1)
3818
3100