alita-sdk 0.3.562__py3-none-any.whl → 0.3.584__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (74) hide show
  1. alita_sdk/cli/agents.py +358 -165
  2. alita_sdk/configurations/openapi.py +227 -15
  3. alita_sdk/runtime/langchain/langraph_agent.py +93 -20
  4. alita_sdk/runtime/langchain/utils.py +30 -14
  5. alita_sdk/runtime/toolkits/artifact.py +2 -1
  6. alita_sdk/runtime/toolkits/mcp.py +4 -2
  7. alita_sdk/runtime/toolkits/skill_router.py +1 -1
  8. alita_sdk/runtime/toolkits/vectorstore.py +1 -1
  9. alita_sdk/runtime/tools/data_analysis.py +1 -1
  10. alita_sdk/runtime/tools/llm.py +30 -11
  11. alita_sdk/runtime/utils/constants.py +5 -1
  12. alita_sdk/tools/ado/repos/__init__.py +2 -1
  13. alita_sdk/tools/ado/test_plan/__init__.py +2 -1
  14. alita_sdk/tools/ado/wiki/__init__.py +2 -1
  15. alita_sdk/tools/ado/work_item/__init__.py +2 -1
  16. alita_sdk/tools/advanced_jira_mining/__init__.py +2 -1
  17. alita_sdk/tools/aws/delta_lake/__init__.py +2 -1
  18. alita_sdk/tools/azure_ai/search/__init__.py +2 -1
  19. alita_sdk/tools/bitbucket/__init__.py +2 -1
  20. alita_sdk/tools/browser/__init__.py +1 -1
  21. alita_sdk/tools/carrier/__init__.py +1 -1
  22. alita_sdk/tools/cloud/aws/__init__.py +2 -1
  23. alita_sdk/tools/cloud/azure/__init__.py +2 -1
  24. alita_sdk/tools/cloud/gcp/__init__.py +2 -1
  25. alita_sdk/tools/cloud/k8s/__init__.py +2 -1
  26. alita_sdk/tools/code/linter/__init__.py +2 -1
  27. alita_sdk/tools/code/sonar/__init__.py +2 -1
  28. alita_sdk/tools/confluence/__init__.py +2 -1
  29. alita_sdk/tools/custom_open_api/__init__.py +2 -1
  30. alita_sdk/tools/elastic/__init__.py +2 -1
  31. alita_sdk/tools/figma/__init__.py +51 -5
  32. alita_sdk/tools/figma/api_wrapper.py +1157 -123
  33. alita_sdk/tools/figma/figma_client.py +73 -0
  34. alita_sdk/tools/figma/toon_tools.py +2748 -0
  35. alita_sdk/tools/github/__init__.py +2 -1
  36. alita_sdk/tools/gitlab/__init__.py +2 -1
  37. alita_sdk/tools/gitlab/api_wrapper.py +32 -0
  38. alita_sdk/tools/gitlab_org/__init__.py +2 -1
  39. alita_sdk/tools/google/bigquery/__init__.py +2 -1
  40. alita_sdk/tools/google_places/__init__.py +2 -1
  41. alita_sdk/tools/jira/__init__.py +2 -1
  42. alita_sdk/tools/keycloak/__init__.py +2 -1
  43. alita_sdk/tools/localgit/__init__.py +2 -1
  44. alita_sdk/tools/memory/__init__.py +1 -1
  45. alita_sdk/tools/ocr/__init__.py +2 -1
  46. alita_sdk/tools/openapi/__init__.py +227 -15
  47. alita_sdk/tools/openapi/api_wrapper.py +1276 -802
  48. alita_sdk/tools/pandas/__init__.py +3 -2
  49. alita_sdk/tools/postman/__init__.py +2 -1
  50. alita_sdk/tools/pptx/__init__.py +2 -1
  51. alita_sdk/tools/qtest/__init__.py +2 -1
  52. alita_sdk/tools/rally/__init__.py +2 -1
  53. alita_sdk/tools/report_portal/__init__.py +2 -1
  54. alita_sdk/tools/salesforce/__init__.py +2 -1
  55. alita_sdk/tools/servicenow/__init__.py +2 -1
  56. alita_sdk/tools/sharepoint/__init__.py +2 -1
  57. alita_sdk/tools/slack/__init__.py +3 -2
  58. alita_sdk/tools/sql/__init__.py +2 -1
  59. alita_sdk/tools/testio/__init__.py +2 -1
  60. alita_sdk/tools/testrail/__init__.py +2 -1
  61. alita_sdk/tools/utils/content_parser.py +68 -2
  62. alita_sdk/tools/xray/__init__.py +2 -1
  63. alita_sdk/tools/yagmail/__init__.py +2 -1
  64. alita_sdk/tools/zephyr/__init__.py +2 -1
  65. alita_sdk/tools/zephyr_enterprise/__init__.py +2 -1
  66. alita_sdk/tools/zephyr_essential/__init__.py +2 -1
  67. alita_sdk/tools/zephyr_scale/__init__.py +2 -1
  68. alita_sdk/tools/zephyr_squad/__init__.py +2 -1
  69. {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/METADATA +1 -1
  70. {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/RECORD +74 -72
  71. {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/WHEEL +0 -0
  72. {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/entry_points.txt +0 -0
  73. {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/licenses/LICENSE +0 -0
  74. {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/top_level.txt +0 -0
alita_sdk/cli/agents.py CHANGED
@@ -275,63 +275,62 @@ def _build_bulk_data_gen_prompt(parsed_test_cases: list) -> str:
275
275
  {'='*60}"""
276
276
 
277
277
 
278
- def _build_bulk_execution_prompt(parsed_test_cases: list) -> str:
279
- """Build consolidated prompt for bulk test execution."""
280
- parts = []
281
-
282
- for idx, tc_info in enumerate(parsed_test_cases, 1):
283
- test_case = tc_info['data']
284
- test_file = tc_info['file']
285
-
286
- parts.append(f"\n{'='*80}\nTEST CASE #{idx}: {test_case['name']}\nFile: {test_file.name}\n{'='*80}")
287
-
288
- if test_case['steps']:
289
- for step in test_case['steps']:
290
- parts.append(f"\nStep {step['number']}: {step['title']}\n{step['instruction']}")
291
- if step['expectation']:
292
- parts.append(f"Expected Result: {step['expectation']}")
293
- else:
294
- parts.append("\n(No steps defined)")
278
+ def _build_single_test_execution_prompt(test_case_info: dict, test_number: int) -> str:
279
+ """Build execution prompt for a single test case."""
280
+ test_case = test_case_info['data']
281
+ test_file = test_case_info['file']
282
+
283
+ parts = [
284
+ f"\n{'='*80}",
285
+ f"TEST CASE #{test_number}: {test_case['name']}",
286
+ f"File: {test_file.name}",
287
+ f"{'='*80}",
288
+ "\nList all the tools you have in your environment. Execute the following steps in sequential order and report results:"
289
+ ]
290
+
291
+ if test_case['steps']:
292
+ for step in test_case['steps']:
293
+ parts.append(f"\nStep {step['number']}: {step['title']}")
294
+ parts.append(step['instruction'])
295
+ else:
296
+ parts.append("\n(No steps defined)")
295
297
 
296
298
  return "\n".join(parts)
297
299
 
298
300
 
299
- def _build_validation_prompt(parsed_test_cases: list, execution_output: str) -> str:
300
- """Build prompt for bulk validation of test results."""
301
- parts = ["You are a test validator. Review the test execution results and validate each test case.\n\nTest Cases to Validate:\n"]
301
+ def _build_single_test_validation_prompt(test_case_info: dict, test_number: int, execution_output: str) -> str:
302
+ """Build validation prompt for a single test case."""
303
+ test_case = test_case_info['data']
304
+
305
+ parts = [
306
+ "Review the test execution results and validate this test case and provide the output in JSON format.\n",
307
+ f"\nTest Case #{test_number}: {test_case['name']}"
308
+ ]
302
309
 
303
- for idx, tc_info in enumerate(parsed_test_cases, 1):
304
- test_case = tc_info['data']
305
- parts.append(f"\nTest Case #{idx}: {test_case['name']}")
306
- if test_case['steps']:
307
- for step in test_case['steps']:
308
- parts.append(f" Step {step['number']}: {step['title']}")
309
- if step['expectation']:
310
- parts.append(f" Expected: {step['expectation']}")
310
+ if test_case['steps']:
311
+ for step in test_case['steps']:
312
+ parts.append(f" Step {step['number']}: {step['title']}")
313
+ if step['expectation']:
314
+ parts.append(f" Expected: {step['expectation']}")
311
315
 
312
316
  parts.append(f"\n\nActual Execution Results:\n{execution_output}\n")
313
- parts.append(f"""\nBased on the execution results above, validate each test case.
317
+
318
+ # Escape quotes in test name for valid JSON in prompt
319
+ escaped_test_name = test_case['name'].replace('"', '\\"')
320
+
321
+ parts.append(f"""\nBased on the execution results above, validate this test case.
314
322
 
315
- Respond with valid JSON in this EXACT format:
323
+ Respond ONLY with valid JSON in this EXACT format (no additional text before or after):
316
324
  {{
317
- "test_cases": [
318
- {{
319
- "test_number": 1,
320
- "test_name": "<test case name>",
321
- "steps": [
322
- {{"step_number": 1, "title": "<step title>", "passed": true/false, "details": "<brief explanation>"}},
323
- {{"step_number": 2, "title": "<step title>", "passed": true/false, "details": "<brief explanation>"}}
324
- ]
325
- }},
326
- {{
327
- "test_number": 2,
328
- "test_name": "<test case name>",
329
- "steps": [...]
330
- }}
325
+ "test_number": {test_number},
326
+ "test_name": "{escaped_test_name}",
327
+ "steps": [
328
+ {{"step_number": 1, "title": "<step title>", "passed": true/false, "details": "<brief explanation>"}},
329
+ {{"step_number": 2, "title": "<step title>", "passed": true/false, "details": "<brief explanation>"}}
331
330
  ]
332
331
  }}
333
332
 
334
- Validate all {len(parsed_test_cases)} test cases and their steps.""")
333
+ IMPORTANT: Return ONLY the JSON object. Do not include any explanatory text before or after the JSON.""")
335
334
 
336
335
  return "\n".join(parts)
337
336
 
@@ -359,17 +358,119 @@ def _extract_json_from_text(text: str) -> dict:
359
358
  return json.loads(text[start_idx:end_idx])
360
359
 
361
360
 
362
- def _create_fallback_results(parsed_test_cases: list) -> tuple[list, int, int, int]:
363
- """Create fallback results when execution/validation fails."""
364
- test_results = []
365
- for tc_info in parsed_test_cases:
366
- test_results.append({
367
- 'title': tc_info['data']['name'],
361
+ def _create_fallback_result_for_test(test_case: dict, test_file: Path, reason: str = 'Validation failed') -> dict:
362
+ """Create a fallback result for a single test case with detailed step information.
363
+
364
+ Args:
365
+ test_case: Parsed test case data
366
+ test_file: Path to test case file
367
+ reason: Reason for fallback
368
+
369
+ Returns:
370
+ Fallback test result dict with step details
371
+ """
372
+ fallback_steps = []
373
+ for step_info in test_case.get('steps', []):
374
+ fallback_steps.append({
375
+ 'step_number': step_info['number'],
376
+ 'title': step_info['title'],
368
377
  'passed': False,
369
- 'file': tc_info['file'].name,
370
- 'step_results': []
378
+ 'details': reason
371
379
  })
372
- return test_results, len(parsed_test_cases), 0, len(parsed_test_cases)
380
+
381
+ return {
382
+ 'title': test_case['name'],
383
+ 'passed': False,
384
+ 'file': test_file.name,
385
+ 'step_results': fallback_steps,
386
+ 'validation_error': reason
387
+ }
388
+
389
+
390
+ def _cleanup_executor_cache(cache: Dict[str, tuple], cache_name: str = "executor") -> None:
391
+ """Clean up executor cache resources.
392
+
393
+ Args:
394
+ cache: Dictionary of cached executors
395
+ cache_name: Name of cache for logging
396
+ """
397
+ console.print(f"[dim]Cleaning up {cache_name} cache...[/dim]")
398
+ for cache_key, cached_items in cache.items():
399
+ try:
400
+ # Extract memory from tuple (second element)
401
+ memory = cached_items[1] if len(cached_items) > 1 else None
402
+
403
+ # Close SQLite memory connection
404
+ if memory and hasattr(memory, 'conn') and memory.conn:
405
+ memory.conn.close()
406
+ except Exception as e:
407
+ logger.debug(f"Error cleaning up {cache_name} cache for {cache_key}: {e}")
408
+
409
+
410
+ def _create_executor_from_cache(cache: Dict[str, tuple], cache_key: str,
411
+ client, agent_def: Dict, toolkit_config_path: Optional[str],
412
+ config, model: Optional[str], temperature: Optional[float],
413
+ max_tokens: Optional[int], work_dir: Optional[str]) -> tuple:
414
+ """Get or create executor from cache.
415
+
416
+ Args:
417
+ cache: Executor cache dictionary
418
+ cache_key: Key for caching
419
+ client: API client
420
+ agent_def: Agent definition
421
+ toolkit_config_path: Path to toolkit config
422
+ config: CLI configuration
423
+ model: Model override
424
+ temperature: Temperature override
425
+ max_tokens: Max tokens override
426
+ work_dir: Working directory
427
+
428
+ Returns:
429
+ Tuple of (agent_executor, memory, mcp_session_manager)
430
+ """
431
+ if cache_key in cache:
432
+ return cache[cache_key]
433
+
434
+ # Create new executor
435
+ from langgraph.checkpoint.sqlite import SqliteSaver
436
+ import sqlite3
437
+
438
+ memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
439
+ toolkit_config_tuple = (toolkit_config_path,) if toolkit_config_path else ()
440
+
441
+ agent_executor, mcp_session_manager, _, _, _, _, _ = _setup_local_agent_executor(
442
+ client, agent_def, toolkit_config_tuple, config, model, temperature,
443
+ max_tokens, memory, work_dir
444
+ )
445
+
446
+ # Cache the executor
447
+ cached_tuple = (agent_executor, memory, mcp_session_manager)
448
+ cache[cache_key] = cached_tuple
449
+ return cached_tuple
450
+
451
+
452
+ def _print_validation_diagnostics(validation_output: str) -> None:
453
+ """Print diagnostic information for validation output.
454
+
455
+ Args:
456
+ validation_output: The validation output to diagnose
457
+ """
458
+ console.print(f"\n[bold red]🔍 Diagnostic Information:[/bold red]")
459
+ console.print(f"[dim]Output length: {len(validation_output)} characters[/dim]")
460
+
461
+ # Check for key JSON elements
462
+ has_json = '{' in validation_output and '}' in validation_output
463
+ has_fields = 'test_number' in validation_output and 'steps' in validation_output
464
+
465
+ console.print(f"[dim]Has JSON structure: {has_json}[/dim]")
466
+ console.print(f"[dim]Has required fields: {has_fields}[/dim]")
467
+
468
+ # Show relevant excerpt
469
+ if len(validation_output) > 400:
470
+ console.print(f"\n[red]First 200 chars:[/red] [dim]{validation_output[:200]}[/dim]")
471
+ console.print(f"[red]Last 200 chars:[/red] [dim]{validation_output[-200:]}[/dim]")
472
+ else:
473
+ console.print(f"\n[red]Full output:[/red] [dim]{validation_output}[/dim]")
373
474
 
374
475
 
375
476
  def _get_alita_system_prompt(config) -> str:
@@ -3226,13 +3327,16 @@ def agent_run(ctx, agent_source: str, message: str, version: Optional[str],
3226
3327
  help='Grant agent filesystem access to this directory')
3227
3328
  @click.option('--data-generator', type=click.Path(exists=True),
3228
3329
  help='Path to test data generator agent definition file')
3330
+ @click.option('--validator', type=click.Path(exists=True),
3331
+ help='Path to test validator agent definition file (default: .alita/agents/test-validator.agent.md)')
3229
3332
  @click.option('--skip-data-generation', is_flag=True,
3230
3333
  help='Skip test data generation step')
3231
3334
  @click.pass_context
3232
3335
  def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir: str,
3233
3336
  test_case_files: tuple, model: Optional[str], temperature: Optional[float],
3234
3337
  max_tokens: Optional[int], work_dir: Optional[str],
3235
- data_generator: Optional[str], skip_data_generation: bool):
3338
+ data_generator: Optional[str], validator: Optional[str],
3339
+ skip_data_generation: bool):
3236
3340
  """
3237
3341
  Execute test cases from a directory and save results.
3238
3342
 
@@ -3259,6 +3363,11 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
3259
3363
  alita execute-test-cases ./agent.json --test-cases-dir ./tests --results-dir ./results \
3260
3364
  --skip-data-generation --model gpt-4o
3261
3365
  """
3366
+ # Import dependencies at function start
3367
+ import sqlite3
3368
+ import uuid
3369
+ from langgraph.checkpoint.sqlite import SqliteSaver
3370
+
3262
3371
  config = ctx.obj['config']
3263
3372
  client = get_client(ctx)
3264
3373
 
@@ -3317,11 +3426,30 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
3317
3426
  console.print("[yellow]Continuing with test execution...[/yellow]\n")
3318
3427
  logger.debug(f"Data generator setup error: {e}", exc_info=True)
3319
3428
 
3320
- # Track overall results
3321
- total_tests = 0
3322
- passed_tests = 0
3323
- failed_tests = 0
3324
- test_results = [] # Store structured results for final report
3429
+ # Load validator agent definition
3430
+ validator_def = None
3431
+ validator_agent_name = "Default Validator"
3432
+
3433
+ # Try to load validator from specified path or default location
3434
+ validator_path = validator
3435
+ if not validator_path:
3436
+ # Default to .alita/agents/test-validator.agent.md
3437
+ default_validator = Path.cwd() / '.alita' / 'agents' / 'test-validator.agent.md'
3438
+ if default_validator.exists():
3439
+ validator_path = str(default_validator)
3440
+
3441
+ if validator_path and Path(validator_path).exists():
3442
+ try:
3443
+ validator_def = load_agent_definition(validator_path)
3444
+ validator_agent_name = validator_def.get('name', Path(validator_path).stem)
3445
+ console.print(f"Validator Agent: [bold]{validator_agent_name}[/bold]")
3446
+ console.print(f"[dim]Using: {validator_path}[/dim]\n")
3447
+ except Exception as e:
3448
+ console.print(f"[yellow]⚠ Warning: Failed to load validator agent: {e}[/yellow]")
3449
+ console.print(f"[yellow]Will use test runner agent for validation[/yellow]\n")
3450
+ logger.debug(f"Validator load error: {e}", exc_info=True)
3451
+ else:
3452
+ console.print(f"[dim]No validator agent specified, using test runner agent for validation[/dim]\n")
3325
3453
 
3326
3454
  # Store bulk data generation chat history to pass to test executors
3327
3455
  bulk_gen_chat_history = []
@@ -3357,7 +3485,6 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
3357
3485
 
3358
3486
  try:
3359
3487
  # Setup data generator agent
3360
- from langgraph.checkpoint.sqlite import SqliteSaver
3361
3488
  bulk_memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
3362
3489
 
3363
3490
  # Use first test case's config or empty tuple
@@ -3398,138 +3525,204 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
3398
3525
  console.print("[yellow]Continuing with test execution...[/yellow]\n")
3399
3526
  logger.debug(f"Bulk data generation error: {e}", exc_info=True)
3400
3527
 
3401
- # Execute ALL test cases in one bulk operation
3528
+ # Execute test cases sequentially with executor caching
3402
3529
  if not parsed_test_cases:
3403
3530
  console.print("[yellow]No test cases to execute[/yellow]")
3404
3531
  return
3405
3532
 
3406
- console.print(f"\n[bold yellow]📋 Executing ALL test cases in bulk...[/bold yellow]\n")
3407
-
3408
- # Use first test case's config for agent setup
3409
- first_tc = parsed_test_cases[0]
3410
- first_test_file = first_tc['file']
3411
- toolkit_config_path = resolve_toolkit_config_path(
3412
- first_tc['data'].get('config_path', ''),
3413
- first_test_file,
3414
- test_cases_path
3415
- )
3416
- toolkit_config_tuple = (toolkit_config_path,) if toolkit_config_path else ()
3417
-
3418
- # Create memory for bulk execution
3419
- from langgraph.checkpoint.sqlite import SqliteSaver
3420
- memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
3533
+ console.print(f"\n[bold yellow]📋 Executing test cases sequentially...[/bold yellow]\n")
3421
3534
 
3422
- # Initialize chat history with bulk data generation context
3423
- chat_history = bulk_gen_chat_history.copy()
3535
+ # Show data generation context availability
3536
+ if bulk_gen_chat_history:
3537
+ console.print(f"[dim]✓ Data generation history available ({len(bulk_gen_chat_history)} messages) - shared with all test cases[/dim]\n")
3538
+ else:
3539
+ console.print(f"[dim]ℹ No data generation history (skipped or disabled)[/dim]\n")
3424
3540
 
3425
- # Setup agent executor
3426
- agent_executor, _, _, _, _, _, _ = _setup_local_agent_executor(
3427
- client, agent_def, toolkit_config_tuple, config, model, temperature, max_tokens, memory, work_dir
3428
- )
3541
+ # Executor cache: key = toolkit_config_path, value = (agent_executor, memory, mcp_session_manager)
3542
+ executor_cache = {}
3429
3543
 
3430
- # Build bulk execution prompt
3431
- bulk_all_prompt = _build_bulk_execution_prompt(parsed_test_cases)
3432
-
3433
- console.print(f"Executing the prompt: {bulk_all_prompt}\n")
3544
+ # Validation executor cache: separate isolated executors for validation
3545
+ # key = toolkit_config_path, value = (agent_executor, memory, mcp_session_manager)
3546
+ validation_executor_cache = {}
3434
3547
 
3435
- # Execute all test cases in bulk
3548
+ # Execute each test case sequentially
3436
3549
  test_results = []
3437
- all_execution_output = ""
3550
+ total_tests = len(parsed_test_cases)
3438
3551
 
3439
- try:
3440
- if agent_executor:
3441
- with console.status(f"[yellow]Executing {len(parsed_test_cases)} test cases in bulk...[/yellow]", spinner="dots"):
3442
- bulk_result = agent_executor.invoke({
3443
- "input": bulk_all_prompt,
3444
- "chat_history": chat_history
3552
+ for idx, tc_info in enumerate(parsed_test_cases, 1):
3553
+ test_case = tc_info['data']
3554
+ test_file = tc_info['file']
3555
+ test_name = test_case['name']
3556
+
3557
+ # Display progress
3558
+ console.print(f"[bold cyan]Test Case {idx}/{total_tests} - {test_name}[/bold cyan]")
3559
+
3560
+ try:
3561
+ # Resolve toolkit config path for this test case
3562
+ toolkit_config_path = resolve_toolkit_config_path(
3563
+ test_case.get('config_path', ''),
3564
+ test_file,
3565
+ test_cases_path
3566
+ )
3567
+
3568
+ # Use cache key (None if no config)
3569
+ cache_key = toolkit_config_path if toolkit_config_path else '__no_config__'
3570
+ thread_id = f"test_case_{idx}_{uuid.uuid4().hex[:8]}"
3571
+
3572
+ # Get or create executor from cache
3573
+ agent_executor, memory, mcp_session_manager = _create_executor_from_cache(
3574
+ executor_cache, cache_key, client, agent_def, toolkit_config_path,
3575
+ config, model, temperature, max_tokens, work_dir
3576
+ )
3577
+
3578
+ # Build execution prompt for single test case
3579
+ execution_prompt = _build_single_test_execution_prompt(tc_info, idx)
3580
+ console.print(f"[dim]Executing with {len(bulk_gen_chat_history)} history messages[/dim]")
3581
+
3582
+ # Execute test case
3583
+ execution_output = ""
3584
+ if agent_executor:
3585
+ with console.status(f"[yellow]Executing test case...[/yellow]", spinner="dots"):
3586
+ exec_result = agent_executor.invoke({
3587
+ "input": execution_prompt,
3588
+ "chat_history": bulk_gen_chat_history # ONLY data gen history, no accumulation
3589
+ }, config={"configurable": {"thread_id": thread_id}})
3590
+ execution_output = extract_output_from_result(exec_result)
3591
+
3592
+ console.print(f"[green]✓ Test case executed[/green]")
3593
+ console.print(f"[dim]{execution_output}[/dim]\n")
3594
+
3595
+ # No history accumulation - each test case is independent
3596
+ else:
3597
+ console.print(f"[red]✗ No agent executor available[/red]")
3598
+ # Create fallback result for this test
3599
+ test_results.append({
3600
+ 'title': test_name,
3601
+ 'passed': False,
3602
+ 'file': test_file.name,
3603
+ 'step_results': []
3445
3604
  })
3446
- all_execution_output = extract_output_from_result(bulk_result)
3605
+ continue
3447
3606
 
3448
- console.print(f"[green]✓ All test cases executed[/green]")
3449
- console.print(f"[dim]{all_execution_output}...[/dim]\n")
3607
+ # Validate test case using ISOLATED validation executor
3608
+ validation_prompt = _build_single_test_validation_prompt(tc_info, idx, execution_output)
3450
3609
 
3451
- # Update chat history
3452
- chat_history.append({"role": "user", "content": bulk_all_prompt})
3453
- chat_history.append({"role": "assistant", "content": all_execution_output})
3610
+ console.print(f"[bold yellow]🔍 Validating test case (isolated context)...[/bold yellow]")
3454
3611
 
3455
- # Now validate ALL test cases in bulk
3456
- console.print(f"[bold yellow]✅ Validating all test cases...[/bold yellow]\n")
3612
+ # Create or retrieve isolated validation executor
3613
+ validation_cache_key = f"{cache_key}_validation"
3614
+ validation_agent_def = validator_def if validator_def else agent_def
3457
3615
 
3458
- validation_prompt = _build_validation_prompt(parsed_test_cases, all_execution_output)
3459
-
3460
- console.print(f"[dim]{validation_prompt}[/dim]\n")
3616
+ validation_executor, validation_memory, validation_mcp_session = _create_executor_from_cache(
3617
+ validation_executor_cache, validation_cache_key, client, validation_agent_def,
3618
+ toolkit_config_path, config, model, temperature, max_tokens, work_dir
3619
+ )
3461
3620
 
3462
- with console.status("[yellow]Validating all results...[/yellow]", spinner="dots"):
3463
- validation_result = agent_executor.invoke({
3464
- "input": validation_prompt,
3465
- "chat_history": chat_history
3466
- })
3467
-
3468
- validation_output = extract_output_from_result(validation_result)
3621
+ if validation_cache_key not in validation_executor_cache:
3622
+ console.print(f"[dim]Created new isolated validation executor[/dim]")
3623
+ else:
3624
+ console.print(f"[dim]Using cached validation executor[/dim]")
3625
+
3626
+ # For validation, use a separate thread with NO chat history (isolated from data gen)
3627
+ # This prevents the agent from using tools and encourages direct JSON output
3628
+ validation_thread_id = f"validation_{idx}_{uuid.uuid4().hex[:8]}"
3629
+
3630
+ validation_output = ""
3631
+ if validation_executor:
3632
+ with console.status(f"[yellow]Validating test case...[/yellow]", spinner="dots"):
3633
+ validation_result = validation_executor.invoke({
3634
+ "input": validation_prompt,
3635
+ "chat_history": [] # ISOLATED: No data gen history for validation
3636
+ }, {"configurable": {"thread_id": validation_thread_id}})
3637
+
3638
+ validation_output = extract_output_from_result(validation_result)
3639
+ else:
3640
+ console.print(f"[red]✗ No validation executor available[/red]")
3641
+ validation_output = "{}"
3642
+
3643
+ console.print(f"[bold cyan]Full LLM Validation Response:[/bold cyan]")
3644
+ console.print(f"[dim]{validation_output}[/dim]\n")
3469
3645
 
3470
- console.print(f"[dim]Validation Response: {validation_output}...[/dim]\n")
3646
+ # No history update - validation is isolated from test execution
3471
3647
 
3472
3648
  # Parse validation JSON
3473
3649
  try:
3474
3650
  validation_json = _extract_json_from_text(validation_output)
3475
- test_cases_results = validation_json.get('test_cases', [])
3651
+ step_results = validation_json.get('steps', [])
3476
3652
 
3477
- # Process results for each test case
3478
- total_tests = 0
3479
- passed_tests = 0
3480
- failed_tests = 0
3653
+ # Determine if test passed (all steps must pass)
3654
+ test_passed = all(step.get('passed', False) for step in step_results) if step_results else False
3481
3655
 
3482
- for tc_result in test_cases_results:
3483
- test_name = tc_result.get('test_name', f"Test #{tc_result.get('test_number', '?')}")
3484
- step_results = tc_result.get('steps', [])
3485
-
3486
- # Determine if test passed (all steps must pass)
3487
- test_passed = all(step.get('passed', False) for step in step_results) if step_results else False
3656
+ if test_passed:
3657
+ console.print(f"[bold green]✅ Test PASSED: {test_name}[/bold green]")
3658
+ else:
3659
+ console.print(f"[bold red]❌ Test FAILED: {test_name}[/bold red]")
3660
+
3661
+ # Display individual step results
3662
+ for step_result in step_results:
3663
+ step_num = step_result.get('step_number')
3664
+ step_title = step_result.get('title', '')
3665
+ passed = step_result.get('passed', False)
3666
+ details = step_result.get('details', '')
3488
3667
 
3489
- total_tests += 1
3490
- if test_passed:
3491
- passed_tests += 1
3492
- console.print(f"[bold green]✅ Test PASSED: {test_name}[/bold green]")
3668
+ if passed:
3669
+ console.print(f" [green]✓ Step {step_num}: {step_title}[/green]")
3670
+ console.print(f" [dim]{details}[/dim]")
3493
3671
  else:
3494
- failed_tests += 1
3495
- console.print(f"[bold red]❌ Test FAILED: {test_name}[/bold red]")
3496
-
3497
- # Display individual step results
3498
- for step_result in step_results:
3499
- step_num = step_result.get('step_number')
3500
- step_title = step_result.get('title', '')
3501
- passed = step_result.get('passed', False)
3502
- details = step_result.get('details', '')
3503
-
3504
- if passed:
3505
- console.print(f" [green]✓ Step {step_num}: {step_title}[/green]")
3506
- console.print(f" [dim]{details}[/dim]")
3507
- else:
3508
- console.print(f" [red]✗ Step {step_num}: {step_title}[/red]")
3509
- console.print(f" [dim]{details}[/dim]")
3510
-
3511
- console.print()
3512
-
3513
- # Store result
3514
- test_results.append({
3515
- 'title': test_name,
3516
- 'passed': test_passed,
3517
- 'file': parsed_test_cases[tc_result.get('test_number', 1) - 1]['file'].name if tc_result.get('test_number', 1) - 1 < len(parsed_test_cases) else 'unknown',
3518
- 'step_results': step_results
3519
- })
3672
+ console.print(f" [red]✗ Step {step_num}: {step_title}[/red]")
3673
+ console.print(f" [dim]{details}[/dim]")
3674
+
3675
+ console.print()
3676
+
3677
+ # Store result
3678
+ test_results.append({
3679
+ 'title': test_name,
3680
+ 'passed': test_passed,
3681
+ 'file': test_file.name,
3682
+ 'step_results': step_results
3683
+ })
3520
3684
 
3521
3685
  except Exception as e:
3522
- logger.debug(f"Validation parsing failed: {e}")
3523
- console.print(f"[yellow]⚠ Warning: Could not parse validation results: {e}[/yellow]\n")
3524
- test_results, total_tests, passed_tests, failed_tests = _create_fallback_results(parsed_test_cases)
3525
- else:
3526
- console.print(f"[red]✗ No agent executor available[/red]\n")
3527
- test_results, total_tests, passed_tests, failed_tests = _create_fallback_results(parsed_test_cases)
3686
+ logger.debug(f"Validation parsing failed for {test_name}: {e}", exc_info=True)
3687
+ console.print(f"[yellow]⚠ Warning: Could not parse validation results for {test_name}[/yellow]")
3688
+ console.print(f"[yellow]Error: {str(e)}[/yellow]")
3528
3689
 
3529
- except Exception as e:
3530
- console.print(f"[red]✗ Bulk execution failed: {e}[/red]\n")
3531
- logger.debug(f"Bulk execution error: {e}", exc_info=True)
3532
- test_results, total_tests, passed_tests, failed_tests = _create_fallback_results(parsed_test_cases)
3690
+ # Enhanced diagnostic output
3691
+ _print_validation_diagnostics(validation_output)
3692
+
3693
+ # Generate fallback result using helper function
3694
+ console.print(f"\n[yellow]🔄 Generating fallback validation result...[/yellow]")
3695
+ fallback_result = _create_fallback_result_for_test(
3696
+ test_case,
3697
+ test_file,
3698
+ f'Validation failed - could not parse validator output: {str(e)}'
3699
+ )
3700
+ console.print(f"[dim]Created {len(fallback_result['step_results'])} fallback step results[/dim]\n")
3701
+
3702
+ test_results.append(fallback_result)
3703
+ console.print()
3704
+
3705
+ except Exception as e:
3706
+ logger.debug(f"Test execution failed for {test_name}: {e}", exc_info=True)
3707
+ console.print(f"[red]✗ Test execution failed: {e}[/red]")
3708
+
3709
+ # Create fallback result using helper function
3710
+ fallback_result = _create_fallback_result_for_test(
3711
+ test_case,
3712
+ test_file,
3713
+ f'Test execution failed: {str(e)}'
3714
+ )
3715
+ test_results.append(fallback_result)
3716
+ console.print()
3717
+
3718
+ # Cleanup: Close executor cache resources
3719
+ _cleanup_executor_cache(executor_cache, "executor")
3720
+ _cleanup_executor_cache(validation_executor_cache, "validation executor")
3721
+
3722
+ # Calculate totals
3723
+ total_tests = len(test_results)
3724
+ passed_tests = sum(1 for r in test_results if r['passed'])
3725
+ failed_tests = total_tests - passed_tests
3533
3726
 
3534
3727
  # Generate summary report
3535
3728
  console.print(f"\n[bold]{'='*60}[/bold]")