alita-sdk 0.3.554__py3-none-any.whl → 0.3.602__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (116) hide show
  1. alita_sdk/cli/agent_executor.py +2 -1
  2. alita_sdk/cli/agent_loader.py +34 -4
  3. alita_sdk/cli/agents.py +433 -203
  4. alita_sdk/configurations/openapi.py +227 -15
  5. alita_sdk/runtime/clients/client.py +4 -2
  6. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  7. alita_sdk/runtime/langchain/assistant.py +61 -11
  8. alita_sdk/runtime/langchain/constants.py +419 -171
  9. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -2
  10. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
  11. alita_sdk/runtime/langchain/langraph_agent.py +106 -21
  12. alita_sdk/runtime/langchain/utils.py +30 -14
  13. alita_sdk/runtime/toolkits/__init__.py +3 -0
  14. alita_sdk/runtime/toolkits/artifact.py +2 -1
  15. alita_sdk/runtime/toolkits/mcp.py +6 -3
  16. alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
  17. alita_sdk/runtime/toolkits/skill_router.py +2 -2
  18. alita_sdk/runtime/toolkits/tools.py +64 -2
  19. alita_sdk/runtime/toolkits/vectorstore.py +1 -1
  20. alita_sdk/runtime/tools/artifact.py +15 -0
  21. alita_sdk/runtime/tools/data_analysis.py +183 -0
  22. alita_sdk/runtime/tools/llm.py +30 -11
  23. alita_sdk/runtime/tools/mcp_server_tool.py +6 -3
  24. alita_sdk/runtime/tools/router.py +2 -4
  25. alita_sdk/runtime/tools/sandbox.py +9 -6
  26. alita_sdk/runtime/utils/constants.py +5 -1
  27. alita_sdk/runtime/utils/mcp_client.py +1 -1
  28. alita_sdk/runtime/utils/mcp_sse_client.py +1 -1
  29. alita_sdk/runtime/utils/toolkit_utils.py +2 -0
  30. alita_sdk/tools/__init__.py +3 -1
  31. alita_sdk/tools/ado/repos/__init__.py +26 -8
  32. alita_sdk/tools/ado/repos/repos_wrapper.py +78 -52
  33. alita_sdk/tools/ado/test_plan/__init__.py +3 -2
  34. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
  35. alita_sdk/tools/ado/utils.py +1 -18
  36. alita_sdk/tools/ado/wiki/__init__.py +2 -1
  37. alita_sdk/tools/ado/wiki/ado_wrapper.py +23 -1
  38. alita_sdk/tools/ado/work_item/__init__.py +3 -2
  39. alita_sdk/tools/ado/work_item/ado_wrapper.py +23 -1
  40. alita_sdk/tools/advanced_jira_mining/__init__.py +2 -1
  41. alita_sdk/tools/aws/delta_lake/__init__.py +2 -1
  42. alita_sdk/tools/azure_ai/search/__init__.py +2 -1
  43. alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
  44. alita_sdk/tools/base_indexer_toolkit.py +15 -6
  45. alita_sdk/tools/bitbucket/__init__.py +2 -1
  46. alita_sdk/tools/bitbucket/api_wrapper.py +1 -1
  47. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +3 -3
  48. alita_sdk/tools/browser/__init__.py +1 -1
  49. alita_sdk/tools/carrier/__init__.py +1 -1
  50. alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
  51. alita_sdk/tools/cloud/aws/__init__.py +2 -1
  52. alita_sdk/tools/cloud/azure/__init__.py +2 -1
  53. alita_sdk/tools/cloud/gcp/__init__.py +2 -1
  54. alita_sdk/tools/cloud/k8s/__init__.py +2 -1
  55. alita_sdk/tools/code/linter/__init__.py +2 -1
  56. alita_sdk/tools/code/sonar/__init__.py +2 -1
  57. alita_sdk/tools/code_indexer_toolkit.py +19 -2
  58. alita_sdk/tools/confluence/__init__.py +7 -6
  59. alita_sdk/tools/confluence/api_wrapper.py +2 -2
  60. alita_sdk/tools/custom_open_api/__init__.py +2 -1
  61. alita_sdk/tools/elastic/__init__.py +2 -1
  62. alita_sdk/tools/elitea_base.py +28 -9
  63. alita_sdk/tools/figma/__init__.py +52 -6
  64. alita_sdk/tools/figma/api_wrapper.py +1158 -123
  65. alita_sdk/tools/figma/figma_client.py +73 -0
  66. alita_sdk/tools/figma/toon_tools.py +2748 -0
  67. alita_sdk/tools/github/__init__.py +2 -1
  68. alita_sdk/tools/github/github_client.py +56 -92
  69. alita_sdk/tools/github/schemas.py +4 -4
  70. alita_sdk/tools/gitlab/__init__.py +2 -1
  71. alita_sdk/tools/gitlab/api_wrapper.py +118 -38
  72. alita_sdk/tools/gitlab_org/__init__.py +2 -1
  73. alita_sdk/tools/gitlab_org/api_wrapper.py +60 -62
  74. alita_sdk/tools/google/bigquery/__init__.py +2 -1
  75. alita_sdk/tools/google_places/__init__.py +2 -1
  76. alita_sdk/tools/jira/__init__.py +2 -1
  77. alita_sdk/tools/keycloak/__init__.py +2 -1
  78. alita_sdk/tools/localgit/__init__.py +2 -1
  79. alita_sdk/tools/memory/__init__.py +1 -1
  80. alita_sdk/tools/ocr/__init__.py +2 -1
  81. alita_sdk/tools/openapi/__init__.py +227 -15
  82. alita_sdk/tools/openapi/api_wrapper.py +1287 -802
  83. alita_sdk/tools/pandas/__init__.py +11 -5
  84. alita_sdk/tools/pandas/api_wrapper.py +38 -25
  85. alita_sdk/tools/postman/__init__.py +2 -1
  86. alita_sdk/tools/pptx/__init__.py +2 -1
  87. alita_sdk/tools/qtest/__init__.py +21 -2
  88. alita_sdk/tools/qtest/api_wrapper.py +430 -13
  89. alita_sdk/tools/rally/__init__.py +2 -1
  90. alita_sdk/tools/rally/api_wrapper.py +1 -1
  91. alita_sdk/tools/report_portal/__init__.py +2 -1
  92. alita_sdk/tools/salesforce/__init__.py +2 -1
  93. alita_sdk/tools/servicenow/__init__.py +2 -1
  94. alita_sdk/tools/sharepoint/__init__.py +2 -1
  95. alita_sdk/tools/sharepoint/api_wrapper.py +2 -2
  96. alita_sdk/tools/slack/__init__.py +3 -2
  97. alita_sdk/tools/slack/api_wrapper.py +2 -2
  98. alita_sdk/tools/sql/__init__.py +3 -2
  99. alita_sdk/tools/testio/__init__.py +2 -1
  100. alita_sdk/tools/testrail/__init__.py +2 -1
  101. alita_sdk/tools/utils/content_parser.py +77 -3
  102. alita_sdk/tools/utils/text_operations.py +163 -71
  103. alita_sdk/tools/xray/__init__.py +3 -2
  104. alita_sdk/tools/yagmail/__init__.py +2 -1
  105. alita_sdk/tools/zephyr/__init__.py +2 -1
  106. alita_sdk/tools/zephyr_enterprise/__init__.py +2 -1
  107. alita_sdk/tools/zephyr_essential/__init__.py +2 -1
  108. alita_sdk/tools/zephyr_scale/__init__.py +3 -2
  109. alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
  110. alita_sdk/tools/zephyr_squad/__init__.py +2 -1
  111. {alita_sdk-0.3.554.dist-info → alita_sdk-0.3.602.dist-info}/METADATA +7 -6
  112. {alita_sdk-0.3.554.dist-info → alita_sdk-0.3.602.dist-info}/RECORD +116 -111
  113. {alita_sdk-0.3.554.dist-info → alita_sdk-0.3.602.dist-info}/WHEEL +0 -0
  114. {alita_sdk-0.3.554.dist-info → alita_sdk-0.3.602.dist-info}/entry_points.txt +0 -0
  115. {alita_sdk-0.3.554.dist-info → alita_sdk-0.3.602.dist-info}/licenses/LICENSE +0 -0
  116. {alita_sdk-0.3.554.dist-info → alita_sdk-0.3.602.dist-info}/top_level.txt +0 -0
alita_sdk/cli/agents.py CHANGED
@@ -141,15 +141,16 @@ def parse_test_case(test_case_path: str) -> Dict[str, Any]:
141
141
  if gen_data_match:
142
142
  generate_test_data = gen_data_match.group(1).lower() == 'true'
143
143
 
144
- # Extract Test Data Configuration table
145
- test_data_config = {}
144
+ # Extract Test Data Configuration section as a raw fenced code block string
145
+ # NOTE: We intentionally store the entire section as a single string rather than parsing
146
+ # individual table rows. This preserves the original formatting for downstream tools
147
+ # which may prefer the raw markdown block.
148
+ test_data_config = None
146
149
  config_section_match = re.search(r'##\s+Test Data Configuration\s*\n(.+?)(?=\n##|\Z)', content, re.DOTALL)
147
150
  if config_section_match:
148
- config_section = config_section_match.group(1)
149
- # Parse markdown table (format: | Parameter | Value | Description |)
150
- table_rows = re.findall(r'\|\s*\*\*([^*]+)\*\*\s*\|\s*`?([^|`]+)`?\s*\|', config_section)
151
- for param, value in table_rows:
152
- test_data_config[param.strip()] = value.strip()
151
+ config_section = config_section_match.group(1).strip()
152
+ # Store as a fenced code block to make it clear this is a raw block of text
153
+ test_data_config = f"\n{config_section}\n"
153
154
 
154
155
  # Extract Pre-requisites section
155
156
  prerequisites = ""
@@ -252,86 +253,80 @@ def _build_bulk_data_gen_prompt(parsed_test_cases: list) -> str:
252
253
  for idx, tc in enumerate(parsed_test_cases, 1):
253
254
  test_case = tc['data']
254
255
  test_file = tc['file']
255
-
256
+ # Build parts for this test case (do not include separator lines here;
257
+ # the entire block is wrapped with separators at the top-level)
256
258
  parts = [f"Test Case #{idx}: {test_case['name']}", f"File: {test_file.name}", ""]
257
259
 
258
260
  if test_case.get('test_data_config'):
259
261
  parts.append("Test Data Configuration:")
260
- for param, value in test_case['test_data_config'].items():
261
- parts.append(f" - {param}: {value}")
262
-
262
+ td = test_case['test_data_config']
263
+ raw_lines = str(td).splitlines()
264
+ for line in raw_lines:
265
+ parts.append(f"{line}")
266
+
263
267
  if test_case.get('prerequisites'):
264
268
  parts.append(f"\nPre-requisites:\n{test_case['prerequisites']}")
265
269
 
266
- if test_case.get('variables'):
267
- parts.append(f"\nVariables to generate: {', '.join(test_case['variables'])}")
268
-
269
270
  requirements.append("\n".join(parts))
270
271
 
271
- return f"""{'='*60}
272
-
273
- {chr(10).join(requirements)}
274
-
275
- {'='*60}"""
276
-
277
-
278
- def _build_bulk_execution_prompt(parsed_test_cases: list) -> str:
279
- """Build consolidated prompt for bulk test execution."""
280
- parts = []
281
-
282
- for idx, tc_info in enumerate(parsed_test_cases, 1):
283
- test_case = tc_info['data']
284
- test_file = tc_info['file']
285
-
286
- parts.append(f"\n{'='*80}\nTEST CASE #{idx}: {test_case['name']}\nFile: {test_file.name}\n{'='*80}")
287
-
288
- if test_case['steps']:
289
- for step in test_case['steps']:
290
- parts.append(f"\nStep {step['number']}: {step['title']}\n{step['instruction']}")
291
- if step['expectation']:
292
- parts.append(f"Expected Result: {step['expectation']}")
293
- else:
294
- parts.append("\n(No steps defined)")
272
+ # If no requirements were collected, return an empty string to avoid
273
+ # producing a prompt with only separator lines.
274
+ if not requirements:
275
+ return ""
276
+
277
+ # Use a visible divider between test cases so each entry is clearly separated
278
+ divider = '-' * 40
279
+ body = f"\n\n{divider}\n\n".join(requirements)
280
+ return f"{('='*60)}\n\n{body}\n\n{('='*60)}"
281
+
282
+
283
+ def _build_single_test_execution_prompt(test_case_info: dict, test_number: int) -> str:
284
+ """Build execution prompt for a single test case."""
285
+ test_case = test_case_info['data']
286
+ test_file = test_case_info['file']
287
+
288
+ parts = [
289
+ f"\n{'='*80}",
290
+ f"TEST CASE #{test_number}: {test_case['name']}",
291
+ f"File: {test_file.name}",
292
+ f"{'='*80}"
293
+ ]
294
+
295
+ if test_case['steps']:
296
+ for step in test_case['steps']:
297
+ parts.append(f"\nStep {step['number']}: {step['title']}")
298
+ parts.append(step['instruction'])
299
+ else:
300
+ parts.append("\n(No steps defined)")
295
301
 
296
302
  return "\n".join(parts)
297
303
 
298
304
 
299
- def _build_validation_prompt(parsed_test_cases: list, execution_output: str) -> str:
300
- """Build prompt for bulk validation of test results."""
301
- parts = ["You are a test validator. Review the test execution results and validate each test case.\n\nTest Cases to Validate:\n"]
305
+ def _build_single_test_validation_prompt(test_case_info: dict, test_number: int, execution_output: str) -> str:
306
+ """Build validation prompt for a single test case."""
307
+ test_case = test_case_info['data']
308
+
309
+ parts = [
310
+ f"\nTest Case #{test_number}: {test_case['name']}"
311
+ ]
302
312
 
303
- for idx, tc_info in enumerate(parsed_test_cases, 1):
304
- test_case = tc_info['data']
305
- parts.append(f"\nTest Case #{idx}: {test_case['name']}")
306
- if test_case['steps']:
307
- for step in test_case['steps']:
308
- parts.append(f" Step {step['number']}: {step['title']}")
309
- if step['expectation']:
310
- parts.append(f" Expected: {step['expectation']}")
313
+ if test_case['steps']:
314
+ for step in test_case['steps']:
315
+ parts.append(f" Step {step['number']}: {step['title']}")
316
+ if step['expectation']:
317
+ parts.append(f" Expected: {step['expectation']}")
311
318
 
312
319
  parts.append(f"\n\nActual Execution Results:\n{execution_output}\n")
313
- parts.append(f"""\nBased on the execution results above, validate each test case.
314
-
315
- Respond with valid JSON in this EXACT format:
316
- {{
317
- "test_cases": [
318
- {{
319
- "test_number": 1,
320
- "test_name": "<test case name>",
321
- "steps": [
322
- {{"step_number": 1, "title": "<step title>", "passed": true/false, "details": "<brief explanation>"}},
323
- {{"step_number": 2, "title": "<step title>", "passed": true/false, "details": "<brief explanation>"}}
324
- ]
325
- }},
326
- {{
327
- "test_number": 2,
328
- "test_name": "<test case name>",
329
- "steps": [...]
330
- }}
331
- ]
332
- }}
333
-
334
- Validate all {len(parsed_test_cases)} test cases and their steps.""")
320
+
321
+ # Escape quotes in test name for valid JSON in prompt
322
+ escaped_test_name = test_case['name'].replace('"', '\\"')
323
+
324
+ parts.append(f"""\nBased on the execution results above, validate this test case.
325
+ {{
326
+ "test_number": {test_number},
327
+ "test_name": "{escaped_test_name}"
328
+ }}
329
+ """)
335
330
 
336
331
  return "\n".join(parts)
337
332
 
@@ -359,17 +354,119 @@ def _extract_json_from_text(text: str) -> dict:
359
354
  return json.loads(text[start_idx:end_idx])
360
355
 
361
356
 
362
- def _create_fallback_results(parsed_test_cases: list) -> tuple[list, int, int, int]:
363
- """Create fallback results when execution/validation fails."""
364
- test_results = []
365
- for tc_info in parsed_test_cases:
366
- test_results.append({
367
- 'title': tc_info['data']['name'],
357
+ def _create_fallback_result_for_test(test_case: dict, test_file: Path, reason: str = 'Validation failed') -> dict:
358
+ """Create a fallback result for a single test case with detailed step information.
359
+
360
+ Args:
361
+ test_case: Parsed test case data
362
+ test_file: Path to test case file
363
+ reason: Reason for fallback
364
+
365
+ Returns:
366
+ Fallback test result dict with step details
367
+ """
368
+ fallback_steps = []
369
+ for step_info in test_case.get('steps', []):
370
+ fallback_steps.append({
371
+ 'step_number': step_info['number'],
372
+ 'title': step_info['title'],
368
373
  'passed': False,
369
- 'file': tc_info['file'].name,
370
- 'step_results': []
374
+ 'details': reason
371
375
  })
372
- return test_results, len(parsed_test_cases), 0, len(parsed_test_cases)
376
+
377
+ return {
378
+ 'title': test_case['name'],
379
+ 'passed': False,
380
+ 'file': test_file.name,
381
+ 'step_results': fallback_steps,
382
+ 'validation_error': reason
383
+ }
384
+
385
+
386
+ def _cleanup_executor_cache(cache: Dict[str, tuple], cache_name: str = "executor") -> None:
387
+ """Clean up executor cache resources.
388
+
389
+ Args:
390
+ cache: Dictionary of cached executors
391
+ cache_name: Name of cache for logging
392
+ """
393
+ console.print(f"[dim]Cleaning up {cache_name} cache...[/dim]")
394
+ for cache_key, cached_items in cache.items():
395
+ try:
396
+ # Extract memory from tuple (second element)
397
+ memory = cached_items[1] if len(cached_items) > 1 else None
398
+
399
+ # Close SQLite memory connection
400
+ if memory and hasattr(memory, 'conn') and memory.conn:
401
+ memory.conn.close()
402
+ except Exception as e:
403
+ logger.debug(f"Error cleaning up {cache_name} cache for {cache_key}: {e}")
404
+
405
+
406
+ def _create_executor_from_cache(cache: Dict[str, tuple], cache_key: str,
407
+ client, agent_def: Dict, toolkit_config_path: Optional[str],
408
+ config, model: Optional[str], temperature: Optional[float],
409
+ max_tokens: Optional[int], work_dir: Optional[str]) -> tuple:
410
+ """Get or create executor from cache.
411
+
412
+ Args:
413
+ cache: Executor cache dictionary
414
+ cache_key: Key for caching
415
+ client: API client
416
+ agent_def: Agent definition
417
+ toolkit_config_path: Path to toolkit config
418
+ config: CLI configuration
419
+ model: Model override
420
+ temperature: Temperature override
421
+ max_tokens: Max tokens override
422
+ work_dir: Working directory
423
+
424
+ Returns:
425
+ Tuple of (agent_executor, memory, mcp_session_manager)
426
+ """
427
+ if cache_key in cache:
428
+ return cache[cache_key]
429
+
430
+ # Create new executor
431
+ from langgraph.checkpoint.sqlite import SqliteSaver
432
+ import sqlite3
433
+
434
+ memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
435
+ toolkit_config_tuple = (toolkit_config_path,) if toolkit_config_path else ()
436
+
437
+ agent_executor, mcp_session_manager, _, _, _, _, _ = _setup_local_agent_executor(
438
+ client, agent_def, toolkit_config_tuple, config, model, temperature,
439
+ max_tokens, memory, work_dir
440
+ )
441
+
442
+ # Cache the executor
443
+ cached_tuple = (agent_executor, memory, mcp_session_manager)
444
+ cache[cache_key] = cached_tuple
445
+ return cached_tuple
446
+
447
+
448
+ def _print_validation_diagnostics(validation_output: str) -> None:
449
+ """Print diagnostic information for validation output.
450
+
451
+ Args:
452
+ validation_output: The validation output to diagnose
453
+ """
454
+ console.print(f"\n[bold red]🔍 Diagnostic Information:[/bold red]")
455
+ console.print(f"[dim]Output length: {len(validation_output)} characters[/dim]")
456
+
457
+ # Check for key JSON elements
458
+ has_json = '{' in validation_output and '}' in validation_output
459
+ has_fields = 'test_number' in validation_output and 'steps' in validation_output
460
+
461
+ console.print(f"[dim]Has JSON structure: {has_json}[/dim]")
462
+ console.print(f"[dim]Has required fields: {has_fields}[/dim]")
463
+
464
+ # Show relevant excerpt
465
+ if len(validation_output) > 400:
466
+ console.print(f"\n[red]First 200 chars:[/red] [dim]{validation_output[:200]}[/dim]")
467
+ console.print(f"[red]Last 200 chars:[/red] [dim]{validation_output[-200:]}[/dim]")
468
+ else:
469
+ console.print(f"\n[red]Full output:[/red] [dim]{validation_output}[/dim]")
373
470
 
374
471
 
375
472
  def _get_alita_system_prompt(config) -> str:
@@ -1263,6 +1360,10 @@ def agent_show(ctx, agent_source: str, version: Optional[str]):
1263
1360
  details.append("Temperature: ", style="bold")
1264
1361
  details.append(f"{agent_def['temperature']}\n", style="cyan")
1265
1362
 
1363
+ if agent_def.get('persona'):
1364
+ details.append("Persona: ", style="bold")
1365
+ details.append(f"{agent_def['persona']}\n", style="cyan")
1366
+
1266
1367
  panel = Panel(
1267
1368
  details,
1268
1369
  title=f"Local Agent: {agent_def.get('name', 'Unknown')}",
@@ -3212,27 +3313,42 @@ def agent_run(ctx, agent_source: str, message: str, version: Optional[str],
3212
3313
 
3213
3314
 
3214
3315
  @agent.command('execute-test-cases')
3215
- @click.argument('agent_source')
3316
+ @click.option(
3317
+ '--agent_source',
3318
+ '--agent-source',
3319
+ 'agent_source',
3320
+ required=False,
3321
+ default=str(Path('.alita') / 'agents' / 'test-runner.agent.md'),
3322
+ show_default=True,
3323
+ type=click.Path(exists=False, file_okay=True, dir_okay=False),
3324
+ help='Path to test runner agent definition file'
3325
+ )
3216
3326
  @click.option('--test-cases-dir', required=True, type=click.Path(exists=True, file_okay=False, dir_okay=True),
3217
3327
  help='Directory containing test case files')
3218
- @click.option('--results-dir', required=True, type=click.Path(file_okay=False, dir_okay=True),
3328
+ @click.option('--results-dir', required=False, default=str(Path('.alita') / 'tests' / 'results'),
3329
+ type=click.Path(file_okay=False, dir_okay=True),
3219
3330
  help='Directory where test results will be saved')
3220
3331
  @click.option('--test-case', 'test_case_files', multiple=True,
3221
3332
  help='Specific test case file(s) to execute (e.g., TC-001.md). Can specify multiple times. If not specified, executes all test cases.')
3222
3333
  @click.option('--model', help='Override LLM model')
3223
3334
  @click.option('--temperature', type=float, help='Override temperature')
3224
3335
  @click.option('--max-tokens', type=int, help='Override max tokens')
3225
- @click.option('--dir', 'work_dir', type=click.Path(exists=True, file_okay=False, dir_okay=True),
3336
+ @click.option('--dir', 'work_dir', required=False, default=str(Path('.alita')),
3337
+ type=click.Path(exists=True, file_okay=False, dir_okay=True),
3226
3338
  help='Grant agent filesystem access to this directory')
3227
- @click.option('--data-generator', type=click.Path(exists=True),
3339
+ @click.option('--data-generator', required=False, default=str(Path('.alita') / 'agents' / 'test-data-generator.agent.md'),
3340
+ type=click.Path(exists=True),
3228
3341
  help='Path to test data generator agent definition file')
3342
+ @click.option('--validator', type=click.Path(exists=True),
3343
+ help='Path to test validator agent definition file (default: .alita/agents/test-validator.agent.md)')
3229
3344
  @click.option('--skip-data-generation', is_flag=True,
3230
3345
  help='Skip test data generation step')
3231
3346
  @click.pass_context
3232
3347
  def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir: str,
3233
3348
  test_case_files: tuple, model: Optional[str], temperature: Optional[float],
3234
- max_tokens: Optional[int], work_dir: Optional[str],
3235
- data_generator: Optional[str], skip_data_generation: bool):
3349
+ max_tokens: Optional[int], work_dir: str,
3350
+ data_generator: str, validator: Optional[str],
3351
+ skip_data_generation: bool):
3236
3352
  """
3237
3353
  Execute test cases from a directory and save results.
3238
3354
 
@@ -3247,24 +3363,44 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
3247
3363
  - Generates a test result file
3248
3364
  4. Saves all results to RESULTS_DIR
3249
3365
 
3250
- AGENT_SOURCE: Path to agent definition file (e.g., .github/agents/test-runner.agent.md)
3366
+ --agent_source: Path to test runner agent definition file
3251
3367
 
3252
3368
  \b
3253
3369
  Examples:
3254
- alita execute-test-cases ./agent.json --test-cases-dir ./tests --results-dir ./results
3255
- alita execute-test-cases ./agent.json --test-cases-dir ./tests --results-dir ./results \
3370
+ alita agent execute-test-cases --test-cases-dir ./tests --results-dir ./results
3371
+ alita agent execute-test-cases --agent_source ./agent.json --test-cases-dir ./tests --results-dir ./results \
3256
3372
  --data-generator ./data-gen.json
3257
- alita execute-test-cases ./agent.json --test-cases-dir ./tests --results-dir ./results \
3373
+ alita agent execute-test-cases --agent_source ./agent.json --test-cases-dir ./tests --results-dir ./results \
3258
3374
  --test-case TC-001.md --test-case TC-002.md
3259
- alita execute-test-cases ./agent.json --test-cases-dir ./tests --results-dir ./results \
3375
+ alita agent execute-test-cases --agent_source ./agent.json --test-cases-dir ./tests --results-dir ./results \
3260
3376
  --skip-data-generation --model gpt-4o
3261
3377
  """
3378
+ # Import dependencies at function start
3379
+ import sqlite3
3380
+ import uuid
3381
+ from langgraph.checkpoint.sqlite import SqliteSaver
3382
+
3262
3383
  config = ctx.obj['config']
3263
3384
  client = get_client(ctx)
3385
+
3386
+ # Sanity-check committed defaults (should exist; fail early with a clear message if not)
3387
+ if results_dir and not Path(results_dir).exists():
3388
+ raise click.ClickException(
3389
+ f"Results directory not found: {results_dir}. "
3390
+ f"If you are running outside the repo root, pass --results-dir explicitly."
3391
+ )
3264
3392
 
3265
3393
  try:
3266
3394
  # Load agent definition
3267
- if not Path(agent_source).exists():
3395
+ agent_source_path = Path(agent_source)
3396
+ if not agent_source_path.exists():
3397
+ default_path = Path('.alita') / 'agents' / 'test-runner.agent.md'
3398
+ if agent_source_path == default_path:
3399
+ raise click.ClickException(
3400
+ f"Default agent definition not found: {agent_source}. "
3401
+ f"Run this command from the repo root (so {default_path} resolves correctly) "
3402
+ f"or pass --agent_source explicitly."
3403
+ )
3268
3404
  raise click.ClickException(f"Agent definition not found: {agent_source}")
3269
3405
 
3270
3406
  agent_def = load_agent_definition(agent_source)
@@ -3317,11 +3453,30 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
3317
3453
  console.print("[yellow]Continuing with test execution...[/yellow]\n")
3318
3454
  logger.debug(f"Data generator setup error: {e}", exc_info=True)
3319
3455
 
3320
- # Track overall results
3321
- total_tests = 0
3322
- passed_tests = 0
3323
- failed_tests = 0
3324
- test_results = [] # Store structured results for final report
3456
+ # Load validator agent definition
3457
+ validator_def = None
3458
+ validator_agent_name = "Default Validator"
3459
+
3460
+ # Try to load validator from specified path or default location
3461
+ validator_path = validator
3462
+ if not validator_path:
3463
+ # Default to .alita/agents/test-validator.agent.md
3464
+ default_validator = Path.cwd() / '.alita' / 'agents' / 'test-validator.agent.md'
3465
+ if default_validator.exists():
3466
+ validator_path = str(default_validator)
3467
+
3468
+ if validator_path and Path(validator_path).exists():
3469
+ try:
3470
+ validator_def = load_agent_definition(validator_path)
3471
+ validator_agent_name = validator_def.get('name', Path(validator_path).stem)
3472
+ console.print(f"Validator Agent: [bold]{validator_agent_name}[/bold]")
3473
+ console.print(f"[dim]Using: {validator_path}[/dim]\n")
3474
+ except Exception as e:
3475
+ console.print(f"[yellow]⚠ Warning: Failed to load validator agent: {e}[/yellow]")
3476
+ console.print(f"[yellow]Will use test runner agent for validation[/yellow]\n")
3477
+ logger.debug(f"Validator load error: {e}", exc_info=True)
3478
+ else:
3479
+ console.print(f"[dim]No validator agent specified, using test runner agent for validation[/dim]\n")
3325
3480
 
3326
3481
  # Store bulk data generation chat history to pass to test executors
3327
3482
  bulk_gen_chat_history = []
@@ -3353,11 +3508,10 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
3353
3508
 
3354
3509
  bulk_data_gen_prompt = _build_bulk_data_gen_prompt(test_cases_needing_data_gen)
3355
3510
 
3356
- console.print(f"Executing test data generation prompt {bulk_data_gen_prompt}\n")
3511
+ console.print(f"Executing test data generation prompt \n{bulk_data_gen_prompt}\n")
3357
3512
 
3358
3513
  try:
3359
3514
  # Setup data generator agent
3360
- from langgraph.checkpoint.sqlite import SqliteSaver
3361
3515
  bulk_memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
3362
3516
 
3363
3517
  # Use first test case's config or empty tuple
@@ -3398,138 +3552,214 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
3398
3552
  console.print("[yellow]Continuing with test execution...[/yellow]\n")
3399
3553
  logger.debug(f"Bulk data generation error: {e}", exc_info=True)
3400
3554
 
3401
- # Execute ALL test cases in one bulk operation
3555
+ # Execute test cases sequentially with executor caching
3402
3556
  if not parsed_test_cases:
3403
3557
  console.print("[yellow]No test cases to execute[/yellow]")
3404
3558
  return
3405
3559
 
3406
- console.print(f"\n[bold yellow]📋 Executing ALL test cases in bulk...[/bold yellow]\n")
3407
-
3408
- # Use first test case's config for agent setup
3409
- first_tc = parsed_test_cases[0]
3410
- first_test_file = first_tc['file']
3411
- toolkit_config_path = resolve_toolkit_config_path(
3412
- first_tc['data'].get('config_path', ''),
3413
- first_test_file,
3414
- test_cases_path
3415
- )
3416
- toolkit_config_tuple = (toolkit_config_path,) if toolkit_config_path else ()
3417
-
3418
- # Create memory for bulk execution
3419
- from langgraph.checkpoint.sqlite import SqliteSaver
3420
- memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
3560
+ console.print(f"\n[bold yellow]📋 Executing test cases sequentially...[/bold yellow]\n")
3421
3561
 
3422
- # Initialize chat history with bulk data generation context
3423
- chat_history = bulk_gen_chat_history.copy()
3562
+ # Show data generation context availability
3563
+ if bulk_gen_chat_history:
3564
+ console.print(f"[dim]✓ Data generation history available ({len(bulk_gen_chat_history)} messages) - shared with all test cases[/dim]\n")
3565
+ else:
3566
+ console.print(f"[dim]ℹ No data generation history (skipped or disabled)[/dim]\n")
3424
3567
 
3425
- # Setup agent executor
3426
- agent_executor, _, _, _, _, _, _ = _setup_local_agent_executor(
3427
- client, agent_def, toolkit_config_tuple, config, model, temperature, max_tokens, memory, work_dir
3428
- )
3568
+ # Executor cache: key = toolkit_config_path, value = (agent_executor, memory, mcp_session_manager)
3569
+ executor_cache = {}
3429
3570
 
3430
- # Build bulk execution prompt
3431
- bulk_all_prompt = _build_bulk_execution_prompt(parsed_test_cases)
3432
-
3433
- console.print(f"Executing the prompt: {bulk_all_prompt}\n")
3571
+ # Validation executor cache: separate isolated executors for validation
3572
+ # key = toolkit_config_path, value = (agent_executor, memory, mcp_session_manager)
3573
+ validation_executor_cache = {}
3434
3574
 
3435
- # Execute all test cases in bulk
3575
+ # Execute each test case sequentially
3436
3576
  test_results = []
3437
- all_execution_output = ""
3577
+ total_tests = len(parsed_test_cases)
3438
3578
 
3439
- try:
3440
- if agent_executor:
3441
- with console.status(f"[yellow]Executing {len(parsed_test_cases)} test cases in bulk...[/yellow]", spinner="dots"):
3442
- bulk_result = agent_executor.invoke({
3443
- "input": bulk_all_prompt,
3444
- "chat_history": chat_history
3445
- })
3446
- all_execution_output = extract_output_from_result(bulk_result)
3447
-
3448
- console.print(f"[green]✓ All test cases executed[/green]")
3449
- console.print(f"[dim]{all_execution_output}...[/dim]\n")
3579
+ for idx, tc_info in enumerate(parsed_test_cases, 1):
3580
+ test_case = tc_info['data']
3581
+ test_file = tc_info['file']
3582
+ test_name = test_case['name']
3583
+
3584
+ # Display progress
3585
+ console.print(f"[bold cyan]Test Case {idx}/{total_tests} - {test_name}[/bold cyan]")
3586
+
3587
+ try:
3588
+ # Resolve toolkit config path for this test case
3589
+ toolkit_config_path = resolve_toolkit_config_path(
3590
+ test_case.get('config_path', ''),
3591
+ test_file,
3592
+ test_cases_path
3593
+ )
3450
3594
 
3451
- # Update chat history
3452
- chat_history.append({"role": "user", "content": bulk_all_prompt})
3453
- chat_history.append({"role": "assistant", "content": all_execution_output})
3595
+ # Use cache key (None if no config)
3596
+ cache_key = toolkit_config_path if toolkit_config_path else '__no_config__'
3597
+ thread_id = f"test_case_{idx}_{uuid.uuid4().hex[:8]}"
3454
3598
 
3455
- # Now validate ALL test cases in bulk
3456
- console.print(f"[bold yellow]✅ Validating all test cases...[/bold yellow]\n")
3599
+ # Get or create executor from cache
3600
+ agent_executor, memory, mcp_session_manager = _create_executor_from_cache(
3601
+ executor_cache, cache_key, client, agent_def, toolkit_config_path,
3602
+ config, model, temperature, max_tokens, work_dir
3603
+ )
3457
3604
 
3458
- validation_prompt = _build_validation_prompt(parsed_test_cases, all_execution_output)
3459
-
3460
- console.print(f"[dim]{validation_prompt}[/dim]\n")
3605
+ # Build execution prompt for single test case
3606
+ execution_prompt = _build_single_test_execution_prompt(tc_info, idx)
3607
+ console.print(f"[dim]Executing with {len(bulk_gen_chat_history)} history messages[/dim]")
3608
+ console.print(f"[dim]Executing test case with the prompt {execution_prompt}[/dim]")
3461
3609
 
3462
- with console.status("[yellow]Validating all results...[/yellow]", spinner="dots"):
3463
- validation_result = agent_executor.invoke({
3464
- "input": validation_prompt,
3465
- "chat_history": chat_history
3610
+ # Execute test case
3611
+ execution_output = ""
3612
+ if agent_executor:
3613
+ with console.status(f"[yellow]Executing test case...[/yellow]", spinner="dots"):
3614
+ exec_result = agent_executor.invoke({
3615
+ "input": execution_prompt,
3616
+ "chat_history": bulk_gen_chat_history # ONLY data gen history, no accumulation
3617
+ }, config={"configurable": {"thread_id": thread_id}})
3618
+ execution_output = extract_output_from_result(exec_result)
3619
+
3620
+ console.print(f"[green]✓ Test case executed[/green]")
3621
+ console.print(f"[dim]{execution_output}[/dim]\n")
3622
+
3623
+ # Append execution to bulk gen chat history for validation
3624
+ test_case_history_start = len(bulk_gen_chat_history)
3625
+ bulk_gen_chat_history.extend([
3626
+ {"role": "user", "content": execution_prompt},
3627
+ {"role": "assistant", "content": execution_output}
3628
+ ])
3629
+
3630
+ # No history accumulation - each test case is independent
3631
+ else:
3632
+ console.print(f"[red]✗ No agent executor available[/red]")
3633
+ # Create fallback result for this test
3634
+ test_results.append({
3635
+ 'title': test_name,
3636
+ 'passed': False,
3637
+ 'file': test_file.name,
3638
+ 'step_results': []
3466
3639
  })
3640
+ continue
3641
+
3642
+ # Validate test case using validation executor with accumulated history
3643
+ validation_prompt = _build_single_test_validation_prompt(tc_info, idx, execution_output)
3644
+
3645
+ console.print(f"[bold yellow]🔍 Validating test case (with execution history)...[/bold yellow]")
3646
+ console.print(f"[dim]{validation_prompt}[/dim]\n")
3467
3647
 
3468
- validation_output = extract_output_from_result(validation_result)
3648
+ # Create or retrieve isolated validation executor
3649
+ validation_cache_key = f"{cache_key}_validation"
3650
+ validation_agent_def = validator_def if validator_def else agent_def
3651
+
3652
+ validation_executor, validation_memory, validation_mcp_session = _create_executor_from_cache(
3653
+ validation_executor_cache, validation_cache_key, client, validation_agent_def,
3654
+ toolkit_config_path, config, model, temperature, max_tokens, work_dir
3655
+ )
3656
+
3657
+ if validation_cache_key not in validation_executor_cache:
3658
+ console.print(f"[dim]Created new isolated validation executor[/dim]")
3659
+ else:
3660
+ console.print(f"[dim]Using cached validation executor[/dim]")
3469
3661
 
3470
- console.print(f"[dim]Validation Response: {validation_output}...[/dim]\n")
3662
+ # For validation, use a separate thread with accumulated chat history (data gen + execution)
3663
+ # This provides context to the validator about the test execution
3664
+ validation_thread_id = f"validation_{idx}_{uuid.uuid4().hex[:8]}"
3665
+
3666
+ validation_output = ""
3667
+ if validation_executor:
3668
+ with console.status(f"[yellow]Validating test case...[/yellow]", spinner="dots"):
3669
+ validation_result = validation_executor.invoke({
3670
+ "input": validation_prompt,
3671
+ "chat_history": bulk_gen_chat_history # Includes data gen and execution history
3672
+ }, {"configurable": {"thread_id": validation_thread_id}})
3673
+
3674
+ validation_output = extract_output_from_result(validation_result)
3675
+ else:
3676
+ console.print(f"[red]✗ No validation executor available[/red]")
3677
+ validation_output = "{}"
3678
+
3679
+ # No further history update - validation completes the cycle
3471
3680
 
3472
3681
  # Parse validation JSON
3473
3682
  try:
3474
3683
  validation_json = _extract_json_from_text(validation_output)
3475
- test_cases_results = validation_json.get('test_cases', [])
3684
+ step_results = validation_json.get('steps', [])
3476
3685
 
3477
- # Process results for each test case
3478
- total_tests = 0
3479
- passed_tests = 0
3480
- failed_tests = 0
3686
+ # Determine if test passed (all steps must pass)
3687
+ test_passed = all(step.get('passed', False) for step in step_results) if step_results else False
3481
3688
 
3482
- for tc_result in test_cases_results:
3483
- test_name = tc_result.get('test_name', f"Test #{tc_result.get('test_number', '?')}")
3484
- step_results = tc_result.get('steps', [])
3485
-
3486
- # Determine if test passed (all steps must pass)
3487
- test_passed = all(step.get('passed', False) for step in step_results) if step_results else False
3689
+ if test_passed:
3690
+ console.print(f"[bold green]✅ Test PASSED: {test_name}[/bold green]")
3691
+ else:
3692
+ console.print(f"[bold red]❌ Test FAILED: {test_name}[/bold red]")
3693
+
3694
+ # Display individual step results
3695
+ for step_result in step_results:
3696
+ step_num = step_result.get('step_number')
3697
+ step_title = step_result.get('title', '')
3698
+ passed = step_result.get('passed', False)
3699
+ details = step_result.get('details', '')
3488
3700
 
3489
- total_tests += 1
3490
- if test_passed:
3491
- passed_tests += 1
3492
- console.print(f"[bold green]✅ Test PASSED: {test_name}[/bold green]")
3701
+ if passed:
3702
+ console.print(f" [green]✓ Step {step_num}: {step_title}[/green]")
3703
+ console.print(f" [dim]{details}[/dim]")
3493
3704
  else:
3494
- failed_tests += 1
3495
- console.print(f"[bold red]❌ Test FAILED: {test_name}[/bold red]")
3496
-
3497
- # Display individual step results
3498
- for step_result in step_results:
3499
- step_num = step_result.get('step_number')
3500
- step_title = step_result.get('title', '')
3501
- passed = step_result.get('passed', False)
3502
- details = step_result.get('details', '')
3503
-
3504
- if passed:
3505
- console.print(f" [green]✓ Step {step_num}: {step_title}[/green]")
3506
- console.print(f" [dim]{details}[/dim]")
3507
- else:
3508
- console.print(f" [red]✗ Step {step_num}: {step_title}[/red]")
3509
- console.print(f" [dim]{details}[/dim]")
3510
-
3511
- console.print()
3512
-
3513
- # Store result
3514
- test_results.append({
3515
- 'title': test_name,
3516
- 'passed': test_passed,
3517
- 'file': parsed_test_cases[tc_result.get('test_number', 1) - 1]['file'].name if tc_result.get('test_number', 1) - 1 < len(parsed_test_cases) else 'unknown',
3518
- 'step_results': step_results
3519
- })
3705
+ console.print(f" [red]✗ Step {step_num}: {step_title}[/red]")
3706
+ console.print(f" [dim]{details}[/dim]")
3707
+
3708
+ console.print()
3709
+
3710
+ # Store result
3711
+ test_results.append({
3712
+ 'title': test_name,
3713
+ 'passed': test_passed,
3714
+ 'file': test_file.name,
3715
+ 'step_results': step_results
3716
+ })
3520
3717
 
3521
3718
  except Exception as e:
3522
- logger.debug(f"Validation parsing failed: {e}")
3523
- console.print(f"[yellow]⚠ Warning: Could not parse validation results: {e}[/yellow]\n")
3524
- test_results, total_tests, passed_tests, failed_tests = _create_fallback_results(parsed_test_cases)
3525
- else:
3526
- console.print(f"[red]✗ No agent executor available[/red]\n")
3527
- test_results, total_tests, passed_tests, failed_tests = _create_fallback_results(parsed_test_cases)
3719
+ logger.debug(f"Validation parsing failed for {test_name}: {e}", exc_info=True)
3720
+ console.print(f"[yellow]⚠ Warning: Could not parse validation results for {test_name}[/yellow]")
3721
+ console.print(f"[yellow]Error: {str(e)}[/yellow]")
3528
3722
 
3529
- except Exception as e:
3530
- console.print(f"[red]✗ Bulk execution failed: {e}[/red]\n")
3531
- logger.debug(f"Bulk execution error: {e}", exc_info=True)
3532
- test_results, total_tests, passed_tests, failed_tests = _create_fallback_results(parsed_test_cases)
3723
+ # Enhanced diagnostic output
3724
+ _print_validation_diagnostics(validation_output)
3725
+
3726
+ # Generate fallback result using helper function
3727
+ console.print(f"\n[yellow]🔄 Generating fallback validation result...[/yellow]")
3728
+ fallback_result = _create_fallback_result_for_test(
3729
+ test_case,
3730
+ test_file,
3731
+ f'Validation failed - could not parse validator output: {str(e)}'
3732
+ )
3733
+ console.print(f"[dim]Created {len(fallback_result['step_results'])} fallback step results[/dim]\n")
3734
+
3735
+ test_results.append(fallback_result)
3736
+ console.print()
3737
+
3738
+ # After validation, remove the test case execution from history to prevent accumulation
3739
+ # Remove the entries added for this test case
3740
+ del bulk_gen_chat_history[test_case_history_start:]
3741
+
3742
+ except Exception as e:
3743
+ logger.debug(f"Test execution failed for {test_name}: {e}", exc_info=True)
3744
+ console.print(f"[red]✗ Test execution failed: {e}[/red]")
3745
+
3746
+ # Create fallback result using helper function
3747
+ fallback_result = _create_fallback_result_for_test(
3748
+ test_case,
3749
+ test_file,
3750
+ f'Test execution failed: {str(e)}'
3751
+ )
3752
+ test_results.append(fallback_result)
3753
+ console.print()
3754
+
3755
+ # Cleanup: Close executor cache resources
3756
+ _cleanup_executor_cache(executor_cache, "executor")
3757
+ _cleanup_executor_cache(validation_executor_cache, "validation executor")
3758
+
3759
+ # Calculate totals
3760
+ total_tests = len(test_results)
3761
+ passed_tests = sum(1 for r in test_results if r['passed'])
3762
+ failed_tests = total_tests - passed_tests
3533
3763
 
3534
3764
  # Generate summary report
3535
3765
  console.print(f"\n[bold]{'='*60}[/bold]")