alita-sdk 0.3.562__py3-none-any.whl → 0.3.584__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/agents.py +358 -165
- alita_sdk/configurations/openapi.py +227 -15
- alita_sdk/runtime/langchain/langraph_agent.py +93 -20
- alita_sdk/runtime/langchain/utils.py +30 -14
- alita_sdk/runtime/toolkits/artifact.py +2 -1
- alita_sdk/runtime/toolkits/mcp.py +4 -2
- alita_sdk/runtime/toolkits/skill_router.py +1 -1
- alita_sdk/runtime/toolkits/vectorstore.py +1 -1
- alita_sdk/runtime/tools/data_analysis.py +1 -1
- alita_sdk/runtime/tools/llm.py +30 -11
- alita_sdk/runtime/utils/constants.py +5 -1
- alita_sdk/tools/ado/repos/__init__.py +2 -1
- alita_sdk/tools/ado/test_plan/__init__.py +2 -1
- alita_sdk/tools/ado/wiki/__init__.py +2 -1
- alita_sdk/tools/ado/work_item/__init__.py +2 -1
- alita_sdk/tools/advanced_jira_mining/__init__.py +2 -1
- alita_sdk/tools/aws/delta_lake/__init__.py +2 -1
- alita_sdk/tools/azure_ai/search/__init__.py +2 -1
- alita_sdk/tools/bitbucket/__init__.py +2 -1
- alita_sdk/tools/browser/__init__.py +1 -1
- alita_sdk/tools/carrier/__init__.py +1 -1
- alita_sdk/tools/cloud/aws/__init__.py +2 -1
- alita_sdk/tools/cloud/azure/__init__.py +2 -1
- alita_sdk/tools/cloud/gcp/__init__.py +2 -1
- alita_sdk/tools/cloud/k8s/__init__.py +2 -1
- alita_sdk/tools/code/linter/__init__.py +2 -1
- alita_sdk/tools/code/sonar/__init__.py +2 -1
- alita_sdk/tools/confluence/__init__.py +2 -1
- alita_sdk/tools/custom_open_api/__init__.py +2 -1
- alita_sdk/tools/elastic/__init__.py +2 -1
- alita_sdk/tools/figma/__init__.py +51 -5
- alita_sdk/tools/figma/api_wrapper.py +1157 -123
- alita_sdk/tools/figma/figma_client.py +73 -0
- alita_sdk/tools/figma/toon_tools.py +2748 -0
- alita_sdk/tools/github/__init__.py +2 -1
- alita_sdk/tools/gitlab/__init__.py +2 -1
- alita_sdk/tools/gitlab/api_wrapper.py +32 -0
- alita_sdk/tools/gitlab_org/__init__.py +2 -1
- alita_sdk/tools/google/bigquery/__init__.py +2 -1
- alita_sdk/tools/google_places/__init__.py +2 -1
- alita_sdk/tools/jira/__init__.py +2 -1
- alita_sdk/tools/keycloak/__init__.py +2 -1
- alita_sdk/tools/localgit/__init__.py +2 -1
- alita_sdk/tools/memory/__init__.py +1 -1
- alita_sdk/tools/ocr/__init__.py +2 -1
- alita_sdk/tools/openapi/__init__.py +227 -15
- alita_sdk/tools/openapi/api_wrapper.py +1276 -802
- alita_sdk/tools/pandas/__init__.py +3 -2
- alita_sdk/tools/postman/__init__.py +2 -1
- alita_sdk/tools/pptx/__init__.py +2 -1
- alita_sdk/tools/qtest/__init__.py +2 -1
- alita_sdk/tools/rally/__init__.py +2 -1
- alita_sdk/tools/report_portal/__init__.py +2 -1
- alita_sdk/tools/salesforce/__init__.py +2 -1
- alita_sdk/tools/servicenow/__init__.py +2 -1
- alita_sdk/tools/sharepoint/__init__.py +2 -1
- alita_sdk/tools/slack/__init__.py +3 -2
- alita_sdk/tools/sql/__init__.py +2 -1
- alita_sdk/tools/testio/__init__.py +2 -1
- alita_sdk/tools/testrail/__init__.py +2 -1
- alita_sdk/tools/utils/content_parser.py +68 -2
- alita_sdk/tools/xray/__init__.py +2 -1
- alita_sdk/tools/yagmail/__init__.py +2 -1
- alita_sdk/tools/zephyr/__init__.py +2 -1
- alita_sdk/tools/zephyr_enterprise/__init__.py +2 -1
- alita_sdk/tools/zephyr_essential/__init__.py +2 -1
- alita_sdk/tools/zephyr_scale/__init__.py +2 -1
- alita_sdk/tools/zephyr_squad/__init__.py +2 -1
- {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/METADATA +1 -1
- {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/RECORD +74 -72
- {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/entry_points.txt +0 -0
- {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.562.dist-info → alita_sdk-0.3.584.dist-info}/top_level.txt +0 -0
alita_sdk/cli/agents.py
CHANGED
|
@@ -275,63 +275,62 @@ def _build_bulk_data_gen_prompt(parsed_test_cases: list) -> str:
|
|
|
275
275
|
{'='*60}"""
|
|
276
276
|
|
|
277
277
|
|
|
278
|
-
def
|
|
279
|
-
"""Build
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
parts.append(
|
|
278
|
+
def _build_single_test_execution_prompt(test_case_info: dict, test_number: int) -> str:
|
|
279
|
+
"""Build execution prompt for a single test case."""
|
|
280
|
+
test_case = test_case_info['data']
|
|
281
|
+
test_file = test_case_info['file']
|
|
282
|
+
|
|
283
|
+
parts = [
|
|
284
|
+
f"\n{'='*80}",
|
|
285
|
+
f"TEST CASE #{test_number}: {test_case['name']}",
|
|
286
|
+
f"File: {test_file.name}",
|
|
287
|
+
f"{'='*80}",
|
|
288
|
+
"\nList all the tools you have in your environment. Execute the following steps in sequential order and report results:"
|
|
289
|
+
]
|
|
290
|
+
|
|
291
|
+
if test_case['steps']:
|
|
292
|
+
for step in test_case['steps']:
|
|
293
|
+
parts.append(f"\nStep {step['number']}: {step['title']}")
|
|
294
|
+
parts.append(step['instruction'])
|
|
295
|
+
else:
|
|
296
|
+
parts.append("\n(No steps defined)")
|
|
295
297
|
|
|
296
298
|
return "\n".join(parts)
|
|
297
299
|
|
|
298
300
|
|
|
299
|
-
def
|
|
300
|
-
"""Build prompt for
|
|
301
|
-
|
|
301
|
+
def _build_single_test_validation_prompt(test_case_info: dict, test_number: int, execution_output: str) -> str:
|
|
302
|
+
"""Build validation prompt for a single test case."""
|
|
303
|
+
test_case = test_case_info['data']
|
|
304
|
+
|
|
305
|
+
parts = [
|
|
306
|
+
"Review the test execution results and validate this test case and provide the output in JSON format.\n",
|
|
307
|
+
f"\nTest Case #{test_number}: {test_case['name']}"
|
|
308
|
+
]
|
|
302
309
|
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
parts.append(f" Step {step['number']}: {step['title']}")
|
|
309
|
-
if step['expectation']:
|
|
310
|
-
parts.append(f" Expected: {step['expectation']}")
|
|
310
|
+
if test_case['steps']:
|
|
311
|
+
for step in test_case['steps']:
|
|
312
|
+
parts.append(f" Step {step['number']}: {step['title']}")
|
|
313
|
+
if step['expectation']:
|
|
314
|
+
parts.append(f" Expected: {step['expectation']}")
|
|
311
315
|
|
|
312
316
|
parts.append(f"\n\nActual Execution Results:\n{execution_output}\n")
|
|
313
|
-
|
|
317
|
+
|
|
318
|
+
# Escape quotes in test name for valid JSON in prompt
|
|
319
|
+
escaped_test_name = test_case['name'].replace('"', '\\"')
|
|
320
|
+
|
|
321
|
+
parts.append(f"""\nBased on the execution results above, validate this test case.
|
|
314
322
|
|
|
315
|
-
Respond with valid JSON in this EXACT format:
|
|
323
|
+
Respond ONLY with valid JSON in this EXACT format (no additional text before or after):
|
|
316
324
|
{{
|
|
317
|
-
"
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
{{"step_number": 1, "title": "<step title>", "passed": true/false, "details": "<brief explanation>"}},
|
|
323
|
-
{{"step_number": 2, "title": "<step title>", "passed": true/false, "details": "<brief explanation>"}}
|
|
324
|
-
]
|
|
325
|
-
}},
|
|
326
|
-
{{
|
|
327
|
-
"test_number": 2,
|
|
328
|
-
"test_name": "<test case name>",
|
|
329
|
-
"steps": [...]
|
|
330
|
-
}}
|
|
325
|
+
"test_number": {test_number},
|
|
326
|
+
"test_name": "{escaped_test_name}",
|
|
327
|
+
"steps": [
|
|
328
|
+
{{"step_number": 1, "title": "<step title>", "passed": true/false, "details": "<brief explanation>"}},
|
|
329
|
+
{{"step_number": 2, "title": "<step title>", "passed": true/false, "details": "<brief explanation>"}}
|
|
331
330
|
]
|
|
332
331
|
}}
|
|
333
332
|
|
|
334
|
-
|
|
333
|
+
IMPORTANT: Return ONLY the JSON object. Do not include any explanatory text before or after the JSON.""")
|
|
335
334
|
|
|
336
335
|
return "\n".join(parts)
|
|
337
336
|
|
|
@@ -359,17 +358,119 @@ def _extract_json_from_text(text: str) -> dict:
|
|
|
359
358
|
return json.loads(text[start_idx:end_idx])
|
|
360
359
|
|
|
361
360
|
|
|
362
|
-
def
|
|
363
|
-
"""Create fallback
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
361
|
+
def _create_fallback_result_for_test(test_case: dict, test_file: Path, reason: str = 'Validation failed') -> dict:
|
|
362
|
+
"""Create a fallback result for a single test case with detailed step information.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
test_case: Parsed test case data
|
|
366
|
+
test_file: Path to test case file
|
|
367
|
+
reason: Reason for fallback
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
Fallback test result dict with step details
|
|
371
|
+
"""
|
|
372
|
+
fallback_steps = []
|
|
373
|
+
for step_info in test_case.get('steps', []):
|
|
374
|
+
fallback_steps.append({
|
|
375
|
+
'step_number': step_info['number'],
|
|
376
|
+
'title': step_info['title'],
|
|
368
377
|
'passed': False,
|
|
369
|
-
'
|
|
370
|
-
'step_results': []
|
|
378
|
+
'details': reason
|
|
371
379
|
})
|
|
372
|
-
|
|
380
|
+
|
|
381
|
+
return {
|
|
382
|
+
'title': test_case['name'],
|
|
383
|
+
'passed': False,
|
|
384
|
+
'file': test_file.name,
|
|
385
|
+
'step_results': fallback_steps,
|
|
386
|
+
'validation_error': reason
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _cleanup_executor_cache(cache: Dict[str, tuple], cache_name: str = "executor") -> None:
|
|
391
|
+
"""Clean up executor cache resources.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
cache: Dictionary of cached executors
|
|
395
|
+
cache_name: Name of cache for logging
|
|
396
|
+
"""
|
|
397
|
+
console.print(f"[dim]Cleaning up {cache_name} cache...[/dim]")
|
|
398
|
+
for cache_key, cached_items in cache.items():
|
|
399
|
+
try:
|
|
400
|
+
# Extract memory from tuple (second element)
|
|
401
|
+
memory = cached_items[1] if len(cached_items) > 1 else None
|
|
402
|
+
|
|
403
|
+
# Close SQLite memory connection
|
|
404
|
+
if memory and hasattr(memory, 'conn') and memory.conn:
|
|
405
|
+
memory.conn.close()
|
|
406
|
+
except Exception as e:
|
|
407
|
+
logger.debug(f"Error cleaning up {cache_name} cache for {cache_key}: {e}")
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _create_executor_from_cache(cache: Dict[str, tuple], cache_key: str,
|
|
411
|
+
client, agent_def: Dict, toolkit_config_path: Optional[str],
|
|
412
|
+
config, model: Optional[str], temperature: Optional[float],
|
|
413
|
+
max_tokens: Optional[int], work_dir: Optional[str]) -> tuple:
|
|
414
|
+
"""Get or create executor from cache.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
cache: Executor cache dictionary
|
|
418
|
+
cache_key: Key for caching
|
|
419
|
+
client: API client
|
|
420
|
+
agent_def: Agent definition
|
|
421
|
+
toolkit_config_path: Path to toolkit config
|
|
422
|
+
config: CLI configuration
|
|
423
|
+
model: Model override
|
|
424
|
+
temperature: Temperature override
|
|
425
|
+
max_tokens: Max tokens override
|
|
426
|
+
work_dir: Working directory
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
Tuple of (agent_executor, memory, mcp_session_manager)
|
|
430
|
+
"""
|
|
431
|
+
if cache_key in cache:
|
|
432
|
+
return cache[cache_key]
|
|
433
|
+
|
|
434
|
+
# Create new executor
|
|
435
|
+
from langgraph.checkpoint.sqlite import SqliteSaver
|
|
436
|
+
import sqlite3
|
|
437
|
+
|
|
438
|
+
memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
|
|
439
|
+
toolkit_config_tuple = (toolkit_config_path,) if toolkit_config_path else ()
|
|
440
|
+
|
|
441
|
+
agent_executor, mcp_session_manager, _, _, _, _, _ = _setup_local_agent_executor(
|
|
442
|
+
client, agent_def, toolkit_config_tuple, config, model, temperature,
|
|
443
|
+
max_tokens, memory, work_dir
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Cache the executor
|
|
447
|
+
cached_tuple = (agent_executor, memory, mcp_session_manager)
|
|
448
|
+
cache[cache_key] = cached_tuple
|
|
449
|
+
return cached_tuple
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _print_validation_diagnostics(validation_output: str) -> None:
|
|
453
|
+
"""Print diagnostic information for validation output.
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
validation_output: The validation output to diagnose
|
|
457
|
+
"""
|
|
458
|
+
console.print(f"\n[bold red]🔍 Diagnostic Information:[/bold red]")
|
|
459
|
+
console.print(f"[dim]Output length: {len(validation_output)} characters[/dim]")
|
|
460
|
+
|
|
461
|
+
# Check for key JSON elements
|
|
462
|
+
has_json = '{' in validation_output and '}' in validation_output
|
|
463
|
+
has_fields = 'test_number' in validation_output and 'steps' in validation_output
|
|
464
|
+
|
|
465
|
+
console.print(f"[dim]Has JSON structure: {has_json}[/dim]")
|
|
466
|
+
console.print(f"[dim]Has required fields: {has_fields}[/dim]")
|
|
467
|
+
|
|
468
|
+
# Show relevant excerpt
|
|
469
|
+
if len(validation_output) > 400:
|
|
470
|
+
console.print(f"\n[red]First 200 chars:[/red] [dim]{validation_output[:200]}[/dim]")
|
|
471
|
+
console.print(f"[red]Last 200 chars:[/red] [dim]{validation_output[-200:]}[/dim]")
|
|
472
|
+
else:
|
|
473
|
+
console.print(f"\n[red]Full output:[/red] [dim]{validation_output}[/dim]")
|
|
373
474
|
|
|
374
475
|
|
|
375
476
|
def _get_alita_system_prompt(config) -> str:
|
|
@@ -3226,13 +3327,16 @@ def agent_run(ctx, agent_source: str, message: str, version: Optional[str],
|
|
|
3226
3327
|
help='Grant agent filesystem access to this directory')
|
|
3227
3328
|
@click.option('--data-generator', type=click.Path(exists=True),
|
|
3228
3329
|
help='Path to test data generator agent definition file')
|
|
3330
|
+
@click.option('--validator', type=click.Path(exists=True),
|
|
3331
|
+
help='Path to test validator agent definition file (default: .alita/agents/test-validator.agent.md)')
|
|
3229
3332
|
@click.option('--skip-data-generation', is_flag=True,
|
|
3230
3333
|
help='Skip test data generation step')
|
|
3231
3334
|
@click.pass_context
|
|
3232
3335
|
def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir: str,
|
|
3233
3336
|
test_case_files: tuple, model: Optional[str], temperature: Optional[float],
|
|
3234
3337
|
max_tokens: Optional[int], work_dir: Optional[str],
|
|
3235
|
-
data_generator: Optional[str],
|
|
3338
|
+
data_generator: Optional[str], validator: Optional[str],
|
|
3339
|
+
skip_data_generation: bool):
|
|
3236
3340
|
"""
|
|
3237
3341
|
Execute test cases from a directory and save results.
|
|
3238
3342
|
|
|
@@ -3259,6 +3363,11 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
|
|
|
3259
3363
|
alita execute-test-cases ./agent.json --test-cases-dir ./tests --results-dir ./results \
|
|
3260
3364
|
--skip-data-generation --model gpt-4o
|
|
3261
3365
|
"""
|
|
3366
|
+
# Import dependencies at function start
|
|
3367
|
+
import sqlite3
|
|
3368
|
+
import uuid
|
|
3369
|
+
from langgraph.checkpoint.sqlite import SqliteSaver
|
|
3370
|
+
|
|
3262
3371
|
config = ctx.obj['config']
|
|
3263
3372
|
client = get_client(ctx)
|
|
3264
3373
|
|
|
@@ -3317,11 +3426,30 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
|
|
|
3317
3426
|
console.print("[yellow]Continuing with test execution...[/yellow]\n")
|
|
3318
3427
|
logger.debug(f"Data generator setup error: {e}", exc_info=True)
|
|
3319
3428
|
|
|
3320
|
-
#
|
|
3321
|
-
|
|
3322
|
-
|
|
3323
|
-
|
|
3324
|
-
|
|
3429
|
+
# Load validator agent definition
|
|
3430
|
+
validator_def = None
|
|
3431
|
+
validator_agent_name = "Default Validator"
|
|
3432
|
+
|
|
3433
|
+
# Try to load validator from specified path or default location
|
|
3434
|
+
validator_path = validator
|
|
3435
|
+
if not validator_path:
|
|
3436
|
+
# Default to .alita/agents/test-validator.agent.md
|
|
3437
|
+
default_validator = Path.cwd() / '.alita' / 'agents' / 'test-validator.agent.md'
|
|
3438
|
+
if default_validator.exists():
|
|
3439
|
+
validator_path = str(default_validator)
|
|
3440
|
+
|
|
3441
|
+
if validator_path and Path(validator_path).exists():
|
|
3442
|
+
try:
|
|
3443
|
+
validator_def = load_agent_definition(validator_path)
|
|
3444
|
+
validator_agent_name = validator_def.get('name', Path(validator_path).stem)
|
|
3445
|
+
console.print(f"Validator Agent: [bold]{validator_agent_name}[/bold]")
|
|
3446
|
+
console.print(f"[dim]Using: {validator_path}[/dim]\n")
|
|
3447
|
+
except Exception as e:
|
|
3448
|
+
console.print(f"[yellow]⚠ Warning: Failed to load validator agent: {e}[/yellow]")
|
|
3449
|
+
console.print(f"[yellow]Will use test runner agent for validation[/yellow]\n")
|
|
3450
|
+
logger.debug(f"Validator load error: {e}", exc_info=True)
|
|
3451
|
+
else:
|
|
3452
|
+
console.print(f"[dim]No validator agent specified, using test runner agent for validation[/dim]\n")
|
|
3325
3453
|
|
|
3326
3454
|
# Store bulk data generation chat history to pass to test executors
|
|
3327
3455
|
bulk_gen_chat_history = []
|
|
@@ -3357,7 +3485,6 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
|
|
|
3357
3485
|
|
|
3358
3486
|
try:
|
|
3359
3487
|
# Setup data generator agent
|
|
3360
|
-
from langgraph.checkpoint.sqlite import SqliteSaver
|
|
3361
3488
|
bulk_memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
|
|
3362
3489
|
|
|
3363
3490
|
# Use first test case's config or empty tuple
|
|
@@ -3398,138 +3525,204 @@ def execute_test_cases(ctx, agent_source: str, test_cases_dir: str, results_dir:
|
|
|
3398
3525
|
console.print("[yellow]Continuing with test execution...[/yellow]\n")
|
|
3399
3526
|
logger.debug(f"Bulk data generation error: {e}", exc_info=True)
|
|
3400
3527
|
|
|
3401
|
-
# Execute
|
|
3528
|
+
# Execute test cases sequentially with executor caching
|
|
3402
3529
|
if not parsed_test_cases:
|
|
3403
3530
|
console.print("[yellow]No test cases to execute[/yellow]")
|
|
3404
3531
|
return
|
|
3405
3532
|
|
|
3406
|
-
console.print(f"\n[bold yellow]📋 Executing
|
|
3407
|
-
|
|
3408
|
-
# Use first test case's config for agent setup
|
|
3409
|
-
first_tc = parsed_test_cases[0]
|
|
3410
|
-
first_test_file = first_tc['file']
|
|
3411
|
-
toolkit_config_path = resolve_toolkit_config_path(
|
|
3412
|
-
first_tc['data'].get('config_path', ''),
|
|
3413
|
-
first_test_file,
|
|
3414
|
-
test_cases_path
|
|
3415
|
-
)
|
|
3416
|
-
toolkit_config_tuple = (toolkit_config_path,) if toolkit_config_path else ()
|
|
3417
|
-
|
|
3418
|
-
# Create memory for bulk execution
|
|
3419
|
-
from langgraph.checkpoint.sqlite import SqliteSaver
|
|
3420
|
-
memory = SqliteSaver(sqlite3.connect(":memory:", check_same_thread=False))
|
|
3533
|
+
console.print(f"\n[bold yellow]📋 Executing test cases sequentially...[/bold yellow]\n")
|
|
3421
3534
|
|
|
3422
|
-
#
|
|
3423
|
-
|
|
3535
|
+
# Show data generation context availability
|
|
3536
|
+
if bulk_gen_chat_history:
|
|
3537
|
+
console.print(f"[dim]✓ Data generation history available ({len(bulk_gen_chat_history)} messages) - shared with all test cases[/dim]\n")
|
|
3538
|
+
else:
|
|
3539
|
+
console.print(f"[dim]ℹ No data generation history (skipped or disabled)[/dim]\n")
|
|
3424
3540
|
|
|
3425
|
-
#
|
|
3426
|
-
|
|
3427
|
-
client, agent_def, toolkit_config_tuple, config, model, temperature, max_tokens, memory, work_dir
|
|
3428
|
-
)
|
|
3541
|
+
# Executor cache: key = toolkit_config_path, value = (agent_executor, memory, mcp_session_manager)
|
|
3542
|
+
executor_cache = {}
|
|
3429
3543
|
|
|
3430
|
-
#
|
|
3431
|
-
|
|
3432
|
-
|
|
3433
|
-
console.print(f"Executing the prompt: {bulk_all_prompt}\n")
|
|
3544
|
+
# Validation executor cache: separate isolated executors for validation
|
|
3545
|
+
# key = toolkit_config_path, value = (agent_executor, memory, mcp_session_manager)
|
|
3546
|
+
validation_executor_cache = {}
|
|
3434
3547
|
|
|
3435
|
-
# Execute
|
|
3548
|
+
# Execute each test case sequentially
|
|
3436
3549
|
test_results = []
|
|
3437
|
-
|
|
3550
|
+
total_tests = len(parsed_test_cases)
|
|
3438
3551
|
|
|
3439
|
-
|
|
3440
|
-
|
|
3441
|
-
|
|
3442
|
-
|
|
3443
|
-
|
|
3444
|
-
|
|
3552
|
+
for idx, tc_info in enumerate(parsed_test_cases, 1):
|
|
3553
|
+
test_case = tc_info['data']
|
|
3554
|
+
test_file = tc_info['file']
|
|
3555
|
+
test_name = test_case['name']
|
|
3556
|
+
|
|
3557
|
+
# Display progress
|
|
3558
|
+
console.print(f"[bold cyan]Test Case {idx}/{total_tests} - {test_name}[/bold cyan]")
|
|
3559
|
+
|
|
3560
|
+
try:
|
|
3561
|
+
# Resolve toolkit config path for this test case
|
|
3562
|
+
toolkit_config_path = resolve_toolkit_config_path(
|
|
3563
|
+
test_case.get('config_path', ''),
|
|
3564
|
+
test_file,
|
|
3565
|
+
test_cases_path
|
|
3566
|
+
)
|
|
3567
|
+
|
|
3568
|
+
# Use cache key (None if no config)
|
|
3569
|
+
cache_key = toolkit_config_path if toolkit_config_path else '__no_config__'
|
|
3570
|
+
thread_id = f"test_case_{idx}_{uuid.uuid4().hex[:8]}"
|
|
3571
|
+
|
|
3572
|
+
# Get or create executor from cache
|
|
3573
|
+
agent_executor, memory, mcp_session_manager = _create_executor_from_cache(
|
|
3574
|
+
executor_cache, cache_key, client, agent_def, toolkit_config_path,
|
|
3575
|
+
config, model, temperature, max_tokens, work_dir
|
|
3576
|
+
)
|
|
3577
|
+
|
|
3578
|
+
# Build execution prompt for single test case
|
|
3579
|
+
execution_prompt = _build_single_test_execution_prompt(tc_info, idx)
|
|
3580
|
+
console.print(f"[dim]Executing with {len(bulk_gen_chat_history)} history messages[/dim]")
|
|
3581
|
+
|
|
3582
|
+
# Execute test case
|
|
3583
|
+
execution_output = ""
|
|
3584
|
+
if agent_executor:
|
|
3585
|
+
with console.status(f"[yellow]Executing test case...[/yellow]", spinner="dots"):
|
|
3586
|
+
exec_result = agent_executor.invoke({
|
|
3587
|
+
"input": execution_prompt,
|
|
3588
|
+
"chat_history": bulk_gen_chat_history # ONLY data gen history, no accumulation
|
|
3589
|
+
}, config={"configurable": {"thread_id": thread_id}})
|
|
3590
|
+
execution_output = extract_output_from_result(exec_result)
|
|
3591
|
+
|
|
3592
|
+
console.print(f"[green]✓ Test case executed[/green]")
|
|
3593
|
+
console.print(f"[dim]{execution_output}[/dim]\n")
|
|
3594
|
+
|
|
3595
|
+
# No history accumulation - each test case is independent
|
|
3596
|
+
else:
|
|
3597
|
+
console.print(f"[red]✗ No agent executor available[/red]")
|
|
3598
|
+
# Create fallback result for this test
|
|
3599
|
+
test_results.append({
|
|
3600
|
+
'title': test_name,
|
|
3601
|
+
'passed': False,
|
|
3602
|
+
'file': test_file.name,
|
|
3603
|
+
'step_results': []
|
|
3445
3604
|
})
|
|
3446
|
-
|
|
3605
|
+
continue
|
|
3447
3606
|
|
|
3448
|
-
|
|
3449
|
-
|
|
3607
|
+
# Validate test case using ISOLATED validation executor
|
|
3608
|
+
validation_prompt = _build_single_test_validation_prompt(tc_info, idx, execution_output)
|
|
3450
3609
|
|
|
3451
|
-
|
|
3452
|
-
chat_history.append({"role": "user", "content": bulk_all_prompt})
|
|
3453
|
-
chat_history.append({"role": "assistant", "content": all_execution_output})
|
|
3610
|
+
console.print(f"[bold yellow]🔍 Validating test case (isolated context)...[/bold yellow]")
|
|
3454
3611
|
|
|
3455
|
-
#
|
|
3456
|
-
|
|
3612
|
+
# Create or retrieve isolated validation executor
|
|
3613
|
+
validation_cache_key = f"{cache_key}_validation"
|
|
3614
|
+
validation_agent_def = validator_def if validator_def else agent_def
|
|
3457
3615
|
|
|
3458
|
-
|
|
3459
|
-
|
|
3460
|
-
|
|
3616
|
+
validation_executor, validation_memory, validation_mcp_session = _create_executor_from_cache(
|
|
3617
|
+
validation_executor_cache, validation_cache_key, client, validation_agent_def,
|
|
3618
|
+
toolkit_config_path, config, model, temperature, max_tokens, work_dir
|
|
3619
|
+
)
|
|
3461
3620
|
|
|
3462
|
-
|
|
3463
|
-
|
|
3464
|
-
|
|
3465
|
-
|
|
3466
|
-
|
|
3467
|
-
|
|
3468
|
-
|
|
3621
|
+
if validation_cache_key not in validation_executor_cache:
|
|
3622
|
+
console.print(f"[dim]Created new isolated validation executor[/dim]")
|
|
3623
|
+
else:
|
|
3624
|
+
console.print(f"[dim]Using cached validation executor[/dim]")
|
|
3625
|
+
|
|
3626
|
+
# For validation, use a separate thread with NO chat history (isolated from data gen)
|
|
3627
|
+
# This prevents the agent from using tools and encourages direct JSON output
|
|
3628
|
+
validation_thread_id = f"validation_{idx}_{uuid.uuid4().hex[:8]}"
|
|
3629
|
+
|
|
3630
|
+
validation_output = ""
|
|
3631
|
+
if validation_executor:
|
|
3632
|
+
with console.status(f"[yellow]Validating test case...[/yellow]", spinner="dots"):
|
|
3633
|
+
validation_result = validation_executor.invoke({
|
|
3634
|
+
"input": validation_prompt,
|
|
3635
|
+
"chat_history": [] # ISOLATED: No data gen history for validation
|
|
3636
|
+
}, {"configurable": {"thread_id": validation_thread_id}})
|
|
3637
|
+
|
|
3638
|
+
validation_output = extract_output_from_result(validation_result)
|
|
3639
|
+
else:
|
|
3640
|
+
console.print(f"[red]✗ No validation executor available[/red]")
|
|
3641
|
+
validation_output = "{}"
|
|
3642
|
+
|
|
3643
|
+
console.print(f"[bold cyan]Full LLM Validation Response:[/bold cyan]")
|
|
3644
|
+
console.print(f"[dim]{validation_output}[/dim]\n")
|
|
3469
3645
|
|
|
3470
|
-
|
|
3646
|
+
# No history update - validation is isolated from test execution
|
|
3471
3647
|
|
|
3472
3648
|
# Parse validation JSON
|
|
3473
3649
|
try:
|
|
3474
3650
|
validation_json = _extract_json_from_text(validation_output)
|
|
3475
|
-
|
|
3651
|
+
step_results = validation_json.get('steps', [])
|
|
3476
3652
|
|
|
3477
|
-
#
|
|
3478
|
-
|
|
3479
|
-
passed_tests = 0
|
|
3480
|
-
failed_tests = 0
|
|
3653
|
+
# Determine if test passed (all steps must pass)
|
|
3654
|
+
test_passed = all(step.get('passed', False) for step in step_results) if step_results else False
|
|
3481
3655
|
|
|
3482
|
-
|
|
3483
|
-
|
|
3484
|
-
|
|
3485
|
-
|
|
3486
|
-
|
|
3487
|
-
|
|
3656
|
+
if test_passed:
|
|
3657
|
+
console.print(f"[bold green]✅ Test PASSED: {test_name}[/bold green]")
|
|
3658
|
+
else:
|
|
3659
|
+
console.print(f"[bold red]❌ Test FAILED: {test_name}[/bold red]")
|
|
3660
|
+
|
|
3661
|
+
# Display individual step results
|
|
3662
|
+
for step_result in step_results:
|
|
3663
|
+
step_num = step_result.get('step_number')
|
|
3664
|
+
step_title = step_result.get('title', '')
|
|
3665
|
+
passed = step_result.get('passed', False)
|
|
3666
|
+
details = step_result.get('details', '')
|
|
3488
3667
|
|
|
3489
|
-
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
console.print(f"[bold green]✅ Test PASSED: {test_name}[/bold green]")
|
|
3668
|
+
if passed:
|
|
3669
|
+
console.print(f" [green]✓ Step {step_num}: {step_title}[/green]")
|
|
3670
|
+
console.print(f" [dim]{details}[/dim]")
|
|
3493
3671
|
else:
|
|
3494
|
-
|
|
3495
|
-
console.print(f"[
|
|
3496
|
-
|
|
3497
|
-
|
|
3498
|
-
|
|
3499
|
-
|
|
3500
|
-
|
|
3501
|
-
|
|
3502
|
-
|
|
3503
|
-
|
|
3504
|
-
|
|
3505
|
-
|
|
3506
|
-
console.print(f" [dim]{details}[/dim]")
|
|
3507
|
-
else:
|
|
3508
|
-
console.print(f" [red]✗ Step {step_num}: {step_title}[/red]")
|
|
3509
|
-
console.print(f" [dim]{details}[/dim]")
|
|
3510
|
-
|
|
3511
|
-
console.print()
|
|
3512
|
-
|
|
3513
|
-
# Store result
|
|
3514
|
-
test_results.append({
|
|
3515
|
-
'title': test_name,
|
|
3516
|
-
'passed': test_passed,
|
|
3517
|
-
'file': parsed_test_cases[tc_result.get('test_number', 1) - 1]['file'].name if tc_result.get('test_number', 1) - 1 < len(parsed_test_cases) else 'unknown',
|
|
3518
|
-
'step_results': step_results
|
|
3519
|
-
})
|
|
3672
|
+
console.print(f" [red]✗ Step {step_num}: {step_title}[/red]")
|
|
3673
|
+
console.print(f" [dim]{details}[/dim]")
|
|
3674
|
+
|
|
3675
|
+
console.print()
|
|
3676
|
+
|
|
3677
|
+
# Store result
|
|
3678
|
+
test_results.append({
|
|
3679
|
+
'title': test_name,
|
|
3680
|
+
'passed': test_passed,
|
|
3681
|
+
'file': test_file.name,
|
|
3682
|
+
'step_results': step_results
|
|
3683
|
+
})
|
|
3520
3684
|
|
|
3521
3685
|
except Exception as e:
|
|
3522
|
-
logger.debug(f"Validation parsing failed: {e}")
|
|
3523
|
-
console.print(f"[yellow]⚠ Warning: Could not parse validation results
|
|
3524
|
-
|
|
3525
|
-
else:
|
|
3526
|
-
console.print(f"[red]✗ No agent executor available[/red]\n")
|
|
3527
|
-
test_results, total_tests, passed_tests, failed_tests = _create_fallback_results(parsed_test_cases)
|
|
3686
|
+
logger.debug(f"Validation parsing failed for {test_name}: {e}", exc_info=True)
|
|
3687
|
+
console.print(f"[yellow]⚠ Warning: Could not parse validation results for {test_name}[/yellow]")
|
|
3688
|
+
console.print(f"[yellow]Error: {str(e)}[/yellow]")
|
|
3528
3689
|
|
|
3529
|
-
|
|
3530
|
-
|
|
3531
|
-
|
|
3532
|
-
|
|
3690
|
+
# Enhanced diagnostic output
|
|
3691
|
+
_print_validation_diagnostics(validation_output)
|
|
3692
|
+
|
|
3693
|
+
# Generate fallback result using helper function
|
|
3694
|
+
console.print(f"\n[yellow]🔄 Generating fallback validation result...[/yellow]")
|
|
3695
|
+
fallback_result = _create_fallback_result_for_test(
|
|
3696
|
+
test_case,
|
|
3697
|
+
test_file,
|
|
3698
|
+
f'Validation failed - could not parse validator output: {str(e)}'
|
|
3699
|
+
)
|
|
3700
|
+
console.print(f"[dim]Created {len(fallback_result['step_results'])} fallback step results[/dim]\n")
|
|
3701
|
+
|
|
3702
|
+
test_results.append(fallback_result)
|
|
3703
|
+
console.print()
|
|
3704
|
+
|
|
3705
|
+
except Exception as e:
|
|
3706
|
+
logger.debug(f"Test execution failed for {test_name}: {e}", exc_info=True)
|
|
3707
|
+
console.print(f"[red]✗ Test execution failed: {e}[/red]")
|
|
3708
|
+
|
|
3709
|
+
# Create fallback result using helper function
|
|
3710
|
+
fallback_result = _create_fallback_result_for_test(
|
|
3711
|
+
test_case,
|
|
3712
|
+
test_file,
|
|
3713
|
+
f'Test execution failed: {str(e)}'
|
|
3714
|
+
)
|
|
3715
|
+
test_results.append(fallback_result)
|
|
3716
|
+
console.print()
|
|
3717
|
+
|
|
3718
|
+
# Cleanup: Close executor cache resources
|
|
3719
|
+
_cleanup_executor_cache(executor_cache, "executor")
|
|
3720
|
+
_cleanup_executor_cache(validation_executor_cache, "validation executor")
|
|
3721
|
+
|
|
3722
|
+
# Calculate totals
|
|
3723
|
+
total_tests = len(test_results)
|
|
3724
|
+
passed_tests = sum(1 for r in test_results if r['passed'])
|
|
3725
|
+
failed_tests = total_tests - passed_tests
|
|
3533
3726
|
|
|
3534
3727
|
# Generate summary report
|
|
3535
3728
|
console.print(f"\n[bold]{'='*60}[/bold]")
|