@aj-archipelago/cortex 1.4.2 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +1 -0
  2. package/config.js +1 -1
  3. package/helper-apps/cortex-autogen2/.dockerignore +1 -0
  4. package/helper-apps/cortex-autogen2/Dockerfile +6 -10
  5. package/helper-apps/cortex-autogen2/Dockerfile.worker +2 -0
  6. package/helper-apps/cortex-autogen2/agents.py +203 -2
  7. package/helper-apps/cortex-autogen2/main.py +1 -1
  8. package/helper-apps/cortex-autogen2/pyproject.toml +12 -0
  9. package/helper-apps/cortex-autogen2/requirements.txt +14 -0
  10. package/helper-apps/cortex-autogen2/services/redis_publisher.py +1 -1
  11. package/helper-apps/cortex-autogen2/services/run_analyzer.py +1 -1
  12. package/helper-apps/cortex-autogen2/task_processor.py +431 -229
  13. package/helper-apps/cortex-autogen2/test_entity_fetcher.py +305 -0
  14. package/helper-apps/cortex-autogen2/tests/README.md +240 -0
  15. package/helper-apps/cortex-autogen2/tests/TEST_REPORT.md +342 -0
  16. package/helper-apps/cortex-autogen2/tests/__init__.py +8 -0
  17. package/helper-apps/cortex-autogen2/tests/analysis/__init__.py +1 -0
  18. package/helper-apps/cortex-autogen2/tests/analysis/improvement_suggester.py +224 -0
  19. package/helper-apps/cortex-autogen2/tests/analysis/trend_analyzer.py +211 -0
  20. package/helper-apps/cortex-autogen2/tests/cli/__init__.py +1 -0
  21. package/helper-apps/cortex-autogen2/tests/cli/run_tests.py +296 -0
  22. package/helper-apps/cortex-autogen2/tests/collectors/__init__.py +1 -0
  23. package/helper-apps/cortex-autogen2/tests/collectors/log_collector.py +252 -0
  24. package/helper-apps/cortex-autogen2/tests/collectors/progress_collector.py +182 -0
  25. package/helper-apps/cortex-autogen2/tests/conftest.py +15 -0
  26. package/helper-apps/cortex-autogen2/tests/database/__init__.py +1 -0
  27. package/helper-apps/cortex-autogen2/tests/database/repository.py +501 -0
  28. package/helper-apps/cortex-autogen2/tests/database/schema.sql +108 -0
  29. package/helper-apps/cortex-autogen2/tests/evaluators/__init__.py +1 -0
  30. package/helper-apps/cortex-autogen2/tests/evaluators/llm_scorer.py +294 -0
  31. package/helper-apps/cortex-autogen2/tests/evaluators/prompts.py +250 -0
  32. package/helper-apps/cortex-autogen2/tests/evaluators/wordcloud_validator.py +168 -0
  33. package/helper-apps/cortex-autogen2/tests/metrics/__init__.py +1 -0
  34. package/helper-apps/cortex-autogen2/tests/metrics/collector.py +155 -0
  35. package/helper-apps/cortex-autogen2/tests/orchestrator.py +576 -0
  36. package/helper-apps/cortex-autogen2/tests/test_cases.yaml +279 -0
  37. package/helper-apps/cortex-autogen2/tests/test_data.db +0 -0
  38. package/helper-apps/cortex-autogen2/tests/utils/__init__.py +3 -0
  39. package/helper-apps/cortex-autogen2/tests/utils/connectivity.py +112 -0
  40. package/helper-apps/cortex-autogen2/tools/azure_blob_tools.py +74 -24
  41. package/helper-apps/cortex-autogen2/tools/entity_api_registry.json +38 -0
  42. package/helper-apps/cortex-autogen2/tools/file_tools.py +1 -1
  43. package/helper-apps/cortex-autogen2/tools/search_tools.py +436 -238
  44. package/helper-apps/cortex-file-handler/package-lock.json +2 -2
  45. package/helper-apps/cortex-file-handler/package.json +1 -1
  46. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +4 -5
  47. package/helper-apps/cortex-file-handler/src/blobHandler.js +36 -144
  48. package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +5 -3
  49. package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +34 -1
  50. package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +22 -0
  51. package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +28 -1
  52. package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +29 -4
  53. package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +11 -0
  54. package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +1 -1
  55. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +3 -2
  56. package/helper-apps/cortex-file-handler/tests/checkHashShortLived.test.js +8 -1
  57. package/helper-apps/cortex-file-handler/tests/containerConversionFlow.test.js +5 -2
  58. package/helper-apps/cortex-file-handler/tests/containerNameParsing.test.js +14 -7
  59. package/helper-apps/cortex-file-handler/tests/containerParameterFlow.test.js +5 -2
  60. package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +31 -19
  61. package/package.json +1 -1
  62. package/server/modelExecutor.js +4 -0
  63. package/server/plugins/claude4VertexPlugin.js +540 -0
  64. package/server/plugins/openAiWhisperPlugin.js +43 -2
  65. package/tests/integration/rest/vendors/claude_streaming.test.js +121 -0
  66. package/tests/unit/plugins/claude4VertexPlugin.test.js +462 -0
  67. package/tests/unit/plugins/claude4VertexToolConversion.test.js +413 -0
  68. package/helper-apps/cortex-autogen/.funcignore +0 -8
  69. package/helper-apps/cortex-autogen/Dockerfile +0 -10
  70. package/helper-apps/cortex-autogen/OAI_CONFIG_LIST +0 -6
  71. package/helper-apps/cortex-autogen/agents.py +0 -493
  72. package/helper-apps/cortex-autogen/agents_extra.py +0 -14
  73. package/helper-apps/cortex-autogen/config.py +0 -18
  74. package/helper-apps/cortex-autogen/data_operations.py +0 -29
  75. package/helper-apps/cortex-autogen/function_app.py +0 -44
  76. package/helper-apps/cortex-autogen/host.json +0 -15
  77. package/helper-apps/cortex-autogen/main.py +0 -38
  78. package/helper-apps/cortex-autogen/prompts.py +0 -196
  79. package/helper-apps/cortex-autogen/prompts_extra.py +0 -5
  80. package/helper-apps/cortex-autogen/requirements.txt +0 -9
  81. package/helper-apps/cortex-autogen/search.py +0 -85
  82. package/helper-apps/cortex-autogen/test.sh +0 -40
  83. package/helper-apps/cortex-autogen/tools/sasfileuploader.py +0 -66
  84. package/helper-apps/cortex-autogen/utils.py +0 -88
  85. package/helper-apps/cortex-autogen2/DigiCertGlobalRootCA.crt.pem +0 -22
  86. package/helper-apps/cortex-autogen2/poetry.lock +0 -3652
@@ -0,0 +1,576 @@
1
+ """
2
+ Test orchestrator for automating Cortex AutoGen2 test execution.
3
+
4
+ Coordinates task submission, data collection, evaluation, and storage.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import yaml
10
+ import uuid
11
+ import json
12
+ import base64
13
+ import asyncio
14
+ import logging
15
+ from datetime import datetime
16
+ from typing import Dict, List, Optional
17
+ from pathlib import Path
18
+
19
+ # Add parent directory to path to import project modules
20
+ sys.path.insert(0, str(Path(__file__).parent.parent))
21
+
22
+ from azure.storage.queue import QueueClient
23
+ from tests.database.repository import TestRepository
24
+ from tests.collectors.progress_collector import ProgressCollector
25
+ from tests.collectors.log_collector import LogCollector
26
+ from tests.evaluators.llm_scorer import LLMEvaluator
27
+ from tests.metrics.collector import MetricsCollector
28
+ from tests.utils.connectivity import check_ajsql_connectivity
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class TestOrchestrator:
34
+ """Orchestrates end-to-end test execution and evaluation."""
35
+
36
+ def __init__(
37
+ self,
38
+ db_path: Optional[str] = None,
39
+ redis_url: Optional[str] = None,
40
+ redis_channel: Optional[str] = None
41
+ ):
42
+ """
43
+ Initialize the test orchestrator.
44
+
45
+ Args:
46
+ db_path: Path to SQLite database (defaults to tests/database/test_results.db)
47
+ redis_url: Redis connection URL (defaults to env var)
48
+ redis_channel: Redis channel name (defaults to env var)
49
+ """
50
+ self.db = TestRepository(db_path)
51
+ self.evaluator = LLMEvaluator()
52
+
53
+ self.redis_url = redis_url or os.getenv("REDIS_CONNECTION_STRING", "redis://localhost:6379")
54
+ self.redis_channel = redis_channel or os.getenv("REDIS_CHANNEL", "cortex_progress")
55
+
56
+ self.azure_queue_conn_str = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
57
+ self.azure_queue_name = os.getenv("AZURE_QUEUE_NAME", "cortex-tasks")
58
+
59
+ if not self.azure_queue_conn_str:
60
+ raise ValueError("AZURE_STORAGE_CONNECTION_STRING environment variable must be set")
61
+
62
+ logger.info("🎬 Test Orchestrator initialized")
63
+ logger.info(f" Redis: {self.redis_url}")
64
+ logger.info(f" Queue: {self.azure_queue_name}")
65
+
66
+ def load_test_cases(self, test_cases_path: Optional[str] = None) -> List[Dict]:
67
+ """
68
+ Load test cases from YAML file.
69
+
70
+ Args:
71
+ test_cases_path: Path to test_cases.yaml (defaults to tests/test_cases.yaml)
72
+
73
+ Returns:
74
+ List of test case dictionaries
75
+ """
76
+ if test_cases_path is None:
77
+ test_cases_path = Path(__file__).parent / "test_cases.yaml"
78
+
79
+ with open(test_cases_path, 'r') as f:
80
+ data = yaml.safe_load(f)
81
+
82
+ test_cases = data.get('test_cases', [])
83
+ logger.info(f"📋 Loaded {len(test_cases)} test cases")
84
+
85
+ return test_cases
86
+
87
+ async def run_test(self, test_case: Dict) -> Dict:
88
+ """
89
+ Run a single test case end-to-end.
90
+
91
+ Args:
92
+ test_case: Test case dictionary from YAML
93
+
94
+ Returns:
95
+ Complete test results including scores and metrics
96
+ """
97
+ test_case_id = test_case['id']
98
+ task_description = test_case['task']
99
+ timeout = test_case.get('timeout_seconds', 300)
100
+ requires_ajsql = test_case.get('requires_ajsql', False)
101
+
102
+ logger.info(f"\n{'='*80}")
103
+ logger.info(f"🧪 Running Test: {test_case['name']}")
104
+ logger.info(f" ID: {test_case_id}")
105
+ logger.info(f" Timeout: {timeout}s")
106
+ if requires_ajsql:
107
+ logger.info(f" Requires AJ SQL: Yes")
108
+ logger.info(f"{'='*80}\n")
109
+
110
+ # Check AJ SQL connectivity if required
111
+ if requires_ajsql:
112
+ logger.info("🔍 Checking AJ SQL database connectivity...")
113
+ is_accessible, message = check_ajsql_connectivity()
114
+
115
+ if not is_accessible:
116
+ logger.warning(f"⚠️ AJ SQL database not accessible: {message}")
117
+ logger.warning(f"⏭️ SKIPPING test {test_case_id} - requires AJ SQL database access")
118
+
119
+ return {
120
+ 'test_case_id': test_case_id,
121
+ 'status': 'skipped',
122
+ 'skip_reason': f'AJ SQL database not accessible: {message}',
123
+ 'message': 'Test skipped due to missing database access (likely IP restriction)'
124
+ }
125
+ else:
126
+ # Note: Success message already logged by check_ajsql_connectivity()
127
+ pass
128
+
129
+ # Generate unique request ID
130
+ request_id = f"test_{test_case_id}_{uuid.uuid4().hex[:8]}"
131
+
132
+ # Create test run record in database
133
+ test_run_id = self.db.create_test_run(
134
+ test_case_id=test_case_id,
135
+ task_description=task_description,
136
+ request_id=request_id
137
+ )
138
+
139
+ logger.info(f"📝 Test run created: ID={test_run_id}, Request={request_id}")
140
+
141
+ # Start collectors
142
+ progress_collector = ProgressCollector(self.redis_url, self.redis_channel)
143
+ log_collector = LogCollector()
144
+
145
+ # Submit task to Azure Queue
146
+ try:
147
+ azure_message_id = await self._submit_task(request_id, task_description)
148
+ logger.info(f"✅ Task submitted to queue (Azure message ID: {azure_message_id})")
149
+ except Exception as e:
150
+ logger.error(f"❌ Failed to submit task: {e}")
151
+ self.db.update_test_run_status(test_run_id, 'failed', error_message=str(e))
152
+ return {'test_run_id': test_run_id, 'status': 'failed', 'error': str(e)}
153
+
154
+ # Collect data concurrently
155
+ # NOTE: Use Azure Queue message ID, not our custom request_id!
156
+ # The system publishes progress updates with the Azure Queue message ID.
157
+ try:
158
+ logger.info(f"📡 Starting data collection...")
159
+
160
+ # Run collectors concurrently - use Azure message ID for progress tracking!
161
+ progress_task = asyncio.create_task(
162
+ progress_collector.start_collecting(azure_message_id, timeout=timeout)
163
+ )
164
+ log_task = asyncio.create_task(
165
+ log_collector.start_collecting(azure_message_id, timeout=timeout)
166
+ )
167
+
168
+ # Wait for both to complete
169
+ progress_updates, logs = await asyncio.gather(progress_task, log_task)
170
+
171
+ logger.info(f"✅ Data collection complete")
172
+ logger.info(f" Progress updates: {len(progress_updates)}")
173
+ logger.info(f" Log entries: {len(logs)}")
174
+
175
+ except Exception as e:
176
+ logger.error(f"❌ Data collection error: {e}", exc_info=True)
177
+ self.db.update_test_run_status(test_run_id, 'failed', error_message=str(e))
178
+ return {'test_run_id': test_run_id, 'status': 'failed', 'error': str(e)}
179
+
180
+ # Store progress updates and logs in database
181
+ for update in progress_updates:
182
+ self.db.add_progress_update(
183
+ test_run_id=test_run_id,
184
+ timestamp=datetime.fromisoformat(update['timestamp']),
185
+ progress=update.get('progress', 0.0),
186
+ info=update.get('info', ''),
187
+ is_final=update.get('data') is not None
188
+ )
189
+
190
+ for log_entry in logs:
191
+ self.db.add_log(
192
+ test_run_id=test_run_id,
193
+ timestamp=datetime.fromisoformat(log_entry['timestamp']),
194
+ level=log_entry.get('level', 'INFO'),
195
+ agent=log_entry.get('agent'),
196
+ message=log_entry.get('message', '')
197
+ )
198
+
199
+ # Get final result
200
+ final_result = progress_collector.get_final_result()
201
+ final_response_text = "" # Initialize to ensure it's always defined
202
+
203
+ # Save final response to database if available
204
+ if final_result:
205
+ try:
206
+ # final_result can be either a string or a dict
207
+ if isinstance(final_result, str):
208
+ final_response_text = final_result
209
+ elif isinstance(final_result, dict):
210
+ # Try to extract text from dict (could have 'message', 'text', or other fields)
211
+ final_response_text = final_result.get('message') or final_result.get('text') or str(final_result)
212
+ else:
213
+ final_response_text = str(final_result)
214
+
215
+ self.db.save_final_response(test_run_id, final_response_text)
216
+ logger.info(f"💾 Saved final response to database ({len(final_response_text)} chars)")
217
+
218
+ # Log the final response content for visibility during test runs
219
+ logger.info(f"\n📝 Final Response:")
220
+ logger.info(f"{final_response_text}")
221
+ except Exception as e:
222
+ logger.warning(f"⚠️ Failed to save final response to database: {e}")
223
+ final_response_text = f"Error saving final response: {str(e)}"
224
+
225
+ # Update test run status
226
+ status = 'completed' if len(progress_updates) > 0 else 'timeout'
227
+ self.db.update_test_run_status(test_run_id, status)
228
+
229
+ test_run_data = self.db.get_test_run(test_run_id)
230
+
231
+ # Extract files from final result if available
232
+ files_created = []
233
+ if final_result and isinstance(final_result, dict):
234
+ deliverables = final_result.get('deliverables', [])
235
+ for item in deliverables:
236
+ if isinstance(item, dict):
237
+ self.db.add_file(
238
+ test_run_id=test_run_id,
239
+ file_path=item.get('path', 'unknown'),
240
+ file_type=item.get('type', 'unknown'),
241
+ sas_url=item.get('sas_url')
242
+ )
243
+ files_created = self.db.get_files(test_run_id)
244
+
245
+ # Calculate metrics
246
+ logger.info(f"\n📊 Calculating metrics...")
247
+ metrics = MetricsCollector.calculate_metrics(
248
+ test_run_data,
249
+ progress_updates,
250
+ logs,
251
+ files_created
252
+ )
253
+
254
+ self.db.save_metrics(test_run_id, **metrics)
255
+
256
+ # Run LLM evaluation
257
+ logger.info(f"\n🤖 Running LLM evaluation...")
258
+
259
+ test_summary = {
260
+ 'duration_seconds': test_run_data.get('duration_seconds', 0),
261
+ 'total_progress_updates': metrics.get('total_progress_updates', 0),
262
+ 'errors_count': metrics.get('errors_count', 0),
263
+ 'warnings_count': metrics.get('warnings_count', 0)
264
+ }
265
+
266
+ try:
267
+ progress_eval, output_eval = await self.evaluator.evaluate_test_run(
268
+ task=task_description,
269
+ progress_updates=progress_updates,
270
+ final_result=final_result,
271
+ files_created=files_created,
272
+ test_summary=test_summary
273
+ )
274
+
275
+ # Store evaluation in database
276
+ self.db.save_evaluation(
277
+ test_run_id=test_run_id,
278
+ progress_score=progress_eval['score'],
279
+ output_score=output_eval['score'],
280
+ progress_reasoning=progress_eval['reasoning'],
281
+ output_reasoning=output_eval['reasoning'],
282
+ progress_issues=progress_eval.get('issues', []),
283
+ output_strengths=output_eval.get('strengths', []),
284
+ output_weaknesses=output_eval.get('weaknesses', [])
285
+ )
286
+
287
+ # Weighted: 80% output quality, 20% progress reporting (output matters most!)
288
+ overall = int((output_eval['score'] * 0.8) + (progress_eval['score'] * 0.2))
289
+
290
+ # Make evaluation results highly visible during test runs
291
+ logger.info(f"\n**Overall Score:** {overall}/100 ✅")
292
+ logger.info(f"**Progress Score:** {progress_eval['score']}/100")
293
+ logger.info(f"**Output Score:** {output_eval['score']}/100")
294
+ logger.info(f"**Duration:** {test_run_data.get('duration_seconds', 0):.1f}s")
295
+
296
+ logger.info(f"\n**Progress Evaluation:**")
297
+ logger.info(f"{progress_eval['reasoning']}")
298
+
299
+ logger.info(f"\n**Output Evaluation:**")
300
+ logger.info(f"{output_eval['reasoning']}")
301
+
302
+ except Exception as e:
303
+ logger.error(f"❌ Evaluation error: {e}", exc_info=True)
304
+ progress_eval = {'score': 0, 'reasoning': f"Evaluation failed: {str(e)}", 'issues': []}
305
+ output_eval = {'score': 0, 'reasoning': f"Evaluation failed: {str(e)}", 'strengths': [], 'weaknesses': []}
306
+
307
+ # Compile results
308
+ results = {
309
+ 'test_run_id': test_run_id,
310
+ 'test_case_id': test_case_id,
311
+ 'request_id': request_id,
312
+ 'status': status,
313
+ 'duration_seconds': test_run_data.get('duration_seconds', 0),
314
+ 'progress_updates_count': len(progress_updates),
315
+ 'logs_count': len(logs),
316
+ 'files_created_count': len(files_created),
317
+ 'final_response': final_response_text,
318
+ 'metrics': metrics,
319
+ 'progress_evaluation': progress_eval,
320
+ 'output_evaluation': output_eval,
321
+ 'progress_score': progress_eval['score'],
322
+ 'output_score': output_eval['score'],
323
+ 'overall_score': int((output_eval['score'] * 0.8) + (progress_eval['score'] * 0.2))
324
+ }
325
+
326
+ logger.info(f"\n{'='*80}")
327
+ logger.info(f"✅ Test Complete: {test_case['name']}")
328
+ logger.info(f"{'='*80}\n")
329
+
330
+ return results
331
+
332
+ async def run_all_tests(self, test_cases_path: Optional[str] = None) -> List[Dict]:
333
+ """
334
+ Run all test cases sequentially.
335
+
336
+ Args:
337
+ test_cases_path: Path to test_cases.yaml
338
+
339
+ Returns:
340
+ List of test results
341
+ """
342
+ test_cases = self.load_test_cases(test_cases_path)
343
+ results = []
344
+
345
+ logger.info(f"\n🚀 Running {len(test_cases)} test cases...\n")
346
+
347
+ for i, test_case in enumerate(test_cases, 1):
348
+ logger.info(f"\n{'#'*80}")
349
+
350
+ # Show progress summary for completed tests
351
+ if results:
352
+ completed_count = len(results)
353
+ passed = sum(1 for r in results if r.get('overall_score', 0) > 80)
354
+ avg_score = sum(r.get('overall_score', 0) for r in results) / completed_count
355
+
356
+ # Calculate average progress and output scores
357
+ avg_progress = sum(r.get('progress_score', 0) for r in results) / completed_count
358
+ avg_output = sum(r.get('output_score', 0) for r in results) / completed_count
359
+
360
+ logger.info(f"# Progress: {completed_count} completed | {passed} passed (>80) | Avg: {avg_score:.1f}/100")
361
+ logger.info(f"# Scores - Progress: {avg_progress:.1f}/100 | Output: {avg_output:.1f}/100 | Overall: {avg_score:.1f}/100")
362
+
363
+ logger.info(f"# Test {i}/{len(test_cases)}: {test_case['name']}")
364
+ logger.info(f"{'#'*80}\n")
365
+
366
+ result = await self.run_test(test_case)
367
+ results.append(result)
368
+
369
+ # Print summary
370
+ self._print_summary(results)
371
+
372
+ # Generate and save report
373
+ logger.info(f"📄 Generating test report...")
374
+ report = self._generate_test_report(results, test_cases)
375
+
376
+ # Save report to file with timestamp
377
+ from datetime import datetime
378
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
379
+ report_path = Path(__file__).parent.parent / f"TEST_RUN_RESULTS_{timestamp}.md"
380
+
381
+ with open(report_path, 'w') as f:
382
+ f.write(report)
383
+
384
+ logger.info(f"📄 Test report saved to: {report_path}")
385
+ logger.info(f" You can review detailed results and final messages in this file.\n")
386
+
387
+ return results
388
+
389
+ async def _submit_task(self, request_id: str, task: str) -> str:
390
+ """Submit task to Azure Queue and return the Azure Queue message ID."""
391
+ queue_client = QueueClient.from_connection_string(
392
+ self.azure_queue_conn_str,
393
+ self.azure_queue_name
394
+ )
395
+
396
+ # Match send_task.py format exactly: "content" and "request_id"
397
+ message_data = {
398
+ "request_id": request_id,
399
+ "message_id": str(uuid.uuid4()),
400
+ "content": task
401
+ }
402
+
403
+ message_json = json.dumps(message_data)
404
+ message_b64 = base64.b64encode(message_json.encode('utf-8')).decode('utf-8')
405
+
406
+ result = queue_client.send_message(message_b64)
407
+ queue_client.close()
408
+
409
+ # Return the Azure Queue message ID - this is what the system uses for progress updates!
410
+ return result.id
411
+
412
+ def _print_summary(self, results: List[Dict]):
413
+ """Print summary of all test results."""
414
+ logger.info(f"\n\n{'='*80}")
415
+ logger.info(f"📊 TEST SUMMARY")
416
+ logger.info(f"{'='*80}\n")
417
+
418
+ total_tests = len(results)
419
+ skipped = sum(1 for r in results if r.get('status') == 'skipped')
420
+ completed_results = [r for r in results if r.get('status') != 'skipped']
421
+ completed_count = len(completed_results)
422
+
423
+ passed = sum(1 for r in completed_results if r.get('overall_score', 0) > 80)
424
+ failed = completed_count - passed
425
+
426
+ total_progress_score = sum(r.get('progress_evaluation', {}).get('score', 0) for r in completed_results)
427
+ total_output_score = sum(r.get('output_evaluation', {}).get('score', 0) for r in completed_results)
428
+ total_overall_score = sum(r.get('overall_score', 0) for r in completed_results)
429
+
430
+ avg_progress = total_progress_score / completed_count if completed_count > 0 else 0
431
+ avg_output = total_output_score / completed_count if completed_count > 0 else 0
432
+ avg_overall = total_overall_score / completed_count if completed_count > 0 else 0
433
+
434
+ logger.info(f"Total Tests: {total_tests}")
435
+ logger.info(f"Completed: {completed_count}")
436
+ if skipped > 0:
437
+ logger.info(f"Skipped: {skipped} (AJ SQL database not accessible)")
438
+ logger.info(f"Passed (≥70): {passed}")
439
+ logger.info(f"Failed (<70): {failed}")
440
+ logger.info(f"")
441
+
442
+ if completed_count > 0:
443
+ logger.info(f"Average Scores:")
444
+ logger.info(f" Progress: {avg_progress:.1f}/100")
445
+ logger.info(f" Output: {avg_output:.1f}/100")
446
+ logger.info(f" Overall: {avg_overall:.1f}/100")
447
+
448
+ logger.info(f"\n{'='*80}\n")
449
+
450
+ def _generate_test_report(self, results: List[Dict], test_cases: List[Dict]) -> str:
451
+ """Generate a comprehensive markdown test report."""
452
+ from datetime import datetime
453
+
454
+ # Calculate summary stats
455
+ total_tests = len(results)
456
+ skipped = sum(1 for r in results if r.get('status') == 'skipped')
457
+ completed_results = [r for r in results if r.get('status') != 'skipped']
458
+ completed_count = len(completed_results)
459
+
460
+ passed = sum(1 for r in completed_results if r.get('overall_score', 0) > 80)
461
+ failed = completed_count - passed
462
+
463
+ total_progress_score = sum(r.get('progress_evaluation', {}).get('score', 0) for r in completed_results)
464
+ total_output_score = sum(r.get('output_evaluation', {}).get('score', 0) for r in completed_results)
465
+ total_overall_score = sum(r.get('overall_score', 0) for r in completed_results)
466
+
467
+ avg_progress = total_progress_score / completed_count if completed_count > 0 else 0
468
+ avg_output = total_output_score / completed_count if completed_count > 0 else 0
469
+ avg_overall = total_overall_score / completed_count if completed_count > 0 else 0
470
+
471
+ # Build markdown report
472
+ report = f"""# Test Run Results
473
+ Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
474
+
475
+ ## Executive Summary
476
+
477
+ **{'✅ ALL TESTS PASSED' if failed == 0 and completed_count == total_tests else '⚠️ SOME TESTS FAILED'}! Average Overall Score: {avg_overall:.1f}/100**
478
+
479
+ ## Test Results Summary
480
+
481
+ | Test # | Test Name | Score | Status | Notes |
482
+ |--------|-----------|-------|--------|-------|
483
+ """
484
+
485
+ # Add table rows for each test
486
+ for i, result in enumerate(results, 1):
487
+ test_case_id = result.get('test_case_id', 'unknown')
488
+ test_case = next((tc for tc in test_cases if tc['id'] == test_case_id), {})
489
+ test_name = test_case.get('name', 'Unknown Test')
490
+
491
+ if result.get('status') == 'skipped':
492
+ report += f"| {i} | {test_name} | N/A | ⏭️ SKIPPED | {result.get('skip_reason', 'Unknown')} |\n"
493
+ else:
494
+ score = result.get('overall_score', 0)
495
+ status = '✅ PASS' if score >= 70 else '❌ FAIL'
496
+ report += f"| {i} | {test_name} | **{score}/100** | {status} | |\n"
497
+
498
+ report += f"""
499
+ **Average Score: {avg_overall:.1f}/100** (Target: ≥70/100) {'✅' if avg_overall >= 70 else '❌'}
500
+
501
+ ## Detailed Results
502
+
503
+ """
504
+
505
+ # Add detailed results for each test
506
+ for i, result in enumerate(results, 1):
507
+ test_case_id = result.get('test_case_id', 'unknown')
508
+ test_case = next((tc for tc in test_cases if tc['id'] == test_case_id), {})
509
+ test_name = test_case.get('name', 'Unknown Test')
510
+
511
+ if result.get('status') == 'skipped':
512
+ report += f"""### Test {i}: {test_name} ⏭️ SKIPPED
513
+
514
+ **Reason:** {result.get('skip_reason', 'Unknown')}
515
+
516
+ ---
517
+
518
+ """
519
+ continue
520
+
521
+ score = result.get('overall_score', 0)
522
+ progress_score = result.get('progress_evaluation', {}).get('score', 0)
523
+ output_score = result.get('output_evaluation', {}).get('score', 0)
524
+ duration = result.get('duration_seconds', 0)
525
+
526
+ report += f"""### Test {i}: {test_name}
527
+
528
+ **Overall Score:** {score}/100 {'✅' if score >= 70 else '❌'}
529
+ **Progress Score:** {progress_score}/100
530
+ **Output Score:** {output_score}/100
531
+ **Duration:** {duration:.1f}s
532
+
533
+ **Progress Evaluation:**
534
+ {result.get('progress_evaluation', {}).get('reasoning', 'N/A')}
535
+
536
+ **Output Evaluation:**
537
+ {result.get('output_evaluation', {}).get('reasoning', 'N/A')}
538
+
539
+ """
540
+
541
+ # Add final response if available
542
+ test_run_id = result.get('test_run_id')
543
+ if test_run_id:
544
+ test_run = self.db.get_test_run(test_run_id)
545
+ final_response = test_run.get('final_response') if test_run else None
546
+
547
+ if final_response:
548
+ report += f"""**Final Response:**
549
+ ```
550
+ {final_response}
551
+ ```
552
+
553
+ """
554
+
555
+ report += "---\n\n"
556
+
557
+ # Add summary metrics
558
+ report += f"""## Performance Summary
559
+
560
+ - **Total Tests:** {total_tests}
561
+ - **Completed:** {completed_count}
562
+ - **Skipped:** {skipped}
563
+ - **Passed (≥70):** {passed}
564
+ - **Failed (<70):** {failed}
565
+
566
+ **Average Scores:**
567
+ - Progress: {avg_progress:.1f}/100
568
+ - Output: {avg_output:.1f}/100
569
+ - Overall: {avg_overall:.1f}/100
570
+
571
+ ---
572
+
573
+ *Report generated automatically by Cortex AutoGen2 Test Orchestrator*
574
+ """
575
+
576
+ return report