@aj-archipelago/cortex 1.4.2 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +1 -0
  2. package/config.js +1 -1
  3. package/helper-apps/cortex-autogen2/.dockerignore +1 -0
  4. package/helper-apps/cortex-autogen2/Dockerfile +6 -10
  5. package/helper-apps/cortex-autogen2/Dockerfile.worker +2 -0
  6. package/helper-apps/cortex-autogen2/agents.py +203 -2
  7. package/helper-apps/cortex-autogen2/main.py +1 -1
  8. package/helper-apps/cortex-autogen2/pyproject.toml +12 -0
  9. package/helper-apps/cortex-autogen2/requirements.txt +14 -0
  10. package/helper-apps/cortex-autogen2/services/redis_publisher.py +1 -1
  11. package/helper-apps/cortex-autogen2/services/run_analyzer.py +1 -1
  12. package/helper-apps/cortex-autogen2/task_processor.py +431 -229
  13. package/helper-apps/cortex-autogen2/test_entity_fetcher.py +305 -0
  14. package/helper-apps/cortex-autogen2/tests/README.md +240 -0
  15. package/helper-apps/cortex-autogen2/tests/TEST_REPORT.md +342 -0
  16. package/helper-apps/cortex-autogen2/tests/__init__.py +8 -0
  17. package/helper-apps/cortex-autogen2/tests/analysis/__init__.py +1 -0
  18. package/helper-apps/cortex-autogen2/tests/analysis/improvement_suggester.py +224 -0
  19. package/helper-apps/cortex-autogen2/tests/analysis/trend_analyzer.py +211 -0
  20. package/helper-apps/cortex-autogen2/tests/cli/__init__.py +1 -0
  21. package/helper-apps/cortex-autogen2/tests/cli/run_tests.py +296 -0
  22. package/helper-apps/cortex-autogen2/tests/collectors/__init__.py +1 -0
  23. package/helper-apps/cortex-autogen2/tests/collectors/log_collector.py +252 -0
  24. package/helper-apps/cortex-autogen2/tests/collectors/progress_collector.py +182 -0
  25. package/helper-apps/cortex-autogen2/tests/conftest.py +15 -0
  26. package/helper-apps/cortex-autogen2/tests/database/__init__.py +1 -0
  27. package/helper-apps/cortex-autogen2/tests/database/repository.py +501 -0
  28. package/helper-apps/cortex-autogen2/tests/database/schema.sql +108 -0
  29. package/helper-apps/cortex-autogen2/tests/evaluators/__init__.py +1 -0
  30. package/helper-apps/cortex-autogen2/tests/evaluators/llm_scorer.py +294 -0
  31. package/helper-apps/cortex-autogen2/tests/evaluators/prompts.py +250 -0
  32. package/helper-apps/cortex-autogen2/tests/evaluators/wordcloud_validator.py +168 -0
  33. package/helper-apps/cortex-autogen2/tests/metrics/__init__.py +1 -0
  34. package/helper-apps/cortex-autogen2/tests/metrics/collector.py +155 -0
  35. package/helper-apps/cortex-autogen2/tests/orchestrator.py +576 -0
  36. package/helper-apps/cortex-autogen2/tests/test_cases.yaml +279 -0
  37. package/helper-apps/cortex-autogen2/tests/test_data.db +0 -0
  38. package/helper-apps/cortex-autogen2/tests/utils/__init__.py +3 -0
  39. package/helper-apps/cortex-autogen2/tests/utils/connectivity.py +112 -0
  40. package/helper-apps/cortex-autogen2/tools/azure_blob_tools.py +74 -24
  41. package/helper-apps/cortex-autogen2/tools/entity_api_registry.json +38 -0
  42. package/helper-apps/cortex-autogen2/tools/file_tools.py +1 -1
  43. package/helper-apps/cortex-autogen2/tools/search_tools.py +436 -238
  44. package/helper-apps/cortex-file-handler/package-lock.json +2 -2
  45. package/helper-apps/cortex-file-handler/package.json +1 -1
  46. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +4 -5
  47. package/helper-apps/cortex-file-handler/src/blobHandler.js +36 -144
  48. package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +5 -3
  49. package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +34 -1
  50. package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +22 -0
  51. package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +28 -1
  52. package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +29 -4
  53. package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +11 -0
  54. package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +1 -1
  55. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +3 -2
  56. package/helper-apps/cortex-file-handler/tests/checkHashShortLived.test.js +8 -1
  57. package/helper-apps/cortex-file-handler/tests/containerConversionFlow.test.js +5 -2
  58. package/helper-apps/cortex-file-handler/tests/containerNameParsing.test.js +14 -7
  59. package/helper-apps/cortex-file-handler/tests/containerParameterFlow.test.js +5 -2
  60. package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +31 -19
  61. package/package.json +1 -1
  62. package/server/modelExecutor.js +4 -0
  63. package/server/plugins/claude4VertexPlugin.js +540 -0
  64. package/server/plugins/openAiWhisperPlugin.js +43 -2
  65. package/tests/integration/rest/vendors/claude_streaming.test.js +121 -0
  66. package/tests/unit/plugins/claude4VertexPlugin.test.js +462 -0
  67. package/tests/unit/plugins/claude4VertexToolConversion.test.js +413 -0
  68. package/helper-apps/cortex-autogen/.funcignore +0 -8
  69. package/helper-apps/cortex-autogen/Dockerfile +0 -10
  70. package/helper-apps/cortex-autogen/OAI_CONFIG_LIST +0 -6
  71. package/helper-apps/cortex-autogen/agents.py +0 -493
  72. package/helper-apps/cortex-autogen/agents_extra.py +0 -14
  73. package/helper-apps/cortex-autogen/config.py +0 -18
  74. package/helper-apps/cortex-autogen/data_operations.py +0 -29
  75. package/helper-apps/cortex-autogen/function_app.py +0 -44
  76. package/helper-apps/cortex-autogen/host.json +0 -15
  77. package/helper-apps/cortex-autogen/main.py +0 -38
  78. package/helper-apps/cortex-autogen/prompts.py +0 -196
  79. package/helper-apps/cortex-autogen/prompts_extra.py +0 -5
  80. package/helper-apps/cortex-autogen/requirements.txt +0 -9
  81. package/helper-apps/cortex-autogen/search.py +0 -85
  82. package/helper-apps/cortex-autogen/test.sh +0 -40
  83. package/helper-apps/cortex-autogen/tools/sasfileuploader.py +0 -66
  84. package/helper-apps/cortex-autogen/utils.py +0 -88
  85. package/helper-apps/cortex-autogen2/DigiCertGlobalRootCA.crt.pem +0 -22
  86. package/helper-apps/cortex-autogen2/poetry.lock +0 -3652
@@ -0,0 +1,305 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Isolation test script for fetch_entity_images() function
4
+ Tests multi-tier entity image fetcher with various scenarios
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import os
10
+ import sys
11
+ import tempfile
12
+ import shutil
13
+ from pathlib import Path
14
+
15
+ # Add project root to path
16
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
17
+
18
+ from tools.search_tools import fetch_entity_images
19
+
20
+
21
+ class TestRunner:
22
+ """Runs isolation tests for fetch_entity_images"""
23
+
24
+ def __init__(self):
25
+ self.work_dir = tempfile.mkdtemp(prefix="entity_test_")
26
+ self.results = []
27
+ print(f"🔬 Test work directory: {self.work_dir}\n")
28
+
29
+ def cleanup(self):
30
+ """Clean up test directory"""
31
+ if os.path.exists(self.work_dir):
32
+ shutil.rmtree(self.work_dir)
33
+ print(f"\n🧹 Cleaned up test directory")
34
+
35
+ async def run_test(self, name: str, entities: list, entity_type: str, count_per_entity: int = 1, force_web_search: bool = False):
36
+ """Run a single test case"""
37
+ print("=" * 80)
38
+ print(f"🧪 TEST: {name}")
39
+ print("=" * 80)
40
+ print(f" Entities: {entities}")
41
+ print(f" Type: {entity_type}")
42
+ print(f" Count per entity: {count_per_entity}")
43
+ print(f" Force web search: {force_web_search}")
44
+ print()
45
+
46
+ try:
47
+ result_json = await fetch_entity_images(
48
+ entities=entities,
49
+ entity_type=entity_type,
50
+ count_per_entity=count_per_entity,
51
+ work_dir=self.work_dir,
52
+ force_web_search=force_web_search
53
+ )
54
+
55
+ result = json.loads(result_json)
56
+
57
+ # Print summary
58
+ print("📊 RESULT:")
59
+ print(f" Success: {result.get('success', False)}")
60
+ print(f" Total images: {result.get('stats', {}).get('total_images', 0)}")
61
+ print(f" API success: {result.get('stats', {}).get('api_success', 0)}")
62
+ print(f" API failed: {result.get('stats', {}).get('api_failed', 0)}")
63
+ print(f" Web search used: {result.get('stats', {}).get('web_search_used', 0)}")
64
+
65
+ # Show per-entity breakdown
66
+ if 'results' in result:
67
+ print("\n Per-entity results:")
68
+ for entity_data in result['results']:
69
+ entity_name = entity_data.get('entity', 'unknown')
70
+ img_count = len(entity_data.get('images', []))
71
+ method = entity_data.get('method', 'unknown')
72
+ api_used = entity_data.get('api_used', 'N/A')
73
+ print(f" {entity_name}: {img_count} images via {method} ({api_used})")
74
+
75
+ # Show sample image paths
76
+ if 'results' in result:
77
+ print("\n Sample images:")
78
+ for entity_data in result['results']:
79
+ entity_name = entity_data.get('entity', 'unknown')
80
+ images = entity_data.get('images', [])
81
+ if images:
82
+ sample = images[0]
83
+ filename = os.path.basename(sample.get('local_path', 'N/A'))
84
+ print(f" {entity_name}: {filename}")
85
+
86
+ if not result.get('success'):
87
+ print(f"\n ⚠️ Error: {result.get('error', 'Unknown error')}")
88
+
89
+ self.results.append({
90
+ 'name': name,
91
+ 'success': result.get('success', False),
92
+ 'result': result
93
+ })
94
+
95
+ print("\n✅ TEST PASSED\n")
96
+ return result
97
+
98
+ except Exception as e:
99
+ print(f"\n❌ TEST FAILED: {str(e)}\n")
100
+ import traceback
101
+ traceback.print_exc()
102
+ self.results.append({
103
+ 'name': name,
104
+ 'success': False,
105
+ 'error': str(e)
106
+ })
107
+ return None
108
+
109
+ async def run_all_tests(self):
110
+ """Run all test scenarios"""
111
+ print("\n" + "=" * 80)
112
+ print("🚀 STARTING ISOLATION TESTS FOR fetch_entity_images()")
113
+ print("=" * 80)
114
+ print()
115
+
116
+ # TEST 1: Pokemon entities (primary use case - should use PokeAPI)
117
+ await self.run_test(
118
+ name="Pokemon - PokeAPI Integration",
119
+ entities=["pikachu", "charizard", "mewtwo"],
120
+ entity_type="pokemon",
121
+ count_per_entity=1
122
+ )
123
+
124
+ await asyncio.sleep(2)
125
+
126
+ # TEST 2: Single Pokemon with multiple images
127
+ await self.run_test(
128
+ name="Pokemon - Multiple Images per Entity",
129
+ entities=["gengar"],
130
+ entity_type="pokemon",
131
+ count_per_entity=3
132
+ )
133
+
134
+ await asyncio.sleep(2)
135
+
136
+ # TEST 3: Country entities (should use REST Countries API)
137
+ await self.run_test(
138
+ name="Countries - REST Countries API",
139
+ entities=["france", "japan", "brazil"],
140
+ entity_type="country",
141
+ count_per_entity=1
142
+ )
143
+
144
+ await asyncio.sleep(2)
145
+
146
+ # TEST 4: Invalid Pokemon name (should fallback to web search)
147
+ await self.run_test(
148
+ name="Pokemon - Invalid Name Fallback",
149
+ entities=["invalidpokemonxyz123"],
150
+ entity_type="pokemon",
151
+ count_per_entity=1
152
+ )
153
+
154
+ await asyncio.sleep(2)
155
+
156
+ # TEST 5: Unknown entity type (should fallback to web search)
157
+ await self.run_test(
158
+ name="Unknown Entity Type - Web Search Fallback",
159
+ entities=["tesla model 3"],
160
+ entity_type="car",
161
+ count_per_entity=2
162
+ )
163
+
164
+ await asyncio.sleep(2)
165
+
166
+ # TEST 6: Force web search even for known entity type
167
+ await self.run_test(
168
+ name="Pokemon - Force Web Search Override",
169
+ entities=["dragonite"],
170
+ entity_type="pokemon",
171
+ count_per_entity=1,
172
+ force_web_search=True
173
+ )
174
+
175
+ await asyncio.sleep(2)
176
+
177
+ # TEST 7: Mixed valid/invalid Pokemon (test fallback handling)
178
+ await self.run_test(
179
+ name="Pokemon - Mixed Valid/Invalid Entities",
180
+ entities=["bulbasaur", "notarealpokemon999", "squirtle"],
181
+ entity_type="pokemon",
182
+ count_per_entity=1
183
+ )
184
+
185
+ await asyncio.sleep(2)
186
+
187
+ # TEST 8: Empty entity list (edge case)
188
+ await self.run_test(
189
+ name="Edge Case - Empty Entity List",
190
+ entities=[],
191
+ entity_type="pokemon",
192
+ count_per_entity=1
193
+ )
194
+
195
+ await asyncio.sleep(2)
196
+
197
+ # TEST 9: Pokemon with numbers (Gen 1 powerful pokemon)
198
+ await self.run_test(
199
+ name="Pokemon - Gen 1 Powerful Pokemon",
200
+ entities=["mewtwo", "dragonite", "alakazam", "gengar", "zapdos"],
201
+ entity_type="pokemon",
202
+ count_per_entity=1
203
+ )
204
+
205
+ # Print summary
206
+ self.print_summary()
207
+
208
+ def print_summary(self):
209
+ """Print test summary"""
210
+ print("\n" + "=" * 80)
211
+ print("📊 TEST SUMMARY")
212
+ print("=" * 80)
213
+
214
+ passed = sum(1 for r in self.results if r.get('success'))
215
+ total = len(self.results)
216
+
217
+ print(f"\nTotal Tests: {total}")
218
+ print(f"Passed: {passed}")
219
+ print(f"Failed: {total - passed}")
220
+ print(f"Success Rate: {(passed/total*100) if total > 0 else 0:.1f}%")
221
+
222
+ print("\n📝 Individual Results:")
223
+ for i, result in enumerate(self.results, 1):
224
+ status = "✅ PASS" if result.get('success') else "❌ FAIL"
225
+ print(f" {i}. {status} - {result['name']}")
226
+
227
+ if result.get('success') and 'result' in result:
228
+ stats = result['result'].get('stats', {})
229
+ total_images = stats.get('total_images', 0)
230
+ api_success = stats.get('api_success', 0)
231
+ web_search = stats.get('web_search_used', 0)
232
+
233
+ methods = []
234
+ if api_success > 0:
235
+ methods.append(f"{api_success} via API")
236
+ if web_search > 0:
237
+ methods.append(f"{web_search} via web search")
238
+
239
+ method_str = ", ".join(methods) if methods else "no images"
240
+ print(f" → {total_images} total images ({method_str})")
241
+
242
+ print("\n" + "=" * 80)
243
+
244
+ # Check for specific success criteria
245
+ print("\n🎯 SUCCESS CRITERIA CHECK:")
246
+
247
+ criteria = [
248
+ ("Pokemon API integration works", any(
249
+ r.get('success') and 'Pokemon - PokeAPI' in r['name']
250
+ and r.get('result', {}).get('stats', {}).get('api_success', 0) > 0
251
+ for r in self.results
252
+ )),
253
+ ("Country API integration works", any(
254
+ r.get('success') and 'Countries - REST' in r['name']
255
+ and r.get('result', {}).get('stats', {}).get('api_success', 0) > 0
256
+ for r in self.results
257
+ )),
258
+ ("Web search fallback works for invalid entities", any(
259
+ r.get('success') and 'Invalid Name Fallback' in r['name']
260
+ and r.get('result', {}).get('stats', {}).get('web_search_used', 0) > 0
261
+ for r in self.results
262
+ )),
263
+ ("Web search fallback works for unknown types", any(
264
+ r.get('success') and 'Unknown Entity Type' in r['name']
265
+ and r.get('result', {}).get('stats', {}).get('web_search_used', 0) > 0
266
+ for r in self.results
267
+ )),
268
+ ("Force web search override works", any(
269
+ r.get('success') and 'Force Web Search' in r['name']
270
+ and r.get('result', {}).get('stats', {}).get('web_search_used', 0) > 0
271
+ for r in self.results
272
+ )),
273
+ ("Multiple images per entity works", any(
274
+ r.get('success') and 'Multiple Images' in r['name']
275
+ and r.get('result', {}).get('stats', {}).get('total_images', 0) >= 3
276
+ for r in self.results
277
+ )),
278
+ ]
279
+
280
+ for criterion, passed in criteria:
281
+ status = "✅ PASS" if passed else "❌ FAIL"
282
+ print(f" {status} - {criterion}")
283
+
284
+ all_criteria_passed = all(passed for _, passed in criteria)
285
+
286
+ print("\n" + "=" * 80)
287
+ if all_criteria_passed:
288
+ print("🎉 ALL CRITERIA PASSED - Tool is ready for integration!")
289
+ else:
290
+ print("⚠️ Some criteria failed - review results above")
291
+ print("=" * 80)
292
+
293
+
294
+ async def main():
295
+ """Main test runner"""
296
+ runner = TestRunner()
297
+
298
+ try:
299
+ await runner.run_all_tests()
300
+ finally:
301
+ runner.cleanup()
302
+
303
+
304
+ if __name__ == "__main__":
305
+ asyncio.run(main())
@@ -0,0 +1,240 @@
1
+ # Cortex AutoGen2 Automated Testing Suite
2
+
3
+ Comprehensive automated testing framework for evaluating and improving the AutoGen2 system quality.
4
+
5
+ ## Features
6
+
7
+ - ✅ **Automated Test Execution**: Run predefined test cases with zero manual intervention
8
+ - 📊 **LLM-Based Evaluation**: Scores progress updates (0-100) and final outputs (0-100) using Cortex API
9
+ - 📈 **Performance Metrics**: Track latency, update frequency, error rates, and more
10
+ - 🗄️ **SQLite Storage**: All test results, scores, and metrics stored locally
11
+ - 💡 **Improvement Suggestions**: LLM analyzes failures and suggests code improvements
12
+ - 📉 **Trend Analysis**: Detect quality regressions over time
13
+ - 🖥️ **CLI Interface**: Easy-to-use command-line tool
14
+
15
+ ## Quick Start
16
+
17
+ ### 1. Prerequisites
18
+
19
+ Ensure you have:
20
+ - Docker running (for cortex-autogen-function container)
21
+ - Redis running (for progress updates)
22
+ - Azure Queue setup
23
+ - Environment variables configured (.env file)
24
+
25
+ Required environment variables:
26
+ ```bash
27
+ CORTEX_API_KEY=your_key_here
28
+ CORTEX_API_BASE_URL=http://localhost:4000/v1
29
+ REDIS_CONNECTION_STRING=redis://localhost:6379
30
+ REDIS_CHANNEL=cortex_progress
31
+ AZURE_STORAGE_CONNECTION_STRING=your_connection_string
32
+ AZURE_QUEUE_NAME=cortex-tasks
33
+ ```
34
+
35
+ ### 2. Install Dependencies
36
+
37
+ The testing suite uses the same dependencies as the main project. No additional installation needed.
38
+
39
+ ### 3. Run Tests
40
+
41
+ ```bash
42
+ # Run all test cases
43
+ python tests/cli/run_tests.py --all
44
+
45
+ # Run specific test
46
+ python tests/cli/run_tests.py --test tc001_pokemon_pptx
47
+
48
+ # View test history
49
+ python tests/cli/run_tests.py --history --limit 20
50
+
51
+ # View score trend for a test case
52
+ python tests/cli/run_tests.py --trend tc001_pokemon_pptx
53
+ ```
54
+
55
+ ## Test Cases
56
+
57
+ The suite includes 3 predefined test cases:
58
+
59
+ ### TC001: Pokemon PPTX Presentation
60
+ Creates a professional PowerPoint with Pokemon images, tests:
61
+ - Image collection (10+ images)
62
+ - Professional slide design
63
+ - Preview image generation
64
+ - File upload with SAS URLs
65
+
66
+ ### TC002: PDF Report with Images
67
+ Generates a renewable energy PDF report, tests:
68
+ - Web research and image collection
69
+ - Chart/graph generation
70
+ - PDF formatting
71
+ - Document quality
72
+
73
+ ### TC003: Random CSV Generation
74
+ Creates realistic sales data CSVs, tests:
75
+ - Data generation
76
+ - Statistical calculations
77
+ - CSV formatting
78
+ - Quick task execution
79
+
80
+ ## Architecture
81
+
82
+ ```
83
+ tests/
84
+ ├── orchestrator.py # Main test execution engine
85
+ ├── test_cases.yaml # Test case definitions
86
+ ├── database/
87
+ │ ├── schema.sql # SQLite database schema
88
+ │ ├── repository.py # Data access layer
89
+ │ └── test_results.db # SQLite database (gitignored)
90
+ ├── collectors/
91
+ │ ├── progress_collector.py # Redis subscriber for progress updates
92
+ │ └── log_collector.py # Docker log parser
93
+ ├── evaluators/
94
+ │ ├── llm_scorer.py # LLM-based evaluation
95
+ │ └── prompts.py # Evaluation prompts and rubrics
96
+ ├── metrics/
97
+ │ └── collector.py # Performance metrics calculation
98
+ ├── analysis/
99
+ │ ├── improvement_suggester.py # LLM-powered suggestions
100
+ │ └── trend_analyzer.py # Trend and regression detection
101
+ └── cli/
102
+ └── run_tests.py # CLI interface
103
+ ```
104
+
105
+ ## How It Works
106
+
107
+ 1. **Test Submission**: Test orchestrator submits task to Azure Queue
108
+ 2. **Data Collection**:
109
+ - Progress collector subscribes to Redis for real-time updates
110
+ - Log collector streams Docker container logs
111
+ 3. **Execution Monitoring**: Wait for task completion or timeout
112
+ 4. **Data Storage**: Store progress updates, logs, files in SQLite
113
+ 5. **Metrics Calculation**: Calculate latency, frequency, error counts
114
+ 6. **LLM Evaluation**:
115
+ - Score progress updates (frequency, clarity, accuracy)
116
+ - Score final output (completeness, quality, correctness)
117
+ 7. **Analysis**: Generate improvement suggestions and track trends
118
+
119
+ ## Evaluation Criteria
120
+
121
+ ### Progress Updates (0-100)
122
+ - **Frequency** (25 pts): Updates every 2-5 seconds ideal
123
+ - **Clarity** (25 pts): Emojis, concise, informative
124
+ - **Accuracy** (25 pts): Progress % matches work done
125
+ - **Coverage** (25 pts): All important steps communicated
126
+
127
+ ### Final Output (0-100)
128
+ - **Completeness** (25 pts): All deliverables present
129
+ - **Quality** (25 pts): Professional, polished, no placeholders
130
+ - **Correctness** (25 pts): Accurate data, no hallucinations
131
+ - **Presentation** (25 pts): SAS URLs, previews, clear results
132
+
133
+ ## Database Schema
134
+
135
+ Test results are stored in `tests/database/test_results.db`:
136
+
137
+ - **test_runs**: Test execution records
138
+ - **progress_updates**: Real-time progress data
139
+ - **logs**: Docker log entries
140
+ - **files_created**: Generated files with SAS URLs
141
+ - **evaluations**: LLM scores and reasoning
142
+ - **metrics**: Performance metrics
143
+ - **suggestions**: Improvement recommendations
144
+
145
+ ## Example Output
146
+
147
+ ```
148
+ 🧪 Running Test: Pokemon PowerPoint Presentation with Images
149
+ ID: tc001_pokemon_pptx
150
+ Timeout: 300s
151
+
152
+ 📝 Test run created: ID=1, Request=test_tc001_pokemon_pptx_a3f9b12e
153
+ ✅ Task submitted to queue
154
+ 📡 Starting data collection...
155
+ Progress: 10% - 📋 Planning task execution...
156
+ Progress: 25% - 🌐 Collecting Pokemon images...
157
+ Progress: 50% - 💻 Creating PowerPoint presentation...
158
+ Progress: 75% - 📸 Generating slide previews...
159
+ Progress: 100% - ✅ Task completed successfully!
160
+ ✅ Data collection complete
161
+ Progress updates: 12
162
+ Log entries: 45
163
+
164
+ 📊 Calculating metrics...
165
+ Time to completion: 142.3s
166
+ Progress updates: 12
167
+ Files created: 15
168
+ Errors: 0
169
+
170
+ 🤖 Running LLM evaluation...
171
+ Progress Score: 88/100
172
+ Output Score: 92/100
173
+
174
+ ✨ Evaluation complete:
175
+ Progress Score: 88/100
176
+ Output Score: 92/100
177
+ Overall Score: 90/100
178
+
179
+ ✅ Test Complete: Pokemon PowerPoint Presentation with Images
180
+ ```
181
+
182
+ ## Extending the Suite
183
+
184
+ ### Add New Test Cases
185
+
186
+ Edit `tests/test_cases.yaml`:
187
+
188
+ ```yaml
189
+ test_cases:
190
+ - id: tc004_my_new_test
191
+ name: "My New Test"
192
+ task: "Test task description..."
193
+ timeout_seconds: 300
194
+ expected_deliverables:
195
+ - type: pdf
196
+ pattern: "*.pdf"
197
+ min_count: 1
198
+ min_progress_updates: 5
199
+ quality_criteria:
200
+ - "Criterion 1"
201
+ - "Criterion 2"
202
+ ```
203
+
204
+ ### Customize Evaluation
205
+
206
+ Modify prompts in `tests/evaluators/prompts.py` to change scoring criteria.
207
+
208
+ ### Add New Metrics
209
+
210
+ Extend `tests/metrics/collector.py` with additional metrics calculation logic.
211
+
212
+ ## Troubleshooting
213
+
214
+ ### No progress updates collected
215
+ - Check Redis is running: `redis-cli ping`
216
+ - Verify REDIS_CONNECTION_STRING in .env
217
+ - Check Docker container is running: `docker ps`
218
+
219
+ ### Database errors
220
+ - Delete and recreate: `rm tests/database/test_results.db`
221
+ - Schema will auto-recreate on next run
222
+
223
+ ### LLM evaluation fails
224
+ - Verify CORTEX_API_KEY is set
225
+ - Check CORTEX_API_BASE_URL is accessible
226
+ - Review logs for API errors
227
+
228
+ ## Future Enhancements
229
+
230
+ - [ ] Web dashboard for viewing results
231
+ - [ ] CI/CD integration (GitHub Actions)
232
+ - [ ] Parallel test execution
233
+ - [ ] Screenshot comparison for visual regression
234
+ - [ ] Custom test case generator
235
+ - [ ] Export reports (PDF, HTML)
236
+ - [ ] Slack/email notifications
237
+
238
+ ## License
239
+
240
+ Part of the Cortex AutoGen2 project.