@aj-archipelago/cortex 1.4.2 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/config.js +1 -1
- package/helper-apps/cortex-autogen2/.dockerignore +1 -0
- package/helper-apps/cortex-autogen2/Dockerfile +6 -10
- package/helper-apps/cortex-autogen2/Dockerfile.worker +2 -0
- package/helper-apps/cortex-autogen2/agents.py +203 -2
- package/helper-apps/cortex-autogen2/main.py +1 -1
- package/helper-apps/cortex-autogen2/pyproject.toml +12 -0
- package/helper-apps/cortex-autogen2/requirements.txt +14 -0
- package/helper-apps/cortex-autogen2/services/redis_publisher.py +1 -1
- package/helper-apps/cortex-autogen2/services/run_analyzer.py +1 -1
- package/helper-apps/cortex-autogen2/task_processor.py +431 -229
- package/helper-apps/cortex-autogen2/test_entity_fetcher.py +305 -0
- package/helper-apps/cortex-autogen2/tests/README.md +240 -0
- package/helper-apps/cortex-autogen2/tests/TEST_REPORT.md +342 -0
- package/helper-apps/cortex-autogen2/tests/__init__.py +8 -0
- package/helper-apps/cortex-autogen2/tests/analysis/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/analysis/improvement_suggester.py +224 -0
- package/helper-apps/cortex-autogen2/tests/analysis/trend_analyzer.py +211 -0
- package/helper-apps/cortex-autogen2/tests/cli/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/cli/run_tests.py +296 -0
- package/helper-apps/cortex-autogen2/tests/collectors/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/collectors/log_collector.py +252 -0
- package/helper-apps/cortex-autogen2/tests/collectors/progress_collector.py +182 -0
- package/helper-apps/cortex-autogen2/tests/conftest.py +15 -0
- package/helper-apps/cortex-autogen2/tests/database/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/database/repository.py +501 -0
- package/helper-apps/cortex-autogen2/tests/database/schema.sql +108 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/llm_scorer.py +294 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/prompts.py +250 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/wordcloud_validator.py +168 -0
- package/helper-apps/cortex-autogen2/tests/metrics/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/metrics/collector.py +155 -0
- package/helper-apps/cortex-autogen2/tests/orchestrator.py +576 -0
- package/helper-apps/cortex-autogen2/tests/test_cases.yaml +279 -0
- package/helper-apps/cortex-autogen2/tests/test_data.db +0 -0
- package/helper-apps/cortex-autogen2/tests/utils/__init__.py +3 -0
- package/helper-apps/cortex-autogen2/tests/utils/connectivity.py +112 -0
- package/helper-apps/cortex-autogen2/tools/azure_blob_tools.py +74 -24
- package/helper-apps/cortex-autogen2/tools/entity_api_registry.json +38 -0
- package/helper-apps/cortex-autogen2/tools/file_tools.py +1 -1
- package/helper-apps/cortex-autogen2/tools/search_tools.py +436 -238
- package/helper-apps/cortex-file-handler/package-lock.json +2 -2
- package/helper-apps/cortex-file-handler/package.json +1 -1
- package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +4 -5
- package/helper-apps/cortex-file-handler/src/blobHandler.js +36 -144
- package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +5 -3
- package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +34 -1
- package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +22 -0
- package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +28 -1
- package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +29 -4
- package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +11 -0
- package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +1 -1
- package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +3 -2
- package/helper-apps/cortex-file-handler/tests/checkHashShortLived.test.js +8 -1
- package/helper-apps/cortex-file-handler/tests/containerConversionFlow.test.js +5 -2
- package/helper-apps/cortex-file-handler/tests/containerNameParsing.test.js +14 -7
- package/helper-apps/cortex-file-handler/tests/containerParameterFlow.test.js +5 -2
- package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +31 -19
- package/package.json +1 -1
- package/server/modelExecutor.js +4 -0
- package/server/plugins/claude4VertexPlugin.js +540 -0
- package/server/plugins/openAiWhisperPlugin.js +43 -2
- package/tests/integration/rest/vendors/claude_streaming.test.js +121 -0
- package/tests/unit/plugins/claude4VertexPlugin.test.js +462 -0
- package/tests/unit/plugins/claude4VertexToolConversion.test.js +413 -0
- package/helper-apps/cortex-autogen/.funcignore +0 -8
- package/helper-apps/cortex-autogen/Dockerfile +0 -10
- package/helper-apps/cortex-autogen/OAI_CONFIG_LIST +0 -6
- package/helper-apps/cortex-autogen/agents.py +0 -493
- package/helper-apps/cortex-autogen/agents_extra.py +0 -14
- package/helper-apps/cortex-autogen/config.py +0 -18
- package/helper-apps/cortex-autogen/data_operations.py +0 -29
- package/helper-apps/cortex-autogen/function_app.py +0 -44
- package/helper-apps/cortex-autogen/host.json +0 -15
- package/helper-apps/cortex-autogen/main.py +0 -38
- package/helper-apps/cortex-autogen/prompts.py +0 -196
- package/helper-apps/cortex-autogen/prompts_extra.py +0 -5
- package/helper-apps/cortex-autogen/requirements.txt +0 -9
- package/helper-apps/cortex-autogen/search.py +0 -85
- package/helper-apps/cortex-autogen/test.sh +0 -40
- package/helper-apps/cortex-autogen/tools/sasfileuploader.py +0 -66
- package/helper-apps/cortex-autogen/utils.py +0 -88
- package/helper-apps/cortex-autogen2/DigiCertGlobalRootCA.crt.pem +0 -22
- package/helper-apps/cortex-autogen2/poetry.lock +0 -3652
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Isolation test script for fetch_entity_images() function
|
|
4
|
+
Tests multi-tier entity image fetcher with various scenarios
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
import tempfile
|
|
12
|
+
import shutil
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
# Add project root to path
|
|
16
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
17
|
+
|
|
18
|
+
from tools.search_tools import fetch_entity_images
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestRunner:
|
|
22
|
+
"""Runs isolation tests for fetch_entity_images"""
|
|
23
|
+
|
|
24
|
+
def __init__(self):
|
|
25
|
+
self.work_dir = tempfile.mkdtemp(prefix="entity_test_")
|
|
26
|
+
self.results = []
|
|
27
|
+
print(f"🔬 Test work directory: {self.work_dir}\n")
|
|
28
|
+
|
|
29
|
+
def cleanup(self):
|
|
30
|
+
"""Clean up test directory"""
|
|
31
|
+
if os.path.exists(self.work_dir):
|
|
32
|
+
shutil.rmtree(self.work_dir)
|
|
33
|
+
print(f"\n🧹 Cleaned up test directory")
|
|
34
|
+
|
|
35
|
+
async def run_test(self, name: str, entities: list, entity_type: str, count_per_entity: int = 1, force_web_search: bool = False):
|
|
36
|
+
"""Run a single test case"""
|
|
37
|
+
print("=" * 80)
|
|
38
|
+
print(f"🧪 TEST: {name}")
|
|
39
|
+
print("=" * 80)
|
|
40
|
+
print(f" Entities: {entities}")
|
|
41
|
+
print(f" Type: {entity_type}")
|
|
42
|
+
print(f" Count per entity: {count_per_entity}")
|
|
43
|
+
print(f" Force web search: {force_web_search}")
|
|
44
|
+
print()
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
result_json = await fetch_entity_images(
|
|
48
|
+
entities=entities,
|
|
49
|
+
entity_type=entity_type,
|
|
50
|
+
count_per_entity=count_per_entity,
|
|
51
|
+
work_dir=self.work_dir,
|
|
52
|
+
force_web_search=force_web_search
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
result = json.loads(result_json)
|
|
56
|
+
|
|
57
|
+
# Print summary
|
|
58
|
+
print("📊 RESULT:")
|
|
59
|
+
print(f" Success: {result.get('success', False)}")
|
|
60
|
+
print(f" Total images: {result.get('stats', {}).get('total_images', 0)}")
|
|
61
|
+
print(f" API success: {result.get('stats', {}).get('api_success', 0)}")
|
|
62
|
+
print(f" API failed: {result.get('stats', {}).get('api_failed', 0)}")
|
|
63
|
+
print(f" Web search used: {result.get('stats', {}).get('web_search_used', 0)}")
|
|
64
|
+
|
|
65
|
+
# Show per-entity breakdown
|
|
66
|
+
if 'results' in result:
|
|
67
|
+
print("\n Per-entity results:")
|
|
68
|
+
for entity_data in result['results']:
|
|
69
|
+
entity_name = entity_data.get('entity', 'unknown')
|
|
70
|
+
img_count = len(entity_data.get('images', []))
|
|
71
|
+
method = entity_data.get('method', 'unknown')
|
|
72
|
+
api_used = entity_data.get('api_used', 'N/A')
|
|
73
|
+
print(f" {entity_name}: {img_count} images via {method} ({api_used})")
|
|
74
|
+
|
|
75
|
+
# Show sample image paths
|
|
76
|
+
if 'results' in result:
|
|
77
|
+
print("\n Sample images:")
|
|
78
|
+
for entity_data in result['results']:
|
|
79
|
+
entity_name = entity_data.get('entity', 'unknown')
|
|
80
|
+
images = entity_data.get('images', [])
|
|
81
|
+
if images:
|
|
82
|
+
sample = images[0]
|
|
83
|
+
filename = os.path.basename(sample.get('local_path', 'N/A'))
|
|
84
|
+
print(f" {entity_name}: {filename}")
|
|
85
|
+
|
|
86
|
+
if not result.get('success'):
|
|
87
|
+
print(f"\n ⚠️ Error: {result.get('error', 'Unknown error')}")
|
|
88
|
+
|
|
89
|
+
self.results.append({
|
|
90
|
+
'name': name,
|
|
91
|
+
'success': result.get('success', False),
|
|
92
|
+
'result': result
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
print("\n✅ TEST PASSED\n")
|
|
96
|
+
return result
|
|
97
|
+
|
|
98
|
+
except Exception as e:
|
|
99
|
+
print(f"\n❌ TEST FAILED: {str(e)}\n")
|
|
100
|
+
import traceback
|
|
101
|
+
traceback.print_exc()
|
|
102
|
+
self.results.append({
|
|
103
|
+
'name': name,
|
|
104
|
+
'success': False,
|
|
105
|
+
'error': str(e)
|
|
106
|
+
})
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
async def run_all_tests(self):
|
|
110
|
+
"""Run all test scenarios"""
|
|
111
|
+
print("\n" + "=" * 80)
|
|
112
|
+
print("🚀 STARTING ISOLATION TESTS FOR fetch_entity_images()")
|
|
113
|
+
print("=" * 80)
|
|
114
|
+
print()
|
|
115
|
+
|
|
116
|
+
# TEST 1: Pokemon entities (primary use case - should use PokeAPI)
|
|
117
|
+
await self.run_test(
|
|
118
|
+
name="Pokemon - PokeAPI Integration",
|
|
119
|
+
entities=["pikachu", "charizard", "mewtwo"],
|
|
120
|
+
entity_type="pokemon",
|
|
121
|
+
count_per_entity=1
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
await asyncio.sleep(2)
|
|
125
|
+
|
|
126
|
+
# TEST 2: Single Pokemon with multiple images
|
|
127
|
+
await self.run_test(
|
|
128
|
+
name="Pokemon - Multiple Images per Entity",
|
|
129
|
+
entities=["gengar"],
|
|
130
|
+
entity_type="pokemon",
|
|
131
|
+
count_per_entity=3
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
await asyncio.sleep(2)
|
|
135
|
+
|
|
136
|
+
# TEST 3: Country entities (should use REST Countries API)
|
|
137
|
+
await self.run_test(
|
|
138
|
+
name="Countries - REST Countries API",
|
|
139
|
+
entities=["france", "japan", "brazil"],
|
|
140
|
+
entity_type="country",
|
|
141
|
+
count_per_entity=1
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
await asyncio.sleep(2)
|
|
145
|
+
|
|
146
|
+
# TEST 4: Invalid Pokemon name (should fallback to web search)
|
|
147
|
+
await self.run_test(
|
|
148
|
+
name="Pokemon - Invalid Name Fallback",
|
|
149
|
+
entities=["invalidpokemonxyz123"],
|
|
150
|
+
entity_type="pokemon",
|
|
151
|
+
count_per_entity=1
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
await asyncio.sleep(2)
|
|
155
|
+
|
|
156
|
+
# TEST 5: Unknown entity type (should fallback to web search)
|
|
157
|
+
await self.run_test(
|
|
158
|
+
name="Unknown Entity Type - Web Search Fallback",
|
|
159
|
+
entities=["tesla model 3"],
|
|
160
|
+
entity_type="car",
|
|
161
|
+
count_per_entity=2
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
await asyncio.sleep(2)
|
|
165
|
+
|
|
166
|
+
# TEST 6: Force web search even for known entity type
|
|
167
|
+
await self.run_test(
|
|
168
|
+
name="Pokemon - Force Web Search Override",
|
|
169
|
+
entities=["dragonite"],
|
|
170
|
+
entity_type="pokemon",
|
|
171
|
+
count_per_entity=1,
|
|
172
|
+
force_web_search=True
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
await asyncio.sleep(2)
|
|
176
|
+
|
|
177
|
+
# TEST 7: Mixed valid/invalid Pokemon (test fallback handling)
|
|
178
|
+
await self.run_test(
|
|
179
|
+
name="Pokemon - Mixed Valid/Invalid Entities",
|
|
180
|
+
entities=["bulbasaur", "notarealpokemon999", "squirtle"],
|
|
181
|
+
entity_type="pokemon",
|
|
182
|
+
count_per_entity=1
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
await asyncio.sleep(2)
|
|
186
|
+
|
|
187
|
+
# TEST 8: Empty entity list (edge case)
|
|
188
|
+
await self.run_test(
|
|
189
|
+
name="Edge Case - Empty Entity List",
|
|
190
|
+
entities=[],
|
|
191
|
+
entity_type="pokemon",
|
|
192
|
+
count_per_entity=1
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
await asyncio.sleep(2)
|
|
196
|
+
|
|
197
|
+
# TEST 9: Pokemon with numbers (Gen 1 powerful pokemon)
|
|
198
|
+
await self.run_test(
|
|
199
|
+
name="Pokemon - Gen 1 Powerful Pokemon",
|
|
200
|
+
entities=["mewtwo", "dragonite", "alakazam", "gengar", "zapdos"],
|
|
201
|
+
entity_type="pokemon",
|
|
202
|
+
count_per_entity=1
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Print summary
|
|
206
|
+
self.print_summary()
|
|
207
|
+
|
|
208
|
+
def print_summary(self):
|
|
209
|
+
"""Print test summary"""
|
|
210
|
+
print("\n" + "=" * 80)
|
|
211
|
+
print("📊 TEST SUMMARY")
|
|
212
|
+
print("=" * 80)
|
|
213
|
+
|
|
214
|
+
passed = sum(1 for r in self.results if r.get('success'))
|
|
215
|
+
total = len(self.results)
|
|
216
|
+
|
|
217
|
+
print(f"\nTotal Tests: {total}")
|
|
218
|
+
print(f"Passed: {passed}")
|
|
219
|
+
print(f"Failed: {total - passed}")
|
|
220
|
+
print(f"Success Rate: {(passed/total*100) if total > 0 else 0:.1f}%")
|
|
221
|
+
|
|
222
|
+
print("\n📝 Individual Results:")
|
|
223
|
+
for i, result in enumerate(self.results, 1):
|
|
224
|
+
status = "✅ PASS" if result.get('success') else "❌ FAIL"
|
|
225
|
+
print(f" {i}. {status} - {result['name']}")
|
|
226
|
+
|
|
227
|
+
if result.get('success') and 'result' in result:
|
|
228
|
+
stats = result['result'].get('stats', {})
|
|
229
|
+
total_images = stats.get('total_images', 0)
|
|
230
|
+
api_success = stats.get('api_success', 0)
|
|
231
|
+
web_search = stats.get('web_search_used', 0)
|
|
232
|
+
|
|
233
|
+
methods = []
|
|
234
|
+
if api_success > 0:
|
|
235
|
+
methods.append(f"{api_success} via API")
|
|
236
|
+
if web_search > 0:
|
|
237
|
+
methods.append(f"{web_search} via web search")
|
|
238
|
+
|
|
239
|
+
method_str = ", ".join(methods) if methods else "no images"
|
|
240
|
+
print(f" → {total_images} total images ({method_str})")
|
|
241
|
+
|
|
242
|
+
print("\n" + "=" * 80)
|
|
243
|
+
|
|
244
|
+
# Check for specific success criteria
|
|
245
|
+
print("\n🎯 SUCCESS CRITERIA CHECK:")
|
|
246
|
+
|
|
247
|
+
criteria = [
|
|
248
|
+
("Pokemon API integration works", any(
|
|
249
|
+
r.get('success') and 'Pokemon - PokeAPI' in r['name']
|
|
250
|
+
and r.get('result', {}).get('stats', {}).get('api_success', 0) > 0
|
|
251
|
+
for r in self.results
|
|
252
|
+
)),
|
|
253
|
+
("Country API integration works", any(
|
|
254
|
+
r.get('success') and 'Countries - REST' in r['name']
|
|
255
|
+
and r.get('result', {}).get('stats', {}).get('api_success', 0) > 0
|
|
256
|
+
for r in self.results
|
|
257
|
+
)),
|
|
258
|
+
("Web search fallback works for invalid entities", any(
|
|
259
|
+
r.get('success') and 'Invalid Name Fallback' in r['name']
|
|
260
|
+
and r.get('result', {}).get('stats', {}).get('web_search_used', 0) > 0
|
|
261
|
+
for r in self.results
|
|
262
|
+
)),
|
|
263
|
+
("Web search fallback works for unknown types", any(
|
|
264
|
+
r.get('success') and 'Unknown Entity Type' in r['name']
|
|
265
|
+
and r.get('result', {}).get('stats', {}).get('web_search_used', 0) > 0
|
|
266
|
+
for r in self.results
|
|
267
|
+
)),
|
|
268
|
+
("Force web search override works", any(
|
|
269
|
+
r.get('success') and 'Force Web Search' in r['name']
|
|
270
|
+
and r.get('result', {}).get('stats', {}).get('web_search_used', 0) > 0
|
|
271
|
+
for r in self.results
|
|
272
|
+
)),
|
|
273
|
+
("Multiple images per entity works", any(
|
|
274
|
+
r.get('success') and 'Multiple Images' in r['name']
|
|
275
|
+
and r.get('result', {}).get('stats', {}).get('total_images', 0) >= 3
|
|
276
|
+
for r in self.results
|
|
277
|
+
)),
|
|
278
|
+
]
|
|
279
|
+
|
|
280
|
+
for criterion, passed in criteria:
|
|
281
|
+
status = "✅ PASS" if passed else "❌ FAIL"
|
|
282
|
+
print(f" {status} - {criterion}")
|
|
283
|
+
|
|
284
|
+
all_criteria_passed = all(passed for _, passed in criteria)
|
|
285
|
+
|
|
286
|
+
print("\n" + "=" * 80)
|
|
287
|
+
if all_criteria_passed:
|
|
288
|
+
print("🎉 ALL CRITERIA PASSED - Tool is ready for integration!")
|
|
289
|
+
else:
|
|
290
|
+
print("⚠️ Some criteria failed - review results above")
|
|
291
|
+
print("=" * 80)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
async def main():
|
|
295
|
+
"""Main test runner"""
|
|
296
|
+
runner = TestRunner()
|
|
297
|
+
|
|
298
|
+
try:
|
|
299
|
+
await runner.run_all_tests()
|
|
300
|
+
finally:
|
|
301
|
+
runner.cleanup()
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
if __name__ == "__main__":
|
|
305
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# Cortex AutoGen2 Automated Testing Suite
|
|
2
|
+
|
|
3
|
+
Comprehensive automated testing framework for evaluating and improving the AutoGen2 system quality.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- ✅ **Automated Test Execution**: Run predefined test cases with zero manual intervention
|
|
8
|
+
- 📊 **LLM-Based Evaluation**: Scores progress updates (0-100) and final outputs (0-100) using Cortex API
|
|
9
|
+
- 📈 **Performance Metrics**: Track latency, update frequency, error rates, and more
|
|
10
|
+
- 🗄️ **SQLite Storage**: All test results, scores, and metrics stored locally
|
|
11
|
+
- 💡 **Improvement Suggestions**: LLM analyzes failures and suggests code improvements
|
|
12
|
+
- 📉 **Trend Analysis**: Detect quality regressions over time
|
|
13
|
+
- 🖥️ **CLI Interface**: Easy-to-use command-line tool
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
### 1. Prerequisites
|
|
18
|
+
|
|
19
|
+
Ensure you have:
|
|
20
|
+
- Docker running (for cortex-autogen-function container)
|
|
21
|
+
- Redis running (for progress updates)
|
|
22
|
+
- Azure Queue setup
|
|
23
|
+
- Environment variables configured (.env file)
|
|
24
|
+
|
|
25
|
+
Required environment variables:
|
|
26
|
+
```bash
|
|
27
|
+
CORTEX_API_KEY=your_key_here
|
|
28
|
+
CORTEX_API_BASE_URL=http://localhost:4000/v1
|
|
29
|
+
REDIS_CONNECTION_STRING=redis://localhost:6379
|
|
30
|
+
REDIS_CHANNEL=cortex_progress
|
|
31
|
+
AZURE_STORAGE_CONNECTION_STRING=your_connection_string
|
|
32
|
+
AZURE_QUEUE_NAME=cortex-tasks
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### 2. Install Dependencies
|
|
36
|
+
|
|
37
|
+
The testing suite uses the same dependencies as the main project. No additional installation needed.
|
|
38
|
+
|
|
39
|
+
### 3. Run Tests
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Run all test cases
|
|
43
|
+
python tests/cli/run_tests.py --all
|
|
44
|
+
|
|
45
|
+
# Run specific test
|
|
46
|
+
python tests/cli/run_tests.py --test tc001_pokemon_pptx
|
|
47
|
+
|
|
48
|
+
# View test history
|
|
49
|
+
python tests/cli/run_tests.py --history --limit 20
|
|
50
|
+
|
|
51
|
+
# View score trend for a test case
|
|
52
|
+
python tests/cli/run_tests.py --trend tc001_pokemon_pptx
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Test Cases
|
|
56
|
+
|
|
57
|
+
The suite includes 3 predefined test cases:
|
|
58
|
+
|
|
59
|
+
### TC001: Pokemon PPTX Presentation
|
|
60
|
+
Creates a professional PowerPoint with Pokemon images, tests:
|
|
61
|
+
- Image collection (10+ images)
|
|
62
|
+
- Professional slide design
|
|
63
|
+
- Preview image generation
|
|
64
|
+
- File upload with SAS URLs
|
|
65
|
+
|
|
66
|
+
### TC002: PDF Report with Images
|
|
67
|
+
Generates a renewable energy PDF report, tests:
|
|
68
|
+
- Web research and image collection
|
|
69
|
+
- Chart/graph generation
|
|
70
|
+
- PDF formatting
|
|
71
|
+
- Document quality
|
|
72
|
+
|
|
73
|
+
### TC003: Random CSV Generation
|
|
74
|
+
Creates realistic sales data CSVs, tests:
|
|
75
|
+
- Data generation
|
|
76
|
+
- Statistical calculations
|
|
77
|
+
- CSV formatting
|
|
78
|
+
- Quick task execution
|
|
79
|
+
|
|
80
|
+
## Architecture
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
tests/
|
|
84
|
+
├── orchestrator.py # Main test execution engine
|
|
85
|
+
├── test_cases.yaml # Test case definitions
|
|
86
|
+
├── database/
|
|
87
|
+
│ ├── schema.sql # SQLite database schema
|
|
88
|
+
│ ├── repository.py # Data access layer
|
|
89
|
+
│ └── test_results.db # SQLite database (gitignored)
|
|
90
|
+
├── collectors/
|
|
91
|
+
│ ├── progress_collector.py # Redis subscriber for progress updates
|
|
92
|
+
│ └── log_collector.py # Docker log parser
|
|
93
|
+
├── evaluators/
|
|
94
|
+
│ ├── llm_scorer.py # LLM-based evaluation
|
|
95
|
+
│ └── prompts.py # Evaluation prompts and rubrics
|
|
96
|
+
├── metrics/
|
|
97
|
+
│ └── collector.py # Performance metrics calculation
|
|
98
|
+
├── analysis/
|
|
99
|
+
│ ├── improvement_suggester.py # LLM-powered suggestions
|
|
100
|
+
│ └── trend_analyzer.py # Trend and regression detection
|
|
101
|
+
└── cli/
|
|
102
|
+
└── run_tests.py # CLI interface
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## How It Works
|
|
106
|
+
|
|
107
|
+
1. **Test Submission**: Test orchestrator submits task to Azure Queue
|
|
108
|
+
2. **Data Collection**:
|
|
109
|
+
- Progress collector subscribes to Redis for real-time updates
|
|
110
|
+
- Log collector streams Docker container logs
|
|
111
|
+
3. **Execution Monitoring**: Wait for task completion or timeout
|
|
112
|
+
4. **Data Storage**: Store progress updates, logs, files in SQLite
|
|
113
|
+
5. **Metrics Calculation**: Calculate latency, frequency, error counts
|
|
114
|
+
6. **LLM Evaluation**:
|
|
115
|
+
- Score progress updates (frequency, clarity, accuracy)
|
|
116
|
+
- Score final output (completeness, quality, correctness)
|
|
117
|
+
7. **Analysis**: Generate improvement suggestions and track trends
|
|
118
|
+
|
|
119
|
+
## Evaluation Criteria
|
|
120
|
+
|
|
121
|
+
### Progress Updates (0-100)
|
|
122
|
+
- **Frequency** (25 pts): Updates every 2-5 seconds ideal
|
|
123
|
+
- **Clarity** (25 pts): Emojis, concise, informative
|
|
124
|
+
- **Accuracy** (25 pts): Progress % matches work done
|
|
125
|
+
- **Coverage** (25 pts): All important steps communicated
|
|
126
|
+
|
|
127
|
+
### Final Output (0-100)
|
|
128
|
+
- **Completeness** (25 pts): All deliverables present
|
|
129
|
+
- **Quality** (25 pts): Professional, polished, no placeholders
|
|
130
|
+
- **Correctness** (25 pts): Accurate data, no hallucinations
|
|
131
|
+
- **Presentation** (25 pts): SAS URLs, previews, clear results
|
|
132
|
+
|
|
133
|
+
## Database Schema
|
|
134
|
+
|
|
135
|
+
Test results are stored in `tests/database/test_results.db`:
|
|
136
|
+
|
|
137
|
+
- **test_runs**: Test execution records
|
|
138
|
+
- **progress_updates**: Real-time progress data
|
|
139
|
+
- **logs**: Docker log entries
|
|
140
|
+
- **files_created**: Generated files with SAS URLs
|
|
141
|
+
- **evaluations**: LLM scores and reasoning
|
|
142
|
+
- **metrics**: Performance metrics
|
|
143
|
+
- **suggestions**: Improvement recommendations
|
|
144
|
+
|
|
145
|
+
## Example Output
|
|
146
|
+
|
|
147
|
+
```
|
|
148
|
+
🧪 Running Test: Pokemon PowerPoint Presentation with Images
|
|
149
|
+
ID: tc001_pokemon_pptx
|
|
150
|
+
Timeout: 300s
|
|
151
|
+
|
|
152
|
+
📝 Test run created: ID=1, Request=test_tc001_pokemon_pptx_a3f9b12e
|
|
153
|
+
✅ Task submitted to queue
|
|
154
|
+
📡 Starting data collection...
|
|
155
|
+
Progress: 10% - 📋 Planning task execution...
|
|
156
|
+
Progress: 25% - 🌐 Collecting Pokemon images...
|
|
157
|
+
Progress: 50% - 💻 Creating PowerPoint presentation...
|
|
158
|
+
Progress: 75% - 📸 Generating slide previews...
|
|
159
|
+
Progress: 100% - ✅ Task completed successfully!
|
|
160
|
+
✅ Data collection complete
|
|
161
|
+
Progress updates: 12
|
|
162
|
+
Log entries: 45
|
|
163
|
+
|
|
164
|
+
📊 Calculating metrics...
|
|
165
|
+
Time to completion: 142.3s
|
|
166
|
+
Progress updates: 12
|
|
167
|
+
Files created: 15
|
|
168
|
+
Errors: 0
|
|
169
|
+
|
|
170
|
+
🤖 Running LLM evaluation...
|
|
171
|
+
Progress Score: 88/100
|
|
172
|
+
Output Score: 92/100
|
|
173
|
+
|
|
174
|
+
✨ Evaluation complete:
|
|
175
|
+
Progress Score: 88/100
|
|
176
|
+
Output Score: 92/100
|
|
177
|
+
Overall Score: 90/100
|
|
178
|
+
|
|
179
|
+
✅ Test Complete: Pokemon PowerPoint Presentation with Images
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## Extending the Suite
|
|
183
|
+
|
|
184
|
+
### Add New Test Cases
|
|
185
|
+
|
|
186
|
+
Edit `tests/test_cases.yaml`:
|
|
187
|
+
|
|
188
|
+
```yaml
|
|
189
|
+
test_cases:
|
|
190
|
+
- id: tc004_my_new_test
|
|
191
|
+
name: "My New Test"
|
|
192
|
+
task: "Test task description..."
|
|
193
|
+
timeout_seconds: 300
|
|
194
|
+
expected_deliverables:
|
|
195
|
+
- type: pdf
|
|
196
|
+
pattern: "*.pdf"
|
|
197
|
+
min_count: 1
|
|
198
|
+
min_progress_updates: 5
|
|
199
|
+
quality_criteria:
|
|
200
|
+
- "Criterion 1"
|
|
201
|
+
- "Criterion 2"
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### Customize Evaluation
|
|
205
|
+
|
|
206
|
+
Modify prompts in `tests/evaluators/prompts.py` to change scoring criteria.
|
|
207
|
+
|
|
208
|
+
### Add New Metrics
|
|
209
|
+
|
|
210
|
+
Extend `tests/metrics/collector.py` with additional metrics calculation logic.
|
|
211
|
+
|
|
212
|
+
## Troubleshooting
|
|
213
|
+
|
|
214
|
+
### No progress updates collected
|
|
215
|
+
- Check Redis is running: `redis-cli ping`
|
|
216
|
+
- Verify REDIS_CONNECTION_STRING in .env
|
|
217
|
+
- Check Docker container is running: `docker ps`
|
|
218
|
+
|
|
219
|
+
### Database errors
|
|
220
|
+
- Delete and recreate: `rm tests/database/test_results.db`
|
|
221
|
+
- Schema will auto-recreate on next run
|
|
222
|
+
|
|
223
|
+
### LLM evaluation fails
|
|
224
|
+
- Verify CORTEX_API_KEY is set
|
|
225
|
+
- Check CORTEX_API_BASE_URL is accessible
|
|
226
|
+
- Review logs for API errors
|
|
227
|
+
|
|
228
|
+
## Future Enhancements
|
|
229
|
+
|
|
230
|
+
- [ ] Web dashboard for viewing results
|
|
231
|
+
- [ ] CI/CD integration (GitHub Actions)
|
|
232
|
+
- [ ] Parallel test execution
|
|
233
|
+
- [ ] Screenshot comparison for visual regression
|
|
234
|
+
- [ ] Custom test case generator
|
|
235
|
+
- [ ] Export reports (PDF, HTML)
|
|
236
|
+
- [ ] Slack/email notifications
|
|
237
|
+
|
|
238
|
+
## License
|
|
239
|
+
|
|
240
|
+
Part of the Cortex AutoGen2 project.
|