@aj-archipelago/cortex 1.4.2 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/config.js +1 -1
- package/helper-apps/cortex-autogen2/.dockerignore +1 -0
- package/helper-apps/cortex-autogen2/Dockerfile +6 -10
- package/helper-apps/cortex-autogen2/Dockerfile.worker +2 -0
- package/helper-apps/cortex-autogen2/agents.py +203 -2
- package/helper-apps/cortex-autogen2/main.py +1 -1
- package/helper-apps/cortex-autogen2/pyproject.toml +12 -0
- package/helper-apps/cortex-autogen2/requirements.txt +14 -0
- package/helper-apps/cortex-autogen2/services/redis_publisher.py +1 -1
- package/helper-apps/cortex-autogen2/services/run_analyzer.py +1 -1
- package/helper-apps/cortex-autogen2/task_processor.py +431 -229
- package/helper-apps/cortex-autogen2/test_entity_fetcher.py +305 -0
- package/helper-apps/cortex-autogen2/tests/README.md +240 -0
- package/helper-apps/cortex-autogen2/tests/TEST_REPORT.md +342 -0
- package/helper-apps/cortex-autogen2/tests/__init__.py +8 -0
- package/helper-apps/cortex-autogen2/tests/analysis/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/analysis/improvement_suggester.py +224 -0
- package/helper-apps/cortex-autogen2/tests/analysis/trend_analyzer.py +211 -0
- package/helper-apps/cortex-autogen2/tests/cli/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/cli/run_tests.py +296 -0
- package/helper-apps/cortex-autogen2/tests/collectors/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/collectors/log_collector.py +252 -0
- package/helper-apps/cortex-autogen2/tests/collectors/progress_collector.py +182 -0
- package/helper-apps/cortex-autogen2/tests/conftest.py +15 -0
- package/helper-apps/cortex-autogen2/tests/database/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/database/repository.py +501 -0
- package/helper-apps/cortex-autogen2/tests/database/schema.sql +108 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/llm_scorer.py +294 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/prompts.py +250 -0
- package/helper-apps/cortex-autogen2/tests/evaluators/wordcloud_validator.py +168 -0
- package/helper-apps/cortex-autogen2/tests/metrics/__init__.py +1 -0
- package/helper-apps/cortex-autogen2/tests/metrics/collector.py +155 -0
- package/helper-apps/cortex-autogen2/tests/orchestrator.py +576 -0
- package/helper-apps/cortex-autogen2/tests/test_cases.yaml +279 -0
- package/helper-apps/cortex-autogen2/tests/test_data.db +0 -0
- package/helper-apps/cortex-autogen2/tests/utils/__init__.py +3 -0
- package/helper-apps/cortex-autogen2/tests/utils/connectivity.py +112 -0
- package/helper-apps/cortex-autogen2/tools/azure_blob_tools.py +74 -24
- package/helper-apps/cortex-autogen2/tools/entity_api_registry.json +38 -0
- package/helper-apps/cortex-autogen2/tools/file_tools.py +1 -1
- package/helper-apps/cortex-autogen2/tools/search_tools.py +436 -238
- package/helper-apps/cortex-file-handler/package-lock.json +2 -2
- package/helper-apps/cortex-file-handler/package.json +1 -1
- package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +4 -5
- package/helper-apps/cortex-file-handler/src/blobHandler.js +36 -144
- package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +5 -3
- package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +34 -1
- package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +22 -0
- package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +28 -1
- package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +29 -4
- package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +11 -0
- package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +1 -1
- package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +3 -2
- package/helper-apps/cortex-file-handler/tests/checkHashShortLived.test.js +8 -1
- package/helper-apps/cortex-file-handler/tests/containerConversionFlow.test.js +5 -2
- package/helper-apps/cortex-file-handler/tests/containerNameParsing.test.js +14 -7
- package/helper-apps/cortex-file-handler/tests/containerParameterFlow.test.js +5 -2
- package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +31 -19
- package/package.json +1 -1
- package/server/modelExecutor.js +4 -0
- package/server/plugins/claude4VertexPlugin.js +540 -0
- package/server/plugins/openAiWhisperPlugin.js +43 -2
- package/tests/integration/rest/vendors/claude_streaming.test.js +121 -0
- package/tests/unit/plugins/claude4VertexPlugin.test.js +462 -0
- package/tests/unit/plugins/claude4VertexToolConversion.test.js +413 -0
- package/helper-apps/cortex-autogen/.funcignore +0 -8
- package/helper-apps/cortex-autogen/Dockerfile +0 -10
- package/helper-apps/cortex-autogen/OAI_CONFIG_LIST +0 -6
- package/helper-apps/cortex-autogen/agents.py +0 -493
- package/helper-apps/cortex-autogen/agents_extra.py +0 -14
- package/helper-apps/cortex-autogen/config.py +0 -18
- package/helper-apps/cortex-autogen/data_operations.py +0 -29
- package/helper-apps/cortex-autogen/function_app.py +0 -44
- package/helper-apps/cortex-autogen/host.json +0 -15
- package/helper-apps/cortex-autogen/main.py +0 -38
- package/helper-apps/cortex-autogen/prompts.py +0 -196
- package/helper-apps/cortex-autogen/prompts_extra.py +0 -5
- package/helper-apps/cortex-autogen/requirements.txt +0 -9
- package/helper-apps/cortex-autogen/search.py +0 -85
- package/helper-apps/cortex-autogen/test.sh +0 -40
- package/helper-apps/cortex-autogen/tools/sasfileuploader.py +0 -66
- package/helper-apps/cortex-autogen/utils.py +0 -88
- package/helper-apps/cortex-autogen2/DigiCertGlobalRootCA.crt.pem +0 -22
- package/helper-apps/cortex-autogen2/poetry.lock +0 -3652
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database repository for test results storage and retrieval.
|
|
3
|
+
|
|
4
|
+
This module provides a clean interface for storing and querying test data
|
|
5
|
+
from the SQLite database.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sqlite3
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import List, Dict, Optional, Any
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TestRepository:
|
|
17
|
+
"""Repository for managing test results in SQLite database."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, db_path: Optional[str] = None):
|
|
20
|
+
"""
|
|
21
|
+
Initialize the repository.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
db_path: Path to SQLite database file. If None, uses default location.
|
|
25
|
+
"""
|
|
26
|
+
if db_path is None:
|
|
27
|
+
db_dir = Path(__file__).parent
|
|
28
|
+
db_path = db_dir / "test_results.db"
|
|
29
|
+
|
|
30
|
+
self.db_path = str(db_path)
|
|
31
|
+
self._initialize_database()
|
|
32
|
+
|
|
33
|
+
def _initialize_database(self):
|
|
34
|
+
"""Create database and tables if they don't exist."""
|
|
35
|
+
schema_path = Path(__file__).parent / "schema.sql"
|
|
36
|
+
|
|
37
|
+
with open(schema_path, 'r') as f:
|
|
38
|
+
schema = f.read()
|
|
39
|
+
|
|
40
|
+
conn = sqlite3.connect(self.db_path)
|
|
41
|
+
conn.executescript(schema)
|
|
42
|
+
conn.commit()
|
|
43
|
+
conn.close()
|
|
44
|
+
|
|
45
|
+
def _get_connection(self) -> sqlite3.Connection:
|
|
46
|
+
"""Get a database connection with row factory."""
|
|
47
|
+
conn = sqlite3.connect(self.db_path)
|
|
48
|
+
conn.row_factory = sqlite3.Row
|
|
49
|
+
return conn
|
|
50
|
+
|
|
51
|
+
# ==================== Test Runs ====================
|
|
52
|
+
|
|
53
|
+
def create_test_run(
|
|
54
|
+
self,
|
|
55
|
+
test_case_id: str,
|
|
56
|
+
task_description: str,
|
|
57
|
+
request_id: str
|
|
58
|
+
) -> int:
|
|
59
|
+
"""
|
|
60
|
+
Create a new test run record.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
test_run_id: The ID of the created test run
|
|
64
|
+
"""
|
|
65
|
+
conn = self._get_connection()
|
|
66
|
+
cursor = conn.cursor()
|
|
67
|
+
|
|
68
|
+
cursor.execute("""
|
|
69
|
+
INSERT INTO test_runs (test_case_id, task_description, request_id, started_at, status)
|
|
70
|
+
VALUES (?, ?, ?, ?, 'running')
|
|
71
|
+
""", (test_case_id, task_description, request_id, datetime.now()))
|
|
72
|
+
|
|
73
|
+
test_run_id = cursor.lastrowid
|
|
74
|
+
conn.commit()
|
|
75
|
+
conn.close()
|
|
76
|
+
|
|
77
|
+
return test_run_id
|
|
78
|
+
|
|
79
|
+
def update_test_run_status(
|
|
80
|
+
self,
|
|
81
|
+
test_run_id: int,
|
|
82
|
+
status: str,
|
|
83
|
+
completed_at: Optional[datetime] = None,
|
|
84
|
+
error_message: Optional[str] = None
|
|
85
|
+
):
|
|
86
|
+
"""Update test run status and completion time."""
|
|
87
|
+
conn = self._get_connection()
|
|
88
|
+
cursor = conn.cursor()
|
|
89
|
+
|
|
90
|
+
if completed_at is None and status in ('completed', 'failed', 'timeout'):
|
|
91
|
+
completed_at = datetime.now()
|
|
92
|
+
|
|
93
|
+
# Calculate duration if completed
|
|
94
|
+
duration_seconds = None
|
|
95
|
+
if completed_at:
|
|
96
|
+
cursor.execute("SELECT started_at FROM test_runs WHERE id = ?", (test_run_id,))
|
|
97
|
+
row = cursor.fetchone()
|
|
98
|
+
if row:
|
|
99
|
+
started_at = datetime.fromisoformat(row['started_at'])
|
|
100
|
+
duration_seconds = (completed_at - started_at).total_seconds()
|
|
101
|
+
|
|
102
|
+
cursor.execute("""
|
|
103
|
+
UPDATE test_runs
|
|
104
|
+
SET status = ?, completed_at = ?, duration_seconds = ?, error_message = ?
|
|
105
|
+
WHERE id = ?
|
|
106
|
+
""", (status, completed_at, duration_seconds, error_message, test_run_id))
|
|
107
|
+
|
|
108
|
+
conn.commit()
|
|
109
|
+
conn.close()
|
|
110
|
+
|
|
111
|
+
def save_final_response(self, test_run_id: int, final_response: str):
|
|
112
|
+
"""
|
|
113
|
+
Save the final response message sent to the user.
|
|
114
|
+
|
|
115
|
+
This stores the complete final message including file URLs,
|
|
116
|
+
making it easy to retrieve outputs from any test run.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
test_run_id: ID of the test run
|
|
120
|
+
final_response: The complete final message text with URLs
|
|
121
|
+
"""
|
|
122
|
+
conn = self._get_connection()
|
|
123
|
+
cursor = conn.cursor()
|
|
124
|
+
|
|
125
|
+
cursor.execute("""
|
|
126
|
+
UPDATE test_runs
|
|
127
|
+
SET final_response = ?
|
|
128
|
+
WHERE id = ?
|
|
129
|
+
""", (final_response, test_run_id))
|
|
130
|
+
|
|
131
|
+
conn.commit()
|
|
132
|
+
conn.close()
|
|
133
|
+
|
|
134
|
+
def get_test_run(self, test_run_id: int) -> Optional[Dict[str, Any]]:
|
|
135
|
+
"""Get test run by ID."""
|
|
136
|
+
conn = self._get_connection()
|
|
137
|
+
cursor = conn.cursor()
|
|
138
|
+
|
|
139
|
+
cursor.execute("SELECT * FROM test_runs WHERE id = ?", (test_run_id,))
|
|
140
|
+
row = cursor.fetchone()
|
|
141
|
+
conn.close()
|
|
142
|
+
|
|
143
|
+
return dict(row) if row else None
|
|
144
|
+
|
|
145
|
+
def get_recent_runs(self, test_case_id: Optional[str] = None, limit: int = 10) -> List[Dict[str, Any]]:
|
|
146
|
+
"""Get recent test runs, optionally filtered by test case."""
|
|
147
|
+
conn = self._get_connection()
|
|
148
|
+
cursor = conn.cursor()
|
|
149
|
+
|
|
150
|
+
if test_case_id:
|
|
151
|
+
cursor.execute("""
|
|
152
|
+
SELECT * FROM test_runs
|
|
153
|
+
WHERE test_case_id = ?
|
|
154
|
+
ORDER BY created_at DESC
|
|
155
|
+
LIMIT ?
|
|
156
|
+
""", (test_case_id, limit))
|
|
157
|
+
else:
|
|
158
|
+
cursor.execute("""
|
|
159
|
+
SELECT * FROM test_runs
|
|
160
|
+
ORDER BY created_at DESC
|
|
161
|
+
LIMIT ?
|
|
162
|
+
""", (limit,))
|
|
163
|
+
|
|
164
|
+
rows = cursor.fetchall()
|
|
165
|
+
conn.close()
|
|
166
|
+
|
|
167
|
+
return [dict(row) for row in rows]
|
|
168
|
+
|
|
169
|
+
# ==================== Progress Updates ====================
|
|
170
|
+
|
|
171
|
+
def add_progress_update(
|
|
172
|
+
self,
|
|
173
|
+
test_run_id: int,
|
|
174
|
+
timestamp: datetime,
|
|
175
|
+
progress: float,
|
|
176
|
+
info: str,
|
|
177
|
+
is_final: bool = False
|
|
178
|
+
):
|
|
179
|
+
"""Add a progress update to the database."""
|
|
180
|
+
conn = self._get_connection()
|
|
181
|
+
cursor = conn.cursor()
|
|
182
|
+
|
|
183
|
+
cursor.execute("""
|
|
184
|
+
INSERT INTO progress_updates (test_run_id, timestamp, progress, info, is_final)
|
|
185
|
+
VALUES (?, ?, ?, ?, ?)
|
|
186
|
+
""", (test_run_id, timestamp, progress, info, is_final))
|
|
187
|
+
|
|
188
|
+
conn.commit()
|
|
189
|
+
conn.close()
|
|
190
|
+
|
|
191
|
+
def get_progress_updates(self, test_run_id: int) -> List[Dict[str, Any]]:
|
|
192
|
+
"""Get all progress updates for a test run."""
|
|
193
|
+
conn = self._get_connection()
|
|
194
|
+
cursor = conn.cursor()
|
|
195
|
+
|
|
196
|
+
cursor.execute("""
|
|
197
|
+
SELECT * FROM progress_updates
|
|
198
|
+
WHERE test_run_id = ?
|
|
199
|
+
ORDER BY timestamp ASC
|
|
200
|
+
""", (test_run_id,))
|
|
201
|
+
|
|
202
|
+
rows = cursor.fetchall()
|
|
203
|
+
conn.close()
|
|
204
|
+
|
|
205
|
+
return [dict(row) for row in rows]
|
|
206
|
+
|
|
207
|
+
# ==================== Logs ====================
|
|
208
|
+
|
|
209
|
+
def add_log(
|
|
210
|
+
self,
|
|
211
|
+
test_run_id: int,
|
|
212
|
+
timestamp: datetime,
|
|
213
|
+
level: str,
|
|
214
|
+
agent: Optional[str],
|
|
215
|
+
message: str
|
|
216
|
+
):
|
|
217
|
+
"""Add a log entry."""
|
|
218
|
+
conn = self._get_connection()
|
|
219
|
+
cursor = conn.cursor()
|
|
220
|
+
|
|
221
|
+
cursor.execute("""
|
|
222
|
+
INSERT INTO logs (test_run_id, timestamp, level, agent, message)
|
|
223
|
+
VALUES (?, ?, ?, ?, ?)
|
|
224
|
+
""", (test_run_id, timestamp, level, agent, message))
|
|
225
|
+
|
|
226
|
+
conn.commit()
|
|
227
|
+
conn.close()
|
|
228
|
+
|
|
229
|
+
def get_logs(self, test_run_id: int, level: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
230
|
+
"""Get logs for a test run, optionally filtered by level."""
|
|
231
|
+
conn = self._get_connection()
|
|
232
|
+
cursor = conn.cursor()
|
|
233
|
+
|
|
234
|
+
if level:
|
|
235
|
+
cursor.execute("""
|
|
236
|
+
SELECT * FROM logs
|
|
237
|
+
WHERE test_run_id = ? AND level = ?
|
|
238
|
+
ORDER BY timestamp ASC
|
|
239
|
+
""", (test_run_id, level))
|
|
240
|
+
else:
|
|
241
|
+
cursor.execute("""
|
|
242
|
+
SELECT * FROM logs
|
|
243
|
+
WHERE test_run_id = ?
|
|
244
|
+
ORDER BY timestamp ASC
|
|
245
|
+
""", (test_run_id,))
|
|
246
|
+
|
|
247
|
+
rows = cursor.fetchall()
|
|
248
|
+
conn.close()
|
|
249
|
+
|
|
250
|
+
return [dict(row) for row in rows]
|
|
251
|
+
|
|
252
|
+
# ==================== Files ====================
|
|
253
|
+
|
|
254
|
+
def add_file(
|
|
255
|
+
self,
|
|
256
|
+
test_run_id: int,
|
|
257
|
+
file_path: str,
|
|
258
|
+
file_type: str,
|
|
259
|
+
file_size_bytes: Optional[int] = None,
|
|
260
|
+
sas_url: Optional[str] = None
|
|
261
|
+
):
|
|
262
|
+
"""Add a file created during test execution."""
|
|
263
|
+
conn = self._get_connection()
|
|
264
|
+
cursor = conn.cursor()
|
|
265
|
+
|
|
266
|
+
cursor.execute("""
|
|
267
|
+
INSERT INTO files_created (test_run_id, file_path, file_type, file_size_bytes, sas_url)
|
|
268
|
+
VALUES (?, ?, ?, ?, ?)
|
|
269
|
+
""", (test_run_id, file_path, file_type, file_size_bytes, sas_url))
|
|
270
|
+
|
|
271
|
+
conn.commit()
|
|
272
|
+
conn.close()
|
|
273
|
+
|
|
274
|
+
def get_files(self, test_run_id: int) -> List[Dict[str, Any]]:
|
|
275
|
+
"""Get all files created during a test run."""
|
|
276
|
+
conn = self._get_connection()
|
|
277
|
+
cursor = conn.cursor()
|
|
278
|
+
|
|
279
|
+
cursor.execute("""
|
|
280
|
+
SELECT * FROM files_created
|
|
281
|
+
WHERE test_run_id = ?
|
|
282
|
+
ORDER BY created_at ASC
|
|
283
|
+
""", (test_run_id,))
|
|
284
|
+
|
|
285
|
+
rows = cursor.fetchall()
|
|
286
|
+
conn.close()
|
|
287
|
+
|
|
288
|
+
return [dict(row) for row in rows]
|
|
289
|
+
|
|
290
|
+
# ==================== Evaluations ====================
|
|
291
|
+
|
|
292
|
+
def save_evaluation(
|
|
293
|
+
self,
|
|
294
|
+
test_run_id: int,
|
|
295
|
+
progress_score: int,
|
|
296
|
+
output_score: int,
|
|
297
|
+
progress_reasoning: str,
|
|
298
|
+
output_reasoning: str,
|
|
299
|
+
progress_issues: Optional[List[str]] = None,
|
|
300
|
+
output_strengths: Optional[List[str]] = None,
|
|
301
|
+
output_weaknesses: Optional[List[str]] = None
|
|
302
|
+
):
|
|
303
|
+
"""Save evaluation scores and reasoning."""
|
|
304
|
+
overall_score = int((output_score * 0.8) + (progress_score * 0.2))
|
|
305
|
+
|
|
306
|
+
conn = self._get_connection()
|
|
307
|
+
cursor = conn.cursor()
|
|
308
|
+
|
|
309
|
+
cursor.execute("""
|
|
310
|
+
INSERT OR REPLACE INTO evaluations (
|
|
311
|
+
test_run_id, progress_score, output_score, overall_score,
|
|
312
|
+
progress_reasoning, output_reasoning,
|
|
313
|
+
progress_issues, output_strengths, output_weaknesses
|
|
314
|
+
)
|
|
315
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
316
|
+
""", (
|
|
317
|
+
test_run_id, progress_score, output_score, overall_score,
|
|
318
|
+
progress_reasoning, output_reasoning,
|
|
319
|
+
json.dumps(progress_issues or []),
|
|
320
|
+
json.dumps(output_strengths or []),
|
|
321
|
+
json.dumps(output_weaknesses or [])
|
|
322
|
+
))
|
|
323
|
+
|
|
324
|
+
conn.commit()
|
|
325
|
+
conn.close()
|
|
326
|
+
|
|
327
|
+
def get_evaluation(self, test_run_id: int) -> Optional[Dict[str, Any]]:
|
|
328
|
+
"""Get evaluation for a test run."""
|
|
329
|
+
conn = self._get_connection()
|
|
330
|
+
cursor = conn.cursor()
|
|
331
|
+
|
|
332
|
+
cursor.execute("SELECT * FROM evaluations WHERE test_run_id = ?", (test_run_id,))
|
|
333
|
+
row = cursor.fetchone()
|
|
334
|
+
conn.close()
|
|
335
|
+
|
|
336
|
+
if row:
|
|
337
|
+
result = dict(row)
|
|
338
|
+
# Parse JSON fields
|
|
339
|
+
result['progress_issues'] = json.loads(result['progress_issues'])
|
|
340
|
+
result['output_strengths'] = json.loads(result['output_strengths'])
|
|
341
|
+
result['output_weaknesses'] = json.loads(result['output_weaknesses'])
|
|
342
|
+
return result
|
|
343
|
+
|
|
344
|
+
return None
|
|
345
|
+
|
|
346
|
+
# ==================== Metrics ====================
|
|
347
|
+
|
|
348
|
+
def save_metrics(
|
|
349
|
+
self,
|
|
350
|
+
test_run_id: int,
|
|
351
|
+
time_to_first_progress: float,
|
|
352
|
+
time_to_completion: float,
|
|
353
|
+
total_progress_updates: int,
|
|
354
|
+
avg_update_interval: float,
|
|
355
|
+
min_update_interval: float,
|
|
356
|
+
max_update_interval: float,
|
|
357
|
+
files_created: int,
|
|
358
|
+
sas_urls_provided: int,
|
|
359
|
+
errors_count: int,
|
|
360
|
+
warnings_count: int
|
|
361
|
+
):
|
|
362
|
+
"""Save performance metrics."""
|
|
363
|
+
conn = self._get_connection()
|
|
364
|
+
cursor = conn.cursor()
|
|
365
|
+
|
|
366
|
+
cursor.execute("""
|
|
367
|
+
INSERT OR REPLACE INTO metrics (
|
|
368
|
+
test_run_id, time_to_first_progress, time_to_completion,
|
|
369
|
+
total_progress_updates, avg_update_interval,
|
|
370
|
+
min_update_interval, max_update_interval,
|
|
371
|
+
files_created, sas_urls_provided,
|
|
372
|
+
errors_count, warnings_count
|
|
373
|
+
)
|
|
374
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
375
|
+
""", (
|
|
376
|
+
test_run_id, time_to_first_progress, time_to_completion,
|
|
377
|
+
total_progress_updates, avg_update_interval,
|
|
378
|
+
min_update_interval, max_update_interval,
|
|
379
|
+
files_created, sas_urls_provided,
|
|
380
|
+
errors_count, warnings_count
|
|
381
|
+
))
|
|
382
|
+
|
|
383
|
+
conn.commit()
|
|
384
|
+
conn.close()
|
|
385
|
+
|
|
386
|
+
def get_metrics(self, test_run_id: int) -> Optional[Dict[str, Any]]:
|
|
387
|
+
"""Get metrics for a test run."""
|
|
388
|
+
conn = self._get_connection()
|
|
389
|
+
cursor = conn.cursor()
|
|
390
|
+
|
|
391
|
+
cursor.execute("SELECT * FROM metrics WHERE test_run_id = ?", (test_run_id,))
|
|
392
|
+
row = cursor.fetchone()
|
|
393
|
+
conn.close()
|
|
394
|
+
|
|
395
|
+
return dict(row) if row else None
|
|
396
|
+
|
|
397
|
+
# ==================== Suggestions ====================
|
|
398
|
+
|
|
399
|
+
def add_suggestion(
|
|
400
|
+
self,
|
|
401
|
+
test_run_id: int,
|
|
402
|
+
suggestion: str,
|
|
403
|
+
category: str = 'other',
|
|
404
|
+
priority: str = 'medium',
|
|
405
|
+
code_reference: Optional[str] = None
|
|
406
|
+
):
|
|
407
|
+
"""Add an improvement suggestion."""
|
|
408
|
+
conn = self._get_connection()
|
|
409
|
+
cursor = conn.cursor()
|
|
410
|
+
|
|
411
|
+
cursor.execute("""
|
|
412
|
+
INSERT INTO suggestions (test_run_id, suggestion, category, priority, code_reference)
|
|
413
|
+
VALUES (?, ?, ?, ?, ?)
|
|
414
|
+
""", (test_run_id, suggestion, category, priority, code_reference))
|
|
415
|
+
|
|
416
|
+
conn.commit()
|
|
417
|
+
conn.close()
|
|
418
|
+
|
|
419
|
+
def get_suggestions(self, test_run_id: int) -> List[Dict[str, Any]]:
|
|
420
|
+
"""Get all suggestions for a test run."""
|
|
421
|
+
conn = self._get_connection()
|
|
422
|
+
cursor = conn.cursor()
|
|
423
|
+
|
|
424
|
+
cursor.execute("""
|
|
425
|
+
SELECT * FROM suggestions
|
|
426
|
+
WHERE test_run_id = ?
|
|
427
|
+
ORDER BY priority DESC, created_at ASC
|
|
428
|
+
""", (test_run_id,))
|
|
429
|
+
|
|
430
|
+
rows = cursor.fetchall()
|
|
431
|
+
conn.close()
|
|
432
|
+
|
|
433
|
+
return [dict(row) for row in rows]
|
|
434
|
+
|
|
435
|
+
# ==================== Analytics ====================
|
|
436
|
+
|
|
437
|
+
def get_average_scores(self, test_case_id: Optional[str] = None, limit: int = 10) -> Dict[str, float]:
|
|
438
|
+
"""Get average scores for recent test runs."""
|
|
439
|
+
conn = self._get_connection()
|
|
440
|
+
cursor = conn.cursor()
|
|
441
|
+
|
|
442
|
+
if test_case_id:
|
|
443
|
+
cursor.execute("""
|
|
444
|
+
SELECT AVG(e.progress_score) as avg_progress,
|
|
445
|
+
AVG(e.output_score) as avg_output,
|
|
446
|
+
AVG(e.overall_score) as avg_overall
|
|
447
|
+
FROM evaluations e
|
|
448
|
+
JOIN test_runs tr ON e.test_run_id = tr.id
|
|
449
|
+
WHERE tr.test_case_id = ? AND tr.status = 'completed'
|
|
450
|
+
AND tr.id IN (
|
|
451
|
+
SELECT id FROM test_runs
|
|
452
|
+
WHERE test_case_id = ?
|
|
453
|
+
ORDER BY created_at DESC
|
|
454
|
+
LIMIT ?
|
|
455
|
+
)
|
|
456
|
+
""", (test_case_id, test_case_id, limit))
|
|
457
|
+
else:
|
|
458
|
+
cursor.execute("""
|
|
459
|
+
SELECT AVG(e.progress_score) as avg_progress,
|
|
460
|
+
AVG(e.output_score) as avg_output,
|
|
461
|
+
AVG(e.overall_score) as avg_overall
|
|
462
|
+
FROM evaluations e
|
|
463
|
+
JOIN test_runs tr ON e.test_run_id = tr.id
|
|
464
|
+
WHERE tr.status = 'completed'
|
|
465
|
+
AND tr.id IN (
|
|
466
|
+
SELECT id FROM test_runs
|
|
467
|
+
ORDER BY created_at DESC
|
|
468
|
+
LIMIT ?
|
|
469
|
+
)
|
|
470
|
+
""", (limit,))
|
|
471
|
+
|
|
472
|
+
row = cursor.fetchone()
|
|
473
|
+
conn.close()
|
|
474
|
+
|
|
475
|
+
if row:
|
|
476
|
+
return {
|
|
477
|
+
'avg_progress_score': row['avg_progress'] or 0,
|
|
478
|
+
'avg_output_score': row['avg_output'] or 0,
|
|
479
|
+
'avg_overall_score': row['avg_overall'] or 0
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
return {'avg_progress_score': 0, 'avg_output_score': 0, 'avg_overall_score': 0}
|
|
483
|
+
|
|
484
|
+
def get_score_trend(self, test_case_id: str, limit: int = 20) -> List[Dict[str, Any]]:
|
|
485
|
+
"""Get score trend over time for a specific test case."""
|
|
486
|
+
conn = self._get_connection()
|
|
487
|
+
cursor = conn.cursor()
|
|
488
|
+
|
|
489
|
+
cursor.execute("""
|
|
490
|
+
SELECT tr.created_at, e.progress_score, e.output_score, e.overall_score
|
|
491
|
+
FROM test_runs tr
|
|
492
|
+
JOIN evaluations e ON tr.id = e.test_run_id
|
|
493
|
+
WHERE tr.test_case_id = ? AND tr.status = 'completed'
|
|
494
|
+
ORDER BY tr.created_at ASC
|
|
495
|
+
LIMIT ?
|
|
496
|
+
""", (test_case_id, limit))
|
|
497
|
+
|
|
498
|
+
rows = cursor.fetchall()
|
|
499
|
+
conn.close()
|
|
500
|
+
|
|
501
|
+
return [dict(row) for row in rows]
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
-- Cortex AutoGen2 Test Results Database Schema
|
|
2
|
+
-- SQLite database for storing test runs, evaluations, and metrics
|
|
3
|
+
|
|
4
|
+
-- Test runs table - stores information about each test execution
|
|
5
|
+
CREATE TABLE IF NOT EXISTS test_runs (
|
|
6
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
7
|
+
test_case_id TEXT NOT NULL,
|
|
8
|
+
task_description TEXT NOT NULL,
|
|
9
|
+
request_id TEXT UNIQUE,
|
|
10
|
+
started_at TIMESTAMP NOT NULL,
|
|
11
|
+
completed_at TIMESTAMP,
|
|
12
|
+
duration_seconds REAL,
|
|
13
|
+
status TEXT CHECK(status IN ('running', 'completed', 'failed', 'timeout')) NOT NULL DEFAULT 'running',
|
|
14
|
+
error_message TEXT,
|
|
15
|
+
final_response TEXT, -- Stores the complete final message sent to user with file URLs
|
|
16
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
17
|
+
);
|
|
18
|
+
|
|
19
|
+
-- Progress updates collected during test execution
|
|
20
|
+
CREATE TABLE IF NOT EXISTS progress_updates (
|
|
21
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
22
|
+
test_run_id INTEGER NOT NULL,
|
|
23
|
+
timestamp TIMESTAMP NOT NULL,
|
|
24
|
+
progress REAL,
|
|
25
|
+
info TEXT,
|
|
26
|
+
is_final BOOLEAN DEFAULT 0,
|
|
27
|
+
FOREIGN KEY (test_run_id) REFERENCES test_runs(id) ON DELETE CASCADE
|
|
28
|
+
);
|
|
29
|
+
|
|
30
|
+
-- Docker logs collected during test execution
|
|
31
|
+
CREATE TABLE IF NOT EXISTS logs (
|
|
32
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
33
|
+
test_run_id INTEGER NOT NULL,
|
|
34
|
+
timestamp TIMESTAMP NOT NULL,
|
|
35
|
+
level TEXT,
|
|
36
|
+
agent TEXT,
|
|
37
|
+
message TEXT,
|
|
38
|
+
FOREIGN KEY (test_run_id) REFERENCES test_runs(id) ON DELETE CASCADE
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
-- Files created during test execution
|
|
42
|
+
CREATE TABLE IF NOT EXISTS files_created (
|
|
43
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
44
|
+
test_run_id INTEGER NOT NULL,
|
|
45
|
+
file_path TEXT,
|
|
46
|
+
file_type TEXT,
|
|
47
|
+
file_size_bytes INTEGER,
|
|
48
|
+
sas_url TEXT,
|
|
49
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
50
|
+
FOREIGN KEY (test_run_id) REFERENCES test_runs(id) ON DELETE CASCADE
|
|
51
|
+
);
|
|
52
|
+
|
|
53
|
+
-- LLM-based evaluation scores
|
|
54
|
+
CREATE TABLE IF NOT EXISTS evaluations (
|
|
55
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
56
|
+
test_run_id INTEGER UNIQUE NOT NULL,
|
|
57
|
+
progress_score INTEGER CHECK(progress_score BETWEEN 0 AND 100),
|
|
58
|
+
output_score INTEGER CHECK(output_score BETWEEN 0 AND 100),
|
|
59
|
+
overall_score INTEGER CHECK(overall_score BETWEEN 0 AND 100),
|
|
60
|
+
progress_reasoning TEXT,
|
|
61
|
+
output_reasoning TEXT,
|
|
62
|
+
progress_issues TEXT, -- JSON array of issues found
|
|
63
|
+
output_strengths TEXT, -- JSON array of strengths
|
|
64
|
+
output_weaknesses TEXT, -- JSON array of weaknesses
|
|
65
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
66
|
+
FOREIGN KEY (test_run_id) REFERENCES test_runs(id) ON DELETE CASCADE
|
|
67
|
+
);
|
|
68
|
+
|
|
69
|
+
-- Performance metrics
|
|
70
|
+
CREATE TABLE IF NOT EXISTS metrics (
|
|
71
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
72
|
+
test_run_id INTEGER UNIQUE NOT NULL,
|
|
73
|
+
time_to_first_progress REAL,
|
|
74
|
+
time_to_completion REAL,
|
|
75
|
+
total_progress_updates INTEGER,
|
|
76
|
+
avg_update_interval REAL,
|
|
77
|
+
min_update_interval REAL,
|
|
78
|
+
max_update_interval REAL,
|
|
79
|
+
files_created INTEGER,
|
|
80
|
+
sas_urls_provided INTEGER,
|
|
81
|
+
errors_count INTEGER,
|
|
82
|
+
warnings_count INTEGER,
|
|
83
|
+
FOREIGN KEY (test_run_id) REFERENCES test_runs(id) ON DELETE CASCADE
|
|
84
|
+
);
|
|
85
|
+
|
|
86
|
+
-- Improvement suggestions from LLM analysis
|
|
87
|
+
CREATE TABLE IF NOT EXISTS suggestions (
|
|
88
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
89
|
+
test_run_id INTEGER NOT NULL,
|
|
90
|
+
suggestion TEXT NOT NULL,
|
|
91
|
+
category TEXT CHECK(category IN ('performance', 'quality', 'reliability', 'other')) DEFAULT 'other',
|
|
92
|
+
priority TEXT CHECK(priority IN ('high', 'medium', 'low')) DEFAULT 'medium',
|
|
93
|
+
code_reference TEXT, -- File path or agent name related to suggestion
|
|
94
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
95
|
+
FOREIGN KEY (test_run_id) REFERENCES test_runs(id) ON DELETE CASCADE
|
|
96
|
+
);
|
|
97
|
+
|
|
98
|
+
-- Create indexes for better query performance
|
|
99
|
+
CREATE INDEX IF NOT EXISTS idx_test_runs_test_case ON test_runs(test_case_id);
|
|
100
|
+
CREATE INDEX IF NOT EXISTS idx_test_runs_status ON test_runs(status);
|
|
101
|
+
CREATE INDEX IF NOT EXISTS idx_test_runs_created_at ON test_runs(created_at);
|
|
102
|
+
CREATE INDEX IF NOT EXISTS idx_progress_updates_run ON progress_updates(test_run_id);
|
|
103
|
+
CREATE INDEX IF NOT EXISTS idx_progress_updates_timestamp ON progress_updates(timestamp);
|
|
104
|
+
CREATE INDEX IF NOT EXISTS idx_logs_run ON logs(test_run_id);
|
|
105
|
+
CREATE INDEX IF NOT EXISTS idx_logs_level ON logs(level);
|
|
106
|
+
CREATE INDEX IF NOT EXISTS idx_files_created_run ON files_created(test_run_id);
|
|
107
|
+
CREATE INDEX IF NOT EXISTS idx_suggestions_run ON suggestions(test_run_id);
|
|
108
|
+
CREATE INDEX IF NOT EXISTS idx_suggestions_priority ON suggestions(priority);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""LLM-based evaluators for scoring test results."""
|