@aj-archipelago/cortex 1.4.2 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +1 -0
  2. package/config.js +1 -1
  3. package/helper-apps/cortex-autogen2/.dockerignore +1 -0
  4. package/helper-apps/cortex-autogen2/Dockerfile +6 -10
  5. package/helper-apps/cortex-autogen2/Dockerfile.worker +2 -0
  6. package/helper-apps/cortex-autogen2/agents.py +203 -2
  7. package/helper-apps/cortex-autogen2/main.py +1 -1
  8. package/helper-apps/cortex-autogen2/pyproject.toml +12 -0
  9. package/helper-apps/cortex-autogen2/requirements.txt +14 -0
  10. package/helper-apps/cortex-autogen2/services/redis_publisher.py +1 -1
  11. package/helper-apps/cortex-autogen2/services/run_analyzer.py +1 -1
  12. package/helper-apps/cortex-autogen2/task_processor.py +431 -229
  13. package/helper-apps/cortex-autogen2/test_entity_fetcher.py +305 -0
  14. package/helper-apps/cortex-autogen2/tests/README.md +240 -0
  15. package/helper-apps/cortex-autogen2/tests/TEST_REPORT.md +342 -0
  16. package/helper-apps/cortex-autogen2/tests/__init__.py +8 -0
  17. package/helper-apps/cortex-autogen2/tests/analysis/__init__.py +1 -0
  18. package/helper-apps/cortex-autogen2/tests/analysis/improvement_suggester.py +224 -0
  19. package/helper-apps/cortex-autogen2/tests/analysis/trend_analyzer.py +211 -0
  20. package/helper-apps/cortex-autogen2/tests/cli/__init__.py +1 -0
  21. package/helper-apps/cortex-autogen2/tests/cli/run_tests.py +296 -0
  22. package/helper-apps/cortex-autogen2/tests/collectors/__init__.py +1 -0
  23. package/helper-apps/cortex-autogen2/tests/collectors/log_collector.py +252 -0
  24. package/helper-apps/cortex-autogen2/tests/collectors/progress_collector.py +182 -0
  25. package/helper-apps/cortex-autogen2/tests/conftest.py +15 -0
  26. package/helper-apps/cortex-autogen2/tests/database/__init__.py +1 -0
  27. package/helper-apps/cortex-autogen2/tests/database/repository.py +501 -0
  28. package/helper-apps/cortex-autogen2/tests/database/schema.sql +108 -0
  29. package/helper-apps/cortex-autogen2/tests/evaluators/__init__.py +1 -0
  30. package/helper-apps/cortex-autogen2/tests/evaluators/llm_scorer.py +294 -0
  31. package/helper-apps/cortex-autogen2/tests/evaluators/prompts.py +250 -0
  32. package/helper-apps/cortex-autogen2/tests/evaluators/wordcloud_validator.py +168 -0
  33. package/helper-apps/cortex-autogen2/tests/metrics/__init__.py +1 -0
  34. package/helper-apps/cortex-autogen2/tests/metrics/collector.py +155 -0
  35. package/helper-apps/cortex-autogen2/tests/orchestrator.py +576 -0
  36. package/helper-apps/cortex-autogen2/tests/test_cases.yaml +279 -0
  37. package/helper-apps/cortex-autogen2/tests/test_data.db +0 -0
  38. package/helper-apps/cortex-autogen2/tests/utils/__init__.py +3 -0
  39. package/helper-apps/cortex-autogen2/tests/utils/connectivity.py +112 -0
  40. package/helper-apps/cortex-autogen2/tools/azure_blob_tools.py +74 -24
  41. package/helper-apps/cortex-autogen2/tools/entity_api_registry.json +38 -0
  42. package/helper-apps/cortex-autogen2/tools/file_tools.py +1 -1
  43. package/helper-apps/cortex-autogen2/tools/search_tools.py +436 -238
  44. package/helper-apps/cortex-file-handler/package-lock.json +2 -2
  45. package/helper-apps/cortex-file-handler/package.json +1 -1
  46. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +4 -5
  47. package/helper-apps/cortex-file-handler/src/blobHandler.js +36 -144
  48. package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +5 -3
  49. package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +34 -1
  50. package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +22 -0
  51. package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +28 -1
  52. package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +29 -4
  53. package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +11 -0
  54. package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +1 -1
  55. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +3 -2
  56. package/helper-apps/cortex-file-handler/tests/checkHashShortLived.test.js +8 -1
  57. package/helper-apps/cortex-file-handler/tests/containerConversionFlow.test.js +5 -2
  58. package/helper-apps/cortex-file-handler/tests/containerNameParsing.test.js +14 -7
  59. package/helper-apps/cortex-file-handler/tests/containerParameterFlow.test.js +5 -2
  60. package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +31 -19
  61. package/package.json +1 -1
  62. package/server/modelExecutor.js +4 -0
  63. package/server/plugins/claude4VertexPlugin.js +540 -0
  64. package/server/plugins/openAiWhisperPlugin.js +43 -2
  65. package/tests/integration/rest/vendors/claude_streaming.test.js +121 -0
  66. package/tests/unit/plugins/claude4VertexPlugin.test.js +462 -0
  67. package/tests/unit/plugins/claude4VertexToolConversion.test.js +413 -0
  68. package/helper-apps/cortex-autogen/.funcignore +0 -8
  69. package/helper-apps/cortex-autogen/Dockerfile +0 -10
  70. package/helper-apps/cortex-autogen/OAI_CONFIG_LIST +0 -6
  71. package/helper-apps/cortex-autogen/agents.py +0 -493
  72. package/helper-apps/cortex-autogen/agents_extra.py +0 -14
  73. package/helper-apps/cortex-autogen/config.py +0 -18
  74. package/helper-apps/cortex-autogen/data_operations.py +0 -29
  75. package/helper-apps/cortex-autogen/function_app.py +0 -44
  76. package/helper-apps/cortex-autogen/host.json +0 -15
  77. package/helper-apps/cortex-autogen/main.py +0 -38
  78. package/helper-apps/cortex-autogen/prompts.py +0 -196
  79. package/helper-apps/cortex-autogen/prompts_extra.py +0 -5
  80. package/helper-apps/cortex-autogen/requirements.txt +0 -9
  81. package/helper-apps/cortex-autogen/search.py +0 -85
  82. package/helper-apps/cortex-autogen/test.sh +0 -40
  83. package/helper-apps/cortex-autogen/tools/sasfileuploader.py +0 -66
  84. package/helper-apps/cortex-autogen/utils.py +0 -88
  85. package/helper-apps/cortex-autogen2/DigiCertGlobalRootCA.crt.pem +0 -22
  86. package/helper-apps/cortex-autogen2/poetry.lock +0 -3652
@@ -0,0 +1,294 @@
1
+ """
2
+ LLM-based evaluator for scoring test results using Cortex API.
3
+
4
+ Uses Cortex LLM API to evaluate progress updates and final outputs,
5
+ providing scores (0-100) and detailed reasoning.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import logging
11
+ import asyncio
12
+ import httpx
13
+ from typing import Dict, List, Optional, Tuple
14
+ from .prompts import (
15
+ PROGRESS_EVALUATION_PROMPT,
16
+ OUTPUT_EVALUATION_PROMPT,
17
+ format_progress_updates_for_evaluation,
18
+ format_files_for_evaluation,
19
+ format_test_summary_for_evaluation
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class LLMEvaluator:
26
+ """Evaluates test results using LLM (Cortex API)."""
27
+
28
+ def __init__(
29
+ self,
30
+ api_base_url: Optional[str] = None,
31
+ api_key: Optional[str] = None,
32
+ model: str = "gpt-4.1" # Use fast model for evaluation
33
+ ):
34
+ """
35
+ Initialize the LLM evaluator.
36
+
37
+ Args:
38
+ api_base_url: Cortex API base URL (defaults to env var CORTEX_API_BASE_URL)
39
+ api_key: Cortex API key (defaults to env var CORTEX_API_KEY)
40
+ model: Model to use for evaluation
41
+ """
42
+ self.api_base_url = api_base_url or os.getenv("CORTEX_API_BASE_URL", "http://localhost:4000/v1")
43
+ self.api_key = api_key or os.getenv("CORTEX_API_KEY")
44
+ self.model = model
45
+
46
+ if not self.api_key:
47
+ raise ValueError("CORTEX_API_KEY environment variable must be set")
48
+
49
+ logger.info(f"🤖 LLM Evaluator initialized")
50
+ logger.info(f" API URL: {self.api_base_url}")
51
+ logger.info(f" Model: {self.model}")
52
+
53
+ async def score_progress_updates(
54
+ self,
55
+ progress_updates: List[Dict],
56
+ task: str
57
+ ) -> Dict:
58
+ """
59
+ Score progress updates (0-100).
60
+
61
+ Args:
62
+ progress_updates: List of progress update dictionaries
63
+ task: The original task description
64
+
65
+ Returns:
66
+ Dictionary with score, reasoning, issues, and strengths
67
+ """
68
+ if not progress_updates:
69
+ logger.warning("No progress updates to evaluate")
70
+ return {
71
+ 'score': 0,
72
+ 'reasoning': "No progress updates were received during task execution.",
73
+ 'issues': ["Zero progress updates received"],
74
+ 'strengths': []
75
+ }
76
+
77
+ logger.info(f"📊 Evaluating {len(progress_updates)} progress updates...")
78
+
79
+ # Format updates for prompt
80
+ updates_formatted = format_progress_updates_for_evaluation(progress_updates)
81
+
82
+ # Build prompt
83
+ prompt = PROGRESS_EVALUATION_PROMPT.format(
84
+ progress_updates=updates_formatted,
85
+ task=task
86
+ )
87
+
88
+ # Call LLM
89
+ try:
90
+ result = await self._call_llm(prompt)
91
+
92
+ # Parse JSON response
93
+ evaluation = json.loads(result)
94
+
95
+ logger.info(f" Progress Score: {evaluation['score']}/100")
96
+ return evaluation
97
+
98
+ except json.JSONDecodeError as e:
99
+ logger.error(f"Failed to parse LLM response as JSON: {e}")
100
+ logger.debug(f"Raw response: {result}")
101
+
102
+ return {
103
+ 'score': 50,
104
+ 'reasoning': "LLM response could not be parsed. Manual review required.",
105
+ 'issues': ["Failed to parse LLM evaluation response"],
106
+ 'strengths': []
107
+ }
108
+ except Exception as e:
109
+ logger.error(f"Error scoring progress updates: {e}", exc_info=True)
110
+
111
+ return {
112
+ 'score': 0,
113
+ 'reasoning': f"Evaluation failed: {str(e)}",
114
+ 'issues': [str(e)],
115
+ 'strengths': []
116
+ }
117
+
118
+ async def score_final_output(
119
+ self,
120
+ task: str,
121
+ final_result: Optional[Dict],
122
+ files_created: List[Dict],
123
+ test_summary: Dict
124
+ ) -> Dict:
125
+ """
126
+ Score final output (0-100).
127
+
128
+ Args:
129
+ task: The original task description
130
+ final_result: Final result data from progress updates
131
+ files_created: List of files created during execution
132
+ test_summary: Summary of test run (duration, errors, etc.)
133
+
134
+ Returns:
135
+ Dictionary with score, reasoning, strengths, and weaknesses
136
+ """
137
+ logger.info(f"📊 Evaluating final output...")
138
+
139
+ # Format data for prompt
140
+ final_result_str = json.dumps(final_result, indent=2) if final_result else "No final result data"
141
+ files_str = format_files_for_evaluation(files_created)
142
+ summary_str = format_test_summary_for_evaluation(test_summary)
143
+
144
+ # Build prompt
145
+ prompt = OUTPUT_EVALUATION_PROMPT.format(
146
+ task=task,
147
+ final_result=final_result_str,
148
+ files_created=files_str,
149
+ test_summary=summary_str
150
+ )
151
+
152
+ # Call LLM
153
+ try:
154
+ result = await self._call_llm(prompt)
155
+
156
+ # Parse JSON response
157
+ evaluation = json.loads(result)
158
+
159
+ logger.info(f" Output Score: {evaluation['score']}/100")
160
+ return evaluation
161
+
162
+ except json.JSONDecodeError as e:
163
+ logger.error(f"Failed to parse LLM response as JSON: {e}")
164
+ logger.debug(f"Raw response: {result}")
165
+
166
+ return {
167
+ 'score': 50,
168
+ 'reasoning': "LLM response could not be parsed. Manual review required.",
169
+ 'strengths': [],
170
+ 'weaknesses': ["Failed to parse LLM evaluation response"]
171
+ }
172
+ except Exception as e:
173
+ logger.error(f"Error scoring final output: {e}", exc_info=True)
174
+
175
+ return {
176
+ 'score': 0,
177
+ 'reasoning': f"Evaluation failed: {str(e)}",
178
+ 'strengths': [],
179
+ 'weaknesses': [str(e)]
180
+ }
181
+
182
+ async def evaluate_test_run(
183
+ self,
184
+ task: str,
185
+ progress_updates: List[Dict],
186
+ final_result: Optional[Dict],
187
+ files_created: List[Dict],
188
+ test_summary: Dict
189
+ ) -> Tuple[Dict, Dict]:
190
+ """
191
+ Evaluate both progress updates and final output.
192
+
193
+ Args:
194
+ task: The original task description
195
+ progress_updates: List of progress updates
196
+ final_result: Final result data
197
+ files_created: List of files created
198
+ test_summary: Test run summary
199
+
200
+ Returns:
201
+ Tuple of (progress_evaluation, output_evaluation)
202
+ """
203
+ logger.info("🎯 Starting complete test run evaluation")
204
+
205
+ # Score progress updates
206
+ progress_eval = await self.score_progress_updates(progress_updates, task)
207
+
208
+ # Score final output
209
+ output_eval = await self.score_final_output(
210
+ task,
211
+ final_result,
212
+ files_created,
213
+ test_summary
214
+ )
215
+
216
+ # Calculate overall score
217
+ overall_score = int((progress_eval['score'] + output_eval['score']) / 2)
218
+
219
+ logger.info(f"✅ Evaluation complete:")
220
+ logger.info(f" Progress: {progress_eval['score']}/100")
221
+ logger.info(f" Output: {output_eval['score']}/100")
222
+ logger.info(f" Overall: {overall_score}/100")
223
+
224
+ return progress_eval, output_eval
225
+
226
+ async def _call_llm(self, prompt: str) -> str:
227
+ """
228
+ Call the Cortex LLM API.
229
+
230
+ Args:
231
+ prompt: The prompt to send
232
+
233
+ Returns:
234
+ LLM response text
235
+ """
236
+ url = f"{self.api_base_url}/chat/completions"
237
+
238
+ headers = {
239
+ "Authorization": f"Bearer {self.api_key}",
240
+ "Content-Type": "application/json"
241
+ }
242
+
243
+ payload = {
244
+ "model": self.model,
245
+ "messages": [
246
+ {
247
+ "role": "system",
248
+ "content": "You are an expert evaluator. Always respond with valid JSON only, no markdown formatting or extra text."
249
+ },
250
+ {
251
+ "role": "user",
252
+ "content": prompt
253
+ }
254
+ ],
255
+ "temperature": 0.3, # Low temperature for consistent evaluation
256
+ "max_tokens": 2000
257
+ }
258
+
259
+ max_retries = 3
260
+ base_delay = 2.0
261
+
262
+ for attempt in range(max_retries):
263
+ try:
264
+ async with httpx.AsyncClient(timeout=180.0) as client:
265
+ response = await client.post(url, headers=headers, json=payload)
266
+ response.raise_for_status()
267
+
268
+ data = response.json()
269
+
270
+ # Extract content from OpenAI-format response
271
+ content = data['choices'][0]['message']['content']
272
+
273
+ # Remove markdown code fences if present
274
+ content = content.strip()
275
+ if content.startswith('```json'):
276
+ content = content[7:]
277
+ if content.startswith('```'):
278
+ content = content[3:]
279
+ if content.endswith('```'):
280
+ content = content[:-3]
281
+
282
+ return content.strip()
283
+
284
+ except (httpx.TimeoutException, httpx.ReadTimeout, httpx.ConnectTimeout) as e:
285
+ if attempt < max_retries - 1:
286
+ delay = base_delay * (2 ** attempt) # Exponential backoff
287
+ logger.warning(f"LLM call timeout (attempt {attempt + 1}/{max_retries}), retrying in {delay}s: {e}")
288
+ await asyncio.sleep(delay)
289
+ else:
290
+ logger.error(f"LLM call failed after {max_retries} attempts: {e}")
291
+ raise
292
+ except Exception as e:
293
+ # Re-raise non-timeout exceptions immediately
294
+ raise
@@ -0,0 +1,250 @@
1
+ """
2
+ Evaluation prompts for LLM-based scoring.
3
+
4
+ These prompts define the criteria and rubrics for scoring
5
+ progress updates and final outputs.
6
+ """
7
+
8
+ PROGRESS_EVALUATION_PROMPT = """You are an expert evaluator assessing the quality of progress updates from an AI agent system.
9
+
10
+ **Progress Updates to Evaluate:**
11
+ {progress_updates}
12
+
13
+ **Task Being Executed:**
14
+ {task}
15
+
16
+ **Evaluation Criteria (0-100 points):**
17
+
18
+ 1. **Frequency & Timing (25 points)**
19
+ - Excellent: Frequent updates (1-5 seconds) acting as heartbeat - EVEN IF at same percentage
20
+ - Good: Regular updates every 5-10 seconds
21
+ - Fair: Updates >10 seconds apart but no major gaps
22
+ - Poor: Large gaps (>30s) with no updates indicating system may be stuck
23
+ - NOTE: Repeated updates at the same percentage are INTENTIONAL heartbeats to show the system is alive
24
+
25
+ 2. **Clarity & Informativeness (25 points)**
26
+ - Excellent: Uses emojis, concise descriptions, tells what's happening
27
+ - Good: Clear messages but lacks emojis or detail
28
+ - Fair: Vague messages like "Processing..." without specifics
29
+ - Poor: Confusing or misleading messages
30
+
31
+ 3. **Progress Accuracy (25 points)**
32
+ - Excellent: Progress % increases logically when tasks complete
33
+ - Good: Progress advances steadily through major phases
34
+ - Fair: Some irregular jumps (e.g., 17% → 95%) but reaches completion
35
+ - Poor: Progress goes backwards or never reaches completion
36
+ - NOTE: Progress staying at same % for extended periods is ACCEPTABLE (heartbeat behavior)
37
+
38
+ 4. **Coverage (25 points)**
39
+ - Excellent: All important steps communicated (planning, data fetching, processing, uploading)
40
+ - Good: Most steps covered
41
+ - Fair: Missing some key steps
42
+ - Poor: Very sparse updates, missing most steps
43
+
44
+ **Instructions:**
45
+ 1. Analyze the progress updates carefully
46
+ 2. Calculate a score from 0-100 based on the criteria above
47
+ 3. Provide specific reasoning for your score
48
+ 4. List any ACTUAL issues found (NOT frequent updates at same percentage - those are heartbeats!)
49
+ 5. Only flag gaps >30 seconds as issues, not frequent heartbeat updates
50
+
51
+ **Return JSON format:**
52
+ ```json
53
+ {{
54
+ "score": 85,
55
+ "reasoning": "Updates were frequent (avg 2.1s interval) acting as heartbeats, which is excellent. Progress percentage advanced logically through major phases. All major steps were communicated clearly with good emoji usage.",
56
+ "issues": [
57
+ "One gap of 35 seconds between updates during image download phase"
58
+ ],
59
+ "strengths": [
60
+ "Excellent heartbeat frequency (1-3 second intervals)",
61
+ "Excellent use of emojis for clarity",
62
+ "Clear descriptions of what's happening at each step",
63
+ "Progress advanced logically when tasks completed"
64
+ ]
65
+ }}
66
+ ```
67
+
68
+ Now evaluate the progress updates above and return ONLY the JSON response."""
69
+
70
+
71
+ OUTPUT_EVALUATION_PROMPT = """You are an expert evaluator assessing the quality of outputs from an AI agent system that creates professional, insightful presentations and deliverables.
72
+
73
+ **Original Task:**
74
+ {task}
75
+
76
+ **Final Result Data:**
77
+ {final_result}
78
+
79
+ **Files Created:**
80
+ {files_created}
81
+
82
+ **Test Run Summary:**
83
+ {test_summary}
84
+
85
+ **Evaluation Criteria (0-100 points):**
86
+
87
+ 1. **Answer Quality (25 points)**
88
+ - Excellent: Directly answers user's question with clear insights, no file dumps
89
+ - Good: Provides useful information but could be more focused on the question
90
+ - Fair: Includes some answer but mostly lists files
91
+ - Poor: Just dumps files without answering the question
92
+
93
+ 2. **Insight & Analysis (25 points)**
94
+ - Excellent: Extracts key findings, trends, surprises; explains "why it matters"
95
+ - Good: Provides some analysis but could go deeper
96
+ - Fair: Basic facts without interpretation
97
+ - Poor: No analysis, just raw data or file lists
98
+
99
+ 3. **Professional Presentation (25 points)**
100
+ - Excellent: Structured like great article (hook→insights→evidence→next steps), strategic emojis, engaging tone
101
+ - Good: Well-organized but could be more engaging
102
+ - Fair: Basic structure, functional but not compelling
103
+ - Poor: Disorganized, unprofessional, hard to read
104
+
105
+ 4. **Deliverable Integration (25 points)**
106
+ - Excellent: Primary deliverable prominently highlighted with hero treatment; preview images are clickable and link to main file; supporting files clearly separated; professional visual styling (borders, formatting)
107
+ - Good: Primary deliverable identified but could be more prominent; preview images shown but not interactive; files somewhat organized
108
+ - Fair: Files listed but primary deliverable not clearly distinguished from supporting files; preview images shown as regular images without download links
109
+ - Poor: Files dumped without organization; no distinction between primary and supporting deliverables; preview images missing or not utilized
110
+
111
+ **Special Considerations:**
112
+ - **Answer First**: Prioritize how well it answers the original question over file completeness
113
+ - **Insight Focus**: Reward analysis, trends, surprises over raw data dumps
114
+ - **Professional Structure**: Executive summary → Key insights → Visual evidence → Clean deliverables → Next steps
115
+ - **Engagement**: Strategic use of formatting, emojis, clear confident language (avoid "I think", "maybe")
116
+ - **Chart Integration**: Charts should illustrate insights, not just be separate dumps
117
+ - **SAS URLs**: All files must have working SAS URLs for download
118
+ - **PRIMARY DELIVERABLE PROMINENCE**: When task requests specific file type (PPTX, PDF, Excel), that file must be prominently featured with hero treatment, clear download link, and preview images that link to the main file
119
+ - **PREVIEW IMAGE INTERACTIVITY**: Preview images for PPTX/PDF should be clickable and link to the main deliverable file with visual styling (borders, hover indication)
120
+ - **BONUS +5-10 points**: Award extra for proactive helpful visualizations or analysis not explicitly requested
121
+
122
+ **Instructions:**
123
+ 1. Analyze all aspects of the output
124
+ 2. Calculate a score from 0-100 based on criteria above
125
+ 3. Provide specific reasoning
126
+ 4. List specific strengths and weaknesses
127
+
128
+ **Return JSON format:**
129
+ ```json
130
+ {{
131
+ "score": 95,
132
+ "reasoning": "Outstanding Pokemon presentation that directly answers the question with professional insights. Starts with executive summary highlighting 12 Pokemon collected, then provides specific design highlights and visual evidence woven throughout. Files are presented as supporting evidence, not the main event. Professional structure with strategic emojis and engaging tone.",
133
+ "strengths": [
134
+ "Directly answers user's request for 'Most Powerful Gen 1 Pokemon PowerPoint'",
135
+ "Professional structure: summary → insights → evidence → deliverables",
136
+ "Charts/images integrated into narrative (described before shown)",
137
+ "Clear insights about design choices and image quality",
138
+ "All SAS URLs provided with descriptive names",
139
+ "Engaging, confident tone throughout"
140
+ ],
141
+ "weaknesses": []
142
+ }}
143
+ ```
144
+
145
+ **Example of EXCELLENT new presentation style:**
146
+ ```json
147
+ {{
148
+ "score": 98,
149
+ "reasoning": "Perfect example of insight-focused presentation with excellent primary deliverable highlighting. Task requested PowerPoint, and response features it prominently with hero treatment: dedicated section, clickable preview images linking to PPTX download, clear download button with file size. Preview images have professional styling (borders, rounded corners). Supporting files (PDF, data) clearly separated in 'Additional Resources' section. Provides meaningful insights before showing deliverables. Professional structure with executive summary, insights, evidence, and next steps. Bonus +5 points for proactive PDF version and additional charts.",
150
+ "strengths": [
151
+ "Primary deliverable (PPTX) prominently featured with hero treatment in dedicated section",
152
+ "Preview images are clickable and link to main PPTX file for instant download",
153
+ "Professional visual styling on previews (borders, rounded corners, cursor indication)",
154
+ "Clear download button with file size (2.1 MB) for transparency",
155
+ "Supporting files clearly separated in 'Additional Resources' section",
156
+ "Answers question immediately with executive summary",
157
+ "Provides meaningful insights before showing deliverables",
158
+ "Professional structure: hook → insights → primary deliverable → supporting files → next steps",
159
+ "Strategic use of emojis and formatting",
160
+ "All files with working SAS URLs"
161
+ ],
162
+ "weaknesses": []
163
+ }}
164
+ ```
165
+
166
+ **Example of POOR old-style file dump:**
167
+ ```json
168
+ {{
169
+ "score": 45,
170
+ "reasoning": "Traditional file dump approach that doesn't answer the user's question. Just lists deliverables without insights or analysis. No attempt to explain what the data shows or why it matters. User asked for comparison but got file inventory instead.",
171
+ "strengths": [
172
+ "All requested files were created",
173
+ "SAS URLs provided for downloads"
174
+ ],
175
+ "weaknesses": [
176
+ "No answer to user's question about AJE vs AJA comparison",
177
+ "No insights or analysis of the data",
178
+ "Just dumps files without context or explanation",
179
+ "No professional structure or engagement",
180
+ "Missing opportunity to explain trends and findings"
181
+ ]
182
+ }}
183
+ ```
184
+
185
+ **Example of MEDIOCRE insight attempt:**
186
+ ```json
187
+ {{
188
+ "score": 72,
189
+ "reasoning": "Makes some attempt at insights but lacks professional structure. Starts with basic facts but doesn't provide deep analysis or explain significance. Files are listed rather than integrated into narrative. Could be much more engaging and comprehensive.",
190
+ "strengths": [
191
+ "Provides some basic insights about article counts",
192
+ "Files are uploaded with SAS URLs",
193
+ "Attempts to answer the comparison question"
194
+ ],
195
+ "weaknesses": [
196
+ "Insights are surface-level without deep analysis",
197
+ "No professional structure (no executive summary, poor flow)",
198
+ "Files dumped at end without integration into story",
199
+ "Lacks engaging tone and strategic formatting",
200
+ "Missing explanation of why findings matter"
201
+ ]
202
+ }}
203
+ ```
204
+
205
+ Now evaluate the output above and return ONLY the JSON response."""
206
+
207
+
208
+ def format_progress_updates_for_evaluation(updates: list) -> str:
209
+ """Format progress updates for inclusion in evaluation prompt."""
210
+ if not updates:
211
+ return "No progress updates received"
212
+
213
+ formatted = []
214
+ for i, update in enumerate(updates, 1):
215
+ timestamp = update.get('timestamp', 'unknown')
216
+ progress = update.get('progress', 0)
217
+ info = update.get('info', '')
218
+ progress_pct = int(progress * 100) if isinstance(progress, float) else progress
219
+
220
+ formatted.append(f"{i}. [{timestamp}] {progress_pct}% - {info}")
221
+
222
+ return "\n".join(formatted)
223
+
224
+
225
+ def format_files_for_evaluation(files: list) -> str:
226
+ """Format file list for inclusion in evaluation prompt."""
227
+ if not files:
228
+ return "No files created"
229
+
230
+ formatted = []
231
+ for file in files:
232
+ file_path = file.get('file_path', 'unknown')
233
+ file_type = file.get('file_type', 'unknown')
234
+ sas_url = file.get('sas_url', 'none')
235
+
236
+ formatted.append(f"- {file_path} (type: {file_type}, SAS URL: {'yes' if sas_url else 'no'})")
237
+
238
+ return "\n".join(formatted)
239
+
240
+
241
+ def format_test_summary_for_evaluation(summary: dict) -> str:
242
+ """Format test run summary for evaluation."""
243
+ lines = [
244
+ f"Duration: {summary.get('duration_seconds', 0):.1f} seconds",
245
+ f"Progress Updates: {summary.get('total_progress_updates', 0)}",
246
+ f"Errors: {summary.get('errors_count', 0)}",
247
+ f"Warnings: {summary.get('warnings_count', 0)}",
248
+ ]
249
+
250
+ return "\n".join(lines)