npm - @aj-archipelago/cortex - Versions diffs - 1.4.2 → 1.4.3 - Mend

@aj-archipelago/cortex 1.4.2 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/helper-apps/cortex-autogen2/tests/evaluators/llm_scorer.py ADDED Viewed

@@ -0,0 +1,294 @@
+"""
+LLM-based evaluator for scoring test results using Cortex API.
+Uses Cortex LLM API to evaluate progress updates and final outputs,
+providing scores (0-100) and detailed reasoning.
+"""
+import os
+import json
+import logging
+import asyncio
+import httpx
+from typing import Dict, List, Optional, Tuple
+from .prompts import (
+    PROGRESS_EVALUATION_PROMPT,
+    OUTPUT_EVALUATION_PROMPT,
+    format_progress_updates_for_evaluation,
+    format_files_for_evaluation,
+    format_test_summary_for_evaluation
+)
+logger = logging.getLogger(__name__)
+class LLMEvaluator:
+    """Evaluates test results using LLM (Cortex API)."""
+    def __init__(
+        self,
+        api_base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        model: str = "gpt-4.1"  # Use fast model for evaluation
+    ):
+        """
+        Initialize the LLM evaluator.
+        Args:
+            api_base_url: Cortex API base URL (defaults to env var CORTEX_API_BASE_URL)
+            api_key: Cortex API key (defaults to env var CORTEX_API_KEY)
+            model: Model to use for evaluation
+        """
+        self.api_base_url = api_base_url or os.getenv("CORTEX_API_BASE_URL", "http://localhost:4000/v1")
+        self.api_key = api_key or os.getenv("CORTEX_API_KEY")
+        self.model = model
+        if not self.api_key:
+            raise ValueError("CORTEX_API_KEY environment variable must be set")
+        logger.info(f"🤖 LLM Evaluator initialized")
+        logger.info(f"   API URL: {self.api_base_url}")
+        logger.info(f"   Model: {self.model}")
+    async def score_progress_updates(
+        self,
+        progress_updates: List[Dict],
+        task: str
+    ) -> Dict:
+        """
+        Score progress updates (0-100).
+        Args:
+            progress_updates: List of progress update dictionaries
+            task: The original task description
+        Returns:
+            Dictionary with score, reasoning, issues, and strengths
+        """
+        if not progress_updates:
+            logger.warning("No progress updates to evaluate")
+            return {
+                'score': 0,
+                'reasoning': "No progress updates were received during task execution.",
+                'issues': ["Zero progress updates received"],
+                'strengths': []
+            }
+        logger.info(f"📊 Evaluating {len(progress_updates)} progress updates...")
+        # Format updates for prompt
+        updates_formatted = format_progress_updates_for_evaluation(progress_updates)
+        # Build prompt
+        prompt = PROGRESS_EVALUATION_PROMPT.format(
+            progress_updates=updates_formatted,
+            task=task
+        )
+        # Call LLM
+        try:
+            result = await self._call_llm(prompt)
+            # Parse JSON response
+            evaluation = json.loads(result)
+            logger.info(f"   Progress Score: {evaluation['score']}/100")
+            return evaluation
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse LLM response as JSON: {e}")
+            logger.debug(f"Raw response: {result}")
+            return {
+                'score': 50,
+                'reasoning': "LLM response could not be parsed. Manual review required.",
+                'issues': ["Failed to parse LLM evaluation response"],
+                'strengths': []
+            }
+        except Exception as e:
+            logger.error(f"Error scoring progress updates: {e}", exc_info=True)
+            return {
+                'score': 0,
+                'reasoning': f"Evaluation failed: {str(e)}",
+                'issues': [str(e)],
+                'strengths': []
+            }
+    async def score_final_output(
+        self,
+        task: str,
+        final_result: Optional[Dict],
+        files_created: List[Dict],
+        test_summary: Dict
+    ) -> Dict:
+        """
+        Score final output (0-100).
+        Args:
+            task: The original task description
+            final_result: Final result data from progress updates
+            files_created: List of files created during execution
+            test_summary: Summary of test run (duration, errors, etc.)
+        Returns:
+            Dictionary with score, reasoning, strengths, and weaknesses
+        """
+        logger.info(f"📊 Evaluating final output...")
+        # Format data for prompt
+        final_result_str = json.dumps(final_result, indent=2) if final_result else "No final result data"
+        files_str = format_files_for_evaluation(files_created)
+        summary_str = format_test_summary_for_evaluation(test_summary)
+        # Build prompt
+        prompt = OUTPUT_EVALUATION_PROMPT.format(
+            task=task,
+            final_result=final_result_str,
+            files_created=files_str,
+            test_summary=summary_str
+        )
+        # Call LLM
+        try:
+            result = await self._call_llm(prompt)
+            # Parse JSON response
+            evaluation = json.loads(result)
+            logger.info(f"   Output Score: {evaluation['score']}/100")
+            return evaluation
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse LLM response as JSON: {e}")
+            logger.debug(f"Raw response: {result}")
+            return {
+                'score': 50,
+                'reasoning': "LLM response could not be parsed. Manual review required.",
+                'strengths': [],
+                'weaknesses': ["Failed to parse LLM evaluation response"]
+            }
+        except Exception as e:
+            logger.error(f"Error scoring final output: {e}", exc_info=True)
+            return {
+                'score': 0,
+                'reasoning': f"Evaluation failed: {str(e)}",
+                'strengths': [],
+                'weaknesses': [str(e)]
+            }
+    async def evaluate_test_run(
+        self,
+        task: str,
+        progress_updates: List[Dict],
+        final_result: Optional[Dict],
+        files_created: List[Dict],
+        test_summary: Dict
+    ) -> Tuple[Dict, Dict]:
+        """
+        Evaluate both progress updates and final output.
+        Args:
+            task: The original task description
+            progress_updates: List of progress updates
+            final_result: Final result data
+            files_created: List of files created
+            test_summary: Test run summary
+        Returns:
+            Tuple of (progress_evaluation, output_evaluation)
+        """
+        logger.info("🎯 Starting complete test run evaluation")
+        # Score progress updates
+        progress_eval = await self.score_progress_updates(progress_updates, task)
+        # Score final output
+        output_eval = await self.score_final_output(
+            task,
+            final_result,
+            files_created,
+            test_summary
+        )
+        # Calculate overall score
+        overall_score = int((progress_eval['score'] + output_eval['score']) / 2)
+        logger.info(f"✅ Evaluation complete:")
+        logger.info(f"   Progress: {progress_eval['score']}/100")
+        logger.info(f"   Output: {output_eval['score']}/100")
+        logger.info(f"   Overall: {overall_score}/100")
+        return progress_eval, output_eval
+    async def _call_llm(self, prompt: str) -> str:
+        """
+        Call the Cortex LLM API.
+        Args:
+            prompt: The prompt to send
+        Returns:
+            LLM response text
+        """
+        url = f"{self.api_base_url}/chat/completions"
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You are an expert evaluator. Always respond with valid JSON only, no markdown formatting or extra text."
+                },
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ],
+            "temperature": 0.3,  # Low temperature for consistent evaluation
+            "max_tokens": 2000
+        }
+        max_retries = 3
+        base_delay = 2.0
+        for attempt in range(max_retries):
+            try:
+                async with httpx.AsyncClient(timeout=180.0) as client:
+                    response = await client.post(url, headers=headers, json=payload)
+                    response.raise_for_status()
+                    data = response.json()
+                    # Extract content from OpenAI-format response
+                    content = data['choices'][0]['message']['content']
+                    # Remove markdown code fences if present
+                    content = content.strip()
+                    if content.startswith('```json'):
+                        content = content[7:]
+                    if content.startswith('```'):
+                        content = content[3:]
+                    if content.endswith('```'):
+                        content = content[:-3]
+                    return content.strip()
+            except (httpx.TimeoutException, httpx.ReadTimeout, httpx.ConnectTimeout) as e:
+                if attempt < max_retries - 1:
+                    delay = base_delay * (2 ** attempt)  # Exponential backoff
+                    logger.warning(f"LLM call timeout (attempt {attempt + 1}/{max_retries}), retrying in {delay}s: {e}")
+                    await asyncio.sleep(delay)
+                else:
+                    logger.error(f"LLM call failed after {max_retries} attempts: {e}")
+                    raise
+            except Exception as e:
+                # Re-raise non-timeout exceptions immediately
+                raise

package/helper-apps/cortex-autogen2/tests/evaluators/prompts.py ADDED Viewed

@@ -0,0 +1,250 @@
+"""
+Evaluation prompts for LLM-based scoring.
+These prompts define the criteria and rubrics for scoring
+progress updates and final outputs.
+"""
+PROGRESS_EVALUATION_PROMPT = """You are an expert evaluator assessing the quality of progress updates from an AI agent system.
+**Progress Updates to Evaluate:**
+{progress_updates}
+**Task Being Executed:**
+{task}
+**Evaluation Criteria (0-100 points):**
+1. **Frequency & Timing (25 points)**
+   - Excellent: Frequent updates (1-5 seconds) acting as heartbeat - EVEN IF at same percentage
+   - Good: Regular updates every 5-10 seconds
+   - Fair: Updates >10 seconds apart but no major gaps
+   - Poor: Large gaps (>30s) with no updates indicating system may be stuck
+   - NOTE: Repeated updates at the same percentage are INTENTIONAL heartbeats to show the system is alive
+2. **Clarity & Informativeness (25 points)**
+   - Excellent: Uses emojis, concise descriptions, tells what's happening
+   - Good: Clear messages but lacks emojis or detail
+   - Fair: Vague messages like "Processing..." without specifics
+   - Poor: Confusing or misleading messages
+3. **Progress Accuracy (25 points)**
+   - Excellent: Progress % increases logically when tasks complete
+   - Good: Progress advances steadily through major phases
+   - Fair: Some irregular jumps (e.g., 17% → 95%) but reaches completion
+   - Poor: Progress goes backwards or never reaches completion
+   - NOTE: Progress staying at same % for extended periods is ACCEPTABLE (heartbeat behavior)
+4. **Coverage (25 points)**
+   - Excellent: All important steps communicated (planning, data fetching, processing, uploading)
+   - Good: Most steps covered
+   - Fair: Missing some key steps
+   - Poor: Very sparse updates, missing most steps
+**Instructions:**
+1. Analyze the progress updates carefully
+2. Calculate a score from 0-100 based on the criteria above
+3. Provide specific reasoning for your score
+4. List any ACTUAL issues found (NOT frequent updates at same percentage - those are heartbeats!)
+5. Only flag gaps >30 seconds as issues, not frequent heartbeat updates
+**Return JSON format:**
+```json
+{{
+  "score": 85,
+  "reasoning": "Updates were frequent (avg 2.1s interval) acting as heartbeats, which is excellent. Progress percentage advanced logically through major phases. All major steps were communicated clearly with good emoji usage.",
+  "issues": [
+    "One gap of 35 seconds between updates during image download phase"
+  ],
+  "strengths": [
+    "Excellent heartbeat frequency (1-3 second intervals)",
+    "Excellent use of emojis for clarity",
+    "Clear descriptions of what's happening at each step",
+    "Progress advanced logically when tasks completed"
+  ]
+}}
+```
+Now evaluate the progress updates above and return ONLY the JSON response."""
+OUTPUT_EVALUATION_PROMPT = """You are an expert evaluator assessing the quality of outputs from an AI agent system that creates professional, insightful presentations and deliverables.
+**Original Task:**
+{task}
+**Final Result Data:**
+{final_result}
+**Files Created:**
+{files_created}
+**Test Run Summary:**
+{test_summary}
+**Evaluation Criteria (0-100 points):**
+1. **Answer Quality (25 points)**
+   - Excellent: Directly answers user's question with clear insights, no file dumps
+   - Good: Provides useful information but could be more focused on the question
+   - Fair: Includes some answer but mostly lists files
+   - Poor: Just dumps files without answering the question
+2. **Insight & Analysis (25 points)**
+   - Excellent: Extracts key findings, trends, surprises; explains "why it matters"
+   - Good: Provides some analysis but could go deeper
+   - Fair: Basic facts without interpretation
+   - Poor: No analysis, just raw data or file lists
+3. **Professional Presentation (25 points)**
+   - Excellent: Structured like great article (hook→insights→evidence→next steps), strategic emojis, engaging tone
+   - Good: Well-organized but could be more engaging
+   - Fair: Basic structure, functional but not compelling
+   - Poor: Disorganized, unprofessional, hard to read
+4. **Deliverable Integration (25 points)**
+   - Excellent: Primary deliverable prominently highlighted with hero treatment; preview images are clickable and link to main file; supporting files clearly separated; professional visual styling (borders, formatting)
+   - Good: Primary deliverable identified but could be more prominent; preview images shown but not interactive; files somewhat organized
+   - Fair: Files listed but primary deliverable not clearly distinguished from supporting files; preview images shown as regular images without download links
+   - Poor: Files dumped without organization; no distinction between primary and supporting deliverables; preview images missing or not utilized
+**Special Considerations:**
+- **Answer First**: Prioritize how well it answers the original question over file completeness
+- **Insight Focus**: Reward analysis, trends, surprises over raw data dumps
+- **Professional Structure**: Executive summary → Key insights → Visual evidence → Clean deliverables → Next steps
+- **Engagement**: Strategic use of formatting, emojis, clear confident language (avoid "I think", "maybe")
+- **Chart Integration**: Charts should illustrate insights, not just be separate dumps
+- **SAS URLs**: All files must have working SAS URLs for download
+- **PRIMARY DELIVERABLE PROMINENCE**: When task requests specific file type (PPTX, PDF, Excel), that file must be prominently featured with hero treatment, clear download link, and preview images that link to the main file
+- **PREVIEW IMAGE INTERACTIVITY**: Preview images for PPTX/PDF should be clickable and link to the main deliverable file with visual styling (borders, hover indication)
+- **BONUS +5-10 points**: Award extra for proactive helpful visualizations or analysis not explicitly requested
+**Instructions:**
+1. Analyze all aspects of the output
+2. Calculate a score from 0-100 based on criteria above
+3. Provide specific reasoning
+4. List specific strengths and weaknesses
+**Return JSON format:**
+```json
+{{
+  "score": 95,
+  "reasoning": "Outstanding Pokemon presentation that directly answers the question with professional insights. Starts with executive summary highlighting 12 Pokemon collected, then provides specific design highlights and visual evidence woven throughout. Files are presented as supporting evidence, not the main event. Professional structure with strategic emojis and engaging tone.",
+  "strengths": [
+    "Directly answers user's request for 'Most Powerful Gen 1 Pokemon PowerPoint'",
+    "Professional structure: summary → insights → evidence → deliverables",
+    "Charts/images integrated into narrative (described before shown)",
+    "Clear insights about design choices and image quality",
+    "All SAS URLs provided with descriptive names",
+    "Engaging, confident tone throughout"
+  ],
+  "weaknesses": []
+}}
+```
+**Example of EXCELLENT new presentation style:**
+```json
+{{
+  "score": 98,
+  "reasoning": "Perfect example of insight-focused presentation with excellent primary deliverable highlighting. Task requested PowerPoint, and response features it prominently with hero treatment: dedicated section, clickable preview images linking to PPTX download, clear download button with file size. Preview images have professional styling (borders, rounded corners). Supporting files (PDF, data) clearly separated in 'Additional Resources' section. Provides meaningful insights before showing deliverables. Professional structure with executive summary, insights, evidence, and next steps. Bonus +5 points for proactive PDF version and additional charts.",
+  "strengths": [
+    "Primary deliverable (PPTX) prominently featured with hero treatment in dedicated section",
+    "Preview images are clickable and link to main PPTX file for instant download",
+    "Professional visual styling on previews (borders, rounded corners, cursor indication)",
+    "Clear download button with file size (2.1 MB) for transparency",
+    "Supporting files clearly separated in 'Additional Resources' section",
+    "Answers question immediately with executive summary",
+    "Provides meaningful insights before showing deliverables",
+    "Professional structure: hook → insights → primary deliverable → supporting files → next steps",
+    "Strategic use of emojis and formatting",
+    "All files with working SAS URLs"
+  ],
+  "weaknesses": []
+}}
+```
+**Example of POOR old-style file dump:**
+```json
+{{
+  "score": 45,
+  "reasoning": "Traditional file dump approach that doesn't answer the user's question. Just lists deliverables without insights or analysis. No attempt to explain what the data shows or why it matters. User asked for comparison but got file inventory instead.",
+  "strengths": [
+    "All requested files were created",
+    "SAS URLs provided for downloads"
+  ],
+  "weaknesses": [
+    "No answer to user's question about AJE vs AJA comparison",
+    "No insights or analysis of the data",
+    "Just dumps files without context or explanation",
+    "No professional structure or engagement",
+    "Missing opportunity to explain trends and findings"
+  ]
+}}
+```
+**Example of MEDIOCRE insight attempt:**
+```json
+{{
+  "score": 72,
+  "reasoning": "Makes some attempt at insights but lacks professional structure. Starts with basic facts but doesn't provide deep analysis or explain significance. Files are listed rather than integrated into narrative. Could be much more engaging and comprehensive.",
+  "strengths": [
+    "Provides some basic insights about article counts",
+    "Files are uploaded with SAS URLs",
+    "Attempts to answer the comparison question"
+  ],
+  "weaknesses": [
+    "Insights are surface-level without deep analysis",
+    "No professional structure (no executive summary, poor flow)",
+    "Files dumped at end without integration into story",
+    "Lacks engaging tone and strategic formatting",
+    "Missing explanation of why findings matter"
+  ]
+}}
+```
+Now evaluate the output above and return ONLY the JSON response."""
+def format_progress_updates_for_evaluation(updates: list) -> str:
+    """Format progress updates for inclusion in evaluation prompt."""
+    if not updates:
+        return "No progress updates received"
+    formatted = []
+    for i, update in enumerate(updates, 1):
+        timestamp = update.get('timestamp', 'unknown')
+        progress = update.get('progress', 0)
+        info = update.get('info', '')
+        progress_pct = int(progress * 100) if isinstance(progress, float) else progress
+        formatted.append(f"{i}. [{timestamp}] {progress_pct}% - {info}")
+    return "\n".join(formatted)
+def format_files_for_evaluation(files: list) -> str:
+    """Format file list for inclusion in evaluation prompt."""
+    if not files:
+        return "No files created"
+    formatted = []
+    for file in files:
+        file_path = file.get('file_path', 'unknown')
+        file_type = file.get('file_type', 'unknown')
+        sas_url = file.get('sas_url', 'none')
+        formatted.append(f"- {file_path} (type: {file_type}, SAS URL: {'yes' if sas_url else 'no'})")
+    return "\n".join(formatted)
+def format_test_summary_for_evaluation(summary: dict) -> str:
+    """Format test run summary for evaluation."""
+    lines = [
+        f"Duration: {summary.get('duration_seconds', 0):.1f} seconds",
+        f"Progress Updates: {summary.get('total_progress_updates', 0)}",
+        f"Errors: {summary.get('errors_count', 0)}",
+        f"Warnings: {summary.get('warnings_count', 0)}",
+    ]
+    return "\n".join(lines)