npm - @microsoft/m365-copilot-eval - Versions diffs - 1.1.1-preview.1 → 1.2.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.1.1-preview.1 → 1.2.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +64 -18
package/package.json +4 -2
package/schema/CHANGELOG.md +21 -0
package/schema/v1/eval-document.schema.json +236 -0
package/schema/v1/examples/invalid/empty-items.json +4 -0
package/schema/v1/examples/invalid/invalid-semver.json +8 -0
package/schema/v1/examples/invalid/missing-schema-version.json +7 -0
package/schema/v1/examples/invalid/wrong-type.json +6 -0
package/schema/v1/examples/valid/comprehensive.json +92 -0
package/schema/v1/examples/valid/minimal.json +8 -0
package/schema/version.json +6 -0
package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +77 -33
package/src/clients/cli/main.py +197 -30
package/src/clients/cli/readme.md +5 -5
package/src/clients/cli/requirements.txt +2 -0
package/src/clients/cli/samples/starter.json +13 -10
package/src/clients/cli/schema_handler.py +349 -0
package/src/clients/cli/version_check.py +139 -0
package/src/clients/node-js/bin/runevals.js +34 -103
package/src/clients/node-js/config/default.js +1 -1
package/src/clients/node-js/lib/env-loader.js +126 -0
package/src/clients/node-js/lib/progress.js +36 -36
package/src/clients/node-js/lib/python-runtime.js +4 -6
package/src/clients/node-js/lib/venv-manager.js +60 -18

package/src/clients/cli/custom_evaluators/CitationsEvaluator.py CHANGED Viewed

@@ -1,9 +1,10 @@
 """
 CitationsEvaluator - A custom evaluator for analyzing citations in M365 Copilot responses.
-This evaluator uses regex-based pattern matching to detect citations in two formats:
-1. New OAI format: \ue200cite\ue202turn{X}search{Y}\ue201
-2. Old format: [^i^] where i is the citation index
+This evaluator uses regex-based pattern matching to detect citations in three modes:
+1. OAI_UNICODE: New OAI format: \ue200cite\ue202turn{X}search{Y}\ue201
+2. LEGACY_BRACKET: Old format: [^i^] where i is the citation index
+3. AUTO: Automatically detects both formats simultaneously
 Where X, Y, and i are natural numbers representing conversation turn, search result index, or citation index.
 """
@@ -17,48 +18,60 @@ class CitationFormat(Enum):
     """Enum for different citation formats supported by the evaluator."""
     OAI_UNICODE = "oai_unicode"  # New format: \ue200cite\ue202turn{X}search{Y}\ue201
     LEGACY_BRACKET = "legacy_bracket"  # Old format: [^i^]
+    AUTO = "auto"  # Automatically detect both formats
 class CitationsEvaluator:
     """
     A custom evaluator that analyzes citations in response text without using an LLM.
     This evaluator detects citation patterns and returns:
     - Whether at least one citation is present
     - The number of unique citations found
-    Supports both new OAI unicode format and legacy bracket format.
+    Supports three modes:
+    - OAI_UNICODE: Detects only OAI unicode format citations
+    - LEGACY_BRACKET: Detects only legacy bracket format citations
+    - AUTO: Automatically detects both formats simultaneously
     """
     def __init__(self, citation_format: CitationFormat = CitationFormat.OAI_UNICODE):
         """
         Initialize the CitationsEvaluator with the specified citation format.
         Args:
             citation_format (CitationFormat): The format of citations to detect.
                 Defaults to OAI_UNICODE format.
         """
         self.citation_format = citation_format
+        oai_pattern = r'\ue200cite\ue202turn\d+search\d+\ue201'
+        legacy_pattern = r'\[\^\d+\^\]'
         if citation_format == CitationFormat.OAI_UNICODE:
             # Pattern to match citations: \ue200cite\ue202turn{number}search{number}\ue201
-            self.citation_pattern = r'\ue200cite\ue202turn\d+search\d+\ue201'
+            self.citation_pattern = oai_pattern
         elif citation_format == CitationFormat.LEGACY_BRACKET:
             # Pattern to match citations: [^number^]
-            self.citation_pattern = r'\[\^\d+\^\]'
+            self.citation_pattern = legacy_pattern
+        elif citation_format == CitationFormat.AUTO:
+            # Auto-detect both formats using alternation (|)
+            # Matches either OAI unicode OR legacy bracket format
+            self.citation_pattern = rf'(?:{oai_pattern})|(?:{legacy_pattern})'
         else:
             raise ValueError(f"Unsupported citation format: {citation_format}")
+        # Compile the pattern once after determining which format to use
         self.compiled_pattern = re.compile(self.citation_pattern)
     def __call__(self, *, response: str, **kwargs) -> Dict[str, Any]:
         """
         Evaluate the response text for citations.
         Args:
             response (str): The response text from the M365 Copilot agent
             **kwargs: Additional keyword arguments (not used but kept for compatibility)
         Returns:
             Dict[str, Any]: Evaluation results containing:
                 - citation_format (str): The format used for detection
@@ -69,40 +82,71 @@ class CitationsEvaluator:
         """
         if not isinstance(response, str):
             response = str(response) if response is not None else ""
-        # Find all citation matches
+        # Find all citations and get unique ones (same for all modes)
         citation_matches = self.compiled_pattern.findall(response)
-        # Get unique citations (remove duplicates)
         unique_citations = list(set(citation_matches))
-        # Extract citation identifiers for reporting
+        # Initialize citation details list (used by all modes)
         citation_details = []
+        # Initialize counters only for AUTO mode
+        if self.citation_format == CitationFormat.AUTO:
+            oai_count = 0
+            legacy_count = 0
+        # Process all citations (unified extraction logic)
         for citation in unique_citations:
-            if self.citation_format == CitationFormat.OAI_UNICODE:
-                # Extract the turn and search numbers from the citation
+            # Determine citation type and extract details
+            if '\ue200' in citation:
+                # OAI format (contains start marker)
                 turn_search_match = re.search(r'turn(\d+)search(\d+)', citation)
                 if turn_search_match:
                     turn_num = turn_search_match.group(1)
                     search_num = turn_search_match.group(2)
-                    citation_details.append(f"turn{turn_num}search{search_num}")
-            elif self.citation_format == CitationFormat.LEGACY_BRACKET:
-                # Extract the citation number from [^number^]
+                    # Add appropriate prefix based on mode
+                    if self.citation_format == CitationFormat.AUTO:
+                        citation_details.append(f"oai:turn{turn_num}search{search_num}")
+                        oai_count += 1
+                    else:  # OAI_UNICODE mode
+                        citation_details.append(f"turn{turn_num}search{search_num}")
+            else:
+                # Legacy bracket format
                 bracket_match = re.search(r'\[\^(\d+)\^\]', citation)
                 if bracket_match:
                     citation_num = bracket_match.group(1)
-                    citation_details.append(f"citation{citation_num}")
-        # Prepare results in a format compatible with the HTML report generator
+                    # Add appropriate prefix based on mode
+                    if self.citation_format == CitationFormat.AUTO:
+                        citation_details.append(f"legacy:citation{citation_num}")
+                        legacy_count += 1
+                    else:  # LEGACY_BRACKET mode
+                        citation_details.append(f"citation{citation_num}")
+        format_info = None
+        if self.citation_format == CitationFormat.AUTO:
+            format_info = f"OAI: {oai_count}, Legacy: {legacy_count}"
+        # Build results (common for all modes)
+        total_citations = len(unique_citations)
+        # Construct reason string with optional format info
+        reason_parts = [f"Found {total_citations} unique citation(s)"]
+        if format_info:
+            reason_parts.append(f"[{format_info}]:")
+        else:
+            reason_parts.append(":")
+        reason_parts.append(', '.join(citation_details) if citation_details else 'None')
         results = {
             "citation_format": self.citation_format.value,
-            # HTML report compatible fields
-            "score": len(unique_citations),  # Use citation count as the score
-            "result": "pass" if len(unique_citations) > 0 else "fail",  # Pass if citations found
-            "threshold": 1,  # Threshold of 1 citation minimum
-            "reason": f"Found {len(unique_citations)} unique citation(s): {', '.join(citation_details) if citation_details else 'None'}"
+            "score": total_citations,
+            "result": "pass" if total_citations > 0 else "fail",
+            "threshold": 1,
+            "reason": " ".join(reason_parts)
         }
         return results
     def get_name(self) -> str:

package/src/clients/cli/main.py CHANGED Viewed

@@ -24,7 +24,10 @@ from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFor
 #from custom_evaluators.PII.PII import PIIEvaluator
 from generate_report import generate_html_report, calculate_aggregate_statistics
 from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
+from schema_handler import DocumentUpgrader, SchemaVersionManager
+from version_check import check_min_version, get_cli_version
 from datetime import datetime, timezone
+from pathlib import Path
 # Allowed endpoints for URL validation
 ALLOWED_ENDPOINTS = [
@@ -36,6 +39,18 @@ class CallPath(Enum):
     ACCESS_TOKEN = "access_token"
     COPILOT_AUTH = "copilot_auth"
+# Flags that should bypass remote min-version enforcement.
+# --help is not needed here because argparse exits before runtime checks.
+VERSION_CHECK_BYPASS_FLAGS = (
+    "signout",
+)
+def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
+    """Return True if the current invocation should skip min-version checks."""
+    return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
 def write_results_to_html(results: List[Dict], output_file: str):
     """Write results to HTML file using generate_html_report from generate_report.py."""
     try:
@@ -58,23 +73,68 @@ def get_default_prompts_and_responses():
     return prompts, expected_responses
 def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
-    """Load prompts and expected responses from a JSON file."""
+    """Load prompts and expected responses from a JSON file.
+    Supports three formats:
+    1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
+    2. Array format: [{"prompt": "...", "expected_response": "..."}]
+    3. Dict format: {"prompts": [...], "expected_responses": [...]}
+    For eval documents (format 1) and array format (format 2), schema validation
+    and auto-upgrade are applied via DocumentUpgrader.
+    """
     try:
         with open(file_path, 'r', encoding='utf-8') as f:
             data = json.load(f)
+        # Detect if this is an eval document (has "items" key) or could be upgraded
+        is_eval_document = (
+            isinstance(data, dict) and "items" in data
+        ) or isinstance(data, list)
+        # Run schema validation and auto-upgrade for eval documents
+        if is_eval_document:
+            try:
+                upgrader = DocumentUpgrader()
+            except Exception as e:
+                # Schema infrastructure not available (missing files, etc.) — skip
+                print(f"Warning: Unable to initialize document upgrader: {e}")
+                upgrader = None
+            if upgrader is not None:
+                result = upgrader.upgrade(Path(file_path))
+                if result.error:
+                    print(f"Schema validation error: {result.error}")
+                    sys.exit(1)
+                if result.upgraded and result.message:
+                    print(result.message)
+                # Use the parsed document from the upgrade result
+                if result.document is not None:
+                    data = result.document
         if isinstance(data, list):
             # Format: [{"prompt": "...", "expected_response": "..."}, ...]
             prompts = [item.get("prompt", "") for item in data]
             expected_responses = [item.get("expected_response", "") for item in data]
         elif isinstance(data, dict):
-            # Format: {"prompts": [...], "expected_responses": [...]}
-            prompts = data.get("prompts", [])
-            expected_responses = data.get("expected_responses", [])
+            if "items" in data:
+                # Eval document format: {"schemaVersion": "...", "items": [...]}
+                items = data["items"]
+                prompts = [item.get("prompt", "") for item in items]
+                expected_responses = [item.get("expected_response", "") for item in items]
+            else:
+                # Format: {"prompts": [...], "expected_responses": [...]}
+                prompts = data.get("prompts", [])
+                expected_responses = data.get("expected_responses", [])
         else:
             raise ValueError("Invalid file format")
         return prompts, expected_responses
+    except SystemExit:
+        raise
     except Exception as e:
         print(f"Error loading prompts from file: {e}")
         sys.exit(1)
@@ -181,7 +241,7 @@ def run_evaluations(args, responses: dict, expected_responses: list) -> list:
         )
         tool_call_accuracy = None
-        if args.agent_id and enhanced_response.get("tool_definitions"):
+        if args.m365_agent_id and enhanced_response.get("tool_definitions"):
             tool_call_accuracy = tool_call_accuracy_evaluator(
                 query=prompt,
                 response=enhanced_response.get("response", actual_response_text),
@@ -267,18 +327,120 @@ def write_results_to_console(results):
                 print(f"{BOLD}{color}{name}:{RESET} {v}")
         print(f"{BLUE}{'-' * 30}{RESET}")
-def write_results_to_json(results: List[Dict], output_file: str):
-    """Write results to JSON file."""
+def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
+    """Extract an EvalScore object from a decorated metric dict.
+    Maps internal decorated-metric format to schema EvalScore:
+    {score, result, threshold} (required) + reason, evaluator (optional).
+    """
+    DEFAULT_THRESHOLD = 3  # fallback; decorate_metric should always set this
+    score_val = None
+    for k in (metric_id, f"{metric_id}_score", "score", "value"):
+        if k in data and isinstance(data[k], (int, float)):
+            score_val = data[k]
+            break
+    if score_val is None:
+        return None
+    result = data.get("result")
+    if result not in ("pass", "fail"):
+        result = "pass" if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else "fail"
+    eval_score: Dict[str, Any] = {
+        "score": score_val,
+        "result": result,
+        "threshold": data.get("threshold", DEFAULT_THRESHOLD),
+    }
+    reason = data.get(f"{metric_id}_reason") or data.get("reason")
+    if reason:
+        eval_score["reason"] = reason
+    return eval_score
+def convert_result_to_eval_item(result: Dict) -> Dict:
+    """Convert an internal evaluation result dict to a schema-compliant EvalItem.
+    Internal format (from run_evaluations):
+        {prompt, response, expected_response, results: {relevance_score: "JSON", ...}}
+    Schema EvalItem format:
+        {prompt, response, expected_response, scores: {relevance: EvalScore, ...}}
+    """
+    item: Dict[str, Any] = {
+        "prompt": result["prompt"],
+        "response": result["response"],
+        "expected_response": result["expected_response"],
+    }
+    scores: Dict[str, Any] = {}
+    results_dict = result.get("results", {})
+    # EvalScore metrics (all share the same schema shape: {score, result, threshold})
+    # Tuple: (internal results key, metric ID for score lookup, schema output key)
+    for internal_key, metric_id, schema_key in [
+        ("relevance_score", "relevance", "relevance"),
+        ("coherence_score", "coherence", "coherence"),
+        ("groundedness_score", "groundedness", "groundedness"),
+        ("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
+    ]:
+        raw = results_dict.get(internal_key)
+        if not raw:
+            continue
+        data = json.loads(raw) if isinstance(raw, str) else raw
+        eval_score = extract_eval_score(data, metric_id)
+        if eval_score:
+            scores[schema_key] = eval_score
+    # Citations → CitationScore (different schema shape: {count, result, threshold} + format)
+    raw_citations = results_dict.get("citations_score")
+    if raw_citations:
+        data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
+        count = data.get("score", 0)
+        cit_result = data.get("result")
+        if cit_result not in ("pass", "fail"):
+            cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
+        citation_score: Dict[str, Any] = {
+            "count": count,
+            "result": cit_result,
+            "threshold": data.get("threshold", 1),
+        }
+        if "citation_format" in data:
+            citation_score["format"] = data["citation_format"]
+        scores["citations"] = citation_score
+    if scores:
+        item["scores"] = scores
+    return item
+def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None):
+    """Write results to a schema-compliant eval document JSON file.
+    Output follows the eval-document.schema.json format:
+    {schemaVersion, metadata, items: [EvalItem]}
+    """
     try:
-        output_data = {
-            "individual_results": results
+        try:
+            current_version = SchemaVersionManager().get_current_version()
+        except Exception:
+            current_version = "1.0.0"
+        items = [convert_result_to_eval_item(r) for r in results]
+        metadata: Dict[str, Any] = {
+            "evaluatedAt": datetime.now(timezone.utc).isoformat(),
         }
-        # Add aggregate statistics if multiple results
-        if len(results) > 1:
-            aggregates = calculate_aggregate_statistics(results)
-            output_data["aggregate_statistics"] = aggregates
+        if agent_id:
+            metadata["agentId"] = agent_id
+        output_data: Dict[str, Any] = {
+            "schemaVersion": current_version,
+            "metadata": metadata,
+            "items": items,
+        }
         with open(output_file, 'w', encoding='utf-8') as f:
             json.dump(output_data, f, indent=2, ensure_ascii=False)
         print(f"Results saved to {output_file}")
@@ -372,12 +534,12 @@ Examples:
         help='List of expected responses (must match number of prompts)'
     )
-    # Agent ID
+    # Agent ID (--m365-agent-id is primary, --agent-id kept for backward compatibility)
     parser.add_argument(
-        '--agent-id',
-        type=str,
-        default=os.environ.get("AGENT_ID"),
-        help='Azure AI Agent ID (default from environment variable)'
+        '--m365-agent-id', '--agent-id',
+        type=str,
+        default=os.environ.get("M365_AGENT_ID") or os.environ.get("AGENT_ID"),
+        help='Agent ID (default from M365_AGENT_ID environment variable)'
     )
     # Output options
@@ -607,7 +769,7 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
             print(f"Processing prompt {i}/{len(prompts)}...")
         # Build the payload
-        payload = build_chat_payload(prompt, user_oid, args.agent_id)
+        payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
         if args.verbose:
             print(f"[Sydney] Sending payload: {payload.decode('utf-8')}")
@@ -644,7 +806,7 @@ def output_results(results: List[Dict], args):
     if args.output:
         output_lower = args.output.lower()
         if output_lower.endswith('.json'):
-            write_results_to_json(results, args.output)
+            write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
         elif output_lower.endswith('.csv'):
             write_results_to_csv(results, args.output)
         elif output_lower.endswith('.html'):
@@ -652,7 +814,7 @@ def output_results(results: List[Dict], args):
             abs_path = os.path.abspath(args.output)
             webbrowser.open(f'file://{abs_path}')
         else:
-            write_results_to_json(results, args.output)
+            write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
     else:
         write_results_to_console(results)
@@ -661,6 +823,11 @@ def main():
     load_dotenv()
     args = parse_arguments()
+    # Check minimum version before proceeding
+    cli_version = get_cli_version(quiet=args.quiet)
+    if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=args.quiet):
+        sys.exit(1)
     # Validate environment variables required for evaluation
     call_path = validate_environment()
     copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
@@ -722,23 +889,23 @@ def main():
     try:
         # 3. Agent selection - if no agent ID provided, prompt user to select
-        if not args.agent_id:
+        if not args.m365_agent_id:
             if not args.quiet:
                 print("No agent ID provided. Fetching available agents...")
             available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
             if not available_agents:
-              print("No agents are available for interactive selection. Please re-run with --agent-id or set the AGENT_ID environment variable.")
+              print("No agents are available for interactive selection. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
               sys.exit(1)
             if available_agents:
                 selected_agent_id = select_agent_interactively(available_agents)
                 if selected_agent_id:
-                    args.agent_id = selected_agent_id
+                    args.m365_agent_id = selected_agent_id
                     if not args.quiet:
-                        print(f"Selected agent: {args.agent_id}")
+                        print(f"Selected agent: {args.m365_agent_id}")
                 else:
-                    print("No agent selected. Please re-run with --agent-id or set the AGENT_ID environment variable.")
+                    print("No agent selected. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
                     sys.exit(1)
         # 4. Send prompts to chat API

package/src/clients/cli/readme.md CHANGED Viewed

@@ -49,8 +49,8 @@ M365_EVAL_CLIENT_ID="<app-registration-client-id>"
 TENANT_ID="<aad-tenant-id>"
 COPILOT_SCOPES="https://substrate.office.com/sydney/.default"
-# Optional: default agent id (overridable via --agent-id)
-AGENT_ID="00000000-0000-0000-0000-000000000000"
+# Optional: default agent id (overridable via --m365-agent-id)
+M365_AGENT_ID="00000000-0000-0000-0000-000000000000"
 ```
 ### 3. Run the Agent Evaluation
@@ -71,7 +71,7 @@ python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph
 python main.py --prompts "What is Microsoft Graph?" "How does authentication work?" --expected "Microsoft Graph is a gateway..." "Authentication works by..."
 # Override the agent configured in environment variables
-python main.py --agent-id "00000000-0000-0000-0000-000000000000" --prompts "What is Microsoft Graph?"
+python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000" --prompts "What is Microsoft Graph?"
 ```
 #### Using Prompts from File
@@ -106,8 +106,8 @@ python main.py --quiet
 # Get help and see all options
 python main.py --help
-# Specify / override the Agent ID (takes precedence over AZURE_AI_AGENT_ID env var)
-python main.py --agent-id "00000000-0000-0000-0000-000000000000"
+# Specify / override the Agent ID (takes precedence over M365_AGENT_ID env var)
+python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
 # Citation format options
 python main.py --citation-format oai_unicode      # Default: New OAI format

package/src/clients/cli/requirements.txt CHANGED Viewed

@@ -3,8 +3,10 @@ azure-ai-evaluation==1.10.0
 azure-ai-projects==1.0.0
 msal[broker]>=1.34,<2
 msal-extensions>=1.3.1
+packaging>=20.0
 PyJWT>=2.11.0
 python-dotenv==1.1.1
 markdown==3.8.2
 promptflow>=1.18.1
 questionary>=2.1.1
+jsonschema>=4.26.0,<5

package/src/clients/cli/samples/starter.json CHANGED Viewed

@@ -1,10 +1,13 @@
-[
-  {
-    "prompt": "What is Microsoft 365?",
-    "expected_response": "Microsoft 365 is a cloud-based productivity suite that includes applications like Word, Excel, PowerPoint, Teams, and other collaboration tools."
-  },
-  {
-    "prompt": "How can I share a file in Teams?",
-    "expected_response": "You can share a file in Teams by uploading it to a channel or chat, or by sharing a link from OneDrive or SharePoint."
-  }
-]
+{
+  "schemaVersion": "1.0.0",
+  "items": [
+    {
+      "prompt": "What is Microsoft 365?",
+      "expected_response": "Microsoft 365 is a cloud-based productivity suite that includes applications like Word, Excel, PowerPoint, Teams, and other collaboration tools."
+    },
+    {
+      "prompt": "How can I share a file in Teams?",
+      "expected_response": "You can share a file in Teams by uploading it to a channel or chat, or by sharing a link from OneDrive or SharePoint."
+    }
+  ]
+}