@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +135 -100
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +7 -0
  4. package/schema/v1/eval-document.schema.json +143 -11
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  11. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  12. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  13. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  14. package/schema/version.json +2 -2
  15. package/src/clients/cli/agent_selector.py +74 -0
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
  18. package/src/clients/cli/api_clients/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/base_agent_client.py +77 -0
  20. package/src/clients/cli/cli_args.py +136 -0
  21. package/src/clients/cli/cli_logging/cli_logger.py +33 -0
  22. package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
  23. package/src/clients/cli/cli_logging/logging_utils.py +0 -1
  24. package/src/clients/cli/common.py +64 -0
  25. package/src/clients/cli/env_validator.py +73 -0
  26. package/src/clients/cli/evaluation_runner.py +653 -0
  27. package/src/clients/cli/evaluator_resolver.py +9 -6
  28. package/src/clients/cli/generate_report.py +272 -129
  29. package/src/clients/cli/main.py +157 -1174
  30. package/src/clients/cli/parallel_executor.py +57 -0
  31. package/src/clients/cli/prompt_loader.py +148 -0
  32. package/src/clients/cli/readme.md +9 -53
  33. package/src/clients/cli/requirements.txt +1 -1
  34. package/src/clients/cli/response_extractor.py +4 -603
  35. package/src/clients/cli/result_writer.py +488 -0
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +82 -20
  40. package/src/clients/node-js/config/default.js +12 -11
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +14 -20
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
@@ -0,0 +1,33 @@
1
+ """Shared CLI logger instance and structured-log convenience wrapper.
2
+
3
+ Every module in the CLI layer that needs to emit diagnostics imports from here
4
+ instead of main.py, which avoids circular-import issues.
5
+ """
6
+
7
+ import logging
8
+ import sys
9
+ from typing import Any, Dict, List
10
+
11
+ from cli_logging.console_diagnostics import emit_structured_log as _emit_structured_log
12
+ from cli_logging.logging_utils import LOG_LEVEL_MAP, Operation
13
+
14
+ CLI_LOGGER_NAME = "m365.eval.cli"
15
+ CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
16
+ DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
17
+
18
+
19
+ def configure_cli_logging(effective_log_level: str) -> None:
20
+ if not CLI_LOGGER.handlers:
21
+ handler = logging.StreamHandler(sys.stdout)
22
+ handler.setFormatter(logging.Formatter("%(message)s"))
23
+ CLI_LOGGER.addHandler(handler)
24
+ CLI_LOGGER.propagate = False
25
+ CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
26
+
27
+
28
+ def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
29
+ _emit_structured_log(
30
+ level, message, operation,
31
+ logger=CLI_LOGGER,
32
+ diagnostic_records=DIAGNOSTIC_RECORDS,
33
+ )
@@ -1,9 +1,15 @@
1
1
  import json
2
+ import logging
2
3
  import sys
3
4
  from collections import OrderedDict
4
- from typing import Any, Dict
5
+ from typing import Any, Dict, List, Optional
5
6
 
6
- from cli_logging.logging_utils import STRUCTURED_LOG_FIELDS
7
+ from cli_logging.logging_utils import (
8
+ STRUCTURED_LOG_FIELDS,
9
+ Operation,
10
+ format_structured_log_entry,
11
+ redact_sensitive_content,
12
+ )
7
13
 
8
14
  _ANSI_COLORS = {
9
15
  "debug": "\033[2m", # dim
@@ -53,3 +59,51 @@ def render_diagnostic(record: Dict[str, Any]) -> str:
53
59
  if sys.stdout.isatty():
54
60
  return format_console_record(record)
55
61
  return serialize_diagnostic_record(record)
62
+
63
+
64
+ def emit_structured_log(
65
+ level: str,
66
+ message: str,
67
+ operation: str = Operation.EVALUATE,
68
+ *,
69
+ logger: logging.Logger,
70
+ diagnostic_records: Optional[List[Dict[str, Any]]] = None,
71
+ run_context: Optional[Dict[str, Any]] = None,
72
+ ) -> None:
73
+ """Emit a structured log entry.
74
+
75
+ Formats via format_structured_log_entry, optionally appends to
76
+ diagnostic_records, then logs via render_diagnostic (TTY-friendly or JSON).
77
+
78
+ Args:
79
+ level: One of "debug", "info", "warning", "error".
80
+ message: Human-readable log message.
81
+ operation: The CLI operation step (e.g. Operation.SEND_PROMPT).
82
+ logger: Logger to emit through.
83
+ diagnostic_records: If provided, the structured entry is appended here.
84
+ run_context: Full run context override (request-id, conversation-id,
85
+ message-id). Defaults to nulls with the given operation.
86
+ """
87
+ log_level_int = getattr(logging, level.upper(), logging.INFO)
88
+ if diagnostic_records is None and not logger.isEnabledFor(log_level_int):
89
+ return
90
+
91
+ context = run_context or {
92
+ "request-id": None,
93
+ "conversation-id": None,
94
+ "message-id": None,
95
+ "operation": operation,
96
+ }
97
+ entry = format_structured_log_entry(
98
+ level=level,
99
+ message=message,
100
+ logger_name=logger.name,
101
+ run_context=context,
102
+ )
103
+ if diagnostic_records is not None:
104
+ diagnostic_records.append(entry)
105
+ try:
106
+ rendered, _ = redact_sensitive_content(render_diagnostic(entry))
107
+ logger.log(getattr(logging, level.upper(), logging.INFO), rendered)
108
+ except Exception:
109
+ pass
@@ -52,7 +52,6 @@ def normalize_log_level(value: Optional[str]) -> Optional[str]:
52
52
  return value.strip().lower()
53
53
 
54
54
 
55
-
56
55
  def resolve_log_level(
57
56
  log_level_values: Optional[List[str]],
58
57
  ) -> Tuple[Optional[str], Optional[str]]:
@@ -4,6 +4,22 @@ import re
4
4
  from dataclasses import dataclass
5
5
  from typing import List, Optional
6
6
 
7
+ MAX_CONCURRENCY = 5
8
+ MAX_ATTEMPTS = 4 # Initial attempt + 3 retries
9
+ MAX_TURNS_PER_THREAD = 20
10
+ LONG_THREAD_WARNING_THRESHOLD = 10
11
+ DEFAULT_PASS_THRESHOLD = 3
12
+
13
+ # ── Environment variable name constants ──────────────────────────────
14
+ ENV_AZURE_AI_OPENAI_ENDPOINT = "AZURE_AI_OPENAI_ENDPOINT"
15
+ ENV_AZURE_AI_API_KEY = "AZURE_AI_API_KEY"
16
+ ENV_AZURE_AI_API_VERSION = "AZURE_AI_API_VERSION"
17
+ ENV_AZURE_AI_MODEL_NAME = "AZURE_AI_MODEL_NAME"
18
+ ENV_TENANT_ID = "TENANT_ID"
19
+ ENV_WORK_IQ_A2A_ENDPOINT = "WORK_IQ_A2A_ENDPOINT"
20
+ ENV_WORK_IQ_A2A_CLIENT_ID = "WORK_IQ_A2A_CLIENT_ID"
21
+ ENV_WORK_IQ_A2A_SCOPES = "WORK_IQ_A2A_SCOPES"
22
+
7
23
 
8
24
  def pascal_case_to_title(eval_name: str) -> str:
9
25
  """Convert PascalCase evaluator name to space-separated display name.
@@ -12,10 +28,12 @@ def pascal_case_to_title(eval_name: str) -> str:
12
28
  """
13
29
  return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', eval_name)
14
30
 
31
+
15
32
  # Canonical evaluator name constants
16
33
  RELEVANCE = "Relevance"
17
34
  COHERENCE = "Coherence"
18
35
  GROUNDEDNESS = "Groundedness"
36
+ SIMILARITY = "Similarity"
19
37
  TOOL_CALL_ACCURACY = "ToolCallAccuracy"
20
38
  CITATIONS = "Citations"
21
39
  EXACT_MATCH = "ExactMatch"
@@ -25,6 +43,17 @@ PARTIAL_MATCH = "PartialMatch"
25
43
  REQUIRES_AZURE_OPENAI = "azure_openai"
26
44
  REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
27
45
 
46
+ # Evaluation status constants
47
+ # Outcome statuses (agent responded, evaluators ran):
48
+ STATUS_PASS = "pass" # All evaluators scored above threshold
49
+ STATUS_FAIL = "fail" # At least one evaluator scored below threshold
50
+ # Error state (evaluation couldn't complete):
51
+ STATUS_ERROR = "error" # API call failed / response couldn't be obtained
52
+ # Thread-level aggregate status (multi-turn only):
53
+ STATUS_PARTIAL = "partial" # Some turns passed, some did not
54
+ # Fallback for missing status:
55
+ STATUS_UNKNOWN = "unknown"
56
+
28
57
  # System defaults when no file-level or env-level defaults are configured
29
58
  SYSTEM_DEFAULT_EVALUATORS = [
30
59
  RELEVANCE,
@@ -37,6 +66,7 @@ METRIC_IDS = {
37
66
  RELEVANCE: "relevance",
38
67
  COHERENCE: "coherence",
39
68
  GROUNDEDNESS: "groundedness",
69
+ SIMILARITY: "similarity",
40
70
  TOOL_CALL_ACCURACY: "tool_call_accuracy",
41
71
  CITATIONS: "citations",
42
72
  EXACT_MATCH: "exact_match",
@@ -49,3 +79,37 @@ class RegistryEntry:
49
79
  type: str # "llm", "tool", or "non-llm"
50
80
  requires: List[str]
51
81
  default_threshold: Optional[float]
82
+
83
+
84
+ @dataclass(frozen=True)
85
+ class RunConfig:
86
+ """Typed, immutable runtime configuration passed across module boundaries.
87
+
88
+ Use ``RunConfig.from_namespace(args)`` to build from argparse output.
89
+ Use ``dataclasses.replace(config, field=value)`` to derive new configs.
90
+ """
91
+ prompts: Optional[List[str]] = None
92
+ expected: Optional[List[str]] = None
93
+ prompts_file: Optional[str] = None
94
+ interactive: bool = False
95
+ m365_agent_id: Optional[str] = None
96
+ output: Optional[str] = None
97
+ log_level: Optional[List[str]] = None
98
+ effective_log_level: str = "info"
99
+ signout: bool = False
100
+ concurrency: int = MAX_CONCURRENCY
101
+
102
+ @classmethod
103
+ def from_namespace(cls, args) -> "RunConfig":
104
+ """Build a RunConfig from an argparse.Namespace."""
105
+ return cls(
106
+ prompts=args.prompts,
107
+ expected=args.expected,
108
+ prompts_file=args.prompts_file,
109
+ interactive=args.interactive,
110
+ m365_agent_id=args.m365_agent_id,
111
+ output=args.output,
112
+ log_level=args.log_level,
113
+ signout=args.signout,
114
+ concurrency=args.concurrency,
115
+ )
@@ -0,0 +1,73 @@
1
+ """Environment validation and URL security checks."""
2
+
3
+ import os
4
+ import sys
5
+ import urllib.parse
6
+ from typing import List
7
+
8
+ from cli_logging.cli_logger import emit_structured_log
9
+ from cli_logging.logging_utils import Operation
10
+ from common import (
11
+ ENV_AZURE_AI_OPENAI_ENDPOINT,
12
+ ENV_AZURE_AI_API_KEY,
13
+ ENV_AZURE_AI_API_VERSION,
14
+ ENV_AZURE_AI_MODEL_NAME,
15
+ ENV_WORK_IQ_A2A_ENDPOINT,
16
+ ENV_WORK_IQ_A2A_CLIENT_ID,
17
+ ENV_WORK_IQ_A2A_SCOPES,
18
+ ENV_TENANT_ID,
19
+ )
20
+
21
+
22
+ # Allowed endpoints for URL validation
23
+ ALLOWED_ENDPOINTS = [
24
+ 'substrate.office.com',
25
+ 'graph.microsoft.com',
26
+ ]
27
+
28
+
29
+ def validate_environment() -> None:
30
+ """Validate required environment variables."""
31
+ required_env_vars = [
32
+ ENV_AZURE_AI_OPENAI_ENDPOINT,
33
+ ENV_AZURE_AI_API_KEY,
34
+ ENV_AZURE_AI_API_VERSION,
35
+ ENV_AZURE_AI_MODEL_NAME,
36
+ ENV_WORK_IQ_A2A_ENDPOINT,
37
+ ENV_WORK_IQ_A2A_CLIENT_ID,
38
+ ENV_WORK_IQ_A2A_SCOPES,
39
+ ENV_TENANT_ID,
40
+ ]
41
+
42
+ missing_vars = [
43
+ var for var in required_env_vars if not os.environ.get(var)
44
+ ]
45
+ if missing_vars:
46
+ emit_structured_log(
47
+ "error",
48
+ "Missing required environment variables: "
49
+ f"{', '.join(missing_vars)}. Please ensure your .env file"
50
+ " contains all required configuration.",
51
+ operation=Operation.VALIDATE_ENV,
52
+ )
53
+ sys.exit(1)
54
+
55
+
56
+ def validate_endpoint_url(url: str, allowed_domains: List[str]) -> None:
57
+ """Validate URL against security requirements."""
58
+ try:
59
+ parsed = urllib.parse.urlparse(url)
60
+ except Exception as e:
61
+ raise ValueError(f"Invalid URL format: {url}") from e
62
+
63
+ if parsed.scheme in ['javascript', 'data']:
64
+ raise ValueError(f"Dangerous URL scheme detected: {parsed.scheme}")
65
+
66
+ if parsed.scheme != 'https':
67
+ raise ValueError(f"Only HTTPS URLs are allowed, got: {parsed.scheme}")
68
+
69
+ if parsed.netloc not in allowed_domains:
70
+ raise ValueError(f"Domain not in allowed list: {parsed.netloc}")
71
+
72
+ if parsed.fragment:
73
+ raise ValueError("Fragment URLs are not allowed")