@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.4.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +140 -101
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +8 -0
  4. package/schema/v1/eval-document.schema.json +256 -8
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/comprehensive.json +27 -2
  11. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  12. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  13. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  14. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  15. package/schema/version.json +2 -2
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
  18. package/src/clients/cli/api_clients/REST/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
  20. package/src/clients/cli/api_clients/__init__.py +3 -0
  21. package/src/clients/cli/api_clients/base_agent_client.py +78 -0
  22. package/src/clients/cli/cli_logging/__init__.py +0 -0
  23. package/src/clients/cli/cli_logging/console_diagnostics.py +107 -0
  24. package/src/clients/cli/cli_logging/logging_utils.py +144 -0
  25. package/src/clients/cli/common.py +62 -0
  26. package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
  27. package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
  28. package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
  29. package/src/clients/cli/evaluator_resolver.py +150 -0
  30. package/src/clients/cli/generate_report.py +347 -184
  31. package/src/clients/cli/main.py +1288 -481
  32. package/src/clients/cli/parallel_executor.py +57 -0
  33. package/src/clients/cli/readme.md +14 -7
  34. package/src/clients/cli/requirements.txt +1 -1
  35. package/src/clients/cli/response_extractor.py +30 -14
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +134 -41
  40. package/src/clients/node-js/config/default.js +5 -1
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +11 -16
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
@@ -3,14 +3,19 @@ import os
3
3
  import argparse
4
4
  import sys
5
5
  import csv
6
- import functools
6
+ import logging
7
+ import time
7
8
  import webbrowser
8
- import urllib.request
9
- import urllib.error
10
9
  import urllib.parse
11
10
  import questionary
11
+ from dataclasses import dataclass, field
12
12
  from enum import Enum
13
13
  from typing import List, Dict, Tuple, Optional, Any
14
+
15
+ from api_clients.A2A import A2AClient
16
+ from api_clients.REST import SydneyClient
17
+ from api_clients.base_agent_client import BaseAgentClient
18
+
14
19
  from azure.ai.evaluation import (
15
20
  AzureOpenAIModelConfiguration,
16
21
  RelevanceEvaluator,
@@ -21,25 +26,119 @@ from azure.ai.evaluation import (
21
26
  from dotenv import load_dotenv
22
27
  from auth.auth_handler import AuthHandler
23
28
  from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
24
- #from custom_evaluators.ConcisenessNonLLMEvaluator import ConcisenessNonLLMEvaluator
25
- #from custom_evaluators.PII.PII import PIIEvaluator
29
+ from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
30
+ from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
26
31
  from generate_report import generate_html_report, calculate_aggregate_statistics
27
- from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
32
+ from response_extractor import get_response_text_for_evaluation
28
33
  from schema_handler import DocumentUpgrader, SchemaVersionManager
34
+ from common import (
35
+ RELEVANCE,
36
+ COHERENCE,
37
+ GROUNDEDNESS,
38
+ TOOL_CALL_ACCURACY,
39
+ CITATIONS,
40
+ EXACT_MATCH,
41
+ PARTIAL_MATCH,
42
+ REQUIRES_AZURE_OPENAI,
43
+ REQUIRES_TOOL_DEFINITIONS,
44
+ METRIC_IDS,
45
+ STATUS_PASS,
46
+ STATUS_FAIL,
47
+ STATUS_ERROR,
48
+ STATUS_PARTIAL,
49
+ STATUS_UNKNOWN,
50
+ pascal_case_to_title,
51
+ )
52
+ from evaluator_resolver import (
53
+ validate_evaluator_names,
54
+ check_prerequisites,
55
+ resolve_default_evaluators,
56
+ resolve_evaluators_for_prompt,
57
+ get_evaluator_threshold,
58
+ )
29
59
  from version_check import check_min_version, get_cli_version
30
60
  from datetime import datetime, timezone
31
61
  from pathlib import Path
32
- import tzlocal
62
+
63
+
64
+ from cli_logging.console_diagnostics import emit_structured_log as _emit_structured_log
65
+ from cli_logging.logging_utils import LOG_LEVEL_MAP, Operation, resolve_log_level
66
+ from parallel_executor import execute_in_parallel
67
+ from throttle_gate import ThrottleGate
68
+ from retry_policy import (
69
+ is_retryable_status,
70
+ get_backoff_seconds,
71
+ get_retry_after_seconds,
72
+ )
33
73
 
34
74
  # Allowed endpoints for URL validation
35
75
  ALLOWED_ENDPOINTS = [
36
- 'substrate.office.com'
76
+ 'substrate.office.com',
77
+ 'graph.microsoft.com',
37
78
  ]
38
79
 
80
+ MAX_CONCURRENCY = 5
81
+ MAX_ATTEMPTS = 4 # Initial attempt + 3 retries
82
+ MAX_TURNS_PER_THREAD = 20
83
+ LONG_THREAD_WARNING_THRESHOLD = 10
84
+
85
+
86
+ @dataclass
87
+ class PipelineConfig:
88
+ """Runtime configuration for the evaluation pipeline."""
89
+ agent_client: BaseAgentClient
90
+ model_config: AzureOpenAIModelConfiguration
91
+ has_azure_openai: bool
92
+ default_evaluators: Dict[str, Any]
93
+ chat_gate: ThrottleGate = field(default_factory=lambda: ThrottleGate("chat_api"))
94
+ is_retryable_status: Any = field(default=is_retryable_status)
95
+ get_backoff_seconds: Any = field(default=get_backoff_seconds)
96
+ get_retry_after_seconds: Any = field(default=get_retry_after_seconds)
97
+
39
98
  class CallPath(Enum):
40
99
  """ Enum to indicate which call path to use. """
41
100
  ACCESS_TOKEN = "access_token"
42
101
  COPILOT_AUTH = "copilot_auth"
102
+ A2A = "a2a"
103
+
104
+
105
+ class ItemType(Enum):
106
+ SINGLE_TURN = "single_turn"
107
+ MULTI_TURN = "multi_turn"
108
+
109
+
110
+ def detect_item_type(item: dict) -> ItemType:
111
+ """Determine if an evaluation item is single-turn or multi-turn.
112
+
113
+ Returns ItemType.SINGLE_TURN if item has 'prompt' without 'turns',
114
+ ItemType.MULTI_TURN if item has 'turns' array.
115
+
116
+ Raises ValueError for invalid items (both, neither, or invalid turns).
117
+ """
118
+ has_turns = "turns" in item
119
+ has_prompt = "prompt" in item
120
+
121
+ if has_turns and has_prompt:
122
+ raise ValueError(
123
+ "Invalid evaluation item: cannot have both 'turns' and 'prompt'. "
124
+ "Use 'turns' for multi-turn threads or 'prompt' for single-turn."
125
+ )
126
+
127
+ if has_turns and not isinstance(item["turns"], list):
128
+ raise ValueError("Invalid evaluation item: 'turns' must be a list")
129
+
130
+ if has_turns:
131
+ if len(item["turns"]) == 0:
132
+ raise ValueError("Invalid multi-turn thread: 'turns' array cannot be empty")
133
+ return ItemType.MULTI_TURN
134
+
135
+ if has_prompt:
136
+ return ItemType.SINGLE_TURN
137
+
138
+ raise ValueError(
139
+ "Invalid evaluation item: must have either 'turns' array (multi-turn) "
140
+ "or 'prompt' field (single-turn)"
141
+ )
43
142
 
44
143
 
45
144
  # Flags that should bypass remote min-version enforcement.
@@ -48,20 +147,44 @@ VERSION_CHECK_BYPASS_FLAGS = (
48
147
  "signout",
49
148
  )
50
149
 
150
+ CLI_LOGGER_NAME = "m365.eval.cli"
151
+ CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
152
+ DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
153
+
154
+
155
+ def configure_cli_logging(effective_log_level: str) -> None:
156
+ if not CLI_LOGGER.handlers:
157
+ handler = logging.StreamHandler(sys.stdout)
158
+ handler.setFormatter(logging.Formatter("%(message)s"))
159
+ CLI_LOGGER.addHandler(handler)
160
+ CLI_LOGGER.propagate = False
161
+ CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
162
+
163
+
164
+ def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
165
+ _emit_structured_log(
166
+ level, message, operation,
167
+ logger=CLI_LOGGER,
168
+ diagnostic_records=DIAGNOSTIC_RECORDS,
169
+ )
170
+
51
171
 
52
172
  def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
53
173
  """Return True if the current invocation should skip min-version checks."""
54
174
  return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
55
175
 
56
- def write_results_to_html(results: List[Dict], output_file: str):
176
+ def write_results_to_html(results: List[Dict], output_file: str,
177
+ agent_name: Optional[str] = None, agent_id: Optional[str] = None,
178
+ cli_version: Optional[str] = None):
57
179
  """Write results to HTML file using generate_html_report from generate_report.py."""
58
180
  try:
59
- html = generate_html_report(results)
181
+ html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
182
+ cli_version=cli_version)
60
183
  with open(output_file, 'w', encoding='utf-8') as f:
61
184
  f.write(html)
62
- print(f"HTML report saved to {output_file}")
185
+ emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
63
186
  except Exception as e:
64
- print(f"Error writing to HTML file: {e}")
187
+ emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
65
188
  sys.exit(1)
66
189
 
67
190
  def get_default_prompts_and_responses():
@@ -74,7 +197,7 @@ def get_default_prompts_and_responses():
74
197
  ]
75
198
  return prompts, expected_responses
76
199
 
77
- def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
200
+ def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
78
201
  """Load prompts and expected responses from a JSON file.
79
202
 
80
203
  Supports three formats:
@@ -84,6 +207,10 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
84
207
 
85
208
  For eval documents (format 1) and array format (format 2), schema validation
86
209
  and auto-upgrade are applied via DocumentUpgrader.
210
+
211
+ Returns:
212
+ Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
213
+ expected_response, and optional evaluators/evaluators_mode fields.
87
214
  """
88
215
  try:
89
216
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -100,18 +227,18 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
100
227
  upgrader = DocumentUpgrader()
101
228
  except Exception as e:
102
229
  # Schema infrastructure not available (missing files, etc.) — skip
103
- print(f"Warning: Unable to initialize document upgrader: {e}")
230
+ emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
104
231
  upgrader = None
105
232
 
106
233
  if upgrader is not None:
107
234
  result = upgrader.upgrade(Path(file_path))
108
235
 
109
236
  if result.error:
110
- print(f"Schema validation error: {result.error}")
237
+ emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
111
238
  sys.exit(1)
112
239
 
113
240
  if result.upgraded and result.message:
114
- print(result.message)
241
+ emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
115
242
 
116
243
  # Use the parsed document from the upgrade result
117
244
  if result.document is not None:
@@ -119,26 +246,26 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
119
246
 
120
247
  if isinstance(data, list):
121
248
  # Format: [{"prompt": "...", "expected_response": "..."}, ...]
122
- prompts = [item.get("prompt", "") for item in data]
123
- expected_responses = [item.get("expected_response", "") for item in data]
249
+ return data, None
124
250
  elif isinstance(data, dict):
125
251
  if "items" in data:
126
252
  # Eval document format: {"schemaVersion": "...", "items": [...]}
127
- items = data["items"]
128
- prompts = [item.get("prompt", "") for item in items]
129
- expected_responses = [item.get("expected_response", "") for item in items]
253
+ return data["items"], data.get("default_evaluators")
130
254
  else:
131
255
  # Format: {"prompts": [...], "expected_responses": [...]}
132
256
  prompts = data.get("prompts", [])
133
257
  expected_responses = data.get("expected_responses", [])
258
+ eval_items = [
259
+ {"prompt": p, "expected_response": e}
260
+ for p, e in zip(prompts, expected_responses)
261
+ ]
262
+ return eval_items, None
134
263
  else:
135
264
  raise ValueError("Invalid file format")
136
-
137
- return prompts, expected_responses
138
265
  except SystemExit:
139
266
  raise
140
267
  except Exception as e:
141
- print(f"Error loading prompts from file: {e}")
268
+ emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
142
269
  sys.exit(1)
143
270
 
144
271
  def get_interactive_prompts() -> Tuple[List[str], List[str]]:
@@ -165,116 +292,549 @@ def get_interactive_prompts() -> Tuple[List[str], List[str]]:
165
292
 
166
293
  return prompts, expected_responses
167
294
 
168
- def run_evaluations(args, responses: dict, expected_responses: list) -> list:
169
- """Run evaluations against the responses."""
170
- model_config = AzureOpenAIModelConfiguration(
171
- azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
172
- api_key=os.environ.get("AZURE_AI_API_KEY"),
173
- api_version=os.environ.get("AZURE_AI_API_VERSION"),
174
- azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
295
+
296
+ _DEFAULT_PASS_THRESHOLD = 3
297
+
298
+
299
+ def _decorate_metric(metric_id: str, data, threshold: Optional[int] = None) -> Dict[str, Any]:
300
+ """Augment raw evaluator output with standardized threshold + pass/fail result."""
301
+ pass_threshold = threshold if threshold is not None else _DEFAULT_PASS_THRESHOLD
302
+ payload = {}
303
+ if isinstance(data, dict):
304
+ payload.update(data)
305
+ else:
306
+ payload['raw'] = data
307
+
308
+ score_val = None
309
+ if isinstance(data, dict):
310
+ if metric_id in data:
311
+ score_val = data[metric_id]
312
+ if isinstance(score_val, (int, float)):
313
+ payload['threshold'] = pass_threshold
314
+ payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
315
+ else:
316
+ payload['threshold'] = pass_threshold
317
+ payload.setdefault('result', STATUS_UNKNOWN)
318
+ return payload
319
+
320
+
321
+ def _run_evaluators_for_item(
322
+ prompt: str,
323
+ actual_response: str,
324
+ expected_response: str,
325
+ enhanced_response: Dict[str, Any],
326
+ resolved_evaluators: Dict[str, Any],
327
+ model_config: AzureOpenAIModelConfiguration,
328
+ has_azure_openai: bool,
329
+ args,
330
+ ) -> Tuple[Dict[str, Optional[str]], List[str]]:
331
+ """Run resolved evaluators against a single item/turn.
332
+
333
+ Returns (results_dict, evaluators_ran).
334
+ """
335
+ has_tool_defs = bool(
336
+ args.m365_agent_id and enhanced_response.get("tool_definitions")
175
337
  )
176
-
177
- # Initialize evaluators
178
- relevance_evaluator = RelevanceEvaluator(model_config=model_config) # Evaluate relevance for a given response. Range is 1 - 5.
179
- coherence_evaluator = CoherenceEvaluator(model_config=model_config) # Measures the coherence (human-like quality) of the response. Range is 1 - 5.
180
- groundedness_evaluator = GroundednessEvaluator(model_config=model_config) # Evaluates the response for for factuality and groundedness against provided context. Range is 1 - 5.
181
- #concisenessnonllm_evaluator = ConcisenessNonLLMEvaluator() # Evaluates the response for conciseness. Range is 1 - 5.
182
- #pii_evaluator = PIIEvaluator(model_config=model_config) # Evaluates the response for presence of PII. Range
183
- # Parse citation format from args
184
- citation_format = CitationFormat.OAI_UNICODE if args.citation_format == 'oai_unicode' else CitationFormat.LEGACY_BRACKET
185
- citations_evaluator = CitationsEvaluator(citation_format=citation_format) # Evaluates citations present in the response using regex pattern matching
186
-
187
- tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config) # Evaluate tool call accuracy if tool definitions are present in response
188
-
338
+ available_context = {
339
+ REQUIRES_AZURE_OPENAI: has_azure_openai,
340
+ REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
341
+ }
189
342
 
190
- PASS_THRESHOLD = 3 # All evaluators must meet or exceed this value (out of 5) to pass
343
+ results_dict: Dict[str, Optional[str]] = {}
344
+ evaluators_ran: List[str] = []
345
+
346
+ for eval_name, eval_options in resolved_evaluators.items():
347
+ can_run, warn_msg = check_prerequisites(eval_name, available_context)
348
+ if not can_run:
349
+ if warn_msg:
350
+ emit_structured_log(
351
+ "warning",
352
+ f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}",
353
+ operation=Operation.EVALUATE,
354
+ )
355
+ results_dict[eval_name] = None
356
+ continue
191
357
 
192
- def decorate_metric(metric_id: str, data):
193
- """Augment raw evaluator output with standardized threshold + pass/fail result."""
194
- payload = {}
195
- # Preserve original structure if dict
196
- if isinstance(data, dict):
197
- payload.update(data)
198
- else:
199
- payload['raw'] = data
200
-
201
- # Try to extract a numeric score
202
- score_val = None
203
- if isinstance(data, dict):
204
- for k in (metric_id, f"{metric_id}_score", 'score', 'value'):
205
- if k in data:
206
- score_val = data[k]
207
- break
208
- if isinstance(score_val, (int, float)):
209
- payload['threshold'] = PASS_THRESHOLD
210
- payload['result'] = 'pass' if score_val >= PASS_THRESHOLD else 'fail'
211
- else:
212
- # If we cannot determine score, mark unknown (no pass/fail)
213
- payload['threshold'] = PASS_THRESHOLD
214
- payload.setdefault('result', 'unknown')
215
- return json.dumps(payload, indent=4)
216
-
217
- evaluation_results = []
218
- for prompt, expected_response in zip(responses.keys(), expected_responses):
219
- # Extract text response for evaluation (backward compatibility)
220
- enhanced_response = responses[prompt]
221
- actual_response_text = get_response_text_for_evaluation(enhanced_response)
222
-
223
- # Run evaluations using text response
224
- relevance_score = relevance_evaluator(
225
- query=prompt,
226
- response=actual_response_text
358
+ threshold = get_evaluator_threshold(eval_name, eval_options)
359
+
360
+ try:
361
+ if eval_name == RELEVANCE:
362
+ raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
363
+ results_dict[RELEVANCE] = _decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
364
+ elif eval_name == COHERENCE:
365
+ raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
366
+ results_dict[COHERENCE] = _decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
367
+ elif eval_name == GROUNDEDNESS:
368
+ raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response, context=expected_response)
369
+ results_dict[GROUNDEDNESS] = _decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
370
+ elif eval_name == TOOL_CALL_ACCURACY:
371
+ raw_score = ToolCallAccuracyEvaluator(model_config)(
372
+ query=prompt,
373
+ response=enhanced_response.get("response", actual_response),
374
+ tool_definitions=enhanced_response.get("tool_definitions", []),
375
+ )
376
+ results_dict[TOOL_CALL_ACCURACY] = _decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
377
+ elif eval_name == CITATIONS:
378
+ fmt_str = eval_options.get("citation_format", "oai_unicode")
379
+ fmt_map = {
380
+ "oai_unicode": CitationFormat.OAI_UNICODE,
381
+ "bracket": CitationFormat.LEGACY_BRACKET,
382
+ "mixed": CitationFormat.AUTO,
383
+ }
384
+ raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response)
385
+ results_dict[CITATIONS] = _decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
386
+ elif eval_name == EXACT_MATCH:
387
+ case_sensitive = eval_options.get("case_sensitive", False)
388
+ raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
389
+ # ExactMatch is binary the evaluator already sets 'result'
390
+ # so _decorate_metric (which computes result from score vs threshold) is not needed.
391
+ results_dict[EXACT_MATCH] = raw_score
392
+ elif eval_name == PARTIAL_MATCH:
393
+ case_sensitive = eval_options.get("case_sensitive", False)
394
+ raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
395
+ results_dict[PARTIAL_MATCH] = _decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
396
+
397
+ evaluators_ran.append(eval_name)
398
+ except Exception as e:
399
+ emit_structured_log(
400
+ "error",
401
+ f"Evaluator '{eval_name}' crashed and will be omitted from results: {e}",
402
+ operation=Operation.EVALUATE,
403
+ )
404
+ results_dict[eval_name] = None
405
+
406
+ return results_dict, evaluators_ran
407
+
408
+
409
+ def _evaluate_single_response(
410
+ enhanced_response: Dict[str, Any],
411
+ eval_item: Dict,
412
+ args,
413
+ model_config: AzureOpenAIModelConfiguration,
414
+ has_azure_openai: bool,
415
+ default_evaluators: Dict[str, Any],
416
+ ) -> Dict[str, Any]:
417
+ """Run all evaluators for a single prompt/response pair and return the result dict."""
418
+ actual_response_text = get_response_text_for_evaluation(enhanced_response)
419
+ prompt = eval_item.get("prompt", "")
420
+ expected_response = eval_item.get("expected_response", "")
421
+
422
+ resolved = resolve_evaluators_for_prompt(
423
+ eval_item.get("evaluators"), eval_item.get("evaluators_mode", "extend"),
424
+ prompt, default_evaluators,
425
+ )
426
+
427
+ results_dict, evaluators_ran = _run_evaluators_for_item(
428
+ prompt, actual_response_text, expected_response, enhanced_response,
429
+ resolved, model_config, has_azure_openai, args,
430
+ )
431
+
432
+ evaluation_result = {
433
+ "prompt": prompt,
434
+ "response": enhanced_response.get(
435
+ "display_response_text", actual_response_text
436
+ ),
437
+ "expected_response": expected_response,
438
+ "evaluators_ran": evaluators_ran,
439
+ "results": results_dict,
440
+ }
441
+
442
+ if "evaluators" in eval_item:
443
+ evaluation_result["evaluators"] = eval_item["evaluators"]
444
+ if "evaluators_mode" in eval_item:
445
+ evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
446
+
447
+ if getattr(args, "effective_log_level", "info") == "debug":
448
+ emit_structured_log(
449
+ "debug",
450
+ f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
451
+ f"Evaluators: {', '.join(evaluators_ran)}. "
452
+ f"Scores: {evaluation_result['results']}",
453
+ operation=Operation.EVALUATE,
454
+ )
455
+
456
+ return evaluation_result
457
+
458
+
459
+ def _check_all_passed(results_dict: Dict[str, Optional[Dict[str, Any]]]) -> bool:
460
+ """Check if all evaluator results passed. Skipped evaluators (None) are ignored."""
461
+ for result_data in results_dict.values():
462
+ if result_data is None:
463
+ continue
464
+ if result_data.get("result") == STATUS_FAIL:
465
+ return False
466
+ return True
467
+
468
+ def _evaluate_multi_turn_responses(
469
+ turns: List[Dict],
470
+ args,
471
+ default_evaluators: Dict[str, Any],
472
+ model_config: AzureOpenAIModelConfiguration,
473
+ has_azure_openai: bool,
474
+ ) -> Tuple[List[Dict], Dict]:
475
+ """Run per-turn evaluations and build evaluated turn results with summary.
476
+
477
+ Returns:
478
+ Tuple of (evaluated_turns, summary). Each evaluated turn contains
479
+ prompt, response, expected_response, status, evaluators_ran, results,
480
+ and optionally error. Does not mutate the input turns.
481
+ """
482
+ evaluated_turns: List[Dict] = []
483
+ turns_passed = 0
484
+ turns_failed = 0
485
+
486
+ for i, turn in enumerate(turns):
487
+ evaluated_turn: Dict[str, Any] = {
488
+ "prompt": turn.get("prompt", ""),
489
+ }
490
+ if "expected_response" in turn:
491
+ evaluated_turn["expected_response"] = turn["expected_response"]
492
+ if "response" in turn:
493
+ evaluated_turn["response"] = turn["response"]
494
+ if "evaluators" in turn:
495
+ evaluated_turn["evaluators"] = turn["evaluators"]
496
+ if "evaluators_mode" in turn:
497
+ evaluated_turn["evaluators_mode"] = turn["evaluators_mode"]
498
+
499
+ if turn.get("status") == STATUS_ERROR:
500
+ evaluated_turn["status"] = STATUS_ERROR
501
+ evaluated_turn["error"] = turn.get("error", "")
502
+ turns_failed += 1
503
+ evaluated_turns.append(evaluated_turn)
504
+ continue
505
+
506
+ enhanced_response = turn.get("_enhanced_response", {})
507
+ actual_response = get_response_text_for_evaluation(enhanced_response)
508
+
509
+ resolved = resolve_evaluators_for_prompt(
510
+ turn.get("evaluators"), turn.get("evaluators_mode", "extend"),
511
+ turn.get("prompt", ""), default_evaluators,
227
512
  )
228
- coherence_score = coherence_evaluator(
229
- query=prompt,
230
- response=actual_response_text
513
+
514
+ results_dict, evaluators_ran = _run_evaluators_for_item(
515
+ turn.get("prompt", ""), actual_response, turn.get("expected_response", ""),
516
+ enhanced_response, resolved, model_config, has_azure_openai, args,
231
517
  )
232
518
 
233
- groundedness_score = groundedness_evaluator(
234
- response=actual_response_text,
235
- context=expected_response
519
+ all_passed = _check_all_passed(results_dict)
520
+
521
+ evaluated_turn["results"] = results_dict
522
+ evaluated_turn["evaluators_ran"] = evaluators_ran
523
+ evaluated_turn["status"] = STATUS_PASS if all_passed else STATUS_FAIL
524
+
525
+ if getattr(args, "effective_log_level", "info") == "debug":
526
+ emit_structured_log(
527
+ "debug",
528
+ f"Evaluation completed for turn {i + 1} prompt='{turn.get('prompt', '')}'. "
529
+ f"Evaluators: {', '.join(evaluators_ran)}. "
530
+ f"Scores: {results_dict}",
531
+ operation=Operation.EVALUATE,
532
+ )
533
+
534
+ if all_passed:
535
+ turns_passed += 1
536
+ else:
537
+ turns_failed += 1
538
+
539
+ evaluated_turns.append(evaluated_turn)
540
+
541
+ turns_total = len(turns)
542
+ if turns_passed == turns_total:
543
+ overall_status = STATUS_PASS
544
+ elif turns_failed == turns_total:
545
+ overall_status = STATUS_FAIL
546
+ else:
547
+ overall_status = STATUS_PARTIAL
548
+
549
+ summary = {
550
+ "turns_total": turns_total,
551
+ "turns_passed": turns_passed,
552
+ "turns_failed": turns_failed,
553
+ "overall_status": overall_status,
554
+ }
555
+
556
+ return evaluated_turns, summary
557
+
558
+
559
+ def get_effective_worker_count(prompt_count: int, args) -> int:
560
+ """Compute safe worker count for prompt processing."""
561
+ if prompt_count <= 0:
562
+ return 1
563
+
564
+ requested = getattr(args, "concurrency", 5)
565
+ try:
566
+ requested_int = int(requested)
567
+ except (TypeError, ValueError):
568
+ requested_int = 5
569
+
570
+ bounded = max(1, min(requested_int, MAX_CONCURRENCY))
571
+ return min(bounded, prompt_count)
572
+
573
+
574
+ def run_pipeline(
575
+ pipeline: PipelineConfig,
576
+ eval_items: List[Dict],
577
+ args,
578
+ ) -> List[Dict[str, Any]]:
579
+ """Run the full evaluation pipeline: send prompts and evaluate responses in parallel.
580
+
581
+ Each worker processes one prompt end-to-end: send → evaluate.
582
+ Results are returned in original prompt order (FR-006).
583
+ """
584
+ # Validate all evaluator names upfront before dispatching workers
585
+ all_evaluator_maps = [pipeline.default_evaluators]
586
+ for eval_item in eval_items:
587
+ if "evaluators" in eval_item:
588
+ all_evaluator_maps.append(eval_item["evaluators"])
589
+ for turn in eval_item.get("turns", []):
590
+ if "evaluators" in turn:
591
+ all_evaluator_maps.append(turn["evaluators"])
592
+ for emap in all_evaluator_maps:
593
+ validate_evaluator_names(emap)
594
+
595
+ # Validate all items upfront and classify types before dispatching workers
596
+ item_types: List[ItemType] = []
597
+ for idx, eval_item in enumerate(eval_items):
598
+ try:
599
+ item_type = detect_item_type(eval_item)
600
+ except ValueError as e:
601
+ raise ValueError(f"Invalid evaluation item at index {idx}: {e}") from e
602
+ if item_type == ItemType.MULTI_TURN:
603
+ turn_count = len(eval_item["turns"])
604
+ if turn_count > MAX_TURNS_PER_THREAD:
605
+ raise ValueError(
606
+ f"Invalid evaluation item at index {idx}: 'turns' array has "
607
+ f"{turn_count} items (max {MAX_TURNS_PER_THREAD})"
608
+ )
609
+ item_types.append(item_type)
610
+
611
+ total = len(eval_items)
612
+ worker_count = get_effective_worker_count(total, args)
613
+
614
+ multi_turn_count = sum(1 for t in item_types if t == ItemType.MULTI_TURN)
615
+ single_turn_count = total - multi_turn_count
616
+
617
+ emit_structured_log(
618
+ "info",
619
+ f"Running pipeline with {worker_count} worker(s) for {total} item(s) "
620
+ f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
621
+ operation=Operation.EVALUATE,
622
+ )
623
+
624
+ def _process_item(eval_item: Dict, index: int) -> Dict[str, Any]:
625
+ if item_types[index] == ItemType.MULTI_TURN:
626
+ return _process_multi_turn(eval_item, index)
627
+ return _process_single_turn(eval_item, index)
628
+
629
+ def _process_single_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
630
+ prompt = eval_item.get("prompt", "")
631
+ emit_structured_log(
632
+ "info",
633
+ f"Processing item {index + 1}/{total} (single-turn).",
634
+ operation=Operation.SEND_PROMPT,
236
635
  )
237
636
 
238
- #PII_score = pii_evaluator(response=actual_response_text)
239
- #concisenessNonLLM_score = concisenessnonllm_evaluator(response=actual_response_text)
637
+ # Phase A: Send prompt to agent (with retry + throttle gate)
638
+ response = None
639
+ for attempt in range(1, MAX_ATTEMPTS + 1):
640
+ pipeline.chat_gate.wait_if_blocked()
641
+ try:
642
+ response, _ = pipeline.agent_client.send_prompt(prompt, agent_id=args.m365_agent_id)
643
+ break
644
+ except Exception as exc:
645
+ cause = exc.__cause__
646
+ status = int(getattr(cause, "code", 0) or 0) or None if cause else None
647
+ retry_after = get_retry_after_seconds(
648
+ cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
649
+ )
650
+
651
+ if retry_after is not None and pipeline.is_retryable_status(status):
652
+ pipeline.chat_gate.apply_retry_after(retry_after)
653
+
654
+ if not pipeline.is_retryable_status(status) or attempt >= MAX_ATTEMPTS:
655
+ emit_structured_log(
656
+ "error",
657
+ f"Item {index + 1}/{total} failed after {attempt} attempt(s): {exc}",
658
+ operation=Operation.SEND_PROMPT,
659
+ )
660
+ return {
661
+ "prompt": prompt,
662
+ "response": "",
663
+ "expected_response": eval_item.get("expected_response", ""),
664
+ "evaluators_ran": [],
665
+ "results": {},
666
+ "status": STATUS_ERROR,
667
+ "errorDetails": str(exc),
668
+ }
669
+
670
+ delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
671
+ time.sleep(delay)
672
+
673
+ # Phase B: Evaluate response
674
+ return _evaluate_single_response(
675
+ response, eval_item, args,
676
+ pipeline.model_config, pipeline.has_azure_openai,
677
+ pipeline.default_evaluators,
678
+ )
240
679
 
241
- citations_score = citations_evaluator(
242
- response=actual_response_text
680
+ def _process_multi_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
681
+ turns = eval_item["turns"]
682
+ thread_name = eval_item.get("name", "Unnamed thread")
683
+ emit_structured_log(
684
+ "info",
685
+ f"Processing item {index + 1}/{total} (multi-turn: '{thread_name}').",
686
+ operation=Operation.SEND_PROMPT,
243
687
  )
244
688
 
245
- tool_call_accuracy = None
246
- if args.m365_agent_id and enhanced_response.get("tool_definitions"):
247
- tool_call_accuracy = tool_call_accuracy_evaluator(
248
- query=prompt,
249
- response=enhanced_response.get("response", actual_response_text),
250
- tool_definitions=enhanced_response["tool_definitions"]
689
+ if len(turns) > LONG_THREAD_WARNING_THRESHOLD:
690
+ emit_structured_log(
691
+ "warning",
692
+ f"Thread '{thread_name}' has {len(turns)} turns (>{LONG_THREAD_WARNING_THRESHOLD}). This may take a while.",
693
+ operation=Operation.SEND_PROMPT,
251
694
  )
252
695
 
253
- evaluation_result = {
254
- "prompt": prompt,
255
- "response": actual_response_text, # Keep simple text for backward compatibility
256
- "expected_response": expected_response,
257
- "results": {
258
- "relevance_score": decorate_metric("relevance", relevance_score),
259
- "coherence_score": decorate_metric("coherence", coherence_score),
260
- "groundedness_score": decorate_metric("groundedness", groundedness_score),
261
- #"concisenessnonllm_score": decorate_metric("concisenessnonllm", concisenessNonLLM_score),
262
- #"pii_score": decorate_metric("pii", PII_score),
263
- "citations_score": json.dumps(citations_score, indent=4),
264
- "tool_call_accuracy_score": json.dumps(tool_call_accuracy, indent=4) if tool_call_accuracy else None
265
- }
696
+ # Phase A: Send each turn with throttle gate + 429-only retry
697
+ # Multi-turn only retries on 429 (server confirmed it didn't process
698
+ # the request). Other transient errors (503, 504) are ambiguous about
699
+ # whether the server processed the turn, risking duplicate turns in
700
+ # the conversation if retried.
701
+ conversation_context = None
702
+ conversation_id = None
703
+ enriched_turns: List[Dict[str, Any]] = []
704
+ failed = False
705
+
706
+ for i, turn in enumerate(turns):
707
+ prompt = turn["prompt"]
708
+ emit_structured_log(
709
+ "debug",
710
+ f"Sending turn {i + 1}/{len(turns)} of '{thread_name}'.",
711
+ operation=Operation.SEND_PROMPT,
712
+ )
713
+
714
+ response = None
715
+ for attempt in range(1, MAX_ATTEMPTS + 1):
716
+ pipeline.chat_gate.wait_if_blocked()
717
+ try:
718
+ response, conversation_context = pipeline.agent_client.send_prompt(
719
+ prompt, agent_id=args.m365_agent_id,
720
+ conversation_context=conversation_context,
721
+ )
722
+ break
723
+ except Exception as exc:
724
+ cause = exc.__cause__
725
+ status = int(getattr(cause, "code", 0) or 0) or None if cause else None
726
+ retry_after = get_retry_after_seconds(
727
+ cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
728
+ )
729
+
730
+ # Only retry on 429 — server confirmed it didn't process the request
731
+ if status == 429 and attempt < MAX_ATTEMPTS:
732
+ if retry_after is not None:
733
+ pipeline.chat_gate.apply_retry_after(retry_after)
734
+ delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
735
+ time.sleep(delay)
736
+ continue
737
+
738
+ # All other errors: stop the thread
739
+ emit_structured_log(
740
+ "error",
741
+ f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s): {exc}",
742
+ operation=Operation.SEND_PROMPT,
743
+ )
744
+ failed = True
745
+ break
746
+
747
+ if failed:
748
+ # Mark this turn and all remaining turns as error
749
+ enriched_turns.append({
750
+ **turn,
751
+ "response": "",
752
+ "status": STATUS_ERROR,
753
+ "error": "Failed to get response from agent",
754
+ })
755
+ for j in range(i + 1, len(turns)):
756
+ enriched_turns.append({
757
+ **turns[j],
758
+ "response": "",
759
+ "status": STATUS_ERROR,
760
+ "error": "Skipped: preceding turn failed",
761
+ })
762
+ break
763
+
764
+ # Enrich turn with response
765
+ response_text = get_response_text_for_evaluation(response)
766
+ enriched_turns.append({
767
+ **turn,
768
+ "response": response.get("display_response_text", response_text),
769
+ "_enhanced_response": response,
770
+ })
771
+
772
+ # Capture conversation_id from first response
773
+ if conversation_id is None:
774
+ conversation_id = response.get("metadata", {}).get("conversation_id")
775
+
776
+ # Phase B: Run per-turn evaluations
777
+ evaluated_turns, summary = _evaluate_multi_turn_responses(
778
+ enriched_turns, args, pipeline.default_evaluators,
779
+ model_config=pipeline.model_config,
780
+ has_azure_openai=pipeline.has_azure_openai,
781
+ )
782
+
783
+ return {
784
+ "type": "multi_turn",
785
+ "name": eval_item.get("name", ""),
786
+ "description": eval_item.get("description", ""),
787
+ "conversation_id": conversation_id or "",
788
+ "turns": evaluated_turns,
789
+ "summary": summary,
266
790
  }
267
791
 
268
- if args.verbose:
269
- print(f".................................. Evaluation for prompt: {evaluation_result['prompt']} ..................................")
270
- print(f"Scores: {evaluation_result['results']}")
271
- print("...........................................................................................................................")
792
+ execution_results = execute_in_parallel(
793
+ eval_items, _process_item, max_workers=worker_count,
794
+ )
795
+
796
+ # Unwrap WorkerResult objects into plain dicts, with error fallback
797
+ ordered_results: List[Dict[str, Any]] = []
798
+ for wr in execution_results:
799
+ if wr.error:
800
+ idx = wr.index
801
+ item = eval_items[idx]
802
+ if item_types[idx] == ItemType.MULTI_TURN:
803
+ ordered_results.append({
804
+ "type": "multi_turn",
805
+ "name": item.get("name", ""),
806
+ "turns": [
807
+ {**t, "status": STATUS_ERROR, "error": str(wr.error), "response": "", "results": {}}
808
+ for t in item.get("turns", [])
809
+ ],
810
+ "summary": {
811
+ "turns_total": len(item.get("turns", [])),
812
+ "turns_passed": 0,
813
+ "turns_failed": len(item.get("turns", [])),
814
+ "overall_status": STATUS_FAIL,
815
+ },
816
+ "error": str(wr.error),
817
+ })
818
+ else:
819
+ ordered_results.append({
820
+ "prompt": item.get("prompt", ""),
821
+ "response": "",
822
+ "expected_response": item.get("expected_response", ""),
823
+ "evaluators_ran": [],
824
+ "results": {},
825
+ "status": STATUS_ERROR,
826
+ "errorDetails": str(wr.error),
827
+ })
828
+ else:
829
+ ordered_results.append(wr.value)
830
+
831
+ return ordered_results
272
832
 
273
- evaluation_results.append(evaluation_result)
274
-
275
- return evaluation_results
276
833
 
277
- def write_results_to_console(results):
834
+
835
+ def write_results_to_console(results, agent_name: Optional[str] = None,
836
+ agent_id: Optional[str] = None,
837
+ cli_version: Optional[str] = None):
278
838
  """Write the response to console."""
279
839
  # ANSI color codes
280
840
  BOLD = '\033[1m'
@@ -286,48 +846,105 @@ def write_results_to_console(results):
286
846
  ORANGE = '\033[38;5;208m'
287
847
  RED = '\033[91m'
288
848
  RESET = '\033[0m'
289
-
290
- # Show aggregate statistics if multiple results
291
- if len(results) > 1:
292
- aggregates = calculate_aggregate_statistics(results)
293
- if aggregates:
294
- print(f"{BOLD}{BLUE}📊 Aggregate Statistics ({len(results)} prompts):{RESET}")
849
+
850
+ def _print_evaluated_item(response: str, expected_response: str,
851
+ evaluators_ran: List[str], item_results: Dict[str, Any],
852
+ error: Optional[str] = None) -> None:
853
+ """Print the body of a single evaluated item (single-turn prompt or multi-turn turn).
854
+
855
+ The item header (Prompt X / Turn X) is printed by the caller; this helper
856
+ prints evaluators, response, expected response, error, and metrics.
857
+ """
858
+ if evaluators_ran:
859
+ print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
860
+ if response:
861
+ print(f"{BOLD}{CYAN}Response:{RESET} {response}")
862
+ if expected_response:
863
+ print(f"{BOLD}{YELLOW}Expected Response:{RESET} {expected_response}")
864
+ if error:
865
+ print(f"{BOLD}{RED}Error:{RESET} {error}")
866
+
867
+ for eval_name, v in item_results.items():
868
+ if v is None:
869
+ continue
870
+ display_name = pascal_case_to_title(eval_name)
871
+ if eval_name == RELEVANCE:
872
+ color = MAGENTA
873
+ elif eval_name == COHERENCE:
874
+ color = ORANGE
875
+ else:
876
+ color = BLUE
877
+ print(f"{BOLD}{color}{display_name}:{RESET} {json.dumps(v, indent=4)}")
878
+
879
+ # Show metadata
880
+ metadata_parts = []
881
+ if agent_name:
882
+ metadata_parts.append(f"Agent Name: {agent_name}")
883
+ if agent_id:
884
+ metadata_parts.append(f"Agent ID: {agent_id}")
885
+ if cli_version:
886
+ metadata_parts.append(f"CLI Version: {cli_version}")
887
+ if metadata_parts:
888
+ print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
889
+ print()
890
+
891
+ aggregates = calculate_aggregate_statistics(results)
892
+ if aggregates:
893
+ total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
894
+ if total_items > 1:
895
+ print(f"{BOLD}{BLUE}Aggregate Statistics ({total_items} prompts):{RESET}")
295
896
  print(f"{BLUE}{'=' * 60}{RESET}")
296
-
897
+
297
898
  for metric_name, stats in aggregates.items():
298
899
  pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
299
- print(f"{BOLD}{CYAN}{metric_name}:{RESET}")
900
+ prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
901
+ total_prompts = stats.get('total_prompts', total_items)
902
+ print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
300
903
  print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
301
904
  print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
302
905
  if stats.get('threshold') is not None:
303
906
  print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
304
907
  print()
305
-
908
+
306
909
  print(f"{BLUE}{'=' * 60}{RESET}")
307
910
  print()
308
-
309
- print(f"{BOLD}{BLUE}📝 Individual Results:{RESET}")
911
+
912
+ print(f"{BOLD}{BLUE}Individual Results:{RESET}")
310
913
  print(f"{BLUE}{'=' * 50}{RESET}")
311
914
  for i, result in enumerate(results, 1):
312
- print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
313
- print(f"{BOLD}{CYAN}Response:{RESET} {result['response']}")
314
- print(f"{BOLD}{YELLOW}Expected Response:{RESET} {result['expected_response']}")
315
-
316
- # Print metric scores generically from nested results (fallback to flat keys for back-compat)
317
- metrics = result.get('results') or {k: v for k, v in result.items() if isinstance(k, str) and k.endswith('_score')}
318
- if metrics:
319
- for k, v in metrics.items():
320
- name = k.replace('_', ' ')
321
- if 'relevance' in k:
322
- color = MAGENTA
323
- elif 'coherence' in k:
324
- color = ORANGE
325
- elif 'fluency' in k:
326
- color = GREEN
327
- else:
328
- color = BLUE
329
- print(f"{BOLD}{color}{name}:{RESET} {v}")
330
- print(f"{BLUE}{'-' * 30}{RESET}")
915
+ if result.get("type") == "multi_turn":
916
+ thread_name = result.get("name", "Unnamed Thread")
917
+ summary = result.get("summary", {})
918
+ status = summary.get("overall_status", STATUS_UNKNOWN)
919
+ status_color = GREEN if status == STATUS_PASS else YELLOW if status == STATUS_PARTIAL else RED
920
+
921
+ print(f"{BOLD}{MAGENTA}Thread {i}: {thread_name}{RESET}")
922
+ for t_idx, turn in enumerate(result.get("turns", []), 1):
923
+ turn_status = turn.get("status", STATUS_UNKNOWN)
924
+ turn_color = GREEN if turn_status == STATUS_PASS else RED if turn_status in (STATUS_FAIL, STATUS_ERROR) else YELLOW
925
+ print(f"{BOLD}{turn_color}Turn {t_idx}:{RESET} [{turn_status}] {turn.get('prompt', '')}")
926
+ _print_evaluated_item(
927
+ response=turn.get("response", ""),
928
+ expected_response=turn.get("expected_response", ""),
929
+ evaluators_ran=turn.get("evaluators_ran", []),
930
+ item_results=turn.get("results", {}),
931
+ error=turn.get("error"),
932
+ )
933
+ print()
934
+ print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
935
+ print(f" Status: {status_color}{status.upper()}{RESET}")
936
+ print(f" Turns passed: {status_color}{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)}{RESET}")
937
+ print(f"{BLUE}{'-' * 30}{RESET}")
938
+ else:
939
+ print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
940
+ _print_evaluated_item(
941
+ response=result.get('response', ''),
942
+ expected_response=result.get('expected_response', ''),
943
+ evaluators_ran=result.get('evaluators_ran', []),
944
+ item_results=result.get('results', {}),
945
+ error=result.get('errorDetails'),
946
+ )
947
+ print(f"{BLUE}{'-' * 30}{RESET}")
331
948
 
332
949
  def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
333
950
  """Extract an EvalScore object from a decorated metric dict.
@@ -338,16 +955,14 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
338
955
  DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
339
956
 
340
957
  score_val = None
341
- for k in (metric_id, f"{metric_id}_score", "score", "value"):
342
- if k in data and isinstance(data[k], (int, float)):
343
- score_val = data[k]
344
- break
958
+ if metric_id in data and isinstance(data[metric_id], (int, float)):
959
+ score_val = data[metric_id]
345
960
  if score_val is None:
346
961
  return None
347
962
 
348
963
  result = data.get("result")
349
- if result not in ("pass", "fail"):
350
- result = "pass" if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else "fail"
964
+ if result not in (STATUS_PASS, STATUS_FAIL):
965
+ result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else STATUS_FAIL
351
966
 
352
967
  eval_score: Dict[str, Any] = {
353
968
  "score": score_val,
@@ -360,48 +975,33 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
360
975
  return eval_score
361
976
 
362
977
 
363
- def convert_result_to_eval_item(result: Dict) -> Dict:
364
- """Convert an internal evaluation result dict to a schema-compliant EvalItem.
978
+ def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
979
+ """Convert raw evaluator results to schema-compliant score objects.
365
980
 
366
- Internal format (from run_evaluations):
367
- {prompt, response, expected_response, results: {relevance_score: "JSON", ...}}
368
- Schema EvalItem format:
369
- {prompt, response, expected_response, scores: {relevance: EvalScore, ...}}
981
+ Evaluator results in results_dict are dicts (from _decorate_metric) or
982
+ None when skipped/crashed. None values are omitted from output.
370
983
  """
371
- item: Dict[str, Any] = {
372
- "prompt": result["prompt"],
373
- "response": result["response"],
374
- "expected_response": result["expected_response"],
375
- }
376
-
377
984
  scores: Dict[str, Any] = {}
378
- results_dict = result.get("results", {})
379
-
380
- # EvalScore metrics (all share the same schema shape: {score, result, threshold})
381
- # Tuple: (internal results key, metric ID for score lookup, schema output key)
382
- for internal_key, metric_id, schema_key in [
383
- ("relevance_score", "relevance", "relevance"),
384
- ("coherence_score", "coherence", "coherence"),
385
- ("groundedness_score", "groundedness", "groundedness"),
386
- ("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
985
+
986
+ for eval_key, schema_key in [
987
+ (RELEVANCE, "relevance"),
988
+ (COHERENCE, "coherence"),
989
+ (GROUNDEDNESS, "groundedness"),
990
+ (TOOL_CALL_ACCURACY, "toolCallAccuracy"),
387
991
  ]:
388
- raw = results_dict.get(internal_key)
389
- if not raw:
992
+ data = results_dict.get(eval_key)
993
+ if data is None:
390
994
  continue
391
- data = json.loads(raw) if isinstance(raw, str) else raw
392
- eval_score = extract_eval_score(data, metric_id)
995
+ eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
393
996
  if eval_score:
394
997
  scores[schema_key] = eval_score
395
998
 
396
- # Citations → CitationScore (different schema shape: {count, result, threshold} + format)
397
- raw_citations = results_dict.get("citations_score")
398
- if raw_citations:
399
- data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
400
- count = data.get("score", 0)
999
+ data = results_dict.get(CITATIONS)
1000
+ if data is not None:
1001
+ count = data.get("citations", 0)
401
1002
  cit_result = data.get("result")
402
- if cit_result not in ("pass", "fail"):
403
- cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
404
-
1003
+ if cit_result not in (STATUS_PASS, STATUS_FAIL):
1004
+ cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
405
1005
  citation_score: Dict[str, Any] = {
406
1006
  "count": count,
407
1007
  "result": cit_result,
@@ -411,17 +1011,100 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
411
1011
  citation_score["format"] = data["citation_format"]
412
1012
  scores["citations"] = citation_score
413
1013
 
1014
+ data = results_dict.get(EXACT_MATCH)
1015
+ if data is not None:
1016
+ is_match = data.get("exact_match", 0.0) == 1.0
1017
+ scores["exactMatch"] = {
1018
+ "match": is_match,
1019
+ "result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
1020
+ "reason": data.get("exact_match_reason", ""),
1021
+ }
1022
+
1023
+ data = results_dict.get(PARTIAL_MATCH)
1024
+ if data is not None:
1025
+ scores["partialMatch"] = {
1026
+ "score": data.get("partial_match", 0.0),
1027
+ "result": data.get("result", STATUS_FAIL),
1028
+ "threshold": data.get("threshold", 0.5),
1029
+ "reason": data.get("partial_match_reason", ""),
1030
+ }
1031
+
1032
+ return scores
1033
+
1034
+
1035
+ def convert_result_to_eval_item(result: Dict) -> Dict:
1036
+ """Convert an internal evaluation result dict to a schema-compliant EvalItem."""
1037
+ item: Dict[str, Any] = {
1038
+ "prompt": result["prompt"],
1039
+ "response": result["response"],
1040
+ "expected_response": result["expected_response"],
1041
+ }
1042
+
1043
+ if "evaluators" in result:
1044
+ item["evaluators"] = result["evaluators"]
1045
+ if "evaluators_mode" in result:
1046
+ item["evaluators_mode"] = result["evaluators_mode"]
1047
+
1048
+ scores = _convert_scores_to_schema(result.get("results", {}))
414
1049
  if scores:
415
1050
  item["scores"] = scores
416
1051
 
417
1052
  return item
418
1053
 
419
1054
 
420
- def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None):
1055
+ def convert_thread_result_to_output(thread_result: Dict) -> Dict:
1056
+ """Convert a multi-turn thread result to the output format."""
1057
+ output_turns = []
1058
+ for turn in thread_result.get("turns", []):
1059
+ output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
1060
+ if "expected_response" in turn:
1061
+ output_turn["expected_response"] = turn["expected_response"]
1062
+ if "response" in turn:
1063
+ output_turn["response"] = turn["response"]
1064
+ if "status" in turn:
1065
+ output_turn["status"] = turn["status"]
1066
+ if "error" in turn:
1067
+ output_turn["error"] = turn["error"]
1068
+ if "evaluators" in turn:
1069
+ output_turn["evaluators"] = turn["evaluators"]
1070
+ if "evaluators_mode" in turn:
1071
+ output_turn["evaluators_mode"] = turn["evaluators_mode"]
1072
+
1073
+ scores = _convert_scores_to_schema(turn.get("results", {}))
1074
+ if scores:
1075
+ output_turn["scores"] = scores
1076
+
1077
+ output_turns.append(output_turn)
1078
+
1079
+ output: Dict[str, Any] = {}
1080
+ if thread_result.get("name"):
1081
+ output["name"] = thread_result["name"]
1082
+ if thread_result.get("description"):
1083
+ output["description"] = thread_result["description"]
1084
+ if thread_result.get("conversation_id"):
1085
+ output["conversation_id"] = thread_result["conversation_id"]
1086
+ output["turns"] = output_turns
1087
+ if thread_result.get("summary"):
1088
+ output["summary"] = thread_result["summary"]
1089
+
1090
+ return output
1091
+
1092
+
1093
+ def convert_result_to_output_item(result: Dict) -> Dict:
1094
+ """Convert an internal result dict to an output item. Routes by type."""
1095
+ if result.get("type") == "multi_turn":
1096
+ return convert_thread_result_to_output(result)
1097
+ return convert_result_to_eval_item(result)
1098
+
1099
+
1100
+ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
1101
+ default_evaluators: Optional[Dict[str, Any]] = None,
1102
+ agent_name: Optional[str] = None,
1103
+ cli_version: Optional[str] = None):
421
1104
  """Write results to a schema-compliant eval document JSON file.
422
1105
 
423
1106
  Output follows the eval-document.schema.json format:
424
- {schemaVersion, metadata, items: [EvalItem]}
1107
+ {schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
425
1108
  """
426
1109
  try:
427
1110
  try:
@@ -429,52 +1112,145 @@ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optio
429
1112
  except Exception:
430
1113
  current_version = "1.0.0"
431
1114
 
432
- items = [convert_result_to_eval_item(r) for r in results]
1115
+ items = [convert_result_to_output_item(r) for r in results]
433
1116
 
434
1117
  metadata: Dict[str, Any] = {
435
1118
  "evaluatedAt": datetime.now(timezone.utc).isoformat(),
436
1119
  }
437
1120
  if agent_id:
438
1121
  metadata["agentId"] = agent_id
1122
+ if agent_name:
1123
+ metadata["agentName"] = agent_name
1124
+ if cli_version:
1125
+ metadata["cliVersion"] = cli_version
439
1126
 
440
1127
  output_data: Dict[str, Any] = {
441
1128
  "schemaVersion": current_version,
442
1129
  "metadata": metadata,
443
- "items": items,
444
1130
  }
445
1131
 
1132
+ if default_evaluators is not None:
1133
+ output_data["default_evaluators"] = default_evaluators
1134
+
1135
+ output_data["items"] = items
1136
+
446
1137
  with open(output_file, 'w', encoding='utf-8') as f:
447
1138
  json.dump(output_data, f, indent=2, ensure_ascii=False)
448
- print(f"Results saved to {output_file}")
1139
+ emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
449
1140
  except Exception as e:
450
- print(f"Error writing to JSON file: {e}")
1141
+ emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
451
1142
  sys.exit(1)
452
1143
 
453
- def write_results_to_csv(results: List[Dict], output_file: str):
1144
+ def _results_to_csv_json(results_dict: Dict) -> str:
1145
+ """Serialize evaluator results dict to a CSV-safe JSON string.
1146
+
1147
+ Skips None (crashed/skipped evaluators). Results are dicts produced
1148
+ by _decorate_metric.
1149
+ """
1150
+ if not results_dict:
1151
+ return ""
1152
+ non_null = {k: v for k, v in results_dict.items() if v is not None}
1153
+ return json.dumps(non_null) if non_null else ""
1154
+
1155
+
1156
+ def write_results_to_csv(results: List[Dict], output_file: str,
1157
+ agent_name: Optional[str] = None, agent_id: Optional[str] = None,
1158
+ cli_version: Optional[str] = None):
454
1159
  """Write results to CSV file."""
455
1160
  try:
456
1161
  with open(output_file, 'w', newline='', encoding='utf-8') as f:
457
1162
  if results:
458
- # Write aggregate statistics first if multiple results
459
- if len(results) > 1:
460
- aggregates = calculate_aggregate_statistics(results)
461
- if aggregates:
1163
+ metadata_parts = []
1164
+ if agent_name:
1165
+ metadata_parts.append(f"Agent Name: {agent_name}")
1166
+ if agent_id:
1167
+ metadata_parts.append(f"Agent ID: {agent_id}")
1168
+ if cli_version:
1169
+ metadata_parts.append(f"CLI Version: {cli_version}")
1170
+ if metadata_parts:
1171
+ f.write(f"# {' | '.join(metadata_parts)}\n")
1172
+
1173
+ aggregates = calculate_aggregate_statistics(results)
1174
+ if aggregates:
1175
+ total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
1176
+ if total_items > 1:
462
1177
  f.write("# AGGREGATE STATISTICS\n")
463
- f.write("Metric,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
1178
+ f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
464
1179
  for metric_name, stats in aggregates.items():
465
1180
  threshold_str = str(stats.get('threshold', 'N/A'))
466
- f.write(f"{metric_name},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
1181
+ prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
1182
+ total_prompts = stats.get('total_prompts', total_items)
1183
+ f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
467
1184
  f.write("\n# INDIVIDUAL RESULTS\n")
468
-
469
- # Write individual results
470
- writer = csv.DictWriter(f, fieldnames=results[0].keys())
471
- writer.writeheader()
472
- writer.writerows(results)
473
- print(f"Results saved to {output_file}")
1185
+
1186
+ single_turn_rows = []
1187
+ multi_turn_rows = []
1188
+ for result in results:
1189
+ if result.get("type") == "multi_turn":
1190
+ thread_name = result.get("name", "")
1191
+ for turn_idx, turn in enumerate(result.get("turns", [])):
1192
+ multi_turn_rows.append({
1193
+ "thread_name": thread_name,
1194
+ "turn_index": turn_idx + 1,
1195
+ "prompt": turn.get("prompt", ""),
1196
+ "response": turn.get("response", ""),
1197
+ "expected_response": turn.get("expected_response", ""),
1198
+ "status": turn.get("status", ""),
1199
+ "error": turn.get("error", ""),
1200
+ "scores": _results_to_csv_json(turn.get("results", {})),
1201
+ })
1202
+ summary = result.get("summary", {})
1203
+ multi_turn_rows.append({
1204
+ "thread_name": thread_name,
1205
+ "turn_index": "summary",
1206
+ "prompt": "",
1207
+ "response": "",
1208
+ "expected_response": "",
1209
+ "status": summary.get("overall_status", ""),
1210
+ "scores": f"{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)} turns passed",
1211
+ })
1212
+ else:
1213
+ exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
1214
+ row = {k: v for k, v in result.items() if k not in exclude_keys}
1215
+ if "results" in result:
1216
+ row["scores"] = _results_to_csv_json(result["results"])
1217
+ single_turn_rows.append(row)
1218
+
1219
+ if single_turn_rows:
1220
+ if multi_turn_rows:
1221
+ f.write("# SINGLE-TURN RESULTS\n")
1222
+ fieldnames = list(single_turn_rows[0].keys())
1223
+ for row in single_turn_rows:
1224
+ for k in row:
1225
+ if k not in fieldnames:
1226
+ fieldnames.append(k)
1227
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
1228
+ writer.writeheader()
1229
+ writer.writerows(single_turn_rows)
1230
+
1231
+ if multi_turn_rows:
1232
+ if single_turn_rows:
1233
+ f.write("\n")
1234
+ f.write("# MULTI-TURN RESULTS\n")
1235
+ fieldnames = ["thread_name", "turn_index", "prompt", "response", "expected_response", "status", "error", "scores"]
1236
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
1237
+ writer.writeheader()
1238
+ writer.writerows(multi_turn_rows)
1239
+ emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
474
1240
  except Exception as e:
475
- print(f"Error writing to CSV file: {e}")
1241
+ emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
476
1242
  sys.exit(1)
477
1243
 
1244
+ def normalize_agent_id(agent_id):
1245
+ """Append '.declarativeAgent' if agent_id has no '.', else return unchanged.
1246
+
1247
+ Returns the input unchanged when it is None/empty or already contains a dot.
1248
+ """
1249
+ if not agent_id:
1250
+ return agent_id
1251
+ return agent_id if '.' in agent_id else f"{agent_id}.declarativeAgent"
1252
+
1253
+
478
1254
  def parse_arguments():
479
1255
  """Parse command line arguments."""
480
1256
  parser = argparse.ArgumentParser(
@@ -503,8 +1279,8 @@ Examples:
503
1279
  # Save results to HTML and open in browser
504
1280
  python main.py --output report.html
505
1281
 
506
- # Verbose output
507
- python main.py --verbose
1282
+ # Debug-level diagnostics
1283
+ python main.py --log-level debug
508
1284
 
509
1285
  # Sign out and clear cached authentication tokens
510
1286
  python main.py --signout
@@ -553,28 +1329,41 @@ Examples:
553
1329
 
554
1330
  # Behavior options
555
1331
  parser.add_argument(
556
- '--verbose',
557
- action='store_true',
558
- help='Enable verbose output'
559
- )
560
- parser.add_argument(
561
- '--quiet',
562
- action='store_true',
563
- help='Suppress non-essential output'
564
- )
565
- parser.add_argument(
566
- '--citation-format',
567
- choices=['oai_unicode', 'legacy_bracket'],
568
- default='oai_unicode',
569
- help='Citation format to detect. "oai_unicode" for new OAI format (default), "legacy_bracket" for old [^i^] format'
1332
+ '--log-level',
1333
+ nargs='?',
1334
+ const='info',
1335
+ action='append',
1336
+ help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
570
1337
  )
1338
+
571
1339
  parser.add_argument(
572
1340
  '--signout',
573
1341
  action='store_true',
574
1342
  help='Sign out and clear cached authentication tokens'
575
1343
  )
1344
+
1345
+ parser.add_argument(
1346
+ '--concurrency',
1347
+ type=int,
1348
+ default=5,
1349
+ help='Number of parallel workers for prompt processing (1-5, default: 5)'
1350
+ )
576
1351
 
577
- return parser.parse_args()
1352
+ args = parser.parse_args()
1353
+
1354
+ args.m365_agent_id = normalize_agent_id(args.m365_agent_id)
1355
+
1356
+ if args.concurrency < 1:
1357
+ parser.error('--concurrency must be an integer >= 1.')
1358
+ if args.concurrency > MAX_CONCURRENCY:
1359
+ emit_structured_log(
1360
+ "warning",
1361
+ f"--concurrency {args.concurrency} exceeds max {MAX_CONCURRENCY}; clamping to {MAX_CONCURRENCY}.",
1362
+ operation=Operation.SETUP,
1363
+ )
1364
+ args.concurrency = MAX_CONCURRENCY
1365
+
1366
+ return args
578
1367
 
579
1368
  def validate_environment() -> CallPath:
580
1369
  """Validate required environment variables."""
@@ -583,25 +1372,40 @@ def validate_environment() -> CallPath:
583
1372
  "AZURE_AI_API_KEY",
584
1373
  "AZURE_AI_API_VERSION",
585
1374
  "AZURE_AI_MODEL_NAME",
586
- # Chat API specific
587
- "COPILOT_API_ENDPOINT",
588
- "X_SCENARIO_HEADER"
589
1375
  ]
590
1376
 
591
1377
  if os.environ.get("COPILOT_API_ACCESS_TOKEN"):
592
1378
  call_path = CallPath.ACCESS_TOKEN
593
- required_env_vars.append("COPILOT_API_ACCESS_TOKEN")
1379
+ required_env_vars.extend([
1380
+ "COPILOT_API_ACCESS_TOKEN",
1381
+ "COPILOT_API_ENDPOINT",
1382
+ "X_SCENARIO_HEADER",
1383
+ ])
1384
+ elif os.environ.get("WORK_IQ_A2A_ENDPOINT"):
1385
+ call_path = CallPath.A2A
1386
+ required_env_vars.extend([
1387
+ "WORK_IQ_A2A_ENDPOINT",
1388
+ "WORK_IQ_A2A_CLIENT_ID",
1389
+ "TENANT_ID",
1390
+ ])
594
1391
  else:
595
1392
  call_path = CallPath.COPILOT_AUTH
596
1393
  required_env_vars.extend([
1394
+ "COPILOT_API_ENDPOINT",
1395
+ "X_SCENARIO_HEADER",
597
1396
  "M365_EVAL_CLIENT_ID",
598
- "TENANT_ID"
1397
+ "TENANT_ID",
599
1398
  ])
600
1399
 
601
1400
  missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
602
1401
  if missing_vars:
603
- print(f"Error: Missing required environment variables: {', '.join(missing_vars)}")
604
- print("Please ensure your .env file contains all required Azure configuration.")
1402
+ emit_structured_log(
1403
+ "error",
1404
+ "Missing required environment variables: "
1405
+ f"{', '.join(missing_vars)}. Please ensure your .env file contains "
1406
+ "all required Azure configuration.",
1407
+ operation=Operation.VALIDATE_ENV,
1408
+ )
605
1409
  sys.exit(1)
606
1410
  return call_path
607
1411
 
@@ -635,78 +1439,58 @@ def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
635
1439
  # Convert other parsing errors to ValueError
636
1440
  raise ValueError(f"Invalid URL format: {url}") from e
637
1441
 
638
- def get_prompt_datasets(args) -> Tuple[List[str], List[str]]:
639
- """Get prompts and expected responses based on command line arguments."""
1442
+ def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
1443
+ """Get prompts and expected responses based on command line arguments.
1444
+
1445
+ Returns:
1446
+ Tuple of (eval_items, default_evaluators).
1447
+ """
640
1448
  if args.prompts:
641
1449
  if args.expected and len(args.prompts) != len(args.expected):
642
- print("Error: Number of prompts must match number of expected responses")
1450
+ emit_structured_log(
1451
+ "error",
1452
+ "Number of prompts must match number of expected responses. "
1453
+ "Update --expected values to match the prompt count.",
1454
+ )
643
1455
  sys.exit(1)
644
- prompts = args.prompts
645
- expected_responses = args.expected or [""] * len(prompts)
1456
+ expected_responses = args.expected or [""] * len(args.prompts)
1457
+ eval_items = [
1458
+ {"prompt": p, "expected_response": e}
1459
+ for p, e in zip(args.prompts, expected_responses)
1460
+ ]
1461
+ return eval_items, None
646
1462
  elif args.prompts_file:
647
- prompts, expected_responses = load_prompts_from_file(args.prompts_file)
1463
+ return load_prompts_from_file(args.prompts_file)
648
1464
  elif args.interactive:
649
1465
  prompts, expected_responses = get_interactive_prompts()
1466
+ eval_items = [
1467
+ {"prompt": p, "expected_response": e}
1468
+ for p, e in zip(prompts, expected_responses)
1469
+ ]
1470
+ return eval_items, None
650
1471
  else:
651
- # Use default prompts
652
1472
  prompts, expected_responses = get_default_prompts_and_responses()
653
-
654
- return prompts, expected_responses
1473
+ eval_items = [
1474
+ {"prompt": p, "expected_response": e}
1475
+ for p, e in zip(prompts, expected_responses)
1476
+ ]
1477
+ return eval_items, None
655
1478
 
656
- def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oid: str) -> List[Dict[str, Any]]:
657
- """
658
- Fetch available agents for the user from the Copilot API.
659
-
660
- Args:
661
- access_token: Bearer token for API authentication
662
- user_oid: User object ID for agent filtering
663
-
664
- Returns:
665
- List of agent dictionaries.
666
- """
667
- request_headers = {
668
- "Content-Type": "application/json",
669
- "X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
670
- "Authorization": f"Bearer {access_token}"
671
- }
672
-
673
- try:
674
- # Build the query parameter with participant info
675
- request_data = json.dumps({"participant": {"id": user_oid}})
676
- query_param = urllib.parse.quote(request_data)
677
-
678
- # Try to fetch agents from /GetGptList endpoint
679
- req = urllib.request.Request(
680
- f"{copilot_api_endpoint}/GetGptList?request={query_param}",
681
- headers=request_headers,
682
- method="GET"
683
- )
684
- with urllib.request.urlopen(req, timeout=120) as resp:
685
- data = json.loads(resp.read().decode("utf-8"))
686
- agents = data.get("gptList", [])
687
- return agents
688
- except urllib.error.HTTPError as e:
689
- # If endpoint doesn't exist or returns error, return empty list
690
- print(f"Warning: Unable to fetch agents list (HTTP {e.code}).")
691
- return []
692
- except Exception as e:
693
- print(f"Warning: Error fetching agents: {e}")
694
- return []
695
-
696
- def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
1479
+ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
697
1480
  """
698
1481
  Display an interactive agent selector using questionary.
699
-
1482
+
700
1483
  Args:
701
1484
  agents: List of agent dictionaries.
702
-
1485
+
703
1486
  Returns:
704
- Selected agent ID or None if cancelled/skipped
1487
+ Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
705
1488
  """
706
1489
  if not agents:
707
- return None
708
-
709
- # Create choices for questionary
1490
+ return None, None
1491
+
1492
+ # Build id→name lookup and choices
1493
+ id_to_name: Dict[str, str] = {}
710
1494
  choices = []
711
1495
  sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
712
1496
  for agent in sorted_agents:
@@ -714,12 +1498,13 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
714
1498
  agent_name = agent.get("name", "Unknown")
715
1499
  agent_description = agent.get("description", "Unknown")
716
1500
  agent_is_owner = agent.get('isOwner')
717
-
1501
+ id_to_name[agent_id] = agent_name
1502
+
718
1503
  # Format the display text
719
1504
  display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
720
-
1505
+
721
1506
  choices.append(questionary.Choice(title=display_text, value=agent_id))
722
-
1507
+
723
1508
  # Display the selection prompt
724
1509
  selected_agent = questionary.select(
725
1510
  "Select an agent to evaluate:",
@@ -727,238 +1512,260 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
727
1512
  use_shortcuts=True,
728
1513
  use_arrow_keys=True
729
1514
  ).ask()
730
-
731
- return selected_agent
732
-
733
- @functools.lru_cache(maxsize=1)
734
- def _get_iana_timezone_name() -> str:
735
- """Get the IANA timezone name from the system using tzlocal.
736
-
737
- Tries get_localzone_name() first; falls back to str(get_localzone()) when the
738
- former raises (e.g. no zone configured on some Unix systems). Result is cached
739
- after the first call so tzlocal is only invoked once per session.
740
- """
741
- try:
742
- return tzlocal.get_localzone_name()
743
- except Exception:
744
- return str(tzlocal.get_localzone())
745
-
746
-
747
- @functools.lru_cache(maxsize=1)
748
- def _get_location_info() -> Dict[str, Any]:
749
- """Return a locationInfo dict containing the local UTC offset and IANA timezone name.
750
1515
 
751
- Result is cached after the first call so the computation runs only once per session.
752
- """
753
- now = datetime.now().astimezone()
754
- utc_offset = now.utcoffset()
755
- offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
756
- return {
757
- "timeZoneOffset": offset_hours,
758
- "timeZone": _get_iana_timezone_name(),
759
- }
1516
+ return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
760
1517
 
761
1518
 
762
- def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
763
- message = {
764
- "message": {
765
- "text": prompt,
766
- "author": "user",
767
- "messageType": "chat",
768
- "timestamp": datetime.now(timezone.utc).isoformat(),
769
- "locationInfo": _get_location_info(),
770
- "from": {
771
- "id": user_oid,
772
- }
773
- },
774
- "verbosity": "verbose", # To enable detailed telemetry in response (to extract tool usage, etc.)
775
- }
776
-
777
- if agent_id:
778
- message["gpts"] = [
779
- {
780
- "id": agent_id.strip(),
781
- "source": "MOS3"
782
- }
783
- ]
784
- message["optionsSets"] = [
785
- "disable_action_confirmation" # Disable 3P action confirmation prompts for agents while scraping
786
- ]
787
-
788
- return json.dumps(message).encode("utf-8")
789
-
790
- def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> Dict[str, Dict[str, any]]:
791
- """ Send prompts to the chat API and return enhanced responses. """
792
-
793
- request_headers = {
794
- "Content-Type": "application/json",
795
- "X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
796
- "Authorization": f"Bearer {access_token}"
797
- }
798
- raw_responses: Dict[str, str] = {}
799
- for i, prompt in enumerate(prompts, 1):
800
- if not args.quiet:
801
- print(f"Processing prompt {i}/{len(prompts)}...")
802
-
803
- # Build the payload
804
- payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
805
- if args.verbose:
806
- print(f"[Sydney] Sending payload: {payload.decode('utf-8')}")
807
-
808
- # Send the request to /chat
809
- req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
810
- try:
811
- with urllib.request.urlopen(req, timeout=120) as resp:
812
- raw = resp.read().decode("utf-8", errors="replace")
813
- except urllib.error.HTTPError as e:
814
- error_body = None
815
- try:
816
- error_body = e.read().decode("utf-8", errors="replace")
817
- except Exception:
818
- pass
819
- msg = f"Chat API request failed (HTTP {e.code} {e.reason})."
820
- if error_body:
821
- msg += f" Body: {error_body[:500]}"
822
- raise RuntimeError(msg) from e
823
- except urllib.error.URLError as e:
824
- raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
825
-
826
- if args.verbose:
827
- print(f"[Sydney] Raw response: {raw}")
828
-
829
- # Store raw response for enhancement
830
- raw_responses[prompt] = raw.strip()
831
-
832
- # Extract enhanced responses using the new extractor
833
- enhanced_responses = extract_enhanced_responses(raw_responses)
834
- return enhanced_responses
835
-
836
- def output_results(results: List[Dict], args):
1519
+ def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
1520
+ agent_name: Optional[str] = None, cli_version: Optional[str] = None):
837
1521
  """Output results based on specified format."""
1522
+ metadata_kwargs = dict(
1523
+ agent_name=agent_name,
1524
+ agent_id=getattr(args, 'm365_agent_id', None),
1525
+ cli_version=cli_version,
1526
+ )
838
1527
  if args.output:
839
1528
  output_lower = args.output.lower()
840
1529
  if output_lower.endswith('.json'):
841
- write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
1530
+ write_results_to_json(results, args.output, default_evaluators=default_evaluators,
1531
+ **metadata_kwargs)
842
1532
  elif output_lower.endswith('.csv'):
843
- write_results_to_csv(results, args.output)
1533
+ write_results_to_csv(results, args.output, **metadata_kwargs)
844
1534
  elif output_lower.endswith('.html'):
845
- write_results_to_html(results, args.output)
1535
+ write_results_to_html(results, args.output, **metadata_kwargs)
846
1536
  abs_path = os.path.abspath(args.output)
847
1537
  webbrowser.open(f'file://{abs_path}')
848
1538
  else:
849
- write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
1539
+ write_results_to_json(results, args.output, default_evaluators=default_evaluators,
1540
+ **metadata_kwargs)
850
1541
  else:
851
- write_results_to_console(results)
1542
+ write_results_to_console(results, **metadata_kwargs)
852
1543
 
853
1544
  def main():
854
1545
  """Main function to orchestrate the evaluation process."""
855
1546
  load_dotenv()
856
1547
  args = parse_arguments()
857
1548
 
1549
+ effective_log_level, error_message = resolve_log_level(args.log_level)
1550
+ if error_message:
1551
+ print(error_message)
1552
+ print(
1553
+ "Next step: rerun with --log-level {debug|info|warning|error}. "
1554
+ "For support, share the console diagnostics output from this run."
1555
+ )
1556
+ sys.exit(2)
1557
+
1558
+ args.effective_log_level = effective_log_level
1559
+ configure_cli_logging(effective_log_level)
1560
+ emit_structured_log("info", f"Log level set to '{effective_log_level}'.", operation=Operation.SETUP)
1561
+
858
1562
  # Check minimum version before proceeding
859
- cli_version = get_cli_version(quiet=args.quiet)
860
- if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=args.quiet):
1563
+ quiet_for_version = effective_log_level in ("warning", "error")
1564
+ cli_version = get_cli_version(quiet=quiet_for_version)
1565
+ if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=quiet_for_version):
861
1566
  sys.exit(1)
862
1567
 
863
1568
  # Validate environment variables required for evaluation
864
1569
  call_path = validate_environment()
865
- copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
866
- validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
867
1570
 
868
1571
  user_oid = ""
869
1572
 
870
- if call_path == CallPath.ACCESS_TOKEN:
871
- access_token = os.environ["COPILOT_API_ACCESS_TOKEN"]
872
- else:
873
- scopes_str = os.environ.get(
874
- "COPILOT_SCOPES", "https://substrate.office.com/sydney/.default"
875
- )
876
-
877
- auth_handler = AuthHandler(
878
- client_id=os.environ["M365_EVAL_CLIENT_ID"],
879
- tenant_id=os.environ["TENANT_ID"],
880
- scopes_str=scopes_str
881
- )
882
-
883
- # Signout user
884
- if args.signout:
885
- try:
886
- auth_handler.clear_cache()
887
- except Exception as e:
888
- print(f"Error during signout: {e}")
889
- sys.exit(1)
890
- sys.exit(0)
891
-
892
- # Authenticate before loading prompts
893
- try:
894
- auth_result = auth_handler.acquire_token_interactive() or {}
895
- access_token = auth_result.get("access_token") or ""
896
- if not access_token:
897
- raise RuntimeError("Failed to acquire access token from authentication result")
898
-
899
- id_token_claims = auth_result.get("id_token_claims")
900
- if not isinstance(id_token_claims, dict):
901
- print("id_token_claims is missing or invalid in authentication result")
902
- else:
903
- user_oid = id_token_claims.get("oid") or ""
904
-
905
- except Exception as e:
906
- print(f"\033[91mError during authentication: {e}\033[0m")
907
- if args.verbose:
908
- import traceback
909
- traceback.print_exc()
910
- sys.exit(1)
1573
+ match call_path:
1574
+ case CallPath.ACCESS_TOKEN:
1575
+ access_token = os.environ["COPILOT_API_ACCESS_TOKEN"]
1576
+ user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
1577
+ copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
1578
+ validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
1579
+ agent_client = SydneyClient(
1580
+ copilot_api_endpoint=copilot_api_endpoint,
1581
+ access_token=access_token,
1582
+ user_oid=user_oid,
1583
+ logger=CLI_LOGGER,
1584
+ diagnostic_records=DIAGNOSTIC_RECORDS,
1585
+ )
911
1586
 
912
- if not user_oid and access_token:
913
- # Fallback: extract from access token.
914
- user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
1587
+ case CallPath.A2A:
1588
+ emit_structured_log(
1589
+ "warning",
1590
+ "The A2A endpoint is experimental and may change without notice.",
1591
+ operation=Operation.SETUP,
1592
+ )
1593
+ a2a_endpoint = os.environ["WORK_IQ_A2A_ENDPOINT"]
1594
+ validate_endpoint_url(a2a_endpoint, ALLOWED_ENDPOINTS)
1595
+
1596
+ a2a_scopes_str = os.environ.get("WORK_IQ_A2A_SCOPES", "")
1597
+ a2a_auth_handler = AuthHandler(
1598
+ client_id=os.environ["WORK_IQ_A2A_CLIENT_ID"],
1599
+ tenant_id=os.environ["TENANT_ID"],
1600
+ scopes_str=a2a_scopes_str,
1601
+ )
1602
+ try:
1603
+ a2a_auth_result = a2a_auth_handler.acquire_token_interactive() or {}
1604
+ a2a_access_token = a2a_auth_result.get("access_token") or ""
1605
+ if not a2a_access_token:
1606
+ raise RuntimeError("Failed to acquire A2A access token")
1607
+ except Exception as e:
1608
+ emit_structured_log(
1609
+ "error",
1610
+ f"Error during A2A authentication: {e}",
1611
+ operation=Operation.AUTHENTICATE,
1612
+ )
1613
+ if effective_log_level == "debug":
1614
+ import traceback
1615
+ traceback.print_exc()
1616
+ sys.exit(1)
1617
+ try:
1618
+ agent_client = A2AClient(
1619
+ a2a_endpoint=a2a_endpoint,
1620
+ access_token=a2a_access_token,
1621
+ logger=CLI_LOGGER,
1622
+ diagnostic_records=DIAGNOSTIC_RECORDS,
1623
+ )
1624
+ except Exception as e:
1625
+ emit_structured_log(
1626
+ "error",
1627
+ f"Failed to initialize A2A client: {e}",
1628
+ operation=Operation.SETUP,
1629
+ )
1630
+ sys.exit(1)
1631
+
1632
+ case CallPath.COPILOT_AUTH:
1633
+ copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
1634
+ validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
1635
+ auth_handler = AuthHandler(
1636
+ client_id=os.environ["M365_EVAL_CLIENT_ID"],
1637
+ tenant_id=os.environ["TENANT_ID"],
1638
+ scopes_str=os.environ.get("COPILOT_SCOPES", ""),
1639
+ )
1640
+
1641
+ # Signout user
1642
+ if args.signout:
1643
+ try:
1644
+ auth_handler.clear_cache()
1645
+ except Exception as e:
1646
+ emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
1647
+ sys.exit(1)
1648
+ sys.exit(0)
915
1649
 
916
- # 1. Load evaluation datasets (prompts and expected_responses)
917
- prompts, expected_responses = get_prompt_datasets(args)
1650
+ try:
1651
+ auth_result = auth_handler.acquire_token_interactive() or {}
1652
+ access_token = auth_result.get("access_token") or ""
1653
+ if not access_token:
1654
+ raise RuntimeError("Failed to acquire access token from authentication result")
1655
+
1656
+ id_token_claims = auth_result.get("id_token_claims")
1657
+ if not isinstance(id_token_claims, dict):
1658
+ emit_structured_log(
1659
+ "warning", "id_token_claims is missing or invalid in authentication result",
1660
+ operation=Operation.AUTHENTICATE,
1661
+ )
1662
+ else:
1663
+ user_oid = id_token_claims.get("oid") or ""
1664
+
1665
+ if not user_oid:
1666
+ user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
918
1667
 
919
- if not args.quiet:
920
- print(f"Running evaluation on {len(prompts)} prompt(s)...")
1668
+ except Exception as e:
1669
+ emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
1670
+ if effective_log_level == "debug":
1671
+ import traceback
1672
+ traceback.print_exc()
1673
+ sys.exit(1)
1674
+
1675
+ agent_client = SydneyClient(
1676
+ copilot_api_endpoint=copilot_api_endpoint,
1677
+ access_token=access_token,
1678
+ user_oid=user_oid,
1679
+ logger=CLI_LOGGER,
1680
+ diagnostic_records=DIAGNOSTIC_RECORDS,
1681
+ )
921
1682
 
1683
+ # 1. Load evaluation datasets
1684
+ eval_items, file_default_evaluators = get_prompt_datasets(args)
1685
+ default_evaluators = resolve_default_evaluators(file_default_evaluators)
1686
+
1687
+ if effective_log_level in ("info", "debug"):
1688
+ multi_turn_count = sum(1 for item in eval_items if "turns" in item)
1689
+ single_turn_count = len(eval_items) - multi_turn_count
1690
+ emit_structured_log(
1691
+ "info",
1692
+ f"Running evaluation on {len(eval_items)} item(s) "
1693
+ f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
1694
+ operation=Operation.SETUP,
1695
+ )
1696
+
1697
+ agent_name = None
922
1698
  try:
923
- # 3. Agent selection - if no agent ID provided, prompt user to select
1699
+ # 2. Agent selection - when no agent ID is provided, discover agents
1700
+ # via the active client (A2A or REST) and prompt interactively.
924
1701
  if not args.m365_agent_id:
925
- if not args.quiet:
926
- print("No agent ID provided. Fetching available agents...")
927
-
928
- available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
929
- if not available_agents:
930
- print("No agents are available for interactive selection. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
931
- sys.exit(1)
932
-
933
- if available_agents:
934
- selected_agent_id = select_agent_interactively(available_agents)
935
- if selected_agent_id:
936
- args.m365_agent_id = selected_agent_id
937
- if not args.quiet:
938
- print(f"Selected agent: {args.m365_agent_id}")
939
- else:
940
- print("No agent selected. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
941
- sys.exit(1)
1702
+ if effective_log_level in ("info", "debug"):
1703
+ emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
942
1704
 
943
- # 4. Send prompts to chat API
944
- responses = send_prompt_to_agent_in_sydney(prompts, copilot_api_endpoint, access_token, user_oid, args)
1705
+ available_agents = agent_client.fetch_available_agents()
1706
+ if not available_agents:
1707
+ emit_structured_log(
1708
+ "error",
1709
+ "No agents are available for interactive selection. Re-run with "
1710
+ "--m365-agent-id or set M365_AGENT_ID.",
1711
+ operation=Operation.FETCH_AGENTS,
1712
+ )
1713
+ sys.exit(1)
1714
+
1715
+ selected_agent_id, agent_name = select_agent_interactively(available_agents)
1716
+ if selected_agent_id:
1717
+ args.m365_agent_id = selected_agent_id
1718
+ if effective_log_level in ("info", "debug"):
1719
+ emit_structured_log("info", f"Selected agent: {args.m365_agent_id}", operation=Operation.FETCH_AGENTS)
1720
+ else:
1721
+ emit_structured_log(
1722
+ "error",
1723
+ "No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
1724
+ operation=Operation.FETCH_AGENTS,
1725
+ )
1726
+ sys.exit(1)
945
1727
  except Exception as e:
946
- print(f"\033[91mError sending prompts to chat API: {e}\033[0m")
947
- if args.verbose:
1728
+ emit_structured_log("error", f"Error during agent discovery: {e}", operation=Operation.FETCH_AGENTS)
1729
+ if effective_log_level == "debug":
948
1730
  import traceback
949
1731
  traceback.print_exc()
950
1732
  sys.exit(1)
1733
+
1734
+ # Pre-resolve agent endpoint (A2A agent card lookup; no-op for REST)
1735
+ if args.m365_agent_id:
1736
+ agent_client.resolve_agent(args.m365_agent_id)
1737
+
1738
+ # 3. Build pipeline config and run evaluation pipeline
1739
+ model_config = AzureOpenAIModelConfiguration(
1740
+ azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
1741
+ api_key=os.environ.get("AZURE_AI_API_KEY"),
1742
+ api_version=os.environ.get("AZURE_AI_API_VERSION"),
1743
+ azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
1744
+ )
1745
+ has_azure_openai = bool(
1746
+ os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
1747
+ and os.environ.get("AZURE_AI_API_KEY")
1748
+ )
1749
+
1750
+ pipeline = PipelineConfig(
1751
+ agent_client=agent_client,
1752
+ model_config=model_config,
1753
+ has_azure_openai=has_azure_openai,
1754
+ default_evaluators=default_evaluators,
1755
+ )
1756
+
1757
+ results = run_pipeline(pipeline, eval_items, args)
951
1758
 
952
- # 5. Run evaluations
953
- if not args.quiet:
954
- print("Running evaluations...")
955
- results = run_evaluations(args, responses, expected_responses)
956
-
957
- # 6. Output results
958
- output_results(results, args)
1759
+ # 4. Output results
1760
+ output_results(results, args, default_evaluators=default_evaluators,
1761
+ agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
959
1762
 
960
- if not args.quiet:
961
- print(f"\nEvaluation completed successfully! Processed {len(prompts)} prompt(s).")
1763
+ if effective_log_level in ("info", "debug"):
1764
+ emit_structured_log(
1765
+ "info",
1766
+ f"Evaluation completed successfully. Processed {len(eval_items)} item(s).",
1767
+ operation=Operation.EVALUATE,
1768
+ )
962
1769
 
963
1770
  # Call the main function when script is run directly
964
1771
  if __name__ == "__main__":