@microsoft/m365-copilot-eval 1.1.1-preview.1 → 1.2.1-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,10 @@
1
1
  """
2
2
  CitationsEvaluator - A custom evaluator for analyzing citations in M365 Copilot responses.
3
3
 
4
- This evaluator uses regex-based pattern matching to detect citations in two formats:
5
- 1. New OAI format: \ue200cite\ue202turn{X}search{Y}\ue201
6
- 2. Old format: [^i^] where i is the citation index
4
+ This evaluator uses regex-based pattern matching to detect citations in three modes:
5
+ 1. OAI_UNICODE: New OAI format: \ue200cite\ue202turn{X}search{Y}\ue201
6
+ 2. LEGACY_BRACKET: Old format: [^i^] where i is the citation index
7
+ 3. AUTO: Automatically detects both formats simultaneously
7
8
 
8
9
  Where X, Y, and i are natural numbers representing conversation turn, search result index, or citation index.
9
10
  """
@@ -17,48 +18,60 @@ class CitationFormat(Enum):
17
18
  """Enum for different citation formats supported by the evaluator."""
18
19
  OAI_UNICODE = "oai_unicode" # New format: \ue200cite\ue202turn{X}search{Y}\ue201
19
20
  LEGACY_BRACKET = "legacy_bracket" # Old format: [^i^]
21
+ AUTO = "auto" # Automatically detect both formats
20
22
 
21
23
 
22
24
  class CitationsEvaluator:
23
25
  """
24
26
  A custom evaluator that analyzes citations in response text without using an LLM.
25
-
27
+
26
28
  This evaluator detects citation patterns and returns:
27
29
  - Whether at least one citation is present
28
30
  - The number of unique citations found
29
-
30
- Supports both new OAI unicode format and legacy bracket format.
31
+
32
+ Supports three modes:
33
+ - OAI_UNICODE: Detects only OAI unicode format citations
34
+ - LEGACY_BRACKET: Detects only legacy bracket format citations
35
+ - AUTO: Automatically detects both formats simultaneously
31
36
  """
32
37
 
33
38
  def __init__(self, citation_format: CitationFormat = CitationFormat.OAI_UNICODE):
34
39
  """
35
40
  Initialize the CitationsEvaluator with the specified citation format.
36
-
41
+
37
42
  Args:
38
43
  citation_format (CitationFormat): The format of citations to detect.
39
44
  Defaults to OAI_UNICODE format.
40
45
  """
41
46
  self.citation_format = citation_format
42
-
47
+
48
+ oai_pattern = r'\ue200cite\ue202turn\d+search\d+\ue201'
49
+ legacy_pattern = r'\[\^\d+\^\]'
50
+
43
51
  if citation_format == CitationFormat.OAI_UNICODE:
44
52
  # Pattern to match citations: \ue200cite\ue202turn{number}search{number}\ue201
45
- self.citation_pattern = r'\ue200cite\ue202turn\d+search\d+\ue201'
53
+ self.citation_pattern = oai_pattern
46
54
  elif citation_format == CitationFormat.LEGACY_BRACKET:
47
55
  # Pattern to match citations: [^number^]
48
- self.citation_pattern = r'\[\^\d+\^\]'
56
+ self.citation_pattern = legacy_pattern
57
+ elif citation_format == CitationFormat.AUTO:
58
+ # Auto-detect both formats using alternation (|)
59
+ # Matches either OAI unicode OR legacy bracket format
60
+ self.citation_pattern = rf'(?:{oai_pattern})|(?:{legacy_pattern})'
49
61
  else:
50
62
  raise ValueError(f"Unsupported citation format: {citation_format}")
51
-
63
+
64
+ # Compile the pattern once after determining which format to use
52
65
  self.compiled_pattern = re.compile(self.citation_pattern)
53
66
 
54
67
  def __call__(self, *, response: str, **kwargs) -> Dict[str, Any]:
55
68
  """
56
69
  Evaluate the response text for citations.
57
-
70
+
58
71
  Args:
59
72
  response (str): The response text from the M365 Copilot agent
60
73
  **kwargs: Additional keyword arguments (not used but kept for compatibility)
61
-
74
+
62
75
  Returns:
63
76
  Dict[str, Any]: Evaluation results containing:
64
77
  - citation_format (str): The format used for detection
@@ -69,40 +82,71 @@ class CitationsEvaluator:
69
82
  """
70
83
  if not isinstance(response, str):
71
84
  response = str(response) if response is not None else ""
72
-
73
- # Find all citation matches
85
+
86
+ # Find all citations and get unique ones (same for all modes)
74
87
  citation_matches = self.compiled_pattern.findall(response)
75
-
76
- # Get unique citations (remove duplicates)
77
88
  unique_citations = list(set(citation_matches))
78
-
79
- # Extract citation identifiers for reporting
89
+
90
+ # Initialize citation details list (used by all modes)
80
91
  citation_details = []
92
+
93
+ # Initialize counters only for AUTO mode
94
+ if self.citation_format == CitationFormat.AUTO:
95
+ oai_count = 0
96
+ legacy_count = 0
97
+
98
+ # Process all citations (unified extraction logic)
81
99
  for citation in unique_citations:
82
- if self.citation_format == CitationFormat.OAI_UNICODE:
83
- # Extract the turn and search numbers from the citation
100
+ # Determine citation type and extract details
101
+ if '\ue200' in citation:
102
+ # OAI format (contains start marker)
84
103
  turn_search_match = re.search(r'turn(\d+)search(\d+)', citation)
85
104
  if turn_search_match:
86
105
  turn_num = turn_search_match.group(1)
87
106
  search_num = turn_search_match.group(2)
88
- citation_details.append(f"turn{turn_num}search{search_num}")
89
- elif self.citation_format == CitationFormat.LEGACY_BRACKET:
90
- # Extract the citation number from [^number^]
107
+
108
+ # Add appropriate prefix based on mode
109
+ if self.citation_format == CitationFormat.AUTO:
110
+ citation_details.append(f"oai:turn{turn_num}search{search_num}")
111
+ oai_count += 1
112
+ else: # OAI_UNICODE mode
113
+ citation_details.append(f"turn{turn_num}search{search_num}")
114
+ else:
115
+ # Legacy bracket format
91
116
  bracket_match = re.search(r'\[\^(\d+)\^\]', citation)
92
117
  if bracket_match:
93
118
  citation_num = bracket_match.group(1)
94
- citation_details.append(f"citation{citation_num}")
95
-
96
- # Prepare results in a format compatible with the HTML report generator
119
+
120
+ # Add appropriate prefix based on mode
121
+ if self.citation_format == CitationFormat.AUTO:
122
+ citation_details.append(f"legacy:citation{citation_num}")
123
+ legacy_count += 1
124
+ else: # LEGACY_BRACKET mode
125
+ citation_details.append(f"citation{citation_num}")
126
+
127
+ format_info = None
128
+ if self.citation_format == CitationFormat.AUTO:
129
+ format_info = f"OAI: {oai_count}, Legacy: {legacy_count}"
130
+
131
+ # Build results (common for all modes)
132
+ total_citations = len(unique_citations)
133
+
134
+ # Construct reason string with optional format info
135
+ reason_parts = [f"Found {total_citations} unique citation(s)"]
136
+ if format_info:
137
+ reason_parts.append(f"[{format_info}]:")
138
+ else:
139
+ reason_parts.append(":")
140
+ reason_parts.append(', '.join(citation_details) if citation_details else 'None')
141
+
97
142
  results = {
98
143
  "citation_format": self.citation_format.value,
99
- # HTML report compatible fields
100
- "score": len(unique_citations), # Use citation count as the score
101
- "result": "pass" if len(unique_citations) > 0 else "fail", # Pass if citations found
102
- "threshold": 1, # Threshold of 1 citation minimum
103
- "reason": f"Found {len(unique_citations)} unique citation(s): {', '.join(citation_details) if citation_details else 'None'}"
144
+ "score": total_citations,
145
+ "result": "pass" if total_citations > 0 else "fail",
146
+ "threshold": 1,
147
+ "reason": " ".join(reason_parts)
104
148
  }
105
-
149
+
106
150
  return results
107
151
 
108
152
  def get_name(self) -> str:
@@ -3,6 +3,7 @@ import os
3
3
  import argparse
4
4
  import sys
5
5
  import csv
6
+ import functools
6
7
  import webbrowser
7
8
  import urllib.request
8
9
  import urllib.error
@@ -24,7 +25,11 @@ from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFor
24
25
  #from custom_evaluators.PII.PII import PIIEvaluator
25
26
  from generate_report import generate_html_report, calculate_aggregate_statistics
26
27
  from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
28
+ from schema_handler import DocumentUpgrader, SchemaVersionManager
29
+ from version_check import check_min_version, get_cli_version
27
30
  from datetime import datetime, timezone
31
+ from pathlib import Path
32
+ import tzlocal
28
33
 
29
34
  # Allowed endpoints for URL validation
30
35
  ALLOWED_ENDPOINTS = [
@@ -36,6 +41,18 @@ class CallPath(Enum):
36
41
  ACCESS_TOKEN = "access_token"
37
42
  COPILOT_AUTH = "copilot_auth"
38
43
 
44
+
45
+ # Flags that should bypass remote min-version enforcement.
46
+ # --help is not needed here because argparse exits before runtime checks.
47
+ VERSION_CHECK_BYPASS_FLAGS = (
48
+ "signout",
49
+ )
50
+
51
+
52
+ def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
53
+ """Return True if the current invocation should skip min-version checks."""
54
+ return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
55
+
39
56
  def write_results_to_html(results: List[Dict], output_file: str):
40
57
  """Write results to HTML file using generate_html_report from generate_report.py."""
41
58
  try:
@@ -58,23 +75,68 @@ def get_default_prompts_and_responses():
58
75
  return prompts, expected_responses
59
76
 
60
77
  def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
61
- """Load prompts and expected responses from a JSON file."""
78
+ """Load prompts and expected responses from a JSON file.
79
+
80
+ Supports three formats:
81
+ 1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
82
+ 2. Array format: [{"prompt": "...", "expected_response": "..."}]
83
+ 3. Dict format: {"prompts": [...], "expected_responses": [...]}
84
+
85
+ For eval documents (format 1) and array format (format 2), schema validation
86
+ and auto-upgrade are applied via DocumentUpgrader.
87
+ """
62
88
  try:
63
89
  with open(file_path, 'r', encoding='utf-8') as f:
64
90
  data = json.load(f)
65
-
91
+
92
+ # Detect if this is an eval document (has "items" key) or could be upgraded
93
+ is_eval_document = (
94
+ isinstance(data, dict) and "items" in data
95
+ ) or isinstance(data, list)
96
+
97
+ # Run schema validation and auto-upgrade for eval documents
98
+ if is_eval_document:
99
+ try:
100
+ upgrader = DocumentUpgrader()
101
+ except Exception as e:
102
+ # Schema infrastructure not available (missing files, etc.) — skip
103
+ print(f"Warning: Unable to initialize document upgrader: {e}")
104
+ upgrader = None
105
+
106
+ if upgrader is not None:
107
+ result = upgrader.upgrade(Path(file_path))
108
+
109
+ if result.error:
110
+ print(f"Schema validation error: {result.error}")
111
+ sys.exit(1)
112
+
113
+ if result.upgraded and result.message:
114
+ print(result.message)
115
+
116
+ # Use the parsed document from the upgrade result
117
+ if result.document is not None:
118
+ data = result.document
119
+
66
120
  if isinstance(data, list):
67
121
  # Format: [{"prompt": "...", "expected_response": "..."}, ...]
68
122
  prompts = [item.get("prompt", "") for item in data]
69
123
  expected_responses = [item.get("expected_response", "") for item in data]
70
124
  elif isinstance(data, dict):
71
- # Format: {"prompts": [...], "expected_responses": [...]}
72
- prompts = data.get("prompts", [])
73
- expected_responses = data.get("expected_responses", [])
125
+ if "items" in data:
126
+ # Eval document format: {"schemaVersion": "...", "items": [...]}
127
+ items = data["items"]
128
+ prompts = [item.get("prompt", "") for item in items]
129
+ expected_responses = [item.get("expected_response", "") for item in items]
130
+ else:
131
+ # Format: {"prompts": [...], "expected_responses": [...]}
132
+ prompts = data.get("prompts", [])
133
+ expected_responses = data.get("expected_responses", [])
74
134
  else:
75
135
  raise ValueError("Invalid file format")
76
-
136
+
77
137
  return prompts, expected_responses
138
+ except SystemExit:
139
+ raise
78
140
  except Exception as e:
79
141
  print(f"Error loading prompts from file: {e}")
80
142
  sys.exit(1)
@@ -181,7 +243,7 @@ def run_evaluations(args, responses: dict, expected_responses: list) -> list:
181
243
  )
182
244
 
183
245
  tool_call_accuracy = None
184
- if args.agent_id and enhanced_response.get("tool_definitions"):
246
+ if args.m365_agent_id and enhanced_response.get("tool_definitions"):
185
247
  tool_call_accuracy = tool_call_accuracy_evaluator(
186
248
  query=prompt,
187
249
  response=enhanced_response.get("response", actual_response_text),
@@ -267,18 +329,120 @@ def write_results_to_console(results):
267
329
  print(f"{BOLD}{color}{name}:{RESET} {v}")
268
330
  print(f"{BLUE}{'-' * 30}{RESET}")
269
331
 
270
- def write_results_to_json(results: List[Dict], output_file: str):
271
- """Write results to JSON file."""
332
+ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
333
+ """Extract an EvalScore object from a decorated metric dict.
334
+
335
+ Maps internal decorated-metric format to schema EvalScore:
336
+ {score, result, threshold} (required) + reason, evaluator (optional).
337
+ """
338
+ DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
339
+
340
+ score_val = None
341
+ for k in (metric_id, f"{metric_id}_score", "score", "value"):
342
+ if k in data and isinstance(data[k], (int, float)):
343
+ score_val = data[k]
344
+ break
345
+ if score_val is None:
346
+ return None
347
+
348
+ result = data.get("result")
349
+ if result not in ("pass", "fail"):
350
+ result = "pass" if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else "fail"
351
+
352
+ eval_score: Dict[str, Any] = {
353
+ "score": score_val,
354
+ "result": result,
355
+ "threshold": data.get("threshold", DEFAULT_THRESHOLD),
356
+ }
357
+ reason = data.get(f"{metric_id}_reason") or data.get("reason")
358
+ if reason:
359
+ eval_score["reason"] = reason
360
+ return eval_score
361
+
362
+
363
+ def convert_result_to_eval_item(result: Dict) -> Dict:
364
+ """Convert an internal evaluation result dict to a schema-compliant EvalItem.
365
+
366
+ Internal format (from run_evaluations):
367
+ {prompt, response, expected_response, results: {relevance_score: "JSON", ...}}
368
+ Schema EvalItem format:
369
+ {prompt, response, expected_response, scores: {relevance: EvalScore, ...}}
370
+ """
371
+ item: Dict[str, Any] = {
372
+ "prompt": result["prompt"],
373
+ "response": result["response"],
374
+ "expected_response": result["expected_response"],
375
+ }
376
+
377
+ scores: Dict[str, Any] = {}
378
+ results_dict = result.get("results", {})
379
+
380
+ # EvalScore metrics (all share the same schema shape: {score, result, threshold})
381
+ # Tuple: (internal results key, metric ID for score lookup, schema output key)
382
+ for internal_key, metric_id, schema_key in [
383
+ ("relevance_score", "relevance", "relevance"),
384
+ ("coherence_score", "coherence", "coherence"),
385
+ ("groundedness_score", "groundedness", "groundedness"),
386
+ ("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
387
+ ]:
388
+ raw = results_dict.get(internal_key)
389
+ if not raw:
390
+ continue
391
+ data = json.loads(raw) if isinstance(raw, str) else raw
392
+ eval_score = extract_eval_score(data, metric_id)
393
+ if eval_score:
394
+ scores[schema_key] = eval_score
395
+
396
+ # Citations → CitationScore (different schema shape: {count, result, threshold} + format)
397
+ raw_citations = results_dict.get("citations_score")
398
+ if raw_citations:
399
+ data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
400
+ count = data.get("score", 0)
401
+ cit_result = data.get("result")
402
+ if cit_result not in ("pass", "fail"):
403
+ cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
404
+
405
+ citation_score: Dict[str, Any] = {
406
+ "count": count,
407
+ "result": cit_result,
408
+ "threshold": data.get("threshold", 1),
409
+ }
410
+ if "citation_format" in data:
411
+ citation_score["format"] = data["citation_format"]
412
+ scores["citations"] = citation_score
413
+
414
+ if scores:
415
+ item["scores"] = scores
416
+
417
+ return item
418
+
419
+
420
+ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None):
421
+ """Write results to a schema-compliant eval document JSON file.
422
+
423
+ Output follows the eval-document.schema.json format:
424
+ {schemaVersion, metadata, items: [EvalItem]}
425
+ """
272
426
  try:
273
- output_data = {
274
- "individual_results": results
427
+ try:
428
+ current_version = SchemaVersionManager().get_current_version()
429
+ except Exception:
430
+ current_version = "1.0.0"
431
+
432
+ items = [convert_result_to_eval_item(r) for r in results]
433
+
434
+ metadata: Dict[str, Any] = {
435
+ "evaluatedAt": datetime.now(timezone.utc).isoformat(),
275
436
  }
276
-
277
- # Add aggregate statistics if multiple results
278
- if len(results) > 1:
279
- aggregates = calculate_aggregate_statistics(results)
280
- output_data["aggregate_statistics"] = aggregates
281
-
437
+ if agent_id:
438
+ metadata["agentId"] = agent_id
439
+
440
+ output_data: Dict[str, Any] = {
441
+ "schemaVersion": current_version,
442
+ "metadata": metadata,
443
+ "items": items,
444
+ }
445
+
282
446
  with open(output_file, 'w', encoding='utf-8') as f:
283
447
  json.dump(output_data, f, indent=2, ensure_ascii=False)
284
448
  print(f"Results saved to {output_file}")
@@ -372,12 +536,12 @@ Examples:
372
536
  help='List of expected responses (must match number of prompts)'
373
537
  )
374
538
 
375
- # Agent ID
539
+ # Agent ID (--m365-agent-id is primary, --agent-id kept for backward compatibility)
376
540
  parser.add_argument(
377
- '--agent-id',
378
- type=str,
379
- default=os.environ.get("AGENT_ID"),
380
- help='Azure AI Agent ID (default from environment variable)'
541
+ '--m365-agent-id', '--agent-id',
542
+ type=str,
543
+ default=os.environ.get("M365_AGENT_ID") or os.environ.get("AGENT_ID"),
544
+ help='Agent ID (default from M365_AGENT_ID environment variable)'
381
545
  )
382
546
 
383
547
  # Output options
@@ -566,6 +730,35 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
566
730
 
567
731
  return selected_agent
568
732
 
733
+ @functools.lru_cache(maxsize=1)
734
+ def _get_iana_timezone_name() -> str:
735
+ """Get the IANA timezone name from the system using tzlocal.
736
+
737
+ Tries get_localzone_name() first; falls back to str(get_localzone()) when the
738
+ former raises (e.g. no zone configured on some Unix systems). Result is cached
739
+ after the first call so tzlocal is only invoked once per session.
740
+ """
741
+ try:
742
+ return tzlocal.get_localzone_name()
743
+ except Exception:
744
+ return str(tzlocal.get_localzone())
745
+
746
+
747
+ @functools.lru_cache(maxsize=1)
748
+ def _get_location_info() -> Dict[str, Any]:
749
+ """Return a locationInfo dict containing the local UTC offset and IANA timezone name.
750
+
751
+ Result is cached after the first call so the computation runs only once per session.
752
+ """
753
+ now = datetime.now().astimezone()
754
+ utc_offset = now.utcoffset()
755
+ offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
756
+ return {
757
+ "timeZoneOffset": offset_hours,
758
+ "timeZone": _get_iana_timezone_name(),
759
+ }
760
+
761
+
569
762
  def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
570
763
  message = {
571
764
  "message": {
@@ -573,6 +766,7 @@ def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
573
766
  "author": "user",
574
767
  "messageType": "chat",
575
768
  "timestamp": datetime.now(timezone.utc).isoformat(),
769
+ "locationInfo": _get_location_info(),
576
770
  "from": {
577
771
  "id": user_oid,
578
772
  }
@@ -607,7 +801,7 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
607
801
  print(f"Processing prompt {i}/{len(prompts)}...")
608
802
 
609
803
  # Build the payload
610
- payload = build_chat_payload(prompt, user_oid, args.agent_id)
804
+ payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
611
805
  if args.verbose:
612
806
  print(f"[Sydney] Sending payload: {payload.decode('utf-8')}")
613
807
 
@@ -644,7 +838,7 @@ def output_results(results: List[Dict], args):
644
838
  if args.output:
645
839
  output_lower = args.output.lower()
646
840
  if output_lower.endswith('.json'):
647
- write_results_to_json(results, args.output)
841
+ write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
648
842
  elif output_lower.endswith('.csv'):
649
843
  write_results_to_csv(results, args.output)
650
844
  elif output_lower.endswith('.html'):
@@ -652,7 +846,7 @@ def output_results(results: List[Dict], args):
652
846
  abs_path = os.path.abspath(args.output)
653
847
  webbrowser.open(f'file://{abs_path}')
654
848
  else:
655
- write_results_to_json(results, args.output)
849
+ write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
656
850
  else:
657
851
  write_results_to_console(results)
658
852
 
@@ -661,6 +855,11 @@ def main():
661
855
  load_dotenv()
662
856
  args = parse_arguments()
663
857
 
858
+ # Check minimum version before proceeding
859
+ cli_version = get_cli_version(quiet=args.quiet)
860
+ if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=args.quiet):
861
+ sys.exit(1)
862
+
664
863
  # Validate environment variables required for evaluation
665
864
  call_path = validate_environment()
666
865
  copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
@@ -722,23 +921,23 @@ def main():
722
921
 
723
922
  try:
724
923
  # 3. Agent selection - if no agent ID provided, prompt user to select
725
- if not args.agent_id:
924
+ if not args.m365_agent_id:
726
925
  if not args.quiet:
727
926
  print("No agent ID provided. Fetching available agents...")
728
927
 
729
928
  available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
730
929
  if not available_agents:
731
- print("No agents are available for interactive selection. Please re-run with --agent-id or set the AGENT_ID environment variable.")
930
+ print("No agents are available for interactive selection. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
732
931
  sys.exit(1)
733
932
 
734
933
  if available_agents:
735
934
  selected_agent_id = select_agent_interactively(available_agents)
736
935
  if selected_agent_id:
737
- args.agent_id = selected_agent_id
936
+ args.m365_agent_id = selected_agent_id
738
937
  if not args.quiet:
739
- print(f"Selected agent: {args.agent_id}")
938
+ print(f"Selected agent: {args.m365_agent_id}")
740
939
  else:
741
- print("No agent selected. Please re-run with --agent-id or set the AGENT_ID environment variable.")
940
+ print("No agent selected. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
742
941
  sys.exit(1)
743
942
 
744
943
  # 4. Send prompts to chat API
@@ -49,8 +49,8 @@ M365_EVAL_CLIENT_ID="<app-registration-client-id>"
49
49
  TENANT_ID="<aad-tenant-id>"
50
50
  COPILOT_SCOPES="https://substrate.office.com/sydney/.default"
51
51
 
52
- # Optional: default agent id (overridable via --agent-id)
53
- AGENT_ID="00000000-0000-0000-0000-000000000000"
52
+ # Optional: default agent id (overridable via --m365-agent-id)
53
+ M365_AGENT_ID="00000000-0000-0000-0000-000000000000"
54
54
  ```
55
55
 
56
56
  ### 3. Run the Agent Evaluation
@@ -71,7 +71,7 @@ python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph
71
71
  python main.py --prompts "What is Microsoft Graph?" "How does authentication work?" --expected "Microsoft Graph is a gateway..." "Authentication works by..."
72
72
 
73
73
  # Override the agent configured in environment variables
74
- python main.py --agent-id "00000000-0000-0000-0000-000000000000" --prompts "What is Microsoft Graph?"
74
+ python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000" --prompts "What is Microsoft Graph?"
75
75
  ```
76
76
 
77
77
  #### Using Prompts from File
@@ -106,8 +106,8 @@ python main.py --quiet
106
106
  # Get help and see all options
107
107
  python main.py --help
108
108
 
109
- # Specify / override the Agent ID (takes precedence over AZURE_AI_AGENT_ID env var)
110
- python main.py --agent-id "00000000-0000-0000-0000-000000000000"
109
+ # Specify / override the Agent ID (takes precedence over M365_AGENT_ID env var)
110
+ python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
111
111
 
112
112
  # Citation format options
113
113
  python main.py --citation-format oai_unicode # Default: New OAI format
@@ -1,10 +1,13 @@
1
1
  ansible-core==2.19.0
2
+ tzlocal>=5.0
2
3
  azure-ai-evaluation==1.10.0
3
4
  azure-ai-projects==1.0.0
4
5
  msal[broker]>=1.34,<2
5
6
  msal-extensions>=1.3.1
7
+ packaging>=20.0
6
8
  PyJWT>=2.11.0
7
9
  python-dotenv==1.1.1
8
10
  markdown==3.8.2
9
11
  promptflow>=1.18.1
10
12
  questionary>=2.1.1
13
+ jsonschema>=4.26.0,<5
@@ -1,10 +1,13 @@
1
- [
2
- {
3
- "prompt": "What is Microsoft 365?",
4
- "expected_response": "Microsoft 365 is a cloud-based productivity suite that includes applications like Word, Excel, PowerPoint, Teams, and other collaboration tools."
5
- },
6
- {
7
- "prompt": "How can I share a file in Teams?",
8
- "expected_response": "You can share a file in Teams by uploading it to a channel or chat, or by sharing a link from OneDrive or SharePoint."
9
- }
10
- ]
1
+ {
2
+ "schemaVersion": "1.0.0",
3
+ "items": [
4
+ {
5
+ "prompt": "What is Microsoft 365?",
6
+ "expected_response": "Microsoft 365 is a cloud-based productivity suite that includes applications like Word, Excel, PowerPoint, Teams, and other collaboration tools."
7
+ },
8
+ {
9
+ "prompt": "How can I share a file in Teams?",
10
+ "expected_response": "You can share a file in Teams by uploading it to a channel or chat, or by sharing a link from OneDrive or SharePoint."
11
+ }
12
+ ]
13
+ }