@microsoft/m365-copilot-eval 1.1.1-preview.1 → 1.2.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,10 @@
1
1
  """
2
2
  CitationsEvaluator - A custom evaluator for analyzing citations in M365 Copilot responses.
3
3
 
4
- This evaluator uses regex-based pattern matching to detect citations in two formats:
5
- 1. New OAI format: \ue200cite\ue202turn{X}search{Y}\ue201
6
- 2. Old format: [^i^] where i is the citation index
4
+ This evaluator uses regex-based pattern matching to detect citations in three modes:
5
+ 1. OAI_UNICODE: New OAI format: \ue200cite\ue202turn{X}search{Y}\ue201
6
+ 2. LEGACY_BRACKET: Old format: [^i^] where i is the citation index
7
+ 3. AUTO: Automatically detects both formats simultaneously
7
8
 
8
9
  Where X, Y, and i are natural numbers representing conversation turn, search result index, or citation index.
9
10
  """
@@ -17,48 +18,60 @@ class CitationFormat(Enum):
17
18
  """Enum for different citation formats supported by the evaluator."""
18
19
  OAI_UNICODE = "oai_unicode" # New format: \ue200cite\ue202turn{X}search{Y}\ue201
19
20
  LEGACY_BRACKET = "legacy_bracket" # Old format: [^i^]
21
+ AUTO = "auto" # Automatically detect both formats
20
22
 
21
23
 
22
24
  class CitationsEvaluator:
23
25
  """
24
26
  A custom evaluator that analyzes citations in response text without using an LLM.
25
-
27
+
26
28
  This evaluator detects citation patterns and returns:
27
29
  - Whether at least one citation is present
28
30
  - The number of unique citations found
29
-
30
- Supports both new OAI unicode format and legacy bracket format.
31
+
32
+ Supports three modes:
33
+ - OAI_UNICODE: Detects only OAI unicode format citations
34
+ - LEGACY_BRACKET: Detects only legacy bracket format citations
35
+ - AUTO: Automatically detects both formats simultaneously
31
36
  """
32
37
 
33
38
  def __init__(self, citation_format: CitationFormat = CitationFormat.OAI_UNICODE):
34
39
  """
35
40
  Initialize the CitationsEvaluator with the specified citation format.
36
-
41
+
37
42
  Args:
38
43
  citation_format (CitationFormat): The format of citations to detect.
39
44
  Defaults to OAI_UNICODE format.
40
45
  """
41
46
  self.citation_format = citation_format
42
-
47
+
48
+ oai_pattern = r'\ue200cite\ue202turn\d+search\d+\ue201'
49
+ legacy_pattern = r'\[\^\d+\^\]'
50
+
43
51
  if citation_format == CitationFormat.OAI_UNICODE:
44
52
  # Pattern to match citations: \ue200cite\ue202turn{number}search{number}\ue201
45
- self.citation_pattern = r'\ue200cite\ue202turn\d+search\d+\ue201'
53
+ self.citation_pattern = oai_pattern
46
54
  elif citation_format == CitationFormat.LEGACY_BRACKET:
47
55
  # Pattern to match citations: [^number^]
48
- self.citation_pattern = r'\[\^\d+\^\]'
56
+ self.citation_pattern = legacy_pattern
57
+ elif citation_format == CitationFormat.AUTO:
58
+ # Auto-detect both formats using alternation (|)
59
+ # Matches either OAI unicode OR legacy bracket format
60
+ self.citation_pattern = rf'(?:{oai_pattern})|(?:{legacy_pattern})'
49
61
  else:
50
62
  raise ValueError(f"Unsupported citation format: {citation_format}")
51
-
63
+
64
+ # Compile the pattern once after determining which format to use
52
65
  self.compiled_pattern = re.compile(self.citation_pattern)
53
66
 
54
67
  def __call__(self, *, response: str, **kwargs) -> Dict[str, Any]:
55
68
  """
56
69
  Evaluate the response text for citations.
57
-
70
+
58
71
  Args:
59
72
  response (str): The response text from the M365 Copilot agent
60
73
  **kwargs: Additional keyword arguments (not used but kept for compatibility)
61
-
74
+
62
75
  Returns:
63
76
  Dict[str, Any]: Evaluation results containing:
64
77
  - citation_format (str): The format used for detection
@@ -69,40 +82,71 @@ class CitationsEvaluator:
69
82
  """
70
83
  if not isinstance(response, str):
71
84
  response = str(response) if response is not None else ""
72
-
73
- # Find all citation matches
85
+
86
+ # Find all citations and get unique ones (same for all modes)
74
87
  citation_matches = self.compiled_pattern.findall(response)
75
-
76
- # Get unique citations (remove duplicates)
77
88
  unique_citations = list(set(citation_matches))
78
-
79
- # Extract citation identifiers for reporting
89
+
90
+ # Initialize citation details list (used by all modes)
80
91
  citation_details = []
92
+
93
+ # Initialize counters only for AUTO mode
94
+ if self.citation_format == CitationFormat.AUTO:
95
+ oai_count = 0
96
+ legacy_count = 0
97
+
98
+ # Process all citations (unified extraction logic)
81
99
  for citation in unique_citations:
82
- if self.citation_format == CitationFormat.OAI_UNICODE:
83
- # Extract the turn and search numbers from the citation
100
+ # Determine citation type and extract details
101
+ if '\ue200' in citation:
102
+ # OAI format (contains start marker)
84
103
  turn_search_match = re.search(r'turn(\d+)search(\d+)', citation)
85
104
  if turn_search_match:
86
105
  turn_num = turn_search_match.group(1)
87
106
  search_num = turn_search_match.group(2)
88
- citation_details.append(f"turn{turn_num}search{search_num}")
89
- elif self.citation_format == CitationFormat.LEGACY_BRACKET:
90
- # Extract the citation number from [^number^]
107
+
108
+ # Add appropriate prefix based on mode
109
+ if self.citation_format == CitationFormat.AUTO:
110
+ citation_details.append(f"oai:turn{turn_num}search{search_num}")
111
+ oai_count += 1
112
+ else: # OAI_UNICODE mode
113
+ citation_details.append(f"turn{turn_num}search{search_num}")
114
+ else:
115
+ # Legacy bracket format
91
116
  bracket_match = re.search(r'\[\^(\d+)\^\]', citation)
92
117
  if bracket_match:
93
118
  citation_num = bracket_match.group(1)
94
- citation_details.append(f"citation{citation_num}")
95
-
96
- # Prepare results in a format compatible with the HTML report generator
119
+
120
+ # Add appropriate prefix based on mode
121
+ if self.citation_format == CitationFormat.AUTO:
122
+ citation_details.append(f"legacy:citation{citation_num}")
123
+ legacy_count += 1
124
+ else: # LEGACY_BRACKET mode
125
+ citation_details.append(f"citation{citation_num}")
126
+
127
+ format_info = None
128
+ if self.citation_format == CitationFormat.AUTO:
129
+ format_info = f"OAI: {oai_count}, Legacy: {legacy_count}"
130
+
131
+ # Build results (common for all modes)
132
+ total_citations = len(unique_citations)
133
+
134
+ # Construct reason string with optional format info
135
+ reason_parts = [f"Found {total_citations} unique citation(s)"]
136
+ if format_info:
137
+ reason_parts.append(f"[{format_info}]:")
138
+ else:
139
+ reason_parts.append(":")
140
+ reason_parts.append(', '.join(citation_details) if citation_details else 'None')
141
+
97
142
  results = {
98
143
  "citation_format": self.citation_format.value,
99
- # HTML report compatible fields
100
- "score": len(unique_citations), # Use citation count as the score
101
- "result": "pass" if len(unique_citations) > 0 else "fail", # Pass if citations found
102
- "threshold": 1, # Threshold of 1 citation minimum
103
- "reason": f"Found {len(unique_citations)} unique citation(s): {', '.join(citation_details) if citation_details else 'None'}"
144
+ "score": total_citations,
145
+ "result": "pass" if total_citations > 0 else "fail",
146
+ "threshold": 1,
147
+ "reason": " ".join(reason_parts)
104
148
  }
105
-
149
+
106
150
  return results
107
151
 
108
152
  def get_name(self) -> str:
@@ -24,7 +24,10 @@ from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFor
24
24
  #from custom_evaluators.PII.PII import PIIEvaluator
25
25
  from generate_report import generate_html_report, calculate_aggregate_statistics
26
26
  from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
27
+ from schema_handler import DocumentUpgrader, SchemaVersionManager
28
+ from version_check import check_min_version, get_cli_version
27
29
  from datetime import datetime, timezone
30
+ from pathlib import Path
28
31
 
29
32
  # Allowed endpoints for URL validation
30
33
  ALLOWED_ENDPOINTS = [
@@ -36,6 +39,18 @@ class CallPath(Enum):
36
39
  ACCESS_TOKEN = "access_token"
37
40
  COPILOT_AUTH = "copilot_auth"
38
41
 
42
+
43
+ # Flags that should bypass remote min-version enforcement.
44
+ # --help is not needed here because argparse exits before runtime checks.
45
+ VERSION_CHECK_BYPASS_FLAGS = (
46
+ "signout",
47
+ )
48
+
49
+
50
+ def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
51
+ """Return True if the current invocation should skip min-version checks."""
52
+ return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
53
+
39
54
  def write_results_to_html(results: List[Dict], output_file: str):
40
55
  """Write results to HTML file using generate_html_report from generate_report.py."""
41
56
  try:
@@ -58,23 +73,68 @@ def get_default_prompts_and_responses():
58
73
  return prompts, expected_responses
59
74
 
60
75
  def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
61
- """Load prompts and expected responses from a JSON file."""
76
+ """Load prompts and expected responses from a JSON file.
77
+
78
+ Supports three formats:
79
+ 1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
80
+ 2. Array format: [{"prompt": "...", "expected_response": "..."}]
81
+ 3. Dict format: {"prompts": [...], "expected_responses": [...]}
82
+
83
+ For eval documents (format 1) and array format (format 2), schema validation
84
+ and auto-upgrade are applied via DocumentUpgrader.
85
+ """
62
86
  try:
63
87
  with open(file_path, 'r', encoding='utf-8') as f:
64
88
  data = json.load(f)
65
-
89
+
90
+ # Detect if this is an eval document (has "items" key) or could be upgraded
91
+ is_eval_document = (
92
+ isinstance(data, dict) and "items" in data
93
+ ) or isinstance(data, list)
94
+
95
+ # Run schema validation and auto-upgrade for eval documents
96
+ if is_eval_document:
97
+ try:
98
+ upgrader = DocumentUpgrader()
99
+ except Exception as e:
100
+ # Schema infrastructure not available (missing files, etc.) — skip
101
+ print(f"Warning: Unable to initialize document upgrader: {e}")
102
+ upgrader = None
103
+
104
+ if upgrader is not None:
105
+ result = upgrader.upgrade(Path(file_path))
106
+
107
+ if result.error:
108
+ print(f"Schema validation error: {result.error}")
109
+ sys.exit(1)
110
+
111
+ if result.upgraded and result.message:
112
+ print(result.message)
113
+
114
+ # Use the parsed document from the upgrade result
115
+ if result.document is not None:
116
+ data = result.document
117
+
66
118
  if isinstance(data, list):
67
119
  # Format: [{"prompt": "...", "expected_response": "..."}, ...]
68
120
  prompts = [item.get("prompt", "") for item in data]
69
121
  expected_responses = [item.get("expected_response", "") for item in data]
70
122
  elif isinstance(data, dict):
71
- # Format: {"prompts": [...], "expected_responses": [...]}
72
- prompts = data.get("prompts", [])
73
- expected_responses = data.get("expected_responses", [])
123
+ if "items" in data:
124
+ # Eval document format: {"schemaVersion": "...", "items": [...]}
125
+ items = data["items"]
126
+ prompts = [item.get("prompt", "") for item in items]
127
+ expected_responses = [item.get("expected_response", "") for item in items]
128
+ else:
129
+ # Format: {"prompts": [...], "expected_responses": [...]}
130
+ prompts = data.get("prompts", [])
131
+ expected_responses = data.get("expected_responses", [])
74
132
  else:
75
133
  raise ValueError("Invalid file format")
76
-
134
+
77
135
  return prompts, expected_responses
136
+ except SystemExit:
137
+ raise
78
138
  except Exception as e:
79
139
  print(f"Error loading prompts from file: {e}")
80
140
  sys.exit(1)
@@ -181,7 +241,7 @@ def run_evaluations(args, responses: dict, expected_responses: list) -> list:
181
241
  )
182
242
 
183
243
  tool_call_accuracy = None
184
- if args.agent_id and enhanced_response.get("tool_definitions"):
244
+ if args.m365_agent_id and enhanced_response.get("tool_definitions"):
185
245
  tool_call_accuracy = tool_call_accuracy_evaluator(
186
246
  query=prompt,
187
247
  response=enhanced_response.get("response", actual_response_text),
@@ -267,18 +327,120 @@ def write_results_to_console(results):
267
327
  print(f"{BOLD}{color}{name}:{RESET} {v}")
268
328
  print(f"{BLUE}{'-' * 30}{RESET}")
269
329
 
270
- def write_results_to_json(results: List[Dict], output_file: str):
271
- """Write results to JSON file."""
330
+ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
331
+ """Extract an EvalScore object from a decorated metric dict.
332
+
333
+ Maps internal decorated-metric format to schema EvalScore:
334
+ {score, result, threshold} (required) + reason, evaluator (optional).
335
+ """
336
+ DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
337
+
338
+ score_val = None
339
+ for k in (metric_id, f"{metric_id}_score", "score", "value"):
340
+ if k in data and isinstance(data[k], (int, float)):
341
+ score_val = data[k]
342
+ break
343
+ if score_val is None:
344
+ return None
345
+
346
+ result = data.get("result")
347
+ if result not in ("pass", "fail"):
348
+ result = "pass" if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else "fail"
349
+
350
+ eval_score: Dict[str, Any] = {
351
+ "score": score_val,
352
+ "result": result,
353
+ "threshold": data.get("threshold", DEFAULT_THRESHOLD),
354
+ }
355
+ reason = data.get(f"{metric_id}_reason") or data.get("reason")
356
+ if reason:
357
+ eval_score["reason"] = reason
358
+ return eval_score
359
+
360
+
361
+ def convert_result_to_eval_item(result: Dict) -> Dict:
362
+ """Convert an internal evaluation result dict to a schema-compliant EvalItem.
363
+
364
+ Internal format (from run_evaluations):
365
+ {prompt, response, expected_response, results: {relevance_score: "JSON", ...}}
366
+ Schema EvalItem format:
367
+ {prompt, response, expected_response, scores: {relevance: EvalScore, ...}}
368
+ """
369
+ item: Dict[str, Any] = {
370
+ "prompt": result["prompt"],
371
+ "response": result["response"],
372
+ "expected_response": result["expected_response"],
373
+ }
374
+
375
+ scores: Dict[str, Any] = {}
376
+ results_dict = result.get("results", {})
377
+
378
+ # EvalScore metrics (all share the same schema shape: {score, result, threshold})
379
+ # Tuple: (internal results key, metric ID for score lookup, schema output key)
380
+ for internal_key, metric_id, schema_key in [
381
+ ("relevance_score", "relevance", "relevance"),
382
+ ("coherence_score", "coherence", "coherence"),
383
+ ("groundedness_score", "groundedness", "groundedness"),
384
+ ("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
385
+ ]:
386
+ raw = results_dict.get(internal_key)
387
+ if not raw:
388
+ continue
389
+ data = json.loads(raw) if isinstance(raw, str) else raw
390
+ eval_score = extract_eval_score(data, metric_id)
391
+ if eval_score:
392
+ scores[schema_key] = eval_score
393
+
394
+ # Citations → CitationScore (different schema shape: {count, result, threshold} + format)
395
+ raw_citations = results_dict.get("citations_score")
396
+ if raw_citations:
397
+ data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
398
+ count = data.get("score", 0)
399
+ cit_result = data.get("result")
400
+ if cit_result not in ("pass", "fail"):
401
+ cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
402
+
403
+ citation_score: Dict[str, Any] = {
404
+ "count": count,
405
+ "result": cit_result,
406
+ "threshold": data.get("threshold", 1),
407
+ }
408
+ if "citation_format" in data:
409
+ citation_score["format"] = data["citation_format"]
410
+ scores["citations"] = citation_score
411
+
412
+ if scores:
413
+ item["scores"] = scores
414
+
415
+ return item
416
+
417
+
418
+ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None):
419
+ """Write results to a schema-compliant eval document JSON file.
420
+
421
+ Output follows the eval-document.schema.json format:
422
+ {schemaVersion, metadata, items: [EvalItem]}
423
+ """
272
424
  try:
273
- output_data = {
274
- "individual_results": results
425
+ try:
426
+ current_version = SchemaVersionManager().get_current_version()
427
+ except Exception:
428
+ current_version = "1.0.0"
429
+
430
+ items = [convert_result_to_eval_item(r) for r in results]
431
+
432
+ metadata: Dict[str, Any] = {
433
+ "evaluatedAt": datetime.now(timezone.utc).isoformat(),
275
434
  }
276
-
277
- # Add aggregate statistics if multiple results
278
- if len(results) > 1:
279
- aggregates = calculate_aggregate_statistics(results)
280
- output_data["aggregate_statistics"] = aggregates
281
-
435
+ if agent_id:
436
+ metadata["agentId"] = agent_id
437
+
438
+ output_data: Dict[str, Any] = {
439
+ "schemaVersion": current_version,
440
+ "metadata": metadata,
441
+ "items": items,
442
+ }
443
+
282
444
  with open(output_file, 'w', encoding='utf-8') as f:
283
445
  json.dump(output_data, f, indent=2, ensure_ascii=False)
284
446
  print(f"Results saved to {output_file}")
@@ -372,12 +534,12 @@ Examples:
372
534
  help='List of expected responses (must match number of prompts)'
373
535
  )
374
536
 
375
- # Agent ID
537
+ # Agent ID (--m365-agent-id is primary, --agent-id kept for backward compatibility)
376
538
  parser.add_argument(
377
- '--agent-id',
378
- type=str,
379
- default=os.environ.get("AGENT_ID"),
380
- help='Azure AI Agent ID (default from environment variable)'
539
+ '--m365-agent-id', '--agent-id',
540
+ type=str,
541
+ default=os.environ.get("M365_AGENT_ID") or os.environ.get("AGENT_ID"),
542
+ help='Agent ID (default from M365_AGENT_ID environment variable)'
381
543
  )
382
544
 
383
545
  # Output options
@@ -607,7 +769,7 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
607
769
  print(f"Processing prompt {i}/{len(prompts)}...")
608
770
 
609
771
  # Build the payload
610
- payload = build_chat_payload(prompt, user_oid, args.agent_id)
772
+ payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
611
773
  if args.verbose:
612
774
  print(f"[Sydney] Sending payload: {payload.decode('utf-8')}")
613
775
 
@@ -644,7 +806,7 @@ def output_results(results: List[Dict], args):
644
806
  if args.output:
645
807
  output_lower = args.output.lower()
646
808
  if output_lower.endswith('.json'):
647
- write_results_to_json(results, args.output)
809
+ write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
648
810
  elif output_lower.endswith('.csv'):
649
811
  write_results_to_csv(results, args.output)
650
812
  elif output_lower.endswith('.html'):
@@ -652,7 +814,7 @@ def output_results(results: List[Dict], args):
652
814
  abs_path = os.path.abspath(args.output)
653
815
  webbrowser.open(f'file://{abs_path}')
654
816
  else:
655
- write_results_to_json(results, args.output)
817
+ write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
656
818
  else:
657
819
  write_results_to_console(results)
658
820
 
@@ -661,6 +823,11 @@ def main():
661
823
  load_dotenv()
662
824
  args = parse_arguments()
663
825
 
826
+ # Check minimum version before proceeding
827
+ cli_version = get_cli_version(quiet=args.quiet)
828
+ if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=args.quiet):
829
+ sys.exit(1)
830
+
664
831
  # Validate environment variables required for evaluation
665
832
  call_path = validate_environment()
666
833
  copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
@@ -722,23 +889,23 @@ def main():
722
889
 
723
890
  try:
724
891
  # 3. Agent selection - if no agent ID provided, prompt user to select
725
- if not args.agent_id:
892
+ if not args.m365_agent_id:
726
893
  if not args.quiet:
727
894
  print("No agent ID provided. Fetching available agents...")
728
895
 
729
896
  available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
730
897
  if not available_agents:
731
- print("No agents are available for interactive selection. Please re-run with --agent-id or set the AGENT_ID environment variable.")
898
+ print("No agents are available for interactive selection. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
732
899
  sys.exit(1)
733
900
 
734
901
  if available_agents:
735
902
  selected_agent_id = select_agent_interactively(available_agents)
736
903
  if selected_agent_id:
737
- args.agent_id = selected_agent_id
904
+ args.m365_agent_id = selected_agent_id
738
905
  if not args.quiet:
739
- print(f"Selected agent: {args.agent_id}")
906
+ print(f"Selected agent: {args.m365_agent_id}")
740
907
  else:
741
- print("No agent selected. Please re-run with --agent-id or set the AGENT_ID environment variable.")
908
+ print("No agent selected. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
742
909
  sys.exit(1)
743
910
 
744
911
  # 4. Send prompts to chat API
@@ -49,8 +49,8 @@ M365_EVAL_CLIENT_ID="<app-registration-client-id>"
49
49
  TENANT_ID="<aad-tenant-id>"
50
50
  COPILOT_SCOPES="https://substrate.office.com/sydney/.default"
51
51
 
52
- # Optional: default agent id (overridable via --agent-id)
53
- AGENT_ID="00000000-0000-0000-0000-000000000000"
52
+ # Optional: default agent id (overridable via --m365-agent-id)
53
+ M365_AGENT_ID="00000000-0000-0000-0000-000000000000"
54
54
  ```
55
55
 
56
56
  ### 3. Run the Agent Evaluation
@@ -71,7 +71,7 @@ python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph
71
71
  python main.py --prompts "What is Microsoft Graph?" "How does authentication work?" --expected "Microsoft Graph is a gateway..." "Authentication works by..."
72
72
 
73
73
  # Override the agent configured in environment variables
74
- python main.py --agent-id "00000000-0000-0000-0000-000000000000" --prompts "What is Microsoft Graph?"
74
+ python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000" --prompts "What is Microsoft Graph?"
75
75
  ```
76
76
 
77
77
  #### Using Prompts from File
@@ -106,8 +106,8 @@ python main.py --quiet
106
106
  # Get help and see all options
107
107
  python main.py --help
108
108
 
109
- # Specify / override the Agent ID (takes precedence over AZURE_AI_AGENT_ID env var)
110
- python main.py --agent-id "00000000-0000-0000-0000-000000000000"
109
+ # Specify / override the Agent ID (takes precedence over M365_AGENT_ID env var)
110
+ python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
111
111
 
112
112
  # Citation format options
113
113
  python main.py --citation-format oai_unicode # Default: New OAI format
@@ -3,8 +3,10 @@ azure-ai-evaluation==1.10.0
3
3
  azure-ai-projects==1.0.0
4
4
  msal[broker]>=1.34,<2
5
5
  msal-extensions>=1.3.1
6
+ packaging>=20.0
6
7
  PyJWT>=2.11.0
7
8
  python-dotenv==1.1.1
8
9
  markdown==3.8.2
9
10
  promptflow>=1.18.1
10
11
  questionary>=2.1.1
12
+ jsonschema>=4.26.0,<5
@@ -1,10 +1,13 @@
1
- [
2
- {
3
- "prompt": "What is Microsoft 365?",
4
- "expected_response": "Microsoft 365 is a cloud-based productivity suite that includes applications like Word, Excel, PowerPoint, Teams, and other collaboration tools."
5
- },
6
- {
7
- "prompt": "How can I share a file in Teams?",
8
- "expected_response": "You can share a file in Teams by uploading it to a channel or chat, or by sharing a link from OneDrive or SharePoint."
9
- }
10
- ]
1
+ {
2
+ "schemaVersion": "1.0.0",
3
+ "items": [
4
+ {
5
+ "prompt": "What is Microsoft 365?",
6
+ "expected_response": "Microsoft 365 is a cloud-based productivity suite that includes applications like Word, Excel, PowerPoint, Teams, and other collaboration tools."
7
+ },
8
+ {
9
+ "prompt": "How can I share a file in Teams?",
10
+ "expected_response": "You can share a file in Teams by uploading it to a channel or chat, or by sharing a link from OneDrive or SharePoint."
11
+ }
12
+ ]
13
+ }