@microsoft/m365-copilot-eval 1.0.1-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +415 -0
  3. package/TERMS.txt +65 -0
  4. package/package.json +82 -0
  5. package/src/clients/cli/auth/__init__.py +1 -0
  6. package/src/clients/cli/auth/auth_handler.py +262 -0
  7. package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +136 -0
  8. package/src/clients/cli/custom_evaluators/ConcisenessNonLLMEvaluator.py +18 -0
  9. package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +25 -0
  10. package/src/clients/cli/custom_evaluators/PII/PII.py +45 -0
  11. package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +39 -0
  12. package/src/clients/cli/custom_evaluators/__init__.py +1 -0
  13. package/src/clients/cli/demo_usage.py +83 -0
  14. package/src/clients/cli/generate_report.py +251 -0
  15. package/src/clients/cli/main.py +766 -0
  16. package/src/clients/cli/readme.md +301 -0
  17. package/src/clients/cli/requirements.txt +10 -0
  18. package/src/clients/cli/response_extractor.py +589 -0
  19. package/src/clients/cli/samples/PartnerSuccess.json +122 -0
  20. package/src/clients/cli/samples/example_prompts.json +14 -0
  21. package/src/clients/cli/samples/example_prompts_alt.json +12 -0
  22. package/src/clients/cli/samples/prompts_ambiguity.json +22 -0
  23. package/src/clients/cli/samples/prompts_rag_grounding.json +22 -0
  24. package/src/clients/cli/samples/prompts_security_injection.json +22 -0
  25. package/src/clients/cli/samples/prompts_tool_use_negatives.json +22 -0
  26. package/src/clients/cli/samples/psaSample.json +18 -0
  27. package/src/clients/cli/samples/starter.json +10 -0
  28. package/src/clients/node-js/bin/runevals.js +505 -0
  29. package/src/clients/node-js/config/default.js +25 -0
  30. package/src/clients/node-js/lib/cache-utils.js +119 -0
  31. package/src/clients/node-js/lib/expiry-check.js +164 -0
  32. package/src/clients/node-js/lib/index.js +25 -0
  33. package/src/clients/node-js/lib/python-runtime.js +253 -0
  34. package/src/clients/node-js/lib/venv-manager.js +242 -0
@@ -0,0 +1,589 @@
1
+ """
2
+ Enhanced Response Extraction Module
3
+
4
+ This module provides functionality for extracting detailed response information including
5
+ tool calls, tool results, and message flow reconstruction from agent responses.
6
+
7
+ Key Components:
8
+ - Message flow reconstruction
9
+ - Tool invocation parsing
10
+ - Enhanced response data structure
11
+ - Too ]
12
+ ]
13
+ ]
14
+ })
15
+
16
+ return reconstructed })
17
+
18
+ return reconstructedi ]
19
+ ]
20
+ })
21
+
22
+ return reconstructedn extraction (placeholder for future implementation)
23
+
24
+ Author: GitHub Copilot
25
+ Date: September 21, 2025
26
+ """
27
+
28
+ import json
29
+ import logging
30
+ from typing import Dict, List, Any, Optional
31
+ from datetime import datetime
32
+ from enum import Enum
33
+
34
+ # Configure logging
35
+ logging.basicConfig(level=logging.INFO)
36
+ logger = logging.getLogger(__name__)
37
+
38
+ class MessageRole(Enum):
39
+ """Enumeration for message roles."""
40
+ USER = "user"
41
+ ASSISTANT = "assistant"
42
+ TOOL = "tool"
43
+
44
+ class ContentType(Enum):
45
+ """Enumeration for content types."""
46
+ TEXT = "text"
47
+ TOOL_CALL = "tool_call"
48
+ TOOL_RESULT = "tool_result"
49
+
50
+ class ToolStatus(Enum):
51
+ """Enumeration for tool invocation status."""
52
+ SUCCESS = "Success"
53
+ FAILURE = "Failure"
54
+
55
+ class MessageType(Enum):
56
+ """Enumeration for different message types in conversations."""
57
+ USER = "user"
58
+ BOT = "bot"
59
+ INTERNAL = "Internal"
60
+ INTERNAL_SEARCH = "InternalSearchResult"
61
+
62
+ class EnhancedResponseExtractor:
63
+ """Enhanced extractor for detailed response information."""
64
+
65
+ # List of internal tool names that should be filtered out
66
+ INTERNAL_TOOLS = {
67
+ "hydrate_tool_response",
68
+ "meta_prioritize",
69
+ "reason",
70
+ "generate_express_response",
71
+ "generate_response"
72
+ }
73
+
74
+ def __init__(self):
75
+ self.tool_call_counter = 0
76
+
77
+ def _generate_tool_call_id(self, tool_name: str) -> str:
78
+ """Generate a unique tool call ID."""
79
+ self.tool_call_counter += 1
80
+ timestamp = datetime.now().strftime("%Y%m%d")
81
+ return f"tool_call_{timestamp}_{self.tool_call_counter:03d}_{tool_name}"
82
+
83
+ def _is_internal_tool(self, tool_name: str) -> bool:
84
+ """
85
+ Check if a tool is an internal tool that should be filtered out.
86
+
87
+ Args:
88
+ tool_name: Name of the tool
89
+
90
+ Returns:
91
+ bool: True if internal tool, False otherwise
92
+ """
93
+ return tool_name in self.INTERNAL_TOOLS
94
+
95
+ def _is_tool_message(self, message: Dict[str, Any]) -> bool:
96
+ """
97
+ Check if a message represents a tool invocation (excluding internal tools).
98
+
99
+ Args:
100
+ message: Message dictionary
101
+
102
+ Returns:
103
+ bool: True if tool message and not internal tool, False otherwise
104
+ """
105
+ # First check if it's a tool message at all
106
+ is_tool = (
107
+ (message.get("messageType") == MessageType.INTERNAL.value and
108
+ message.get("contentOrigin") == "OpenAPI-spec") or
109
+ message.get("messageType") == MessageType.INTERNAL_SEARCH.value or
110
+ (message.get("messageType") == MessageType.INTERNAL.value and
111
+ message.get("invocation") is not None)
112
+ )
113
+
114
+ if not is_tool:
115
+ return False
116
+
117
+ # Check if it's an internal tool that should be filtered out
118
+ invocation_str = message.get("invocation", "")
119
+ if invocation_str:
120
+ tool_info = self._parse_tool_invocation(invocation_str)
121
+ tool_name = tool_info.get("name", "")
122
+ if self._is_internal_tool(tool_name):
123
+ return False
124
+
125
+ return True
126
+
127
+ def _parse_tool_invocation(self, invocation_str: str) -> Dict[str, Any]:
128
+ """
129
+ Parse tool invocation string to extract tool name and arguments.
130
+
131
+ Args:
132
+ invocation_str: Tool invocation string
133
+
134
+ Returns:
135
+ Dict containing tool name and arguments
136
+ """
137
+ try:
138
+ # Handle Flux v3 format (JSON array)
139
+ if invocation_str.startswith('['):
140
+ invocation_data = json.loads(invocation_str)
141
+ if isinstance(invocation_data, list) and len(invocation_data) > 0:
142
+ func_data = invocation_data[0].get("function", {})
143
+ tool_name = func_data.get("name", "unknown_tool")
144
+ arguments = json.loads(func_data.get("arguments", "{}"))
145
+ return {"name": tool_name, "arguments": arguments}
146
+
147
+ # Handle standard format: tool_name(arg1="value1", arg2="value2")
148
+ if "(" in invocation_str and ")" in invocation_str:
149
+ tool_name = invocation_str.split("(")[0].strip()
150
+ args_str = invocation_str[invocation_str.find("(")+1:invocation_str.rfind(")")]
151
+
152
+ # Parse arguments
153
+ arguments = {}
154
+ if args_str.strip():
155
+ # Simple parsing for key="value" format
156
+ import re
157
+ matches = re.findall(r'(\w+)=(["\'])(.*?)\2', args_str)
158
+ for match in matches:
159
+ key, _, value = match
160
+ arguments[key] = value
161
+
162
+ return {"name": tool_name, "arguments": arguments}
163
+
164
+ # Fallback: treat as tool name without arguments
165
+ return {"name": invocation_str.strip(), "arguments": {}}
166
+
167
+ except Exception as e:
168
+ logger.warning(f"Failed to parse tool invocation '{invocation_str}': {e}")
169
+ return {"name": "unknown_tool", "arguments": {}}
170
+
171
+ def _extract_tool_results(self, message: Dict[str, Any]) -> Any:
172
+ """
173
+ Extract tool results from a message.
174
+
175
+ Args:
176
+ message: Message dictionary
177
+
178
+ Returns:
179
+ Tool results or None if extraction fails
180
+ """
181
+ try:
182
+ text = message.get("text", "")
183
+ if text:
184
+ result_data = json.loads(text)
185
+
186
+ # Check for search metadata errors
187
+ if isinstance(result_data, dict):
188
+ search_metadata = result_data.get("searchMetadata", {})
189
+ if "error" in search_metadata.get("status", ""):
190
+ return {"error": search_metadata.get("status", "Unknown error")}
191
+
192
+ # Return results or the whole object
193
+ return result_data.get("results", result_data)
194
+
195
+ return result_data
196
+
197
+ except json.JSONDecodeError:
198
+ # Return raw text if not JSON
199
+ return message.get("text", "")
200
+ except Exception as e:
201
+ logger.warning(f"Failed to extract tool results: {e}")
202
+ return None
203
+
204
+ return None
205
+
206
+ def _extract_telemetry_tools(self, telemetry: Dict[str, Any]) -> List[Dict[str, Any]]:
207
+ """
208
+ Extract tool invocations from telemetry data.
209
+
210
+ Args:
211
+ telemetry: Telemetry data dictionary
212
+
213
+ Returns:
214
+ List of tool invocation details
215
+ """
216
+ tools_invoked = []
217
+ tools_queue = []
218
+
219
+ for metric in telemetry.get("metrics", []):
220
+ service_name = metric.get("serviceName")
221
+
222
+ # Track tool invocations from FluxToolInvoker
223
+ if service_name == "FluxToolInvoker" and metric.get("status") == ToolStatus.SUCCESS.value:
224
+ try:
225
+ output = json.loads(metric["output"])
226
+ for item in output:
227
+ invocation = item.get("invocation", "")
228
+ # Parse tool name to check if it's internal
229
+ tool_info = self._parse_tool_invocation(invocation)
230
+ tool_name = tool_info.get("name", "")
231
+
232
+ # Skip internal tools
233
+ if not self._is_internal_tool(tool_name):
234
+ tools_queue.append(invocation)
235
+ except (json.JSONDecodeError, KeyError):
236
+ continue
237
+
238
+ # Track tool results from ExtensionRunner
239
+ elif service_name in ["ExtensionRunner:ext:OpenAPI-spec", "ExtensionRunner:ext:enterprise-search"]:
240
+ if tools_queue:
241
+ invocation_str = tools_queue.pop(0)
242
+ tool_info = self._parse_tool_invocation(invocation_str)
243
+
244
+ tool_data = {
245
+ "invocation": invocation_str,
246
+ "tool_name": tool_info["name"],
247
+ "arguments": tool_info["arguments"],
248
+ "status": metric.get("status", ToolStatus.FAILURE.value),
249
+ "results": None
250
+ }
251
+
252
+ if metric.get("status") == ToolStatus.SUCCESS.value:
253
+ try:
254
+ api_response = json.loads(metric.get("output", "")).get("responses", [])
255
+ if api_response:
256
+ response_data = json.loads(api_response[0].get("text", ""))
257
+
258
+ # Check for errors
259
+ search_metadata = response_data.get("searchMetadata", {})
260
+ if "error" in search_metadata.get("status", ""):
261
+ tool_data["status"] = ToolStatus.FAILURE.value
262
+ tool_data["results"] = {"error": search_metadata.get("status", "")}
263
+ else:
264
+ tool_data["results"] = response_data.get("results", response_data)
265
+ except Exception as e:
266
+ logger.warning(f"Failed to parse tool results from telemetry: {e}")
267
+ tool_data["status"] = ToolStatus.FAILURE.value
268
+
269
+ tools_invoked.append(tool_data)
270
+
271
+ return tools_invoked
272
+
273
+ def _extract_tool_definitions(self, telemetry: Dict[str, Any]) -> List[Dict[str, Any]]:
274
+ """
275
+ Extract tool definitions from telemetry data.
276
+
277
+ Args:
278
+ telemetry: Telemetry data dictionary
279
+
280
+ Returns:
281
+ List of tool definition dictionaries
282
+ """
283
+ tool_definitions = []
284
+
285
+ for metric in telemetry.get("metrics", []):
286
+ service_name = metric.get("serviceName")
287
+
288
+ # Look for DeepLeoImprovedNetworking service with function invocation
289
+ if (service_name == "DeepLeoImprovedNetworking" and
290
+ metric.get("output", "").startswith("CallTags: fluxv3:invokingfunction,")):
291
+
292
+ try:
293
+ # Parse the input field which contains the tool definitions
294
+ input_str = metric.get("input", "")
295
+ if input_str:
296
+ input_data = json.loads(input_str)
297
+ tools = input_data.get("tools", [])
298
+
299
+ if tools and isinstance(tools, list):
300
+ # Add tools to our definitions list (avoid duplicates and filter out internal tools)
301
+ for tool in tools:
302
+ # Check if this tool is internal
303
+ tool_name = tool.get("function", {}).get("name", "")
304
+ if not self._is_internal_tool(tool_name) and tool not in tool_definitions:
305
+ tool_definitions.append(tool.get("function", {}))
306
+
307
+ logger.info(f"Extracted {len(tools)} tool definitions from telemetry")
308
+
309
+ except json.JSONDecodeError as e:
310
+ logger.warning(f"Failed to parse tool definitions from telemetry input: {e}")
311
+ except Exception as e:
312
+ logger.warning(f"Error extracting tool definitions: {e}")
313
+
314
+ return tool_definitions
315
+
316
+ def _reconstruct_message_flow(self, messages: List[Dict[str, Any]], telemetry_tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
317
+ """
318
+ Reconstruct the message flow including tool calls and results.
319
+
320
+ Args:
321
+ messages: Original messages from response
322
+ telemetry_tools: Tool invocations from telemetry
323
+
324
+ Returns:
325
+ List of reconstructed messages
326
+ """
327
+ reconstructed = []
328
+ telemetry_tool_index = 0
329
+
330
+ for message in messages:
331
+ timestamp = message.get("createdAt", datetime.now().isoformat() + "Z")
332
+ run_id = message.get("conversationId", "0")
333
+ author = message.get("author", "")
334
+
335
+ # Skip user messages entirely - we only want the agent's response flow
336
+ if author == MessageType.USER.value:
337
+ continue
338
+
339
+ # Handle tool messages
340
+ elif self._is_tool_message(message):
341
+ invocation_str = message.get("invocation", "")
342
+ tool_info = self._parse_tool_invocation(invocation_str)
343
+ tool_call_id = self._generate_tool_call_id(tool_info["name"])
344
+
345
+ # Add tool call message
346
+ reconstructed.append({
347
+ "createdAt": timestamp,
348
+ "run_id": run_id,
349
+ "role": MessageRole.ASSISTANT.value,
350
+ "content": [
351
+ {
352
+ "type": ContentType.TOOL_CALL.value,
353
+ "tool_call_id": tool_call_id,
354
+ "name": tool_info["name"],
355
+ "arguments": tool_info["arguments"]
356
+ }
357
+ ]
358
+ })
359
+
360
+ # Add tool result message
361
+ results = self._extract_tool_results(message)
362
+ reconstructed.append({
363
+ "createdAt": timestamp,
364
+ "run_id": run_id,
365
+ "role": MessageRole.TOOL.value,
366
+ "tool_call_id": tool_call_id,
367
+ "content": [
368
+ {
369
+ "type": ContentType.TOOL_RESULT.value,
370
+ ContentType.TOOL_RESULT.value: results
371
+ }
372
+ ]
373
+ })
374
+
375
+ # Handle bot response messages
376
+ elif author == MessageType.BOT.value and "messageType" not in message:
377
+ # Check if we have unused telemetry tools to add before the final response
378
+ while telemetry_tool_index < len(telemetry_tools):
379
+ tool = telemetry_tools[telemetry_tool_index]
380
+ tool_call_id = self._generate_tool_call_id(tool["tool_name"])
381
+
382
+ # Add tool call
383
+ reconstructed.append({
384
+ "createdAt": timestamp,
385
+ "run_id": run_id,
386
+ "role": MessageRole.ASSISTANT.value,
387
+ "content": [
388
+ {
389
+ "type": ContentType.TOOL_CALL.value,
390
+ "tool_call_id": tool_call_id,
391
+ "name": tool["tool_name"],
392
+ "arguments": tool["arguments"]
393
+ }
394
+ ]
395
+ })
396
+
397
+ # Add tool result
398
+ reconstructed.append({
399
+ "createdAt": timestamp,
400
+ "run_id": run_id,
401
+ "role": MessageRole.TOOL.value,
402
+ "tool_call_id": tool_call_id,
403
+ "content": [
404
+ {
405
+ "type": ContentType.TOOL_RESULT.value,
406
+ ContentType.TOOL_RESULT.value: tool["results"]
407
+ }
408
+ ]
409
+ })
410
+
411
+ telemetry_tool_index += 1
412
+
413
+ # Add final assistant response
414
+ reconstructed.append({
415
+ "createdAt": timestamp,
416
+ "run_id": run_id,
417
+ "role": MessageRole.ASSISTANT.value,
418
+ "content": [
419
+ {
420
+ "type": ContentType.TEXT.value,
421
+ "text": message.get("text", "")
422
+ }
423
+ ]
424
+ })
425
+ return reconstructed
426
+
427
+ def extract_enhanced_response(self, raw_response: str) -> Dict[str, Any]:
428
+ """
429
+ Extract enhanced response information from raw agent response.
430
+
431
+ Args:
432
+ raw_response: Raw response string from agent
433
+
434
+ Returns:
435
+ Dict containing enhanced response data
436
+ """
437
+ try:
438
+ # Parse the raw response
439
+ response_data = json.loads(raw_response)
440
+
441
+ # Extract basic response text (fallback to original behavior)
442
+ response_text = ""
443
+ if isinstance(response_data, dict):
444
+ # Look for bot response in messages array
445
+ messages_for_text = response_data.get("messages", [])
446
+ for message in messages_for_text:
447
+ if (message.get("author") == "bot" and
448
+ "messageType" not in message and
449
+ message.get("text")):
450
+ response_text = message.get("text", "").strip()
451
+ break
452
+
453
+ if not response_text:
454
+ response_text = raw_response.strip()
455
+
456
+ # Initialize enhanced structure
457
+ enhanced_response = {
458
+ "response": [], # Will contain reconstructed message flow
459
+ "tool_definitions": [], # Placeholder for future implementation
460
+ "raw_response_text": response_text, # Backward compatibility
461
+ "metadata": {
462
+ "conversation_id": response_data.get("conversationId"),
463
+ "request_id": response_data.get("requestId"),
464
+ "telemetry_available": False
465
+ }
466
+ }
467
+
468
+ # Extract messages if available
469
+ messages = []
470
+ if isinstance(response_data, dict):
471
+ # Messages are directly in the response_data object
472
+ messages = response_data.get("messages", [])
473
+
474
+ # Extract telemetry tools if available
475
+ telemetry_tools = []
476
+ tool_definitions = []
477
+ telemetry = response_data.get("telemetry", {})
478
+ if telemetry:
479
+ enhanced_response["metadata"]["telemetry_available"] = True
480
+ telemetry_tools = self._extract_telemetry_tools(telemetry)
481
+ tool_definitions = self._extract_tool_definitions(telemetry)
482
+
483
+ # Update tool definitions in the response
484
+ enhanced_response["tool_definitions"] = tool_definitions
485
+
486
+ # Reconstruct message flow
487
+ if messages:
488
+ enhanced_response["response"] = self._reconstruct_message_flow(messages, telemetry_tools)
489
+ else:
490
+ # Fallback: create simple text response
491
+ enhanced_response["response"] = [
492
+ {
493
+ "createdAt": datetime.now().isoformat() + "Z",
494
+ "run_id": "0",
495
+ "role": MessageRole.ASSISTANT.value,
496
+ "content": [
497
+ {
498
+ "type": ContentType.TEXT.value,
499
+ "text": response_text
500
+ }
501
+ ]
502
+ }
503
+ ]
504
+
505
+ return enhanced_response
506
+
507
+ except json.JSONDecodeError:
508
+ # Handle non-JSON responses
509
+ logger.warning("Received non-JSON response, creating simple text response")
510
+ return {
511
+ "response": [
512
+ {
513
+ "createdAt": datetime.now().isoformat() + "Z",
514
+ "run_id": "0",
515
+ "role": MessageRole.ASSISTANT.value,
516
+ "content": [
517
+ {
518
+ "type": ContentType.TEXT.value,
519
+ "text": raw_response.strip()
520
+ }
521
+ ]
522
+ }
523
+ ],
524
+ "tool_definitions": [],
525
+ "raw_response_text": raw_response.strip(),
526
+ "metadata": {
527
+ "conversation_id": None,
528
+ "request_id": None,
529
+ "telemetry_available": False
530
+ }
531
+ }
532
+
533
+ except Exception as e:
534
+ logger.error(f"Failed to extract enhanced response: {e}")
535
+ # Return minimal structure on error
536
+ return {
537
+ "response": [
538
+ {
539
+ "createdAt": datetime.now().isoformat() + "Z",
540
+ "run_id": "0",
541
+ "role": MessageRole.ASSISTANT.value,
542
+ "content": [
543
+ {
544
+ "type": ContentType.TEXT.value,
545
+ "text": raw_response.strip() if raw_response else "Error processing response"
546
+ }
547
+ ]
548
+ }
549
+ ],
550
+ "tool_definitions": [],
551
+ "raw_response_text": raw_response.strip() if raw_response else "",
552
+ "metadata": {
553
+ "conversation_id": None,
554
+ "request_id": None,
555
+ "telemetry_available": False,
556
+ "error": str(e)
557
+ }
558
+ }
559
+
560
+ def extract_enhanced_responses(responses: Dict[str, str]) -> Dict[str, Dict[str, Any]]:
561
+ """
562
+ Extract enhanced response information for multiple responses.
563
+
564
+ Args:
565
+ responses: Dictionary mapping prompts to raw response strings
566
+
567
+ Returns:
568
+ Dictionary mapping prompts to enhanced response data
569
+ """
570
+ extractor = EnhancedResponseExtractor()
571
+ enhanced_responses = {}
572
+
573
+ for prompt, raw_response in responses.items():
574
+ enhanced_responses[prompt] = extractor.extract_enhanced_response(raw_response)
575
+
576
+ return enhanced_responses
577
+
578
+ def get_response_text_for_evaluation(enhanced_response: Dict[str, Any]) -> str:
579
+ """
580
+ Extract simple text response for evaluation purposes (backward compatibility).
581
+
582
+ Args:
583
+ enhanced_response: Enhanced response dictionary
584
+
585
+ Returns:
586
+ Simple text response string
587
+ """
588
+ # Use raw_response_text for backward compatibility
589
+ return enhanced_response.get("raw_response_text", "")