@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +135 -100
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +7 -0
  4. package/schema/v1/eval-document.schema.json +143 -11
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  11. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  12. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  13. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  14. package/schema/version.json +2 -2
  15. package/src/clients/cli/agent_selector.py +74 -0
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
  18. package/src/clients/cli/api_clients/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/base_agent_client.py +77 -0
  20. package/src/clients/cli/cli_args.py +136 -0
  21. package/src/clients/cli/cli_logging/cli_logger.py +33 -0
  22. package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
  23. package/src/clients/cli/cli_logging/logging_utils.py +0 -1
  24. package/src/clients/cli/common.py +64 -0
  25. package/src/clients/cli/env_validator.py +73 -0
  26. package/src/clients/cli/evaluation_runner.py +653 -0
  27. package/src/clients/cli/evaluator_resolver.py +9 -6
  28. package/src/clients/cli/generate_report.py +272 -129
  29. package/src/clients/cli/main.py +157 -1174
  30. package/src/clients/cli/parallel_executor.py +57 -0
  31. package/src/clients/cli/prompt_loader.py +148 -0
  32. package/src/clients/cli/readme.md +9 -53
  33. package/src/clients/cli/requirements.txt +1 -1
  34. package/src/clients/cli/response_extractor.py +4 -603
  35. package/src/clients/cli/result_writer.py +488 -0
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +82 -20
  40. package/src/clients/node-js/config/default.js +12 -11
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +14 -20
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
@@ -1,607 +1,8 @@
1
- """
2
- Enhanced Response Extraction Module
1
+ """Response text extraction for evaluation."""
3
2
 
4
- This module provides functionality for extracting detailed response information including
5
- tool calls, tool results, and message flow reconstruction from agent responses.
3
+ from typing import Any, Dict
6
4
 
7
- Key Components:
8
- - Message flow reconstruction
9
- - Tool invocation parsing
10
- - Enhanced response data structure
11
- - Too ]
12
- ]
13
- ]
14
- })
15
-
16
- return reconstructed })
17
-
18
- return reconstructedi ]
19
- ]
20
- })
21
-
22
- return reconstructedn extraction (placeholder for future implementation)
23
-
24
- Author: GitHub Copilot
25
- Date: September 21, 2025
26
- """
27
-
28
- import json
29
- import logging
30
- from typing import Dict, List, Any, Optional, Tuple
31
- from datetime import datetime
32
- from enum import Enum
33
- from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel
34
-
35
- # Configure logging
36
- if not logging.getLogger().handlers:
37
- logging.basicConfig(level=logging.INFO)
38
- logger = logging.getLogger(__name__)
39
-
40
- def _log_level_to_python_level(log_level: str) -> int:
41
- normalized = (log_level or "info").strip().lower()
42
- return LOG_LEVEL_MAP.get(normalized, logging.INFO)
43
-
44
- class MessageRole(Enum):
45
- """Enumeration for message roles."""
46
- USER = "user"
47
- ASSISTANT = "assistant"
48
- TOOL = "tool"
49
-
50
- class ContentType(Enum):
51
- """Enumeration for content types."""
52
- TEXT = "text"
53
- TOOL_CALL = "tool_call"
54
- TOOL_RESULT = "tool_result"
55
-
56
- class ToolStatus(Enum):
57
- """Enumeration for tool invocation status."""
58
- SUCCESS = "Success"
59
- FAILURE = "Failure"
60
-
61
- class MessageType(Enum):
62
- """Enumeration for different message types in conversations."""
63
- USER = "user"
64
- BOT = "bot"
65
- INTERNAL = "Internal"
66
- INTERNAL_SEARCH = "InternalSearchResult"
67
-
68
- class EnhancedResponseExtractor:
69
- """Enhanced extractor for detailed response information."""
70
-
71
- # List of internal tool names that should be filtered out
72
- INTERNAL_TOOLS = {
73
- "hydrate_tool_response",
74
- "meta_prioritize",
75
- "reason",
76
- "generate_express_response",
77
- "generate_response"
78
- }
79
-
80
- def __init__(self, log_level: str = "info"):
81
- self.tool_call_counter = 0
82
- self.log_level = (log_level or "info").strip().lower()
83
- logger.setLevel(_log_level_to_python_level(self.log_level))
84
-
85
- def _generate_tool_call_id(self, tool_name: str) -> str:
86
- """Generate a unique tool call ID."""
87
- self.tool_call_counter += 1
88
- timestamp = datetime.now().strftime("%Y%m%d")
89
- return f"tool_call_{timestamp}_{self.tool_call_counter:03d}_{tool_name}"
90
-
91
- def _is_internal_tool(self, tool_name: str) -> bool:
92
- """
93
- Check if a tool is an internal tool that should be filtered out.
94
-
95
- Args:
96
- tool_name: Name of the tool
97
-
98
- Returns:
99
- bool: True if internal tool, False otherwise
100
- """
101
- return tool_name in self.INTERNAL_TOOLS
102
-
103
- def _is_tool_message(self, message: Dict[str, Any]) -> bool:
104
- """
105
- Check if a message represents a tool invocation (excluding internal tools).
106
-
107
- Args:
108
- message: Message dictionary
109
-
110
- Returns:
111
- bool: True if tool message and not internal tool, False otherwise
112
- """
113
- # First check if it's a tool message at all
114
- is_tool = (
115
- (message.get("messageType") == MessageType.INTERNAL.value and
116
- message.get("contentOrigin") == "OpenAPI-spec") or
117
- message.get("messageType") == MessageType.INTERNAL_SEARCH.value or
118
- (message.get("messageType") == MessageType.INTERNAL.value and
119
- message.get("invocation") is not None)
120
- )
121
-
122
- if not is_tool:
123
- return False
124
-
125
- # Check if it's an internal tool that should be filtered out
126
- invocation_str = message.get("invocation", "")
127
- if invocation_str:
128
- tool_info = self._parse_tool_invocation(invocation_str)
129
- tool_name = tool_info.get("name", "")
130
- if self._is_internal_tool(tool_name):
131
- return False
132
-
133
- return True
134
-
135
- def _parse_tool_invocation(self, invocation_str: str) -> Dict[str, Any]:
136
- """
137
- Parse tool invocation string to extract tool name and arguments.
138
-
139
- Args:
140
- invocation_str: Tool invocation string
141
-
142
- Returns:
143
- Dict containing tool name and arguments
144
- """
145
- try:
146
- # Handle Flux v3 format (JSON array)
147
- if invocation_str.startswith('['):
148
- invocation_data = json.loads(invocation_str)
149
- if isinstance(invocation_data, list) and len(invocation_data) > 0:
150
- func_data = invocation_data[0].get("function", {})
151
- tool_name = func_data.get("name", "unknown_tool")
152
- arguments = json.loads(func_data.get("arguments", "{}"))
153
- return {"name": tool_name, "arguments": arguments}
154
-
155
- # Handle standard format: tool_name(arg1="value1", arg2="value2")
156
- if "(" in invocation_str and ")" in invocation_str:
157
- tool_name = invocation_str.split("(")[0].strip()
158
- args_str = invocation_str[invocation_str.find("(")+1:invocation_str.rfind(")")]
159
-
160
- # Parse arguments
161
- arguments = {}
162
- if args_str.strip():
163
- # Simple parsing for key="value" format
164
- import re
165
- matches = re.findall(r'(\w+)=(["\'])(.*?)\2', args_str)
166
- for match in matches:
167
- key, _, value = match
168
- arguments[key] = value
169
-
170
- return {"name": tool_name, "arguments": arguments}
171
-
172
- # Fallback: treat as tool name without arguments
173
- return {"name": invocation_str.strip(), "arguments": {}}
174
-
175
- except Exception as e:
176
- logger.warning(f"Failed to parse tool invocation '{invocation_str}': {e}")
177
- return {"name": "unknown_tool", "arguments": {}}
178
-
179
- def _extract_tool_results(self, message: Dict[str, Any]) -> Any:
180
- """
181
- Extract tool results from a message.
182
-
183
- Args:
184
- message: Message dictionary
185
-
186
- Returns:
187
- Tool results or None if extraction fails
188
- """
189
- try:
190
- text = message.get("text", "")
191
- if text:
192
- result_data = json.loads(text)
193
-
194
- # Check for search metadata errors
195
- if isinstance(result_data, dict):
196
- search_metadata = result_data.get("searchMetadata", {})
197
- if "error" in search_metadata.get("status", ""):
198
- return {"error": search_metadata.get("status", "Unknown error")}
199
-
200
- # Return results or the whole object
201
- return result_data.get("results", result_data)
202
-
203
- return result_data
204
-
205
- except json.JSONDecodeError:
206
- # Return raw text if not JSON
207
- return message.get("text", "")
208
- except Exception as e:
209
- logger.warning(f"Failed to extract tool results: {e}")
210
- return None
211
-
212
- return None
213
-
214
- def _extract_telemetry_tools(self, telemetry: Dict[str, Any]) -> List[Dict[str, Any]]:
215
- """
216
- Extract tool invocations from telemetry data.
217
-
218
- Args:
219
- telemetry: Telemetry data dictionary
220
-
221
- Returns:
222
- List of tool invocation details
223
- """
224
- tools_invoked = []
225
- tools_queue = []
226
-
227
- for metric in telemetry.get("metrics", []):
228
- service_name = metric.get("serviceName")
229
-
230
- # Track tool invocations from FluxToolInvoker
231
- if service_name == "FluxToolInvoker" and metric.get("status") == ToolStatus.SUCCESS.value:
232
- try:
233
- output = json.loads(metric["output"])
234
- for item in output:
235
- invocation = item.get("invocation", "")
236
- # Parse tool name to check if it's internal
237
- tool_info = self._parse_tool_invocation(invocation)
238
- tool_name = tool_info.get("name", "")
239
-
240
- # Skip internal tools
241
- if not self._is_internal_tool(tool_name):
242
- tools_queue.append(invocation)
243
- except (json.JSONDecodeError, KeyError):
244
- continue
245
-
246
- # Track tool results from ExtensionRunner
247
- elif service_name in ["ExtensionRunner:ext:OpenAPI-spec", "ExtensionRunner:ext:enterprise-search"]:
248
- if tools_queue:
249
- invocation_str = tools_queue.pop(0)
250
- tool_info = self._parse_tool_invocation(invocation_str)
251
-
252
- tool_data = {
253
- "invocation": invocation_str,
254
- "tool_name": tool_info["name"],
255
- "arguments": tool_info["arguments"],
256
- "status": metric.get("status", ToolStatus.FAILURE.value),
257
- "results": None
258
- }
259
-
260
- if metric.get("status") == ToolStatus.SUCCESS.value:
261
- try:
262
- api_response = json.loads(metric.get("output", "")).get("responses", [])
263
- if api_response:
264
- response_data = json.loads(api_response[0].get("text", ""))
265
-
266
- # Check for errors
267
- search_metadata = response_data.get("searchMetadata", {})
268
- if "error" in search_metadata.get("status", ""):
269
- tool_data["status"] = ToolStatus.FAILURE.value
270
- tool_data["results"] = {"error": search_metadata.get("status", "")}
271
- else:
272
- tool_data["results"] = response_data.get("results", response_data)
273
- except Exception as e:
274
- logger.warning(f"Failed to parse tool results from telemetry: {e}")
275
- tool_data["status"] = ToolStatus.FAILURE.value
276
-
277
- tools_invoked.append(tool_data)
278
-
279
- return tools_invoked
280
-
281
- def _extract_tool_definitions(self, telemetry: Dict[str, Any]) -> List[Dict[str, Any]]:
282
- """
283
- Extract tool definitions from telemetry data.
284
-
285
- Args:
286
- telemetry: Telemetry data dictionary
287
-
288
- Returns:
289
- List of tool definition dictionaries
290
- """
291
- tool_definitions = []
292
-
293
- for metric in telemetry.get("metrics", []):
294
- service_name = metric.get("serviceName")
295
-
296
- # Look for DeepLeoImprovedNetworking service with function invocation
297
- if (service_name == "DeepLeoImprovedNetworking" and
298
- metric.get("output", "").startswith("CallTags: fluxv3:invokingfunction,")):
299
-
300
- try:
301
- # Parse the input field which contains the tool definitions
302
- input_str = metric.get("input", "")
303
- if input_str:
304
- input_data = json.loads(input_str)
305
- tools = input_data.get("tools", [])
306
-
307
- if tools and isinstance(tools, list):
308
- # Add tools to our definitions list (avoid duplicates and filter out internal tools)
309
- for tool in tools:
310
- # Check if this tool is internal
311
- tool_name = tool.get("function", {}).get("name", "")
312
- if not self._is_internal_tool(tool_name) and tool not in tool_definitions:
313
- tool_definitions.append(tool.get("function", {}))
314
-
315
- logger.info(f"Extracted {len(tools)} tool definitions from telemetry")
316
-
317
- except json.JSONDecodeError as e:
318
- logger.warning(f"Failed to parse tool definitions from telemetry input: {e}")
319
- except Exception as e:
320
- logger.warning(f"Error extracting tool definitions: {e}")
321
-
322
- return tool_definitions
323
-
324
- def _reconstruct_message_flow(self, messages: List[Dict[str, Any]], telemetry_tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
325
- """
326
- Reconstruct the message flow including tool calls and results.
327
-
328
- Args:
329
- messages: Original messages from response
330
- telemetry_tools: Tool invocations from telemetry
331
-
332
- Returns:
333
- List of reconstructed messages
334
- """
335
- reconstructed = []
336
- telemetry_tool_index = 0
337
-
338
- for message in messages:
339
- timestamp = message.get("createdAt", datetime.now().isoformat() + "Z")
340
- run_id = message.get("conversationId", "0")
341
- author = message.get("author", "")
342
-
343
- # Skip user messages entirely - we only want the agent's response flow
344
- if author == MessageType.USER.value:
345
- continue
346
-
347
- # Handle tool messages
348
- elif self._is_tool_message(message):
349
- invocation_str = message.get("invocation", "")
350
- tool_info = self._parse_tool_invocation(invocation_str)
351
- tool_call_id = self._generate_tool_call_id(tool_info["name"])
352
-
353
- # Add tool call message
354
- reconstructed.append({
355
- "createdAt": timestamp,
356
- "run_id": run_id,
357
- "role": MessageRole.ASSISTANT.value,
358
- "content": [
359
- {
360
- "type": ContentType.TOOL_CALL.value,
361
- "tool_call_id": tool_call_id,
362
- "name": tool_info["name"],
363
- "arguments": tool_info["arguments"]
364
- }
365
- ]
366
- })
367
-
368
- # Add tool result message
369
- results = self._extract_tool_results(message)
370
- reconstructed.append({
371
- "createdAt": timestamp,
372
- "run_id": run_id,
373
- "role": MessageRole.TOOL.value,
374
- "tool_call_id": tool_call_id,
375
- "content": [
376
- {
377
- "type": ContentType.TOOL_RESULT.value,
378
- ContentType.TOOL_RESULT.value: results
379
- }
380
- ]
381
- })
382
-
383
- # Handle bot response messages
384
- elif author == MessageType.BOT.value and "messageType" not in message:
385
- # Check if we have unused telemetry tools to add before the final response
386
- while telemetry_tool_index < len(telemetry_tools):
387
- tool = telemetry_tools[telemetry_tool_index]
388
- tool_call_id = self._generate_tool_call_id(tool["tool_name"])
389
-
390
- # Add tool call
391
- reconstructed.append({
392
- "createdAt": timestamp,
393
- "run_id": run_id,
394
- "role": MessageRole.ASSISTANT.value,
395
- "content": [
396
- {
397
- "type": ContentType.TOOL_CALL.value,
398
- "tool_call_id": tool_call_id,
399
- "name": tool["tool_name"],
400
- "arguments": tool["arguments"]
401
- }
402
- ]
403
- })
404
-
405
- # Add tool result
406
- reconstructed.append({
407
- "createdAt": timestamp,
408
- "run_id": run_id,
409
- "role": MessageRole.TOOL.value,
410
- "tool_call_id": tool_call_id,
411
- "content": [
412
- {
413
- "type": ContentType.TOOL_RESULT.value,
414
- ContentType.TOOL_RESULT.value: tool["results"]
415
- }
416
- ]
417
- })
418
-
419
- telemetry_tool_index += 1
420
-
421
- # Add final assistant response
422
- reconstructed.append({
423
- "createdAt": timestamp,
424
- "run_id": run_id,
425
- "role": MessageRole.ASSISTANT.value,
426
- "content": [
427
- {
428
- "type": ContentType.TEXT.value,
429
- "text": message.get("text", "")
430
- }
431
- ]
432
- })
433
- return reconstructed
434
-
435
- def extract_enhanced_response(self, raw_response: str) -> Dict[str, Any]:
436
- """
437
- Extract enhanced response information from raw agent response.
438
-
439
- Args:
440
- raw_response: Raw response string from agent
441
-
442
- Returns:
443
- Dict containing enhanced response data
444
- """
445
- try:
446
- # Parse the raw response
447
- response_data = json.loads(raw_response)
448
-
449
- # Extract basic response text (fallback to original behavior)
450
- response_text = ""
451
- if isinstance(response_data, dict):
452
- # Look for bot response in messages array
453
- messages_for_text = response_data.get("messages", [])
454
- for message in messages_for_text:
455
- if (message.get("author") == "bot" and
456
- "messageType" not in message and
457
- message.get("text")):
458
- response_text = message.get("text", "").strip()
459
- break
460
-
461
- if not response_text:
462
- response_text = raw_response.strip()
463
-
464
- # Initialize enhanced structure
465
- enhanced_response = {
466
- "response": [], # Will contain reconstructed message flow
467
- "tool_definitions": [], # Placeholder for future implementation
468
- "raw_response_text": response_text, # Backward compatibility
469
- "metadata": {
470
- "conversation_id": response_data.get("conversationId"),
471
- "request_id": response_data.get("requestId"),
472
- "message_id": None,
473
- "telemetry_available": False
474
- }
475
- }
476
-
477
- # Extract messages if available
478
- messages = []
479
- if isinstance(response_data, dict):
480
- # Messages are directly in the response_data object
481
- messages = response_data.get("messages", [])
482
-
483
- # Extract message_id from the last bot message in this response
484
- bot_messages = [m for m in messages if m.get("author") != "user"]
485
- if bot_messages and bot_messages[-1].get("messageId"):
486
- enhanced_response["metadata"]["message_id"] = bot_messages[-1]["messageId"]
487
-
488
- # Extract telemetry tools if available
489
- telemetry_tools = []
490
- tool_definitions = []
491
- telemetry = response_data.get("telemetry", {})
492
- if telemetry:
493
- enhanced_response["metadata"]["telemetry_available"] = True
494
- telemetry_tools = self._extract_telemetry_tools(telemetry)
495
- tool_definitions = self._extract_tool_definitions(telemetry)
496
-
497
- # Update tool definitions in the response
498
- enhanced_response["tool_definitions"] = tool_definitions
499
-
500
- # Reconstruct message flow
501
- if messages:
502
- enhanced_response["response"] = self._reconstruct_message_flow(messages, telemetry_tools)
503
- else:
504
- # Fallback: create simple text response
505
- enhanced_response["response"] = [
506
- {
507
- "createdAt": datetime.now().isoformat() + "Z",
508
- "run_id": "0",
509
- "role": MessageRole.ASSISTANT.value,
510
- "content": [
511
- {
512
- "type": ContentType.TEXT.value,
513
- "text": response_text
514
- }
515
- ]
516
- }
517
- ]
518
-
519
- return enhanced_response
520
-
521
- except json.JSONDecodeError:
522
- # Handle non-JSON responses
523
- logger.warning("Received non-JSON response, creating simple text response")
524
- return {
525
- "response": [
526
- {
527
- "createdAt": datetime.now().isoformat() + "Z",
528
- "run_id": "0",
529
- "role": MessageRole.ASSISTANT.value,
530
- "content": [
531
- {
532
- "type": ContentType.TEXT.value,
533
- "text": raw_response.strip()
534
- }
535
- ]
536
- }
537
- ],
538
- "tool_definitions": [],
539
- "raw_response_text": raw_response.strip(),
540
- "metadata": {
541
- "conversation_id": None,
542
- "request_id": None,
543
- "message_id": None,
544
- "telemetry_available": False
545
- }
546
- }
547
-
548
- except Exception as e:
549
- logger.error(f"Failed to extract enhanced response: {e}")
550
- # Return minimal structure on error
551
- return {
552
- "response": [
553
- {
554
- "createdAt": datetime.now().isoformat() + "Z",
555
- "run_id": "0",
556
- "role": MessageRole.ASSISTANT.value,
557
- "content": [
558
- {
559
- "type": ContentType.TEXT.value,
560
- "text": raw_response.strip() if raw_response else "Error processing response"
561
- }
562
- ]
563
- }
564
- ],
565
- "tool_definitions": [],
566
- "raw_response_text": raw_response.strip() if raw_response else "",
567
- "metadata": {
568
- "conversation_id": None,
569
- "request_id": None,
570
- "message_id": None,
571
- "telemetry_available": False,
572
- "error": str(e)
573
- }
574
- }
575
-
576
- def extract_enhanced_responses(responses: List[Tuple[str, str]], log_level: str = "info") -> List[Dict[str, Any]]:
577
- """
578
- Extract enhanced response information for multiple responses.
579
-
580
- Args:
581
- responses: List of (prompt_text, raw_response_string) tuples, one per prompt
582
- sent to the chat API. Order and duplicates are preserved.
583
-
584
- Returns:
585
- List of enhanced response dicts (one per prompt, same order as input).
586
- """
587
- extractor = EnhancedResponseExtractor(log_level=log_level)
588
- enhanced_responses = []
589
-
590
- for prompt, raw_response in responses:
591
- enhanced = extractor.extract_enhanced_response(raw_response)
592
- enhanced_responses.append(enhanced)
593
-
594
- return enhanced_responses
595
5
 
596
6
  def get_response_text_for_evaluation(enhanced_response: Dict[str, Any]) -> str:
597
- """
598
- Extract simple text response for evaluation purposes (backward compatibility).
599
-
600
- Args:
601
- enhanced_response: Enhanced response dictionary
602
-
603
- Returns:
604
- Simple text response string
605
- """
606
- # Use raw_response_text for backward compatibility
607
- return enhanced_response.get("raw_response_text", "")
7
+ """Extract plain text from an agent response dict for evaluation."""
8
+ return enhanced_response.get("raw_response_text", "")