azure-ai-evaluation 1.11.1__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (35) hide show
  1. azure/ai/evaluation/_aoai/aoai_grader.py +63 -19
  2. azure/ai/evaluation/_aoai/label_grader.py +8 -3
  3. azure/ai/evaluation/_aoai/python_grader.py +8 -3
  4. azure/ai/evaluation/_aoai/score_model_grader.py +8 -3
  5. azure/ai/evaluation/_aoai/string_check_grader.py +9 -4
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +9 -4
  7. azure/ai/evaluation/_eval_mapping.py +2 -0
  8. azure/ai/evaluation/_evaluate/_evaluate.py +106 -4
  9. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +252 -48
  10. azure/ai/evaluation/_evaluate/_utils.py +7 -3
  11. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  12. azure/ai/evaluation/_evaluators/_common/_base_eval.py +77 -3
  13. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  14. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +6 -0
  15. azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py +7 -0
  16. azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py +342 -0
  17. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +7 -1
  18. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  19. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  20. azure/ai/evaluation/_evaluators/_task_success/__init__.py +7 -0
  21. azure/ai/evaluation/_evaluators/_task_success/_task_success.py +168 -0
  22. azure/ai/evaluation/_evaluators/_task_success/task_success.prompty +220 -0
  23. azure/ai/evaluation/_exceptions.py +1 -0
  24. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +7 -2
  25. azure/ai/evaluation/_version.py +1 -1
  26. azure/ai/evaluation/red_team/_mlflow_integration.py +454 -35
  27. azure/ai/evaluation/red_team/_red_team.py +9 -0
  28. azure/ai/evaluation/red_team/_red_team_result.py +230 -1
  29. azure/ai/evaluation/red_team/_result_processor.py +416 -23
  30. azure/ai/evaluation/red_team/_utils/formatting_utils.py +1 -1
  31. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/METADATA +13 -3
  32. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/RECORD +35 -30
  33. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/WHEEL +0 -0
  34. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/licenses/NOTICE.txt +0 -0
  35. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/top_level.txt +0 -0
@@ -3,10 +3,11 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import logging
6
+ import re
6
7
 
7
8
  from openai import AzureOpenAI, OpenAI
8
9
  import pandas as pd
9
- from typing import Any, Callable, Dict, Tuple, TypeVar, Union, Type, Optional, TypedDict, List
10
+ from typing import Any, Callable, Dict, Tuple, TypeVar, Union, Type, Optional, TypedDict, List, cast, Set
10
11
  from time import sleep
11
12
 
12
13
  from ._batch_run import CodeClient, ProxyClient
@@ -21,6 +22,15 @@ from azure.ai.evaluation._common._experimental import experimental
21
22
  TClient = TypeVar("TClient", ProxyClient, CodeClient)
22
23
  LOGGER = logging.getLogger(__name__)
23
24
 
25
+ # Precompiled regex for extracting data paths from mapping expressions of the form
26
+ # ${data.some.dotted.path}. Compiled once at import time to avoid repeated
27
+ # recompilation on each call to _generate_data_source_config.
28
+ DATA_PATH_PATTERN = re.compile(r"^\$\{data\.([a-zA-Z0-9_\.]+)\}$")
29
+
30
+ # Canonical top-level wrapper key expected in nested JSONL evaluation rows.
31
+ # Centralizing here avoids magic strings sprinkled through schema/content generation code.
32
+ WRAPPER_KEY = "item"
33
+
24
34
 
25
35
  class OAIEvalRunCreationInfo(TypedDict, total=True):
26
36
  """Configuration for an evaluator"""
@@ -102,7 +112,7 @@ def _begin_aoai_evaluation(
102
112
 
103
113
 
104
114
  def _begin_single_aoai_evaluation(
105
- graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Dict[str, str], run_name: str
115
+ graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Optional[Dict[str, str]], run_name: str
106
116
  ) -> OAIEvalRunCreationInfo:
107
117
  """
108
118
  Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
@@ -111,8 +121,10 @@ def _begin_single_aoai_evaluation(
111
121
 
112
122
  :param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
113
123
  :type graders: Dict[str, AoaiGrader]
114
- :param data_source_config: The data source configuration to apply to the
115
- :type data_source_config: pd.DataFrame
124
+ :param data: The input data to evaluate, as a pandas DataFrame.
125
+ :type data: pd.DataFrame
126
+ :param column_mapping: The column mapping to apply. If None, an empty mapping is used.
127
+ :type column_mapping: Optional[Dict[str, str]]
116
128
  :param run_name: The name of the evaluation run.
117
129
  :type run_name: str
118
130
  :return: A tuple containing the eval group ID and eval run ID of the resultant eval run, as well as a dictionary
@@ -130,7 +142,8 @@ def _begin_single_aoai_evaluation(
130
142
  for name, grader in graders.items():
131
143
  grader_name_list.append(name)
132
144
  grader_list.append(grader._grader_config)
133
- data_source_config = _generate_data_source_config(data, column_mapping)
145
+ effective_column_mapping: Dict[str, str] = column_mapping or {}
146
+ data_source_config = _generate_data_source_config(data, effective_column_mapping)
134
147
 
135
148
  # Create eval group
136
149
  # import pdb; pdb.set_trace()
@@ -154,7 +167,7 @@ def _begin_single_aoai_evaluation(
154
167
  grader_name_map[criteria.id] = name
155
168
 
156
169
  # Create eval run
157
- eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, column_mapping)
170
+ eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, effective_column_mapping)
158
171
  LOGGER.info(
159
172
  f"AOAI: Eval run created with id {eval_run_id}."
160
173
  + " Results will be retrieved after normal evaluation is complete..."
@@ -479,36 +492,166 @@ def _get_graders_and_column_mappings(
479
492
  ]
480
493
 
481
494
 
495
+ def _build_schema_tree_from_paths(
496
+ paths: List[str],
497
+ force_leaf_type: str = "string",
498
+ ) -> Dict[str, Any]:
499
+ """
500
+ Build a nested JSON schema (object) from a list of dot-delimited paths.
501
+ Each path represents a leaf. Intermediate segments become nested object properties.
502
+
503
+ Example input paths:
504
+ ["item.query",
505
+ "item.context.company.policy.security.passwords.rotation_days",
506
+ "item.context.company.policy.security.network.vpn.required"]
507
+
508
+ Returns schema fragment:
509
+ {
510
+ "type": "object",
511
+ "properties": {
512
+ "item": {
513
+ "type": "object",
514
+ "properties": {
515
+ "query": {"type": "string"},
516
+ "context": {
517
+ "type": "object",
518
+ "properties": {
519
+ "company": { ... }
520
+ },
521
+ "required": ["company"]
522
+ }
523
+ },
524
+ "required": ["query", "context"]
525
+ }
526
+ },
527
+ "required": ["item"]
528
+ }
529
+
530
+ :param paths: A list of dot-delimited strings, each representing a leaf path
531
+ in the logical object hierarchy (e.g. ``"item.context.company.policy.security.passwords.rotation_days"``).
532
+ Empty path segments are ignored.
533
+ :type paths: List[str]
534
+ :param force_leaf_type: The JSON Schema ``type`` value to assign to every leaf node
535
+ produced from the supplied paths. Defaults to ``"string"``.
536
+ :type force_leaf_type: str
537
+ :return: A JSON Schema fragment describing the hierarchical structure implied by
538
+ the input paths. The returned schema root always has ``type: object`` with
539
+ recursively nested ``properties`` / ``required`` keys.
540
+ :rtype: Dict[str, Any]
541
+ """
542
+ # Build tree where each node: {"__children__": { segment: node, ... }, "__leaf__": bool }
543
+ root: Dict[str, Any] = {"__children__": {}, "__leaf__": False}
544
+
545
+ def insert(path: str):
546
+ parts = [p for p in path.split(".") if p]
547
+ node = root
548
+ for i, part in enumerate(parts):
549
+ children = node["__children__"]
550
+ if part not in children:
551
+ children[part] = {"__children__": {}, "__leaf__": False}
552
+ node = children[part]
553
+ if i == len(parts) - 1:
554
+ node["__leaf__"] = True
555
+
556
+ for p in paths:
557
+ insert(p)
558
+
559
+ def to_schema(node: Dict[str, Any]) -> Dict[str, Any]:
560
+ children = node["__children__"]
561
+ if not children:
562
+ # Leaf node
563
+ return {"type": force_leaf_type}
564
+ props = {}
565
+ required = []
566
+ for name, child in children.items():
567
+ props[name] = to_schema(child)
568
+ required.append(name)
569
+ return {
570
+ "type": "object",
571
+ "properties": props,
572
+ "required": required,
573
+ }
574
+
575
+ return to_schema(root)
576
+
577
+
482
578
  def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
483
- """Produce a data source config that maps all columns from the supplied data source into
484
- the OAI API. The mapping is naive unless a column mapping is provided, in which case
485
- the column mapping's values overrule the relevant naive mappings
579
+ """
580
+ Produce a data source config (JSON schema) that reflects nested object structure
581
+ when column mappings reference dotted paths (e.g., item.context.company...).
582
+
583
+ Backward compatibility:
584
+ - If all referenced source paths are single tokens (flat), fall back to legacy flat schema.
585
+ - Otherwise build a nested object schema covering only referenced leaves.
486
586
 
487
- :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
488
- helper function.
489
587
  :type input_data_df: pd.DataFrame
490
- :param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
588
+ :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
491
589
  :type column_mapping: Optional[Dict[str, str]]
590
+ :param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
492
591
  :return: A dictionary that can act as data source config for OAI evaluation group creation.
493
592
  :rtype: Dict[str, Any]
593
+ helper function.
494
594
  """
595
+ # Extract referenced data paths from mapping values of the form ${data.<path>} (ignore ${run.outputs.*})
596
+ referenced_paths: List[str] = []
597
+ for v in column_mapping.values():
598
+ m = DATA_PATH_PATTERN.match(v)
599
+ if m:
600
+ referenced_paths.append(m.group(1))
601
+
602
+ # Decide if we have nested structures
603
+ has_nested = any("." in p for p in referenced_paths)
604
+
605
+ if not referenced_paths or not has_nested:
606
+ # Legacy flat behavior (existing logic): treat each mapping key as independent string field
607
+ data_source_config = {
608
+ "type": "custom",
609
+ "item_schema": {
610
+ "type": "object",
611
+ "properties": {},
612
+ "required": [],
613
+ },
614
+ }
615
+ props = data_source_config["item_schema"]["properties"]
616
+ req = data_source_config["item_schema"]["required"]
617
+ for key in column_mapping.keys():
618
+ props[key] = {"type": "string"}
619
+ req.append(key)
620
+ return data_source_config
621
+
622
+ # NEW: If all nested paths share the same first segment (e.g. 'item'),
623
+ # treat that segment as the wrapper already provided by the JSONL line ("item": {...})
624
+ # so we exclude it from the schema (schema describes the *inside* of "item").
625
+ first_segments = {p.split(".")[0] for p in referenced_paths}
626
+ strip_wrapper = False
627
+ wrapper_name = None
628
+ if len(first_segments) == 1:
629
+ only_seg = next(iter(first_segments))
630
+ # We only strip if that segment looks like the canonical wrapper.
631
+ if only_seg == WRAPPER_KEY:
632
+ strip_wrapper = True
633
+ wrapper_name = only_seg
634
+
635
+ effective_paths = referenced_paths
636
+ if strip_wrapper:
637
+ stripped = []
638
+ for p in referenced_paths:
639
+ parts = p.split(".", 1)
640
+ if len(parts) == 2:
641
+ stripped.append(parts[1]) # drop leading 'item.'
642
+ else:
643
+ # Path was just 'item' (no leaf) – ignore; it doesn't define a leaf value.
644
+ continue
645
+ # If stripping produced at least one usable path, adopt; else fall back to original.
646
+ if stripped:
647
+ effective_paths = stripped
495
648
 
496
- data_source_config = {
649
+ nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string")
650
+
651
+ return {
497
652
  "type": "custom",
498
- "item_schema": {
499
- "type": "object",
500
- "properties": {},
501
- "required": [],
502
- },
653
+ "item_schema": nested_schema,
503
654
  }
504
- properties = data_source_config["item_schema"]["properties"]
505
- required = data_source_config["item_schema"]["required"]
506
- for key in column_mapping.keys():
507
- properties[key] = {
508
- "type": "string",
509
- }
510
- required.append(key)
511
- return data_source_config
512
655
 
513
656
 
514
657
  def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
@@ -543,9 +686,9 @@ def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[st
543
686
 
544
687
  def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
545
688
  """
546
- Given a dataframe of data to be evaluated, and an optional column mapping,
547
- produce a dictionary can be used as the data source input for an OAI evaluation run.
548
-
689
+ Given a dataframe of data to be evaluated, and a column mapping,
690
+ produce a dictionary that can be used as the data source input for an OAI evaluation run.
691
+ Builds a nested 'item' object mirroring the hierarchical paths in the mapping values.
549
692
  :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
550
693
  helper function.
551
694
  :type input_data_df: pd.DataFrame
@@ -554,25 +697,86 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
554
697
  :return: A dictionary that can be used as the data source input for an OAI evaluation run.
555
698
  :rtype: Dict[str, Any]
556
699
  """
557
- content = []
558
- column_to_source_map = {}
559
- # Convert from column mapping's format to figure out actual column names in
560
- # input dataframe, and map those to the appropriate OAI input names.
700
+ # Gather path specs: list of tuples (original_mapping_value, relative_parts, dataframe_column_name)
701
+ # relative_parts excludes the wrapper (so schema + content align).
702
+ path_specs: List[Tuple[str, List[str], str]] = []
703
+
561
704
  for name, formatted_entry in column_mapping.items():
562
- # From "${" from start and "}" from end before splitting.
563
- entry_pieces = formatted_entry[2:-1].split(".")
564
- if len(entry_pieces) == 2 and entry_pieces[0] == "data":
565
- column_to_source_map[name] = entry_pieces[1]
566
- elif len(entry_pieces) == 3 and entry_pieces[0] == "run" and entry_pieces[1] == "outputs":
567
- column_to_source_map[name] = f"__outputs.{entry_pieces[2]}"
568
-
569
- # Using the above mapping, transform the input dataframe into a content
570
- # dictionary that'll work in an OAI data source.
571
- for row in input_data_df.iterrows():
572
- row_dict = {}
573
- for oai_key, dataframe_key in column_to_source_map.items():
574
- row_dict[oai_key] = str(row[1][dataframe_key])
575
- content.append({"item": row_dict})
705
+ if not (
706
+ isinstance(formatted_entry, str) and formatted_entry.startswith("${") and formatted_entry.endswith("}")
707
+ ):
708
+ continue
709
+ body = formatted_entry[2:-1] # remove ${ }
710
+ pieces = body.split(".")
711
+
712
+ if not pieces:
713
+ continue
714
+
715
+ if pieces[0] == "data":
716
+ # Data path: data.<maybe wrapper>.<...>
717
+ if len(pieces) == 1:
718
+ continue
719
+ source_path = ".".join(pieces[1:]) # e.g. item.context.company...
720
+ # Skip mapping of wrapper itself
721
+ if source_path == WRAPPER_KEY:
722
+ continue
723
+
724
+ # Determine dataframe column name (it is the full dotted path as flattened earlier)
725
+ dataframe_col = source_path
726
+
727
+ # Relative parts for nested insertion (drop leading wrapper if present)
728
+ if source_path.startswith(WRAPPER_KEY + "."):
729
+ relative_path = source_path[len(WRAPPER_KEY) + 1 :]
730
+ else:
731
+ # Path not under wrapper; treat its segments as is (will live directly under wrapper)
732
+ relative_path = source_path
733
+
734
+ relative_parts = [p for p in relative_path.split(".") if p]
735
+
736
+ # Defensive: if mapping alias differs from leaf, prefer actual path leaf to stay consistent.
737
+ # (If you want alias override, replace relative_parts[-1] with name when name != path_leaf.)
738
+ if not relative_parts:
739
+ continue
740
+
741
+ path_specs.append((formatted_entry, relative_parts, dataframe_col))
742
+
743
+ elif pieces[0] == "run" and len(pieces) >= 3 and pieces[1] == "outputs":
744
+ # Target / run outputs become __outputs.<rest> columns
745
+ run_col = "__outputs." + ".".join(pieces[2:])
746
+ leaf_name = pieces[-1]
747
+ path_specs.append((formatted_entry, [leaf_name], run_col))
748
+
749
+ content: List[Dict[str, Any]] = []
750
+
751
+ for _, row in input_data_df.iterrows():
752
+ item_root: Dict[str, Any] = {}
753
+
754
+ for _, rel_parts, df_col in path_specs:
755
+ # Safely fetch value
756
+ val = row.get(df_col, None)
757
+
758
+ # Convert value to string to match schema's "type": "string" leaves.
759
+ # (If you later infer types, you can remove the stringify.)
760
+ if val is None:
761
+ str_val = ""
762
+ elif isinstance(val, (str, int, float, bool)):
763
+ str_val = str(val)
764
+ else:
765
+ # Lists / dicts / other -> string for now
766
+ str_val = str(val)
767
+
768
+ # Insert into nested dict
769
+ cursor = item_root
770
+ for seg in rel_parts[:-1]:
771
+ nxt = cursor.get(seg)
772
+ if not isinstance(nxt, dict):
773
+ nxt = {}
774
+ cursor[seg] = nxt
775
+ cursor = nxt
776
+ leaf_key = rel_parts[-1]
777
+ cursor[leaf_key] = str_val
778
+
779
+ content.append({WRAPPER_KEY: item_root})
576
780
 
577
781
  return {
578
782
  "type": "jsonl",
@@ -611,7 +815,7 @@ def _begin_eval_run(
611
815
  data_source = _get_data_source(input_data_df, column_mapping)
612
816
  eval_run = client.evals.runs.create(
613
817
  eval_id=eval_group_id,
614
- data_source=data_source,
818
+ data_source=cast(Any, data_source), # Cast for type checker: dynamic schema dict accepted by SDK at runtime
615
819
  name=run_name,
616
820
  metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
617
821
  # TODO decide if we want to add our own timeout value?
@@ -330,7 +330,11 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
330
330
  json.dump(data_dict, f, ensure_ascii=False)
331
331
 
332
332
  # Use tqdm.write to print message without interfering with any current progress bar
333
- tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
333
+ # Fall back to regular print if tqdm.write fails (e.g., when progress bar is closed)
334
+ try:
335
+ tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
336
+ except Exception:
337
+ print(f'Evaluation results saved to "{p.resolve()}".\n')
334
338
 
335
339
 
336
340
  def _apply_column_mapping(
@@ -460,7 +464,7 @@ class JSONLDataFileLoader:
460
464
  self.filename = filename
461
465
 
462
466
  def load(self) -> pd.DataFrame:
463
- return pd.read_json(self.filename, lines=True)
467
+ return pd.read_json(self.filename, lines=True, dtype=object)
464
468
 
465
469
 
466
470
  class CSVDataFileLoader:
@@ -468,7 +472,7 @@ class CSVDataFileLoader:
468
472
  self.filename = filename
469
473
 
470
474
  def load(self) -> pd.DataFrame:
471
- return pd.read_csv(self.filename)
475
+ return pd.read_csv(self.filename, dtype=str)
472
476
 
473
477
 
474
478
  class DataLoaderFactory:
@@ -23,6 +23,11 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
23
23
  ~azure.ai.evaluation.OpenAIModelConfiguration]
24
24
  :param threshold: The threshold for the coherence evaluator. Default is 3.
25
25
  :type threshold: int
26
+ :param credential: The credential for authenticating to Azure AI service.
27
+ :type credential: ~azure.core.credentials.TokenCredential
28
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
29
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
30
+ :paramtype is_reasoning_model: bool
26
31
 
27
32
  .. admonition:: Example:
28
33
 
@@ -66,7 +71,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
66
71
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
67
72
 
68
73
  @override
69
- def __init__(self, model_config, *, threshold=3, credential=None):
74
+ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
70
75
  current_dir = os.path.dirname(__file__)
71
76
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
72
77
  self._threshold = threshold
@@ -78,6 +83,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
83
  threshold=threshold,
79
84
  credential=credential,
80
85
  _higher_is_better=self._higher_is_better,
86
+ **kwargs,
81
87
  )
82
88
 
83
89
  @overload
@@ -4,12 +4,14 @@
4
4
 
5
5
  import inspect
6
6
  from abc import ABC, abstractmethod
7
+ import json
7
8
  from typing import (
8
9
  Any,
9
10
  Callable,
10
11
  Dict,
11
12
  Generic,
12
13
  List,
14
+ Tuple,
13
15
  TypedDict,
14
16
  TypeVar,
15
17
  Union,
@@ -510,6 +512,67 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
510
512
 
511
513
  return tool_calls
512
514
 
515
+ def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
516
+ """Extract tool names and parameters from the response.
517
+
518
+ :param response: The response to parse.
519
+ :type response: Union[str, List[dict]]
520
+ :return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
521
+ :rtype: List[Tuple[str, Dict[str, str]]]
522
+ """
523
+ tool_calls = self._parse_tools_from_response(response)
524
+ tool_name_param_pairs = []
525
+ for tool_call in tool_calls:
526
+ if not isinstance(tool_call, dict):
527
+ raise EvaluationException(
528
+ "Tool call must be a dictionary.",
529
+ internal_message=str(tool_call),
530
+ target=ErrorTarget.EVALUATE,
531
+ category=ErrorCategory.UNKNOWN,
532
+ )
533
+ if tool_call.get("type") != "tool_call":
534
+ raise EvaluationException(
535
+ "Tool call must have 'type' set to 'tool_call'.",
536
+ internal_message=str(tool_call),
537
+ target=ErrorTarget.EVALUATE,
538
+ category=ErrorCategory.INVALID_VALUE,
539
+ )
540
+
541
+ if "name" not in tool_call:
542
+ raise EvaluationException(
543
+ "Tool call missing 'name' field.",
544
+ internal_message=str(tool_call),
545
+ target=ErrorTarget.EVALUATE,
546
+ category=ErrorCategory.MISSING_FIELD,
547
+ )
548
+
549
+ tool_name = str(tool_call["name"]).strip()
550
+
551
+ # Extract parameters/arguments
552
+ parameters = {}
553
+ if "arguments" in tool_call:
554
+ args = tool_call["arguments"]
555
+ if isinstance(args, dict):
556
+ # Convert all values to strings for consistent comparison
557
+ parameters = {str(k): str(v) for k, v in args.items()}
558
+ elif isinstance(args, str):
559
+ # If arguments is a string, try to parse it as JSON
560
+ try:
561
+ parsed_args = json.loads(args)
562
+ if isinstance(parsed_args, dict):
563
+ parameters = {str(k): str(v) for k, v in parsed_args.items()}
564
+ except json.JSONDecodeError:
565
+ raise EvaluationException(
566
+ "Failed to parse tool call arguments as JSON.",
567
+ internal_message=str(tool_call),
568
+ target=ErrorTarget.EVALUATE,
569
+ category=ErrorCategory.INVALID_VALUE,
570
+ )
571
+
572
+ tool_name_param_pairs.append((tool_name, parameters))
573
+
574
+ return tool_name_param_pairs
575
+
513
576
  async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
514
577
  """The asynchronous call where real end-to-end evaluation logic is performed.
515
578
 
@@ -532,14 +595,25 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
532
595
  base_key = key[:-6] # Remove "_score" suffix
533
596
  result_key = f"{base_key}_result"
534
597
  threshold_key = f"{base_key}_threshold"
535
- result[threshold_key] = self._threshold
598
+ threshold_value = (
599
+ self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
600
+ )
601
+ if not isinstance(threshold_value, (int, float)):
602
+ raise EvaluationException(
603
+ "Threshold value must be a number.",
604
+ internal_message=str(threshold_value),
605
+ target=ErrorTarget.EVALUATE,
606
+ category=ErrorCategory.INVALID_VALUE,
607
+ )
608
+
609
+ result[threshold_key] = threshold_value
536
610
  if self._higher_is_better:
537
- if float(score_value) >= self._threshold:
611
+ if float(score_value) >= threshold_value:
538
612
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
539
613
  else:
540
614
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
541
615
  else:
542
- if float(score_value) <= self._threshold:
616
+ if float(score_value) <= threshold_value:
543
617
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
544
618
  else:
545
619
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
@@ -25,6 +25,11 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
25
25
  ~azure.ai.evaluation.OpenAIModelConfiguration]
26
26
  :param threshold: The threshold for the fluency evaluator. Default is 3.
27
27
  :type threshold: int
28
+ :param credential: The credential for authenticating to Azure AI service.
29
+ :type credential: ~azure.core.credentials.TokenCredential
30
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
31
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
32
+ :paramtype is_reasoning_model: bool
28
33
 
29
34
  .. admonition:: Example:
30
35
 
@@ -68,7 +73,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
68
73
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
69
74
 
70
75
  @override
71
- def __init__(self, model_config, *, credential=None, threshold=3):
76
+ def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
72
77
  current_dir = os.path.dirname(__file__)
73
78
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
74
79
  self._threshold = threshold
@@ -80,6 +85,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
80
85
  threshold=threshold,
81
86
  credential=credential,
82
87
  _higher_is_better=self._higher_is_better,
88
+ **kwargs,
83
89
  )
84
90
 
85
91
  @overload
@@ -49,6 +49,11 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
49
49
  ~azure.ai.evaluation.OpenAIModelConfiguration]
50
50
  :param threshold: The threshold for the groundedness evaluator. Default is 3.
51
51
  :type threshold: int
52
+ :param credential: The credential for authenticating to Azure AI service.
53
+ :type credential: ~azure.core.credentials.TokenCredential
54
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
55
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
56
+ :paramtype is_reasoning_model: bool
52
57
 
53
58
  .. admonition:: Example:
54
59
 
@@ -106,6 +111,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
106
111
  threshold=threshold,
107
112
  credential=credential,
108
113
  _higher_is_better=self._higher_is_better,
114
+ **kwargs,
109
115
  )
110
116
  self._model_config = model_config
111
117
  self.threshold = threshold
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._path_efficiency import PathEfficiencyEvaluator
6
+
7
+ __all__ = ["PathEfficiencyEvaluator"]