azure-ai-evaluation 1.11.0__py3-none-any.whl → 1.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_aoai/aoai_grader.py +63 -19
- azure/ai/evaluation/_aoai/label_grader.py +8 -3
- azure/ai/evaluation/_aoai/python_grader.py +8 -3
- azure/ai/evaluation/_aoai/score_model_grader.py +8 -3
- azure/ai/evaluation/_aoai/string_check_grader.py +9 -4
- azure/ai/evaluation/_aoai/text_similarity_grader.py +9 -4
- azure/ai/evaluation/_eval_mapping.py +2 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +106 -4
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +279 -50
- azure/ai/evaluation/_evaluate/_utils.py +7 -3
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +77 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +6 -0
- azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py +342 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +7 -1
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_success/_task_success.py +168 -0
- azure/ai/evaluation/_evaluators/_task_success/task_success.prompty +220 -0
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +7 -2
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_mlflow_integration.py +454 -35
- azure/ai/evaluation/red_team/_red_team.py +9 -0
- azure/ai/evaluation/red_team/_red_team_result.py +230 -1
- azure/ai/evaluation/red_team/_result_processor.py +416 -23
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +1 -1
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/METADATA +19 -3
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/RECORD +35 -30
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/top_level.txt +0 -0
|
@@ -3,10 +3,11 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import re
|
|
6
7
|
|
|
7
8
|
from openai import AzureOpenAI, OpenAI
|
|
8
9
|
import pandas as pd
|
|
9
|
-
from typing import Any, Callable, Dict, Tuple, TypeVar, Union, Type, Optional, TypedDict, List
|
|
10
|
+
from typing import Any, Callable, Dict, Tuple, TypeVar, Union, Type, Optional, TypedDict, List, cast, Set
|
|
10
11
|
from time import sleep
|
|
11
12
|
|
|
12
13
|
from ._batch_run import CodeClient, ProxyClient
|
|
@@ -21,6 +22,15 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
21
22
|
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
22
23
|
LOGGER = logging.getLogger(__name__)
|
|
23
24
|
|
|
25
|
+
# Precompiled regex for extracting data paths from mapping expressions of the form
|
|
26
|
+
# ${data.some.dotted.path}. Compiled once at import time to avoid repeated
|
|
27
|
+
# recompilation on each call to _generate_data_source_config.
|
|
28
|
+
DATA_PATH_PATTERN = re.compile(r"^\$\{data\.([a-zA-Z0-9_\.]+)\}$")
|
|
29
|
+
|
|
30
|
+
# Canonical top-level wrapper key expected in nested JSONL evaluation rows.
|
|
31
|
+
# Centralizing here avoids magic strings sprinkled through schema/content generation code.
|
|
32
|
+
WRAPPER_KEY = "item"
|
|
33
|
+
|
|
24
34
|
|
|
25
35
|
class OAIEvalRunCreationInfo(TypedDict, total=True):
|
|
26
36
|
"""Configuration for an evaluator"""
|
|
@@ -102,7 +112,7 @@ def _begin_aoai_evaluation(
|
|
|
102
112
|
|
|
103
113
|
|
|
104
114
|
def _begin_single_aoai_evaluation(
|
|
105
|
-
graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Dict[str, str], run_name: str
|
|
115
|
+
graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Optional[Dict[str, str]], run_name: str
|
|
106
116
|
) -> OAIEvalRunCreationInfo:
|
|
107
117
|
"""
|
|
108
118
|
Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
|
|
@@ -111,8 +121,10 @@ def _begin_single_aoai_evaluation(
|
|
|
111
121
|
|
|
112
122
|
:param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
|
|
113
123
|
:type graders: Dict[str, AoaiGrader]
|
|
114
|
-
:param
|
|
115
|
-
:type
|
|
124
|
+
:param data: The input data to evaluate, as a pandas DataFrame.
|
|
125
|
+
:type data: pd.DataFrame
|
|
126
|
+
:param column_mapping: The column mapping to apply. If None, an empty mapping is used.
|
|
127
|
+
:type column_mapping: Optional[Dict[str, str]]
|
|
116
128
|
:param run_name: The name of the evaluation run.
|
|
117
129
|
:type run_name: str
|
|
118
130
|
:return: A tuple containing the eval group ID and eval run ID of the resultant eval run, as well as a dictionary
|
|
@@ -130,7 +142,8 @@ def _begin_single_aoai_evaluation(
|
|
|
130
142
|
for name, grader in graders.items():
|
|
131
143
|
grader_name_list.append(name)
|
|
132
144
|
grader_list.append(grader._grader_config)
|
|
133
|
-
|
|
145
|
+
effective_column_mapping: Dict[str, str] = column_mapping or {}
|
|
146
|
+
data_source_config = _generate_data_source_config(data, effective_column_mapping)
|
|
134
147
|
|
|
135
148
|
# Create eval group
|
|
136
149
|
# import pdb; pdb.set_trace()
|
|
@@ -154,7 +167,7 @@ def _begin_single_aoai_evaluation(
|
|
|
154
167
|
grader_name_map[criteria.id] = name
|
|
155
168
|
|
|
156
169
|
# Create eval run
|
|
157
|
-
eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data,
|
|
170
|
+
eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, effective_column_mapping)
|
|
158
171
|
LOGGER.info(
|
|
159
172
|
f"AOAI: Eval run created with id {eval_run_id}."
|
|
160
173
|
+ " Results will be retrieved after normal evaluation is complete..."
|
|
@@ -272,8 +285,33 @@ def _get_single_run_results(
|
|
|
272
285
|
for row_result in all_results:
|
|
273
286
|
listed_results["index"].append(row_result.datasource_item_id)
|
|
274
287
|
for single_grader_row_result in row_result.results:
|
|
275
|
-
|
|
276
|
-
|
|
288
|
+
if isinstance(single_grader_row_result, dict):
|
|
289
|
+
result_dict = single_grader_row_result
|
|
290
|
+
elif hasattr(single_grader_row_result, "model_dump"):
|
|
291
|
+
result_dict = single_grader_row_result.model_dump()
|
|
292
|
+
elif hasattr(single_grader_row_result, "dict"):
|
|
293
|
+
result_dict = single_grader_row_result.dict()
|
|
294
|
+
elif hasattr(single_grader_row_result, "__dict__"):
|
|
295
|
+
result_dict = vars(single_grader_row_result)
|
|
296
|
+
else:
|
|
297
|
+
raise EvaluationException(
|
|
298
|
+
message=("Unsupported AOAI evaluation result type: " f"{type(single_grader_row_result)!r}."),
|
|
299
|
+
blame=ErrorBlame.UNKNOWN,
|
|
300
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
301
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
grader_result_name = result_dict.get("name", None)
|
|
305
|
+
if grader_result_name is None:
|
|
306
|
+
raise EvaluationException(
|
|
307
|
+
message="AOAI evaluation response missing grader result name; unable to map to original grader.",
|
|
308
|
+
blame=ErrorBlame.UNKNOWN,
|
|
309
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
310
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
grader_name = run_info["grader_name_map"][grader_result_name]
|
|
314
|
+
for name, value in result_dict.items():
|
|
277
315
|
if name in ["name"]:
|
|
278
316
|
continue
|
|
279
317
|
if name.lower() == "passed":
|
|
@@ -454,36 +492,166 @@ def _get_graders_and_column_mappings(
|
|
|
454
492
|
]
|
|
455
493
|
|
|
456
494
|
|
|
495
|
+
def _build_schema_tree_from_paths(
|
|
496
|
+
paths: List[str],
|
|
497
|
+
force_leaf_type: str = "string",
|
|
498
|
+
) -> Dict[str, Any]:
|
|
499
|
+
"""
|
|
500
|
+
Build a nested JSON schema (object) from a list of dot-delimited paths.
|
|
501
|
+
Each path represents a leaf. Intermediate segments become nested object properties.
|
|
502
|
+
|
|
503
|
+
Example input paths:
|
|
504
|
+
["item.query",
|
|
505
|
+
"item.context.company.policy.security.passwords.rotation_days",
|
|
506
|
+
"item.context.company.policy.security.network.vpn.required"]
|
|
507
|
+
|
|
508
|
+
Returns schema fragment:
|
|
509
|
+
{
|
|
510
|
+
"type": "object",
|
|
511
|
+
"properties": {
|
|
512
|
+
"item": {
|
|
513
|
+
"type": "object",
|
|
514
|
+
"properties": {
|
|
515
|
+
"query": {"type": "string"},
|
|
516
|
+
"context": {
|
|
517
|
+
"type": "object",
|
|
518
|
+
"properties": {
|
|
519
|
+
"company": { ... }
|
|
520
|
+
},
|
|
521
|
+
"required": ["company"]
|
|
522
|
+
}
|
|
523
|
+
},
|
|
524
|
+
"required": ["query", "context"]
|
|
525
|
+
}
|
|
526
|
+
},
|
|
527
|
+
"required": ["item"]
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
:param paths: A list of dot-delimited strings, each representing a leaf path
|
|
531
|
+
in the logical object hierarchy (e.g. ``"item.context.company.policy.security.passwords.rotation_days"``).
|
|
532
|
+
Empty path segments are ignored.
|
|
533
|
+
:type paths: List[str]
|
|
534
|
+
:param force_leaf_type: The JSON Schema ``type`` value to assign to every leaf node
|
|
535
|
+
produced from the supplied paths. Defaults to ``"string"``.
|
|
536
|
+
:type force_leaf_type: str
|
|
537
|
+
:return: A JSON Schema fragment describing the hierarchical structure implied by
|
|
538
|
+
the input paths. The returned schema root always has ``type: object`` with
|
|
539
|
+
recursively nested ``properties`` / ``required`` keys.
|
|
540
|
+
:rtype: Dict[str, Any]
|
|
541
|
+
"""
|
|
542
|
+
# Build tree where each node: {"__children__": { segment: node, ... }, "__leaf__": bool }
|
|
543
|
+
root: Dict[str, Any] = {"__children__": {}, "__leaf__": False}
|
|
544
|
+
|
|
545
|
+
def insert(path: str):
|
|
546
|
+
parts = [p for p in path.split(".") if p]
|
|
547
|
+
node = root
|
|
548
|
+
for i, part in enumerate(parts):
|
|
549
|
+
children = node["__children__"]
|
|
550
|
+
if part not in children:
|
|
551
|
+
children[part] = {"__children__": {}, "__leaf__": False}
|
|
552
|
+
node = children[part]
|
|
553
|
+
if i == len(parts) - 1:
|
|
554
|
+
node["__leaf__"] = True
|
|
555
|
+
|
|
556
|
+
for p in paths:
|
|
557
|
+
insert(p)
|
|
558
|
+
|
|
559
|
+
def to_schema(node: Dict[str, Any]) -> Dict[str, Any]:
|
|
560
|
+
children = node["__children__"]
|
|
561
|
+
if not children:
|
|
562
|
+
# Leaf node
|
|
563
|
+
return {"type": force_leaf_type}
|
|
564
|
+
props = {}
|
|
565
|
+
required = []
|
|
566
|
+
for name, child in children.items():
|
|
567
|
+
props[name] = to_schema(child)
|
|
568
|
+
required.append(name)
|
|
569
|
+
return {
|
|
570
|
+
"type": "object",
|
|
571
|
+
"properties": props,
|
|
572
|
+
"required": required,
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
return to_schema(root)
|
|
576
|
+
|
|
577
|
+
|
|
457
578
|
def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
458
|
-
"""
|
|
459
|
-
|
|
460
|
-
|
|
579
|
+
"""
|
|
580
|
+
Produce a data source config (JSON schema) that reflects nested object structure
|
|
581
|
+
when column mappings reference dotted paths (e.g., item.context.company...).
|
|
582
|
+
|
|
583
|
+
Backward compatibility:
|
|
584
|
+
- If all referenced source paths are single tokens (flat), fall back to legacy flat schema.
|
|
585
|
+
- Otherwise build a nested object schema covering only referenced leaves.
|
|
461
586
|
|
|
462
|
-
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
463
|
-
helper function.
|
|
464
587
|
:type input_data_df: pd.DataFrame
|
|
465
|
-
:param
|
|
588
|
+
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
466
589
|
:type column_mapping: Optional[Dict[str, str]]
|
|
590
|
+
:param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
|
|
467
591
|
:return: A dictionary that can act as data source config for OAI evaluation group creation.
|
|
468
592
|
:rtype: Dict[str, Any]
|
|
593
|
+
helper function.
|
|
469
594
|
"""
|
|
595
|
+
# Extract referenced data paths from mapping values of the form ${data.<path>} (ignore ${run.outputs.*})
|
|
596
|
+
referenced_paths: List[str] = []
|
|
597
|
+
for v in column_mapping.values():
|
|
598
|
+
m = DATA_PATH_PATTERN.match(v)
|
|
599
|
+
if m:
|
|
600
|
+
referenced_paths.append(m.group(1))
|
|
601
|
+
|
|
602
|
+
# Decide if we have nested structures
|
|
603
|
+
has_nested = any("." in p for p in referenced_paths)
|
|
604
|
+
|
|
605
|
+
if not referenced_paths or not has_nested:
|
|
606
|
+
# Legacy flat behavior (existing logic): treat each mapping key as independent string field
|
|
607
|
+
data_source_config = {
|
|
608
|
+
"type": "custom",
|
|
609
|
+
"item_schema": {
|
|
610
|
+
"type": "object",
|
|
611
|
+
"properties": {},
|
|
612
|
+
"required": [],
|
|
613
|
+
},
|
|
614
|
+
}
|
|
615
|
+
props = data_source_config["item_schema"]["properties"]
|
|
616
|
+
req = data_source_config["item_schema"]["required"]
|
|
617
|
+
for key in column_mapping.keys():
|
|
618
|
+
props[key] = {"type": "string"}
|
|
619
|
+
req.append(key)
|
|
620
|
+
return data_source_config
|
|
621
|
+
|
|
622
|
+
# NEW: If all nested paths share the same first segment (e.g. 'item'),
|
|
623
|
+
# treat that segment as the wrapper already provided by the JSONL line ("item": {...})
|
|
624
|
+
# so we exclude it from the schema (schema describes the *inside* of "item").
|
|
625
|
+
first_segments = {p.split(".")[0] for p in referenced_paths}
|
|
626
|
+
strip_wrapper = False
|
|
627
|
+
wrapper_name = None
|
|
628
|
+
if len(first_segments) == 1:
|
|
629
|
+
only_seg = next(iter(first_segments))
|
|
630
|
+
# We only strip if that segment looks like the canonical wrapper.
|
|
631
|
+
if only_seg == WRAPPER_KEY:
|
|
632
|
+
strip_wrapper = True
|
|
633
|
+
wrapper_name = only_seg
|
|
634
|
+
|
|
635
|
+
effective_paths = referenced_paths
|
|
636
|
+
if strip_wrapper:
|
|
637
|
+
stripped = []
|
|
638
|
+
for p in referenced_paths:
|
|
639
|
+
parts = p.split(".", 1)
|
|
640
|
+
if len(parts) == 2:
|
|
641
|
+
stripped.append(parts[1]) # drop leading 'item.'
|
|
642
|
+
else:
|
|
643
|
+
# Path was just 'item' (no leaf) – ignore; it doesn't define a leaf value.
|
|
644
|
+
continue
|
|
645
|
+
# If stripping produced at least one usable path, adopt; else fall back to original.
|
|
646
|
+
if stripped:
|
|
647
|
+
effective_paths = stripped
|
|
470
648
|
|
|
471
|
-
|
|
649
|
+
nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string")
|
|
650
|
+
|
|
651
|
+
return {
|
|
472
652
|
"type": "custom",
|
|
473
|
-
"item_schema":
|
|
474
|
-
"type": "object",
|
|
475
|
-
"properties": {},
|
|
476
|
-
"required": [],
|
|
477
|
-
},
|
|
653
|
+
"item_schema": nested_schema,
|
|
478
654
|
}
|
|
479
|
-
properties = data_source_config["item_schema"]["properties"]
|
|
480
|
-
required = data_source_config["item_schema"]["required"]
|
|
481
|
-
for key in column_mapping.keys():
|
|
482
|
-
properties[key] = {
|
|
483
|
-
"type": "string",
|
|
484
|
-
}
|
|
485
|
-
required.append(key)
|
|
486
|
-
return data_source_config
|
|
487
655
|
|
|
488
656
|
|
|
489
657
|
def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
|
|
@@ -518,9 +686,9 @@ def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[st
|
|
|
518
686
|
|
|
519
687
|
def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
520
688
|
"""
|
|
521
|
-
Given a dataframe of data to be evaluated, and
|
|
522
|
-
produce a dictionary can be used as the data source input for an OAI evaluation run.
|
|
523
|
-
|
|
689
|
+
Given a dataframe of data to be evaluated, and a column mapping,
|
|
690
|
+
produce a dictionary that can be used as the data source input for an OAI evaluation run.
|
|
691
|
+
Builds a nested 'item' object mirroring the hierarchical paths in the mapping values.
|
|
524
692
|
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
525
693
|
helper function.
|
|
526
694
|
:type input_data_df: pd.DataFrame
|
|
@@ -529,25 +697,86 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
|
|
|
529
697
|
:return: A dictionary that can be used as the data source input for an OAI evaluation run.
|
|
530
698
|
:rtype: Dict[str, Any]
|
|
531
699
|
"""
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
700
|
+
# Gather path specs: list of tuples (original_mapping_value, relative_parts, dataframe_column_name)
|
|
701
|
+
# relative_parts excludes the wrapper (so schema + content align).
|
|
702
|
+
path_specs: List[Tuple[str, List[str], str]] = []
|
|
703
|
+
|
|
536
704
|
for name, formatted_entry in column_mapping.items():
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
705
|
+
if not (
|
|
706
|
+
isinstance(formatted_entry, str) and formatted_entry.startswith("${") and formatted_entry.endswith("}")
|
|
707
|
+
):
|
|
708
|
+
continue
|
|
709
|
+
body = formatted_entry[2:-1] # remove ${ }
|
|
710
|
+
pieces = body.split(".")
|
|
711
|
+
|
|
712
|
+
if not pieces:
|
|
713
|
+
continue
|
|
714
|
+
|
|
715
|
+
if pieces[0] == "data":
|
|
716
|
+
# Data path: data.<maybe wrapper>.<...>
|
|
717
|
+
if len(pieces) == 1:
|
|
718
|
+
continue
|
|
719
|
+
source_path = ".".join(pieces[1:]) # e.g. item.context.company...
|
|
720
|
+
# Skip mapping of wrapper itself
|
|
721
|
+
if source_path == WRAPPER_KEY:
|
|
722
|
+
continue
|
|
723
|
+
|
|
724
|
+
# Determine dataframe column name (it is the full dotted path as flattened earlier)
|
|
725
|
+
dataframe_col = source_path
|
|
726
|
+
|
|
727
|
+
# Relative parts for nested insertion (drop leading wrapper if present)
|
|
728
|
+
if source_path.startswith(WRAPPER_KEY + "."):
|
|
729
|
+
relative_path = source_path[len(WRAPPER_KEY) + 1 :]
|
|
730
|
+
else:
|
|
731
|
+
# Path not under wrapper; treat its segments as is (will live directly under wrapper)
|
|
732
|
+
relative_path = source_path
|
|
733
|
+
|
|
734
|
+
relative_parts = [p for p in relative_path.split(".") if p]
|
|
735
|
+
|
|
736
|
+
# Defensive: if mapping alias differs from leaf, prefer actual path leaf to stay consistent.
|
|
737
|
+
# (If you want alias override, replace relative_parts[-1] with name when name != path_leaf.)
|
|
738
|
+
if not relative_parts:
|
|
739
|
+
continue
|
|
740
|
+
|
|
741
|
+
path_specs.append((formatted_entry, relative_parts, dataframe_col))
|
|
742
|
+
|
|
743
|
+
elif pieces[0] == "run" and len(pieces) >= 3 and pieces[1] == "outputs":
|
|
744
|
+
# Target / run outputs become __outputs.<rest> columns
|
|
745
|
+
run_col = "__outputs." + ".".join(pieces[2:])
|
|
746
|
+
leaf_name = pieces[-1]
|
|
747
|
+
path_specs.append((formatted_entry, [leaf_name], run_col))
|
|
748
|
+
|
|
749
|
+
content: List[Dict[str, Any]] = []
|
|
750
|
+
|
|
751
|
+
for _, row in input_data_df.iterrows():
|
|
752
|
+
item_root: Dict[str, Any] = {}
|
|
753
|
+
|
|
754
|
+
for _, rel_parts, df_col in path_specs:
|
|
755
|
+
# Safely fetch value
|
|
756
|
+
val = row.get(df_col, None)
|
|
757
|
+
|
|
758
|
+
# Convert value to string to match schema's "type": "string" leaves.
|
|
759
|
+
# (If you later infer types, you can remove the stringify.)
|
|
760
|
+
if val is None:
|
|
761
|
+
str_val = ""
|
|
762
|
+
elif isinstance(val, (str, int, float, bool)):
|
|
763
|
+
str_val = str(val)
|
|
764
|
+
else:
|
|
765
|
+
# Lists / dicts / other -> string for now
|
|
766
|
+
str_val = str(val)
|
|
767
|
+
|
|
768
|
+
# Insert into nested dict
|
|
769
|
+
cursor = item_root
|
|
770
|
+
for seg in rel_parts[:-1]:
|
|
771
|
+
nxt = cursor.get(seg)
|
|
772
|
+
if not isinstance(nxt, dict):
|
|
773
|
+
nxt = {}
|
|
774
|
+
cursor[seg] = nxt
|
|
775
|
+
cursor = nxt
|
|
776
|
+
leaf_key = rel_parts[-1]
|
|
777
|
+
cursor[leaf_key] = str_val
|
|
778
|
+
|
|
779
|
+
content.append({WRAPPER_KEY: item_root})
|
|
551
780
|
|
|
552
781
|
return {
|
|
553
782
|
"type": "jsonl",
|
|
@@ -586,7 +815,7 @@ def _begin_eval_run(
|
|
|
586
815
|
data_source = _get_data_source(input_data_df, column_mapping)
|
|
587
816
|
eval_run = client.evals.runs.create(
|
|
588
817
|
eval_id=eval_group_id,
|
|
589
|
-
data_source=data_source,
|
|
818
|
+
data_source=cast(Any, data_source), # Cast for type checker: dynamic schema dict accepted by SDK at runtime
|
|
590
819
|
name=run_name,
|
|
591
820
|
metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
|
|
592
821
|
# TODO decide if we want to add our own timeout value?
|
|
@@ -330,7 +330,11 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
|
330
330
|
json.dump(data_dict, f, ensure_ascii=False)
|
|
331
331
|
|
|
332
332
|
# Use tqdm.write to print message without interfering with any current progress bar
|
|
333
|
-
tqdm.write(
|
|
333
|
+
# Fall back to regular print if tqdm.write fails (e.g., when progress bar is closed)
|
|
334
|
+
try:
|
|
335
|
+
tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
336
|
+
except Exception:
|
|
337
|
+
print(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
334
338
|
|
|
335
339
|
|
|
336
340
|
def _apply_column_mapping(
|
|
@@ -460,7 +464,7 @@ class JSONLDataFileLoader:
|
|
|
460
464
|
self.filename = filename
|
|
461
465
|
|
|
462
466
|
def load(self) -> pd.DataFrame:
|
|
463
|
-
return pd.read_json(self.filename, lines=True)
|
|
467
|
+
return pd.read_json(self.filename, lines=True, dtype=object)
|
|
464
468
|
|
|
465
469
|
|
|
466
470
|
class CSVDataFileLoader:
|
|
@@ -468,7 +472,7 @@ class CSVDataFileLoader:
|
|
|
468
472
|
self.filename = filename
|
|
469
473
|
|
|
470
474
|
def load(self) -> pd.DataFrame:
|
|
471
|
-
return pd.read_csv(self.filename)
|
|
475
|
+
return pd.read_csv(self.filename, dtype=str)
|
|
472
476
|
|
|
473
477
|
|
|
474
478
|
class DataLoaderFactory:
|
|
@@ -23,6 +23,11 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
23
23
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
24
24
|
:param threshold: The threshold for the coherence evaluator. Default is 3.
|
|
25
25
|
:type threshold: int
|
|
26
|
+
:param credential: The credential for authenticating to Azure AI service.
|
|
27
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
28
|
+
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
|
|
29
|
+
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
|
|
30
|
+
:paramtype is_reasoning_model: bool
|
|
26
31
|
|
|
27
32
|
.. admonition:: Example:
|
|
28
33
|
|
|
@@ -66,7 +71,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
66
71
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
67
72
|
|
|
68
73
|
@override
|
|
69
|
-
def __init__(self, model_config, *, threshold=3, credential=None):
|
|
74
|
+
def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
|
|
70
75
|
current_dir = os.path.dirname(__file__)
|
|
71
76
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
72
77
|
self._threshold = threshold
|
|
@@ -78,6 +83,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
83
|
threshold=threshold,
|
|
79
84
|
credential=credential,
|
|
80
85
|
_higher_is_better=self._higher_is_better,
|
|
86
|
+
**kwargs,
|
|
81
87
|
)
|
|
82
88
|
|
|
83
89
|
@overload
|
|
@@ -4,12 +4,14 @@
|
|
|
4
4
|
|
|
5
5
|
import inspect
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
|
+
import json
|
|
7
8
|
from typing import (
|
|
8
9
|
Any,
|
|
9
10
|
Callable,
|
|
10
11
|
Dict,
|
|
11
12
|
Generic,
|
|
12
13
|
List,
|
|
14
|
+
Tuple,
|
|
13
15
|
TypedDict,
|
|
14
16
|
TypeVar,
|
|
15
17
|
Union,
|
|
@@ -510,6 +512,67 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
510
512
|
|
|
511
513
|
return tool_calls
|
|
512
514
|
|
|
515
|
+
def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
|
|
516
|
+
"""Extract tool names and parameters from the response.
|
|
517
|
+
|
|
518
|
+
:param response: The response to parse.
|
|
519
|
+
:type response: Union[str, List[dict]]
|
|
520
|
+
:return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
|
|
521
|
+
:rtype: List[Tuple[str, Dict[str, str]]]
|
|
522
|
+
"""
|
|
523
|
+
tool_calls = self._parse_tools_from_response(response)
|
|
524
|
+
tool_name_param_pairs = []
|
|
525
|
+
for tool_call in tool_calls:
|
|
526
|
+
if not isinstance(tool_call, dict):
|
|
527
|
+
raise EvaluationException(
|
|
528
|
+
"Tool call must be a dictionary.",
|
|
529
|
+
internal_message=str(tool_call),
|
|
530
|
+
target=ErrorTarget.EVALUATE,
|
|
531
|
+
category=ErrorCategory.UNKNOWN,
|
|
532
|
+
)
|
|
533
|
+
if tool_call.get("type") != "tool_call":
|
|
534
|
+
raise EvaluationException(
|
|
535
|
+
"Tool call must have 'type' set to 'tool_call'.",
|
|
536
|
+
internal_message=str(tool_call),
|
|
537
|
+
target=ErrorTarget.EVALUATE,
|
|
538
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
if "name" not in tool_call:
|
|
542
|
+
raise EvaluationException(
|
|
543
|
+
"Tool call missing 'name' field.",
|
|
544
|
+
internal_message=str(tool_call),
|
|
545
|
+
target=ErrorTarget.EVALUATE,
|
|
546
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
tool_name = str(tool_call["name"]).strip()
|
|
550
|
+
|
|
551
|
+
# Extract parameters/arguments
|
|
552
|
+
parameters = {}
|
|
553
|
+
if "arguments" in tool_call:
|
|
554
|
+
args = tool_call["arguments"]
|
|
555
|
+
if isinstance(args, dict):
|
|
556
|
+
# Convert all values to strings for consistent comparison
|
|
557
|
+
parameters = {str(k): str(v) for k, v in args.items()}
|
|
558
|
+
elif isinstance(args, str):
|
|
559
|
+
# If arguments is a string, try to parse it as JSON
|
|
560
|
+
try:
|
|
561
|
+
parsed_args = json.loads(args)
|
|
562
|
+
if isinstance(parsed_args, dict):
|
|
563
|
+
parameters = {str(k): str(v) for k, v in parsed_args.items()}
|
|
564
|
+
except json.JSONDecodeError:
|
|
565
|
+
raise EvaluationException(
|
|
566
|
+
"Failed to parse tool call arguments as JSON.",
|
|
567
|
+
internal_message=str(tool_call),
|
|
568
|
+
target=ErrorTarget.EVALUATE,
|
|
569
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
tool_name_param_pairs.append((tool_name, parameters))
|
|
573
|
+
|
|
574
|
+
return tool_name_param_pairs
|
|
575
|
+
|
|
513
576
|
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
514
577
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
515
578
|
|
|
@@ -532,14 +595,25 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
532
595
|
base_key = key[:-6] # Remove "_score" suffix
|
|
533
596
|
result_key = f"{base_key}_result"
|
|
534
597
|
threshold_key = f"{base_key}_threshold"
|
|
535
|
-
|
|
598
|
+
threshold_value = (
|
|
599
|
+
self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
|
|
600
|
+
)
|
|
601
|
+
if not isinstance(threshold_value, (int, float)):
|
|
602
|
+
raise EvaluationException(
|
|
603
|
+
"Threshold value must be a number.",
|
|
604
|
+
internal_message=str(threshold_value),
|
|
605
|
+
target=ErrorTarget.EVALUATE,
|
|
606
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
result[threshold_key] = threshold_value
|
|
536
610
|
if self._higher_is_better:
|
|
537
|
-
if float(score_value) >=
|
|
611
|
+
if float(score_value) >= threshold_value:
|
|
538
612
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
|
|
539
613
|
else:
|
|
540
614
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
|
|
541
615
|
else:
|
|
542
|
-
if float(score_value) <=
|
|
616
|
+
if float(score_value) <= threshold_value:
|
|
543
617
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
|
|
544
618
|
else:
|
|
545
619
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
|
|
@@ -25,6 +25,11 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
25
25
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
26
26
|
:param threshold: The threshold for the fluency evaluator. Default is 3.
|
|
27
27
|
:type threshold: int
|
|
28
|
+
:param credential: The credential for authenticating to Azure AI service.
|
|
29
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
30
|
+
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
|
|
31
|
+
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
|
|
32
|
+
:paramtype is_reasoning_model: bool
|
|
28
33
|
|
|
29
34
|
.. admonition:: Example:
|
|
30
35
|
|
|
@@ -68,7 +73,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
68
73
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
69
74
|
|
|
70
75
|
@override
|
|
71
|
-
def __init__(self, model_config, *, credential=None, threshold=3):
|
|
76
|
+
def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
|
|
72
77
|
current_dir = os.path.dirname(__file__)
|
|
73
78
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
74
79
|
self._threshold = threshold
|
|
@@ -80,6 +85,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
80
85
|
threshold=threshold,
|
|
81
86
|
credential=credential,
|
|
82
87
|
_higher_is_better=self._higher_is_better,
|
|
88
|
+
**kwargs,
|
|
83
89
|
)
|
|
84
90
|
|
|
85
91
|
@overload
|
|
@@ -49,6 +49,11 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
49
49
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
50
50
|
:param threshold: The threshold for the groundedness evaluator. Default is 3.
|
|
51
51
|
:type threshold: int
|
|
52
|
+
:param credential: The credential for authenticating to Azure AI service.
|
|
53
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
54
|
+
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
|
|
55
|
+
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
|
|
56
|
+
:paramtype is_reasoning_model: bool
|
|
52
57
|
|
|
53
58
|
.. admonition:: Example:
|
|
54
59
|
|
|
@@ -106,6 +111,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
106
111
|
threshold=threshold,
|
|
107
112
|
credential=credential,
|
|
108
113
|
_higher_is_better=self._higher_is_better,
|
|
114
|
+
**kwargs,
|
|
109
115
|
)
|
|
110
116
|
self._model_config = model_config
|
|
111
117
|
self.threshold = threshold
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._path_efficiency import PathEfficiencyEvaluator
|
|
6
|
+
|
|
7
|
+
__all__ = ["PathEfficiencyEvaluator"]
|