azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (43) hide show
  1. azure/ai/evaluation/__init__.py +9 -5
  2. azure/ai/evaluation/_common/utils.py +24 -9
  3. azure/ai/evaluation/_constants.py +4 -0
  4. azure/ai/evaluation/_evaluate/_evaluate.py +57 -39
  5. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +34 -81
  6. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
  7. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  8. azure/ai/evaluation/_evaluators/_common/_base_eval.py +302 -0
  9. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +79 -0
  10. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +99 -0
  11. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  12. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
  13. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
  14. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
  15. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
  16. azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
  17. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +29 -74
  18. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
  19. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +33 -80
  20. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
  21. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
  22. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +34 -83
  23. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
  24. azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
  25. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +16 -22
  26. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
  27. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -11
  28. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  29. azure/ai/evaluation/_evaluators/_xpia/xpia.py +16 -90
  30. azure/ai/evaluation/_exceptions.py +0 -1
  31. azure/ai/evaluation/_model_configurations.py +36 -8
  32. azure/ai/evaluation/_version.py +1 -1
  33. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -1
  34. azure/ai/evaluation/simulator/_simulator.py +19 -8
  35. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/METADATA +59 -1
  36. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/RECORD +38 -39
  37. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  38. azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -9
  39. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  40. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  41. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  42. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/WHEEL +0 -0
  43. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/top_level.txt +0 -0
@@ -4,10 +4,8 @@
4
4
 
5
5
  from ._evaluate._evaluate import evaluate
6
6
  from ._evaluators._bleu import BleuScoreEvaluator
7
- from ._evaluators._chat import ChatEvaluator
8
7
  from ._evaluators._coherence import CoherenceEvaluator
9
8
  from ._evaluators._content_safety import (
10
- ContentSafetyChatEvaluator,
11
9
  ContentSafetyEvaluator,
12
10
  HateUnfairnessEvaluator,
13
11
  SelfHarmEvaluator,
@@ -22,10 +20,16 @@ from ._evaluators._meteor import MeteorScoreEvaluator
22
20
  from ._evaluators._protected_material import ProtectedMaterialEvaluator
23
21
  from ._evaluators._qa import QAEvaluator
24
22
  from ._evaluators._relevance import RelevanceEvaluator
23
+ from ._evaluators._retrieval import RetrievalEvaluator
25
24
  from ._evaluators._rouge import RougeScoreEvaluator, RougeType
26
25
  from ._evaluators._similarity import SimilarityEvaluator
27
26
  from ._evaluators._xpia import IndirectAttackEvaluator
28
- from ._model_configurations import AzureAIProject, AzureOpenAIModelConfiguration, OpenAIModelConfiguration
27
+ from ._model_configurations import (
28
+ AzureAIProject,
29
+ AzureOpenAIModelConfiguration,
30
+ OpenAIModelConfiguration,
31
+ EvaluatorConfig,
32
+ )
29
33
 
30
34
  __all__ = [
31
35
  "evaluate",
@@ -36,21 +40,21 @@ __all__ = [
36
40
  "RelevanceEvaluator",
37
41
  "SimilarityEvaluator",
38
42
  "QAEvaluator",
39
- "ChatEvaluator",
40
43
  "ViolenceEvaluator",
41
44
  "SexualEvaluator",
42
45
  "SelfHarmEvaluator",
43
46
  "HateUnfairnessEvaluator",
44
47
  "ContentSafetyEvaluator",
45
- "ContentSafetyChatEvaluator",
46
48
  "IndirectAttackEvaluator",
47
49
  "BleuScoreEvaluator",
48
50
  "GleuScoreEvaluator",
49
51
  "MeteorScoreEvaluator",
52
+ "RetrievalEvaluator",
50
53
  "RougeScoreEvaluator",
51
54
  "RougeType",
52
55
  "ProtectedMaterialEvaluator",
53
56
  "AzureAIProject",
54
57
  "AzureOpenAIModelConfiguration",
55
58
  "OpenAIModelConfiguration",
59
+ "EvaluatorConfig",
56
60
  ]
@@ -3,12 +3,13 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import threading
6
- from typing import List, Optional, Union
6
+ from typing import List, Union
7
7
 
8
8
  import nltk
9
9
  import numpy as np
10
10
 
11
11
  from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
12
+ from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
12
13
 
13
14
  from . import constants
14
15
 
@@ -70,18 +71,32 @@ def nltk_tokenize(text: str) -> List[str]:
70
71
  return list(tokens)
71
72
 
72
73
 
73
- def ensure_api_version_in_aoai_model_config(
74
+ def parse_model_config_type(
74
75
  model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
75
- default_api_version: str,
76
76
  ) -> None:
77
77
  if "azure_endpoint" in model_config or "azure_deployment" in model_config:
78
- model_config["api_version"] = model_config.get("api_version", default_api_version)
78
+ model_config["type"] = AZURE_OPENAI_TYPE
79
+ else:
80
+ model_config["type"] = OPENAI_TYPE
79
81
 
80
82
 
81
- def ensure_user_agent_in_aoai_model_config(
83
+ def construct_prompty_model_config(
82
84
  model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
83
- prompty_model_config: dict,
84
- user_agent: Optional[str] = None,
85
- ) -> None:
86
- if user_agent and ("azure_endpoint" in model_config or "azure_deployment" in model_config):
85
+ default_api_version: str,
86
+ user_agent: str,
87
+ ) -> dict:
88
+ parse_model_config_type(model_config)
89
+
90
+ if model_config["type"] == AZURE_OPENAI_TYPE:
91
+ model_config["api_version"] = model_config.get("api_version", default_api_version)
92
+
93
+ prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
94
+
95
+ # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
96
+ # https://github.com/encode/httpx/discussions/2959
97
+ prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
98
+
99
+ if model_config["type"] == AZURE_OPENAI_TYPE and user_agent:
87
100
  prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
101
+
102
+ return prompty_model_config
@@ -57,3 +57,7 @@ PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
57
57
 
58
58
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
59
59
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
60
+
61
+ AZURE_OPENAI_TYPE = "azure_openai"
62
+
63
+ OPENAI_TYPE = "openai"
@@ -19,7 +19,7 @@ from .._constants import (
19
19
  Prefixes,
20
20
  _InternalEvaluationMetrics,
21
21
  )
22
- from .._model_configurations import AzureAIProject
22
+ from .._model_configurations import AzureAIProject, EvaluatorConfig
23
23
  from .._user_agent import USER_AGENT
24
24
  from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
25
25
  from ._utils import (
@@ -158,6 +158,12 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
158
158
  ]
159
159
 
160
160
  missing_inputs = [col for col in required_inputs if col not in df_data.columns]
161
+ if missing_inputs and "conversation" in required_inputs:
162
+ non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
163
+ if len(missing_inputs) == len(non_conversation_inputs) and [
164
+ input in non_conversation_inputs for input in missing_inputs
165
+ ]:
166
+ missing_inputs = []
161
167
  if missing_inputs:
162
168
  if not is_target_fn:
163
169
  msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
@@ -273,7 +279,7 @@ def _validate_columns(
273
279
  df: pd.DataFrame,
274
280
  evaluators: Dict[str, Any],
275
281
  target: Optional[Callable],
276
- evaluator_config: Dict[str, Dict[str, str]],
282
+ column_mapping: Dict[str, Dict[str, str]],
277
283
  ) -> None:
278
284
  """
279
285
  Check that all columns needed by evaluator or target function are present.
@@ -284,8 +290,8 @@ def _validate_columns(
284
290
  :type evaluators: Dict[str, Any]
285
291
  :param target: The callable to be applied to data set.
286
292
  :type target: Optional[Callable]
287
- :param evaluator_config: The configuration for evaluators.
288
- :type evaluator_config: Dict[str, Dict[str, str]]
293
+ :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
294
+ :type column_mapping: Dict[str, Dict[str, str]]
289
295
  :raises EvaluationException: If column starts from "__outputs." while target is defined.
290
296
  """
291
297
  if target:
@@ -306,7 +312,7 @@ def _validate_columns(
306
312
  else:
307
313
  for evaluator_name, evaluator in evaluators.items():
308
314
  # Apply column mapping
309
- mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
315
+ mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
310
316
  new_df = _apply_column_mapping(df, mapping_config)
311
317
 
312
318
  # Validate input data for evaluator
@@ -372,11 +378,11 @@ def _apply_target_to_data(
372
378
  return target_output, generated_columns, run
373
379
 
374
380
 
375
- def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
376
- """Process evaluator_config to replace ${target.} with ${data.}
381
+ def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
382
+ """Process column_mapping to replace ${target.} with ${data.}
377
383
 
378
- :param evaluator_config: The configuration for evaluators.
379
- :type evaluator_config: Dict[str, Dict[str, str]]
384
+ :param column_mapping: The configuration for evaluators.
385
+ :type column_mapping: Dict[str, Dict[str, str]]
380
386
  :return: The processed configuration.
381
387
  :rtype: Dict[str, Dict[str, str]]
382
388
  """
@@ -385,15 +391,15 @@ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Di
385
391
 
386
392
  unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
387
393
 
388
- if evaluator_config:
389
- for evaluator, mapping_config in evaluator_config.items():
394
+ if column_mapping:
395
+ for evaluator, mapping_config in column_mapping.items():
390
396
  if isinstance(mapping_config, dict):
391
397
  processed_config[evaluator] = {}
392
398
 
393
399
  for map_to_key, map_value in mapping_config.items():
394
400
  # Check if there's any unexpected reference other than ${target.} or ${data.}
395
401
  if unexpected_references.search(map_value):
396
- msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
402
+ msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
397
403
  raise EvaluationException(
398
404
  message=msg,
399
405
  internal_message=msg,
@@ -439,7 +445,7 @@ def evaluate(
439
445
  evaluators: Dict[str, Callable],
440
446
  evaluation_name: Optional[str] = None,
441
447
  target: Optional[Callable] = None,
442
- evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
448
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
443
449
  azure_ai_project: Optional[AzureAIProject] = None,
444
450
  output_path: Optional[str] = None,
445
451
  **kwargs,
@@ -458,10 +464,10 @@ def evaluate(
458
464
  :keyword target: Target to be evaluated. `target` and `data` both cannot be None
459
465
  :paramtype target: Optional[Callable]
460
466
  :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
461
- names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
462
- keys as the column names in the evaluator input and values as the column names in the input data or data
463
- generated by target.
464
- :paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
467
+ names as keys and a values that are dictionaries containing the column mappings. The column mappings should
468
+ be a dictionary with keys as the column names in the evaluator input and values as the column names in the
469
+ input data or data generated by target.
470
+ :paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
465
471
  :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
466
472
  the results will be saved to a file named `evaluation_results.json` in the folder.
467
473
  :paramtype output_path: Optional[str]
@@ -482,7 +488,7 @@ def evaluate(
482
488
  model_config = {
483
489
  "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
484
490
  "api_key": os.environ.get("AZURE_OPENAI_KEY"),
485
- "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
491
+ "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
486
492
  }
487
493
 
488
494
  coherence_eval = CoherenceEvaluator(model_config=model_config)
@@ -497,15 +503,19 @@ def evaluate(
497
503
  },
498
504
  evaluator_config={
499
505
  "coherence": {
500
- "response": "${data.response}",
501
- "query": "${data.query}"
506
+ "column_mapping": {
507
+ "response": "${data.response}",
508
+ "query": "${data.query}",
509
+ },
502
510
  },
503
511
  "relevance": {
504
- "response": "${data.response}",
505
- "context": "${data.context}",
506
- "query": "${data.query}"
507
- }
508
- }
512
+ "column_mapping": {
513
+ "response": "${data.response}",
514
+ "context": "${data.context}",
515
+ "query": "${data.query}",
516
+ },
517
+ },
518
+ },
509
519
  )
510
520
 
511
521
  """
@@ -544,13 +554,13 @@ def evaluate(
544
554
  raise e
545
555
 
546
556
 
547
- def _evaluate( # pylint: disable=too-many-locals
557
+ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
548
558
  *,
549
559
  evaluation_name: Optional[str] = None,
550
560
  target: Optional[Callable] = None,
551
561
  data: Optional[str] = None,
552
562
  evaluators: Optional[Dict[str, Callable]] = None,
553
- evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
563
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
554
564
  azure_ai_project: Optional[AzureAIProject] = None,
555
565
  output_path: Optional[str] = None,
556
566
  **kwargs,
@@ -560,8 +570,13 @@ def _evaluate( # pylint: disable=too-many-locals
560
570
  # Process evaluator config to replace ${target.} with ${data.}
561
571
  if evaluator_config is None:
562
572
  evaluator_config = {}
563
- evaluator_config = _process_evaluator_config(evaluator_config)
564
- _validate_columns(input_data_df, evaluators, target, evaluator_config)
573
+ # extract column mapping dicts into dictionary mapping evaluator name to column mapping
574
+ column_mapping = {
575
+ evaluator_name: evaluator_configuration.get("column_mapping", None)
576
+ for evaluator_name, evaluator_configuration in evaluator_config.items()
577
+ }
578
+ column_mapping = _process_column_mappings(column_mapping)
579
+ _validate_columns(input_data_df, evaluators, target, column_mapping)
565
580
 
566
581
  # Target Run
567
582
  pf_client = PFClient(
@@ -577,8 +592,8 @@ def _evaluate( # pylint: disable=too-many-locals
577
592
 
578
593
  # Create default configuration for evaluators that directly maps
579
594
  # input data names to keyword inputs of the same name in the evaluators.
580
- evaluator_config = evaluator_config or {}
581
- evaluator_config.setdefault("default", {})
595
+ column_mapping = column_mapping or {}
596
+ column_mapping.setdefault("default", {})
582
597
 
583
598
  # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
584
599
  if data is not None and target is not None:
@@ -586,21 +601,21 @@ def _evaluate( # pylint: disable=too-many-locals
586
601
  target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
587
602
  )
588
603
 
589
- for evaluator_name, mapping in evaluator_config.items():
604
+ for evaluator_name, mapping in column_mapping.items():
590
605
  mapped_to_values = set(mapping.values())
591
606
  for col in target_generated_columns:
592
607
  # If user defined mapping differently, do not change it.
593
608
  # If it was mapped to target, we have already changed it
594
- # in _process_evaluator_config
609
+ # in _process_column_mappings
595
610
  run_output = f"${{run.outputs.{col}}}"
596
611
  # We will add our mapping only if
597
612
  # customer did not mapped target output.
598
613
  if col not in mapping and run_output not in mapped_to_values:
599
- evaluator_config[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
614
+ column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
600
615
 
601
616
  # After we have generated all columns we can check if we have
602
617
  # everything we need for evaluators.
603
- _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
618
+ _validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
604
619
 
605
620
  # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
606
621
  # via target mapping.
@@ -610,13 +625,16 @@ def _evaluate( # pylint: disable=too-many-locals
610
625
  for col in input_data_df.columns:
611
626
  # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
612
627
  # Also ignore columns that are already in config, since they've been covered by target mapping.
613
- if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
614
- evaluator_config["default"][col] = f"${{data.{col}}}"
628
+ if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
629
+ column_mapping["default"][col] = f"${{data.{col}}}"
615
630
  # Batch Run
616
631
  evaluators_info = {}
617
632
  use_pf_client = kwargs.get("_use_pf_client", True)
618
633
  if use_pf_client:
619
- batch_run_client = ProxyClient(pf_client)
634
+ # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
635
+ # The root cause is still unclear, but it seems related to a conflict between the async run uploader
636
+ # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
637
+ batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT))
620
638
 
621
639
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
622
640
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
@@ -632,7 +650,7 @@ def _evaluate( # pylint: disable=too-many-locals
632
650
  flow=evaluator,
633
651
  run=target_run,
634
652
  evaluator_name=evaluator_name,
635
- column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
653
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
636
654
  data=data,
637
655
  stream=True,
638
656
  name=kwargs.get("_run_name"),
@@ -1,77 +1,14 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
-
5
4
  import os
6
- import re
7
-
8
- import numpy as np
9
- from promptflow._utils.async_utils import async_run_allowing_running_loop
10
- from promptflow.core import AsyncPrompty
11
-
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
13
-
14
- from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
15
-
16
- try:
17
- from ..._user_agent import USER_AGENT
18
- except ImportError:
19
- USER_AGENT = None
20
-
21
-
22
- class _AsyncCoherenceEvaluator:
23
- # Constants must be defined within eval's directory to be save/loadable
24
- PROMPTY_FILE = "coherence.prompty"
25
- LLM_CALL_TIMEOUT = 600
26
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
27
-
28
- def __init__(self, model_config: dict):
29
- ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
30
-
31
- prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
5
+ from typing import Optional
6
+ from typing_extensions import override
32
7
 
33
- # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
34
- # https://github.com/encode/httpx/discussions/2959
35
- prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
8
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
36
9
 
37
- ensure_user_agent_in_aoai_model_config(
38
- model_config,
39
- prompty_model_config,
40
- USER_AGENT,
41
- )
42
-
43
- current_dir = os.path.dirname(__file__)
44
- prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
45
- self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
46
10
 
47
- async def __call__(self, *, query: str, response: str, **kwargs):
48
- # Validate input parameters
49
- query = str(query or "")
50
- response = str(response or "")
51
-
52
- if not (query.strip() and response.strip()):
53
- msg = "Both 'query' and 'response' must be non-empty strings."
54
- raise EvaluationException(
55
- message=msg,
56
- internal_message=msg,
57
- error_category=ErrorCategory.INVALID_VALUE,
58
- error_blame=ErrorBlame.USER_ERROR,
59
- error_target=ErrorTarget.COHERENCE_EVALUATOR,
60
- )
61
-
62
- # Run the evaluation flow
63
- llm_output = await self._flow(query=query, response=response, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
64
-
65
- score = np.nan
66
- if llm_output:
67
- match = re.search(r"\d", llm_output)
68
- if match:
69
- score = float(match.group())
70
-
71
- return {"gpt_coherence": float(score)}
72
-
73
-
74
- class CoherenceEvaluator:
11
+ class CoherenceEvaluator(PromptyEvaluatorBase):
75
12
  """
76
13
  Initialize a coherence evaluator configured for a specific Azure OpenAI model.
77
14
 
@@ -97,21 +34,37 @@ class CoherenceEvaluator:
97
34
  }
98
35
  """
99
36
 
100
- def __init__(self, model_config: dict):
101
- self._async_evaluator = _AsyncCoherenceEvaluator(model_config)
37
+ PROMPTY_FILE = "coherence.prompty"
38
+ RESULT_KEY = "gpt_coherence"
102
39
 
103
- def __call__(self, *, query: str, response: str, **kwargs):
104
- """
105
- Evaluate coherence.
40
+ @override
41
+ def __init__(self, model_config: dict):
42
+ current_dir = os.path.dirname(__file__)
43
+ prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
44
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
45
+
46
+ @override
47
+ def __call__(
48
+ self,
49
+ *,
50
+ query: Optional[str] = None,
51
+ response: Optional[str] = None,
52
+ conversation: Optional[dict] = None,
53
+ **kwargs
54
+ ):
55
+ """Evaluate coherence. Accepts either a query and response for a single evaluation,
56
+ or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
57
+ turns, the evaluator will aggregate the results of each turn.
106
58
 
107
- :keyword query: The query to be evaluated.
108
- :paramtype query: str
109
59
  :keyword response: The response to be evaluated.
110
- :paramtype response: str
111
- :return: The coherence score.
112
- :rtype: Dict[str, float]
60
+ :paramtype response: Optional[str]
61
+ :keyword context: The context to be evaluated.
62
+ :paramtype context: Optional[str]
63
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
64
+ key "messages". Conversation turns are expected
65
+ to be dictionaries with keys "content" and "role".
66
+ :paramtype conversation: Optional[Dict]
67
+ :return: The relevance score.
68
+ :rtype: dict
113
69
  """
114
- return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
115
-
116
- def _to_async(self):
117
- return self._async_evaluator
70
+ return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
@@ -3,11 +3,6 @@ name: Coherence
3
3
  description: Evaluates coherence score for QA scenario
4
4
  model:
5
5
  api: chat
6
- configuration:
7
- type: azure_openai
8
- azure_deployment: ${env:AZURE_DEPLOYMENT}
9
- api_key: ${env:AZURE_OPENAI_API_KEY}
10
- azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
6
  parameters:
12
7
  temperature: 0.0
13
8
  max_tokens: 1
@@ -0,0 +1,13 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._base_eval import EvaluatorBase
6
+ from ._base_prompty_eval import PromptyEvaluatorBase
7
+ from ._base_rai_svc_eval import RaiServiceEvaluatorBase
8
+
9
+ __all__ = [
10
+ "EvaluatorBase",
11
+ "PromptyEvaluatorBase",
12
+ "RaiServiceEvaluatorBase",
13
+ ]