azure-ai-evaluation 1.0.0b1__py3-none-any.whl → 1.0.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (76) hide show
  1. azure/ai/evaluation/__init__.py +4 -4
  2. azure/ai/evaluation/_common/rai_service.py +4 -4
  3. azure/ai/evaluation/_common/utils.py +40 -25
  4. azure/ai/evaluation/_constants.py +13 -0
  5. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +2 -1
  6. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +39 -17
  7. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +23 -13
  8. azure/ai/evaluation/_evaluate/_eval_run.py +38 -18
  9. azure/ai/evaluation/_evaluate/_evaluate.py +88 -63
  10. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +13 -8
  11. azure/ai/evaluation/_evaluate/_utils.py +29 -22
  12. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  13. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +34 -86
  14. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
  15. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  16. azure/ai/evaluation/_evaluators/_common/_base_eval.py +302 -0
  17. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +79 -0
  18. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +99 -0
  19. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  20. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -2
  21. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +9 -4
  22. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
  23. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
  24. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
  25. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
  26. azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
  27. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +2 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +29 -79
  29. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
  30. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +33 -85
  32. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
  33. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -0
  34. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
  35. azure/ai/evaluation/_evaluators/_qa/_qa.py +3 -14
  36. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +34 -88
  37. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
  38. azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
  39. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +17 -29
  40. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
  41. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +3 -2
  42. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +5 -18
  43. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  44. azure/ai/evaluation/_evaluators/_xpia/xpia.py +16 -91
  45. azure/ai/evaluation/_exceptions.py +0 -1
  46. azure/ai/evaluation/_http_utils.py +3 -3
  47. azure/ai/evaluation/_model_configurations.py +36 -8
  48. azure/ai/evaluation/_version.py +1 -1
  49. azure/ai/evaluation/simulator/__init__.py +1 -1
  50. azure/ai/evaluation/simulator/_adversarial_simulator.py +8 -6
  51. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  52. azure/ai/evaluation/simulator/_conversation/_conversation.py +16 -16
  53. azure/ai/evaluation/simulator/_direct_attack_simulator.py +6 -6
  54. azure/ai/evaluation/simulator/_helpers/__init__.py +3 -2
  55. azure/ai/evaluation/simulator/_helpers/_experimental.py +157 -0
  56. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +11 -29
  57. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +6 -6
  58. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -3
  59. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +18 -11
  60. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  61. azure/ai/evaluation/simulator/_model_tools/models.py +9 -11
  62. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  63. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -1
  64. azure/ai/evaluation/simulator/{simulator.py → _simulator.py} +166 -88
  65. azure/ai/evaluation/simulator/_tracing.py +21 -24
  66. azure/ai/evaluation/simulator/_utils.py +4 -1
  67. {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/METADATA +144 -14
  68. azure_ai_evaluation-1.0.0b3.dist-info/RECORD +98 -0
  69. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -350
  70. azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -9
  71. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -66
  72. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  73. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  74. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +0 -97
  75. {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/WHEEL +0 -0
  76. {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/top_level.txt +0 -0
@@ -8,27 +8,26 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
11
-
12
11
  from promptflow._sdk._constants import LINE_NUMBER
13
12
  from promptflow.client import PFClient
14
13
 
15
- from .._model_configurations import AzureAIProject
14
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
15
+
16
16
  from .._constants import (
17
17
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
18
18
  EvaluationMetrics,
19
19
  Prefixes,
20
20
  _InternalEvaluationMetrics,
21
21
  )
22
+ from .._model_configurations import AzureAIProject, EvaluatorConfig
22
23
  from .._user_agent import USER_AGENT
23
24
  from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
24
- from ._telemetry import log_evaluate_activity
25
25
  from ._utils import (
26
26
  _apply_column_mapping,
27
27
  _log_metrics_and_instance_results,
28
28
  _trace_destination_from_project_scope,
29
29
  _write_output,
30
30
  )
31
- from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
32
31
 
33
32
 
34
33
  # pylint: disable=line-too-long
@@ -159,6 +158,12 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
159
158
  ]
160
159
 
161
160
  missing_inputs = [col for col in required_inputs if col not in df_data.columns]
161
+ if missing_inputs and "conversation" in required_inputs:
162
+ non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
163
+ if len(missing_inputs) == len(non_conversation_inputs) and [
164
+ input in non_conversation_inputs for input in missing_inputs
165
+ ]:
166
+ missing_inputs = []
162
167
  if missing_inputs:
163
168
  if not is_target_fn:
164
169
  msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
@@ -260,12 +265,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
260
265
  initial_data_df = pd.read_json(data, lines=True)
261
266
  except Exception as e:
262
267
  raise EvaluationException(
263
- message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
264
- internal_message="Failed to load data. Confirm that it is valid jsonl data.",
265
- target=ErrorTarget.EVALUATE,
266
- category=ErrorCategory.INVALID_VALUE,
267
- blame=ErrorBlame.USER_ERROR,
268
- ) from e
268
+ message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
269
+ internal_message="Failed to load data. Confirm that it is valid jsonl data.",
270
+ target=ErrorTarget.EVALUATE,
271
+ category=ErrorCategory.INVALID_VALUE,
272
+ blame=ErrorBlame.USER_ERROR,
273
+ ) from e
269
274
 
270
275
  return initial_data_df
271
276
 
@@ -274,7 +279,7 @@ def _validate_columns(
274
279
  df: pd.DataFrame,
275
280
  evaluators: Dict[str, Any],
276
281
  target: Optional[Callable],
277
- evaluator_config: Dict[str, Dict[str, str]],
282
+ column_mapping: Dict[str, Dict[str, str]],
278
283
  ) -> None:
279
284
  """
280
285
  Check that all columns needed by evaluator or target function are present.
@@ -285,8 +290,8 @@ def _validate_columns(
285
290
  :type evaluators: Dict[str, Any]
286
291
  :param target: The callable to be applied to data set.
287
292
  :type target: Optional[Callable]
288
- :param evaluator_config: The configuration for evaluators.
289
- :type evaluator_config: Dict[str, Dict[str, str]]
293
+ :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
294
+ :type column_mapping: Dict[str, Dict[str, str]]
290
295
  :raises EvaluationException: If column starts from "__outputs." while target is defined.
291
296
  """
292
297
  if target:
@@ -307,7 +312,7 @@ def _validate_columns(
307
312
  else:
308
313
  for evaluator_name, evaluator in evaluators.items():
309
314
  # Apply column mapping
310
- mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
315
+ mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
311
316
  new_df = _apply_column_mapping(df, mapping_config)
312
317
 
313
318
  # Validate input data for evaluator
@@ -373,11 +378,11 @@ def _apply_target_to_data(
373
378
  return target_output, generated_columns, run
374
379
 
375
380
 
376
- def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
377
- """Process evaluator_config to replace ${target.} with ${data.}
381
+ def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
382
+ """Process column_mapping to replace ${target.} with ${data.}
378
383
 
379
- :param evaluator_config: The configuration for evaluators.
380
- :type evaluator_config: Dict[str, Dict[str, str]]
384
+ :param column_mapping: The configuration for evaluators.
385
+ :type column_mapping: Dict[str, Dict[str, str]]
381
386
  :return: The processed configuration.
382
387
  :rtype: Dict[str, Dict[str, str]]
383
388
  """
@@ -386,15 +391,15 @@ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Di
386
391
 
387
392
  unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
388
393
 
389
- if evaluator_config:
390
- for evaluator, mapping_config in evaluator_config.items():
394
+ if column_mapping:
395
+ for evaluator, mapping_config in column_mapping.items():
391
396
  if isinstance(mapping_config, dict):
392
397
  processed_config[evaluator] = {}
393
398
 
394
399
  for map_to_key, map_value in mapping_config.items():
395
400
  # Check if there's any unexpected reference other than ${target.} or ${data.}
396
401
  if unexpected_references.search(map_value):
397
- msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
402
+ msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
398
403
  raise EvaluationException(
399
404
  message=msg,
400
405
  internal_message=msg,
@@ -436,11 +441,11 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
436
441
  # @log_evaluate_activity
437
442
  def evaluate(
438
443
  *,
444
+ data: str,
445
+ evaluators: Dict[str, Callable],
439
446
  evaluation_name: Optional[str] = None,
440
447
  target: Optional[Callable] = None,
441
- data: Optional[str] = None,
442
- evaluators: Optional[Dict[str, Callable]] = None,
443
- evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
448
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
444
449
  azure_ai_project: Optional[AzureAIProject] = None,
445
450
  output_path: Optional[str] = None,
446
451
  **kwargs,
@@ -448,21 +453,21 @@ def evaluate(
448
453
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
449
454
  data will be run through target function and then results will be evaluated.
450
455
 
456
+ :keyword data: Path to the data to be evaluated or passed to target if target is set.
457
+ Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
458
+ :paramtype data: str
459
+ :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
460
+ and value as the evaluator function. Required.
461
+ :paramtype evaluators: Dict[str, Callable]
451
462
  :keyword evaluation_name: Display name of the evaluation.
452
463
  :paramtype evaluation_name: Optional[str]
453
464
  :keyword target: Target to be evaluated. `target` and `data` both cannot be None
454
465
  :paramtype target: Optional[Callable]
455
- :keyword data: Path to the data to be evaluated or passed to target if target is set.
456
- Only .jsonl format files are supported. `target` and `data` both cannot be None
457
- :paramtype data: Optional[str]
458
- :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
459
- and value as the evaluator function.
460
- :paramtype evaluators: Optional[Dict[str, Callable]
461
466
  :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
462
- names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
463
- keys as the column names in the evaluator input and values as the column names in the input data or data
464
- generated by target.
465
- :paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
467
+ names as keys and a values that are dictionaries containing the column mappings. The column mappings should
468
+ be a dictionary with keys as the column names in the evaluator input and values as the column names in the
469
+ input data or data generated by target.
470
+ :paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
466
471
  :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
467
472
  the results will be saved to a file named `evaluation_results.json` in the folder.
468
473
  :paramtype output_path: Optional[str]
@@ -483,7 +488,7 @@ def evaluate(
483
488
  model_config = {
484
489
  "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
485
490
  "api_key": os.environ.get("AZURE_OPENAI_KEY"),
486
- "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
491
+ "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
487
492
  }
488
493
 
489
494
  coherence_eval = CoherenceEvaluator(model_config=model_config)
@@ -498,15 +503,19 @@ def evaluate(
498
503
  },
499
504
  evaluator_config={
500
505
  "coherence": {
501
- "response": "${data.response}",
502
- "query": "${data.query}"
506
+ "column_mapping": {
507
+ "response": "${data.response}",
508
+ "query": "${data.query}",
509
+ },
503
510
  },
504
511
  "relevance": {
505
- "response": "${data.response}",
506
- "context": "${data.context}",
507
- "query": "${data.query}"
508
- }
509
- }
512
+ "column_mapping": {
513
+ "response": "${data.response}",
514
+ "context": "${data.context}",
515
+ "query": "${data.query}",
516
+ },
517
+ },
518
+ },
510
519
  )
511
520
 
512
521
  """
@@ -545,13 +554,13 @@ def evaluate(
545
554
  raise e
546
555
 
547
556
 
548
- def _evaluate( # pylint: disable=too-many-locals
557
+ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
549
558
  *,
550
559
  evaluation_name: Optional[str] = None,
551
560
  target: Optional[Callable] = None,
552
561
  data: Optional[str] = None,
553
562
  evaluators: Optional[Dict[str, Callable]] = None,
554
- evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
563
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
555
564
  azure_ai_project: Optional[AzureAIProject] = None,
556
565
  output_path: Optional[str] = None,
557
566
  **kwargs,
@@ -561,8 +570,13 @@ def _evaluate( # pylint: disable=too-many-locals
561
570
  # Process evaluator config to replace ${target.} with ${data.}
562
571
  if evaluator_config is None:
563
572
  evaluator_config = {}
564
- evaluator_config = _process_evaluator_config(evaluator_config)
565
- _validate_columns(input_data_df, evaluators, target, evaluator_config)
573
+ # extract column mapping dicts into dictionary mapping evaluator name to column mapping
574
+ column_mapping = {
575
+ evaluator_name: evaluator_configuration.get("column_mapping", None)
576
+ for evaluator_name, evaluator_configuration in evaluator_config.items()
577
+ }
578
+ column_mapping = _process_column_mappings(column_mapping)
579
+ _validate_columns(input_data_df, evaluators, target, column_mapping)
566
580
 
567
581
  # Target Run
568
582
  pf_client = PFClient(
@@ -572,43 +586,55 @@ def _evaluate( # pylint: disable=too-many-locals
572
586
  user_agent=USER_AGENT,
573
587
  )
574
588
 
575
- trace_destination = pf_client._config.get_trace_destination()
576
-
589
+ trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
577
590
  target_run = None
578
-
579
591
  target_generated_columns = set()
592
+
593
+ # Create default configuration for evaluators that directly maps
594
+ # input data names to keyword inputs of the same name in the evaluators.
595
+ column_mapping = column_mapping or {}
596
+ column_mapping.setdefault("default", {})
597
+
598
+ # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
580
599
  if data is not None and target is not None:
581
600
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
582
601
  target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
583
602
  )
584
603
 
585
- # Make sure, the default is always in the configuration.
586
- if not evaluator_config:
587
- evaluator_config = {}
588
- if "default" not in evaluator_config:
589
- evaluator_config["default"] = {}
590
-
591
- for evaluator_name, mapping in evaluator_config.items():
604
+ for evaluator_name, mapping in column_mapping.items():
592
605
  mapped_to_values = set(mapping.values())
593
606
  for col in target_generated_columns:
594
607
  # If user defined mapping differently, do not change it.
595
608
  # If it was mapped to target, we have already changed it
596
- # in _process_evaluator_config
609
+ # in _process_column_mappings
597
610
  run_output = f"${{run.outputs.{col}}}"
598
611
  # We will add our mapping only if
599
612
  # customer did not mapped target output.
600
613
  if col not in mapping and run_output not in mapped_to_values:
601
- evaluator_config[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
614
+ column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
602
615
 
603
616
  # After we have generated all columns we can check if we have
604
617
  # everything we need for evaluators.
605
- _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
606
-
618
+ _validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
619
+
620
+ # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
621
+ # via target mapping.
622
+ # If both the data and the output dictionary of the target function
623
+ # have the same column, then the target function value is used.
624
+ if input_data_df is not None:
625
+ for col in input_data_df.columns:
626
+ # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
627
+ # Also ignore columns that are already in config, since they've been covered by target mapping.
628
+ if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
629
+ column_mapping["default"][col] = f"${{data.{col}}}"
607
630
  # Batch Run
608
631
  evaluators_info = {}
609
632
  use_pf_client = kwargs.get("_use_pf_client", True)
610
633
  if use_pf_client:
611
- batch_run_client = ProxyClient(pf_client)
634
+ # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
635
+ # The root cause is still unclear, but it seems related to a conflict between the async run uploader
636
+ # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
637
+ batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT))
612
638
 
613
639
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
614
640
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
@@ -624,7 +650,7 @@ def _evaluate( # pylint: disable=too-many-locals
624
650
  flow=evaluator,
625
651
  run=target_run,
626
652
  evaluator_name=evaluator_name,
627
- column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
653
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
628
654
  data=data,
629
655
  stream=True,
630
656
  name=kwargs.get("_run_name"),
@@ -672,7 +698,6 @@ def _evaluate( # pylint: disable=too-many-locals
672
698
  result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
673
699
  metrics = _aggregate_metrics(evaluators_result_df, evaluators)
674
700
  metrics.update(evaluators_metric)
675
-
676
701
  studio_url = _log_metrics_and_instance_results(
677
702
  metrics,
678
703
  result_df,
@@ -6,21 +6,24 @@ import functools
6
6
  import inspect
7
7
  import json
8
8
  import logging
9
- from typing import Callable, Dict
9
+ from typing import Callable, Dict, TypeVar
10
10
 
11
11
  import pandas as pd
12
-
13
12
  from promptflow._sdk.entities._flows import FlexFlow as flex_flow
14
13
  from promptflow._sdk.entities._flows import Prompty as prompty_sdk
15
14
  from promptflow._sdk.entities._flows.dag import Flow as dag_flow
16
15
  from promptflow.client import PFClient
17
16
  from promptflow.core import Prompty as prompty_core
17
+ from typing_extensions import ParamSpec
18
18
 
19
19
  from ..._user_agent import USER_AGENT
20
20
  from .._utils import _trace_destination_from_project_scope
21
21
 
22
22
  LOGGER = logging.getLogger(__name__)
23
23
 
24
+ P = ParamSpec("P")
25
+ R = TypeVar("R")
26
+
24
27
 
25
28
  def _get_evaluator_type(evaluator: Dict[str, Callable]):
26
29
  """
@@ -82,7 +85,7 @@ def _get_evaluator_properties(evaluator, evaluator_name):
82
85
  name = str(evaluator)
83
86
  pf_type = "Unknown"
84
87
  except Exception as e: # pylint: disable=broad-exception-caught
85
- LOGGER.debug(f"Failed to get evaluator properties: {e}")
88
+ LOGGER.debug("Failed to get evaluator properties: %s", e)
86
89
  name = str(evaluator)
87
90
  pf_type = "Unknown"
88
91
 
@@ -95,15 +98,17 @@ def _get_evaluator_properties(evaluator, evaluator_name):
95
98
 
96
99
 
97
100
  # cspell:ignore isna
98
- def log_evaluate_activity(func) -> None:
101
+ def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
99
102
  """Decorator to log evaluate activity
100
103
 
101
104
  :param func: The function to be decorated
102
105
  :type func: Callable
106
+ :returns: The decorated function
107
+ :rtype: Callable[P, R]
103
108
  """
104
109
 
105
110
  @functools.wraps(func)
106
- def wrapper(*args, **kwargs) -> Callable:
111
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
107
112
  from promptflow._sdk._telemetry import ActivityType, log_activity
108
113
  from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
109
114
 
@@ -119,7 +124,7 @@ def log_evaluate_activity(func) -> None:
119
124
  user_agent=USER_AGENT,
120
125
  )
121
126
 
122
- track_in_cloud = bool(pf_client._config.get_trace_destination())
127
+ track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
123
128
  evaluate_target = bool(kwargs.get("target", None))
124
129
  evaluator_config = bool(kwargs.get("evaluator_config", None))
125
130
  custom_dimensions = {
@@ -154,7 +159,7 @@ def log_evaluate_activity(func) -> None:
154
159
  evaluator_info["failed_rows"] = failed_rows
155
160
  evaluator_info["total_rows"] = total_rows
156
161
  except Exception as e: # pylint: disable=broad-exception-caught
157
- LOGGER.debug(f"Failed to collect evaluate failed row info for {evaluator_name}: {e}")
162
+ LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
158
163
  evaluators_info.append(evaluator_info)
159
164
 
160
165
  custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
@@ -167,7 +172,7 @@ def log_evaluate_activity(func) -> None:
167
172
  ):
168
173
  pass
169
174
  except Exception as e: # pylint: disable=broad-exception-caught
170
- LOGGER.debug(f"Failed to collect evaluate usage info: {e}")
175
+ LOGGER.debug("Failed to collect evaluate usage info: %s", e)
171
176
 
172
177
  return result
173
178
 
@@ -8,12 +8,13 @@ import re
8
8
  import tempfile
9
9
  from collections import namedtuple
10
10
  from pathlib import Path
11
+ from typing import Dict
11
12
 
12
13
  import pandas as pd
13
14
 
14
- from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, Prefixes
15
- from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
15
+ from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, Prefixes
16
16
  from azure.ai.evaluation._evaluate._eval_run import EvalRun
17
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
17
18
 
18
19
  LOGGER = logging.getLogger(__name__)
19
20
 
@@ -33,16 +34,16 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
33
34
  match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
34
35
  if not match or len(match.groups()) != 5:
35
36
  raise EvaluationException(
36
- message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
37
- "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
38
- f"workspaces/<workspace_name>, got {trace_provider}",
39
- internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
40
- "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
41
- "workspaces/<workspace_name>,",
42
- target=ErrorTarget.UNKNOWN,
43
- category=ErrorCategory.INVALID_VALUE,
44
- blame=ErrorBlame.UNKNOWN,
45
- )
37
+ message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
38
+ "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
39
+ f"workspaces/<workspace_name>, got {trace_provider}",
40
+ internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
41
+ "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
42
+ "workspaces/<workspace_name>,",
43
+ target=ErrorTarget.UNKNOWN,
44
+ category=ErrorCategory.INVALID_VALUE,
45
+ blame=ErrorBlame.UNKNOWN,
46
+ )
46
47
  subscription_id = match.group(1)
47
48
  resource_group_name = match.group(3)
48
49
  workspace_name = match.group(5)
@@ -50,7 +51,7 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
50
51
 
51
52
 
52
53
  def load_jsonl(path):
53
- with open(path, "r", encoding="utf-8") as f:
54
+ with open(path, "r", encoding=DefaultOpenEncoding.READ) as f:
54
55
  return [json.loads(line) for line in f.readlines()]
55
56
 
56
57
 
@@ -99,7 +100,7 @@ def _log_metrics_and_instance_results(
99
100
  with tempfile.TemporaryDirectory() as tmpdir:
100
101
  tmp_path = os.path.join(tmpdir, artifact_name)
101
102
 
102
- with open(tmp_path, "w", encoding="utf-8") as f:
103
+ with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
103
104
  f.write(instance_results.to_json(orient="records", lines=True))
104
105
 
105
106
  ev_run.log_artifact(tmpdir, artifact_name)
@@ -155,11 +156,13 @@ def _write_output(path, data_dict):
155
156
  if os.path.isdir(path):
156
157
  p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
157
158
 
158
- with open(p, "w") as f:
159
+ with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
159
160
  json.dump(data_dict, f)
160
161
 
161
162
 
162
- def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace: bool = False) -> pd.DataFrame:
163
+ def _apply_column_mapping(
164
+ source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
165
+ ) -> pd.DataFrame:
163
166
  """
164
167
  Apply column mapping to source_df based on mapping_config.
165
168
 
@@ -167,10 +170,11 @@ def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace
167
170
  :param source_df: the data frame to be changed.
168
171
  :type source_df: pd.DataFrame
169
172
  :param mapping_config: The configuration, containing column mapping.
170
- :type mapping_config: dict.
173
+ :type mapping_config: Dict[str, str].
171
174
  :param inplace: If true, the source_df will be changed inplace.
172
175
  :type inplace: bool
173
176
  :return: The modified data frame.
177
+ :rtype: pd.DataFrame
174
178
  """
175
179
  result_df = source_df
176
180
 
@@ -211,19 +215,22 @@ def _has_aggregator(evaluator):
211
215
  return hasattr(evaluator, "__aggregate__")
212
216
 
213
217
 
214
- def get_int_env_var(env_var_name, default_value=None):
218
+ def get_int_env_var(env_var_name: str, default_value: int) -> int:
215
219
  """
216
- The function `get_int_env_var` retrieves an integer environment variable value, with an optional
220
+ The function `get_int_env_var` retrieves an integer environment variable value, with a
217
221
  default value if the variable is not set or cannot be converted to an integer.
218
222
 
219
223
  :param env_var_name: The name of the environment variable you want to retrieve the value of
224
+ :type env_var_name: str
220
225
  :param default_value: The default value is the value that will be returned if the environment
221
- variable is not found or if it cannot be converted to an integer
226
+ variable is not found or if it cannot be converted to an integer
227
+ :type default_value: int
222
228
  :return: an integer value.
229
+ :rtype: int
223
230
  """
224
231
  try:
225
- return int(os.environ.get(env_var_name, default_value))
226
- except Exception:
232
+ return int(os.environ[env_var_name])
233
+ except (ValueError, KeyError):
227
234
  return default_value
228
235
 
229
236
 
@@ -2,8 +2,8 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
5
-
6
5
  from promptflow._utils.async_utils import async_run_allowing_running_loop
6
+
7
7
  from azure.ai.evaluation._common.utils import nltk_tokenize
8
8
 
9
9