azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. azure/ai/evaluation/__init__.py +82 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/_experimental.py +172 -0
  4. azure/ai/evaluation/_common/constants.py +72 -0
  5. azure/ai/evaluation/_common/math.py +89 -0
  6. azure/ai/evaluation/_common/rai_service.py +632 -0
  7. azure/ai/evaluation/_common/utils.py +445 -0
  8. azure/ai/evaluation/_constants.py +72 -0
  9. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
  12. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
  13. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
  14. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  15. azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
  16. azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
  17. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
  18. azure/ai/evaluation/_evaluate/_utils.py +298 -0
  19. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  20. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  21. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
  22. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  23. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
  24. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
  25. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  26. azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
  27. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
  28. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  47. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  48. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  49. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
  50. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  51. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  52. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  53. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  54. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  55. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  56. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  57. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  58. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  59. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
  60. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
  62. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
  64. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
  65. azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
  66. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
  67. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  68. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  69. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  70. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  72. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  73. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
  74. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
  75. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  76. azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
  77. azure/ai/evaluation/_exceptions.py +128 -0
  78. azure/ai/evaluation/_http_utils.py +466 -0
  79. azure/ai/evaluation/_model_configurations.py +123 -0
  80. azure/ai/evaluation/_user_agent.py +6 -0
  81. azure/ai/evaluation/_vendor/__init__.py +3 -0
  82. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  83. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  84. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  85. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  86. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  87. azure/ai/evaluation/_version.py +5 -0
  88. azure/ai/evaluation/py.typed +0 -0
  89. azure/ai/evaluation/simulator/__init__.py +16 -0
  90. azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
  91. azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
  92. azure/ai/evaluation/simulator/_constants.py +27 -0
  93. azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
  94. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  95. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  96. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  97. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  98. azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
  99. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  100. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  101. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
  102. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
  103. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  104. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
  105. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
  106. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
  107. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
  108. azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
  109. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  110. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
  111. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
  112. azure/ai/evaluation/simulator/_simulator.py +716 -0
  113. azure/ai/evaluation/simulator/_tracing.py +89 -0
  114. azure/ai/evaluation/simulator/_utils.py +132 -0
  115. azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
  116. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
  117. azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
  118. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
  119. azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
  120. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  121. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  122. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,89 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ # pylint: disable=C0103,C0114,C0116,E0401,E0611
5
+
6
+ import functools
7
+ from typing import Callable, TypeVar
8
+
9
+ from promptflow._sdk._telemetry.activity import ActivityType, monitor_operation
10
+ from typing_extensions import ParamSpec
11
+
12
+ P = ParamSpec("P")
13
+ R = TypeVar("R")
14
+
15
+
16
+ def monitor_adversarial_scenario(activity_name: str = "adversarial.simulator.call"):
17
+ """
18
+ Monitor an adversarial scenario.
19
+
20
+ :param activity_name: The name of the activity to monitor.
21
+ :type activity_name: str
22
+ :returns: A decorator
23
+ :rtype: Callable[[Callable], Callable]
24
+ """
25
+
26
+ def decorator(func: Callable[P, R]) -> Callable[P, R]:
27
+ """
28
+ Decorator for monitoring an adversarial scenario.
29
+
30
+ :param func: The function to be decorated.
31
+ :type func: Callable[P, R]
32
+ :returns: The decorated function
33
+ :rtype: Callable[P, R]
34
+ """
35
+
36
+ @functools.wraps(func)
37
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
38
+ scenario = str(kwargs.get("scenario", None))
39
+ max_conversation_turns = kwargs.get("max_conversation_turns", None)
40
+ max_simulation_results = kwargs.get("max_simulation_results", None)
41
+ jailbreak = kwargs.get("jailbreak", None)
42
+ decorated_func = monitor_operation(
43
+ activity_name=activity_name,
44
+ activity_type=ActivityType.PUBLICAPI,
45
+ custom_dimensions={
46
+ "scenario": scenario,
47
+ "max_conversation_turns": max_conversation_turns,
48
+ "max_simulation_results": max_simulation_results,
49
+ "jailbreak": jailbreak,
50
+ },
51
+ )(func)
52
+
53
+ return decorated_func(*args, **kwargs)
54
+
55
+ return wrapper
56
+
57
+ return decorator
58
+
59
+
60
+ def monitor_task_simulator(func: Callable[P, R]) -> Callable[P, R]:
61
+ """
62
+ Monitor a task simulator.
63
+
64
+ :param func: The function to be decorated.
65
+ :type func: Callable[P, R]
66
+ :returns: The decorated function
67
+ :rtype: Callable[P, R]
68
+ """
69
+
70
+ @functools.wraps(func)
71
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
72
+ text = kwargs.get("text")
73
+ user_persona = kwargs.get("user_persona")
74
+ num_queries = kwargs.get("num_queries", 0)
75
+ max_conversation_turns = kwargs.get("max_conversation_turns", 0)
76
+ decorated_func = monitor_operation(
77
+ activity_name="task.simulator.call",
78
+ activity_type=ActivityType.PUBLICAPI,
79
+ custom_dimensions={
80
+ "text_length": len(text) if isinstance(text, str) else 0,
81
+ "user_persona_length": len(user_persona) if isinstance(user_persona, list) else 0,
82
+ "number_of_queries": num_queries,
83
+ "max_conversation_turns": max_conversation_turns,
84
+ },
85
+ )(func)
86
+
87
+ return decorated_func(*args, **kwargs)
88
+
89
+ return wrapper
@@ -0,0 +1,132 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ """
5
+ This module contains a utility class for managing a list of JSON lines.
6
+ """
7
+ import json
8
+
9
+
10
+ class JsonLineList(list):
11
+ """
12
+ A util to manage a list of JSON lines.
13
+ """
14
+
15
+ def to_json_lines(self):
16
+ """
17
+ Converts the list to a string of JSON lines.
18
+ Each item in the list is converted to a JSON string
19
+ and appended to the result string with a newline.
20
+
21
+ :returns: A string of JSON lines, where each line is a JSON representation of an item in the list.
22
+ :rtype: str
23
+ """
24
+ json_lines = ""
25
+ for item in self:
26
+ json_lines += json.dumps(item) + "\n"
27
+ return json_lines
28
+
29
+ def to_eval_qr_json_lines(self):
30
+ """
31
+ Converts the list to a string of JSON lines suitable for evaluation in a query & response format.
32
+ Each item in the list is expected to be a dictionary with
33
+ 'messages' key. The 'messages' value is a list of
34
+ dictionaries, each with a 'role' key and a 'content' key.
35
+ The 'role' value should be either 'user' or 'assistant',
36
+ and the 'content' value should be a string.
37
+ If a 'context' key is present in the message, its value is also included
38
+ in the output.
39
+
40
+ :returns: A string of JSON lines.
41
+ :rtype: str
42
+ """
43
+ json_lines = ""
44
+ for item in self:
45
+ user_message = None
46
+ assistant_message = None
47
+ user_context = None
48
+ assistant_context = None
49
+ template_parameters = item.get("template_parameters", {})
50
+ category = template_parameters.get("category", None)
51
+ for message in item["messages"]:
52
+ if message["role"] == "user":
53
+ user_message = message["content"]
54
+ user_context = message.get("context", "")
55
+ elif message["role"] == "assistant":
56
+ assistant_message = message["content"]
57
+ assistant_context = message.get("context", "")
58
+ if user_message and assistant_message:
59
+ if user_context or assistant_context:
60
+ json_lines += (
61
+ json.dumps(
62
+ {
63
+ "query": user_message,
64
+ "response": assistant_message,
65
+ "context": str(
66
+ {
67
+ "user_context": user_context,
68
+ "assistant_context": assistant_context,
69
+ }
70
+ ),
71
+ "category": category,
72
+ }
73
+ )
74
+ + "\n"
75
+ )
76
+ user_message = assistant_message = None
77
+ else:
78
+ json_lines += (
79
+ json.dumps({"query": user_message, "response": assistant_message, "category": category})
80
+ + "\n"
81
+ )
82
+ user_message = assistant_message = None
83
+
84
+ return json_lines
85
+
86
+
87
+ class JsonLineChatProtocol(dict):
88
+ """
89
+ A util to manage a JSON object that follows the chat protocol.
90
+ """
91
+
92
+ def to_json(self):
93
+ """
94
+ Converts the object to a JSON string.
95
+
96
+ :returns: A JSON representation of the object.
97
+ :rtype: str
98
+ """
99
+ return json.dumps(self)
100
+
101
+ def to_eval_qr_json_lines(self) -> str:
102
+ """
103
+ Converts the object to a string of JSON lines suitable for evaluation in a query and response format.
104
+ The object is expected to be a dictionary with 'messages' key.
105
+
106
+ :returns: A json lines document
107
+ :rtype: str
108
+ """
109
+ user_message = None
110
+ assistant_message = None
111
+ if "context" in self:
112
+ context = self["context"]
113
+ else:
114
+ context = None
115
+ json_lines = ""
116
+ for message in self["messages"]:
117
+ if message["role"] == "user":
118
+ user_message = message["content"]
119
+ elif message["role"] == "assistant":
120
+ assistant_message = message["content"]
121
+ if "context" in message and message["context"] is not None:
122
+ context = message.get("context", context)
123
+ if user_message and assistant_message:
124
+ if context:
125
+ json_lines += (
126
+ json.dumps({"query": user_message, "response": assistant_message, "context": context}) + "\n"
127
+ )
128
+ user_message = assistant_message = None
129
+ else:
130
+ json_lines += json.dumps({"query": user_message, "response": assistant_message}) + "\n"
131
+ user_message = assistant_message = None
132
+ return json_lines