azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. azure/ai/evaluation/__init__.py +82 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/_experimental.py +172 -0
  4. azure/ai/evaluation/_common/constants.py +72 -0
  5. azure/ai/evaluation/_common/math.py +89 -0
  6. azure/ai/evaluation/_common/rai_service.py +632 -0
  7. azure/ai/evaluation/_common/utils.py +445 -0
  8. azure/ai/evaluation/_constants.py +72 -0
  9. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
  12. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
  13. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
  14. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  15. azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
  16. azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
  17. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
  18. azure/ai/evaluation/_evaluate/_utils.py +298 -0
  19. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  20. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  21. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
  22. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  23. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
  24. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
  25. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  26. azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
  27. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
  28. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  47. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  48. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  49. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
  50. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  51. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  52. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  53. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  54. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  55. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  56. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  57. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  58. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  59. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
  60. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
  62. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
  64. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
  65. azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
  66. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
  67. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  68. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  69. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  70. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  72. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  73. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
  74. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
  75. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  76. azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
  77. azure/ai/evaluation/_exceptions.py +128 -0
  78. azure/ai/evaluation/_http_utils.py +466 -0
  79. azure/ai/evaluation/_model_configurations.py +123 -0
  80. azure/ai/evaluation/_user_agent.py +6 -0
  81. azure/ai/evaluation/_vendor/__init__.py +3 -0
  82. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  83. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  84. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  85. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  86. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  87. azure/ai/evaluation/_version.py +5 -0
  88. azure/ai/evaluation/py.typed +0 -0
  89. azure/ai/evaluation/simulator/__init__.py +16 -0
  90. azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
  91. azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
  92. azure/ai/evaluation/simulator/_constants.py +27 -0
  93. azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
  94. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  95. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  96. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  97. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  98. azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
  99. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  100. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  101. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
  102. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
  103. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  104. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
  105. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
  106. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
  107. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
  108. azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
  109. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  110. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
  111. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
  112. azure/ai/evaluation/simulator/_simulator.py +716 -0
  113. azure/ai/evaluation/simulator/_tracing.py +89 -0
  114. azure/ai/evaluation/simulator/_utils.py +132 -0
  115. azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
  116. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
  117. azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
  118. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
  119. azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
  120. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  121. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  122. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,614 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ # pylint: skip-file
5
+ import ast
6
+ import asyncio
7
+ import copy
8
+ import logging
9
+ import time
10
+ import uuid
11
+ from abc import ABC, abstractmethod
12
+ from collections import deque
13
+ from typing import Deque, Dict, List, Optional, Union
14
+ from urllib.parse import urlparse
15
+
16
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
17
+ from azure.ai.evaluation._http_utils import AsyncHttpPipeline
18
+
19
+ from ._identity_manager import APITokenManager
20
+
21
+ MIN_ERRORS_TO_FAIL = 3
22
+ MAX_TIME_TAKEN_RECORDS = 20_000
23
+
24
+
25
+ def get_model_class_from_url(endpoint_url: str):
26
+ """Convert an endpoint URL to the appropriate model class."""
27
+ endpoint_path = urlparse(endpoint_url).path # remove query params
28
+
29
+ if endpoint_path.endswith("chat/completions"):
30
+ return OpenAIChatCompletionsModel
31
+ if endpoint_path.endswith("completions"):
32
+ return OpenAICompletionsModel
33
+ raise EvaluationException(
34
+ message=f"Unknown API type for endpoint {endpoint_url}",
35
+ internal_message="Unknown API type",
36
+ error_category=ErrorCategory.UNKNOWN_FIELD,
37
+ error_blame=ErrorBlame.USER_ERROR,
38
+ error_target=ErrorTarget.MODELS,
39
+ )
40
+
41
+
42
+ # ===========================================================
43
+ # ===================== LLMBase Class =======================
44
+ # ===========================================================
45
+
46
+
47
+ class LLMBase(ABC):
48
+ """
49
+ Base class for all LLM models.
50
+ """
51
+
52
+ def __init__(self, endpoint_url: str, name: str = "unknown", additional_headers: Optional[Dict[str, str]] = None):
53
+ self.endpoint_url = endpoint_url
54
+ self.name = name
55
+ self.additional_headers = additional_headers or {}
56
+ self.logger = logging.getLogger(repr(self))
57
+
58
+ # Metric tracking
59
+ self._lock = None
60
+ self.response_times: Deque[Union[int, float]] = deque(maxlen=MAX_TIME_TAKEN_RECORDS)
61
+ self.step = 0
62
+ self.error_count = 0
63
+
64
+ @property
65
+ async def lock(self):
66
+ if self._lock is None:
67
+ self._lock = asyncio.Lock()
68
+ return self._lock
69
+
70
+ @abstractmethod
71
+ def get_model_params(self) -> dict:
72
+ pass
73
+
74
+ @abstractmethod
75
+ def format_request_data(self, prompt: str, **request_params) -> dict:
76
+ pass
77
+
78
+ async def get_completion(
79
+ self,
80
+ prompt: str,
81
+ session: AsyncHttpPipeline,
82
+ **request_params,
83
+ ) -> dict:
84
+ """
85
+ Query the model a single time with a prompt.
86
+
87
+ Parameters
88
+ ----------
89
+ prompt: Prompt str to query model with.
90
+ session: AsyncHttpPipeline object to use for the request.
91
+ **request_params: Additional parameters to pass to the request.
92
+ """
93
+ request_data = self.format_request_data(prompt, **request_params)
94
+ return await self.request_api(
95
+ session=session,
96
+ request_data=request_data,
97
+ )
98
+
99
+ @abstractmethod
100
+ async def get_all_completions(
101
+ self,
102
+ prompts: List[str],
103
+ session: AsyncHttpPipeline,
104
+ api_call_max_parallel_count: int,
105
+ api_call_delay_seconds: float,
106
+ request_error_rate_threshold: float,
107
+ **request_params,
108
+ ) -> List[dict]:
109
+ pass
110
+
111
+ @abstractmethod
112
+ async def request_api(
113
+ self,
114
+ session: AsyncHttpPipeline,
115
+ request_data: dict,
116
+ ) -> dict:
117
+ pass
118
+
119
+ @abstractmethod
120
+ async def get_conversation_completion(
121
+ self,
122
+ messages: List[dict],
123
+ session: AsyncHttpPipeline,
124
+ role: str,
125
+ **request_params,
126
+ ) -> dict:
127
+ pass
128
+
129
+ @abstractmethod
130
+ async def request_api_parallel(
131
+ self,
132
+ request_datas: List[dict],
133
+ output_collector: List,
134
+ session: AsyncHttpPipeline,
135
+ api_call_delay_seconds: float,
136
+ request_error_rate_threshold: float,
137
+ ) -> None:
138
+ pass
139
+
140
+ def _log_request(self, request: dict) -> None:
141
+ self.logger.info(f"Request: {request}")
142
+
143
+ async def _add_successful_response(self, time_taken: Union[int, float]) -> None:
144
+ async with self.lock:
145
+ self.response_times.append(time_taken)
146
+ self.step += 1
147
+
148
+ async def _add_error(self) -> None:
149
+ async with self.lock:
150
+ self.error_count += 1
151
+ self.step += 1
152
+
153
+ async def get_response_count(self) -> int:
154
+ async with self.lock:
155
+ return len(self.response_times)
156
+
157
+ async def get_response_times(self) -> List[float]:
158
+ async with self.lock:
159
+ return list(self.response_times)
160
+
161
+ async def get_average_response_time(self) -> float:
162
+ async with self.lock:
163
+ return sum(self.response_times) / len(self.response_times)
164
+
165
+ async def get_error_rate(self) -> float:
166
+ async with self.lock:
167
+ return self.error_count / self.step
168
+
169
+ async def get_error_count(self) -> int:
170
+ async with self.lock:
171
+ return self.error_count
172
+
173
+ def __repr__(self):
174
+ return f"{self.__class__.__name__}(name={self.name})"
175
+
176
+
177
+ # ===========================================================
178
+ # ================== OpenAICompletions ======================
179
+ # ===========================================================
180
+
181
+
182
+ class OpenAICompletionsModel(LLMBase):
183
+ """
184
+ Object for calling a Completions-style API for OpenAI models.
185
+ """
186
+
187
+ prompt_idx_key = "__prompt_idx__"
188
+
189
+ max_stop_tokens = 4
190
+ stop_tokens = ["<|im_end|>", "<|endoftext|>"]
191
+
192
+ model_param_names = [
193
+ "model",
194
+ "temperature",
195
+ "max_tokens",
196
+ "top_p",
197
+ "n",
198
+ "frequency_penalty",
199
+ "presence_penalty",
200
+ "stop",
201
+ ]
202
+
203
+ CHAT_START_TOKEN = "<|im_start|>"
204
+ CHAT_END_TOKEN = "<|im_end|>"
205
+
206
+ def __init__(
207
+ self,
208
+ *,
209
+ endpoint_url: str,
210
+ name: str = "OpenAICompletionsModel",
211
+ additional_headers: Optional[Dict[str, str]] = None,
212
+ api_version: Optional[str] = "2023-03-15-preview",
213
+ token_manager: APITokenManager,
214
+ azureml_model_deployment: Optional[str] = None,
215
+ model: Optional[str] = None,
216
+ temperature: Optional[float] = 0.7,
217
+ max_tokens: Optional[int] = 300,
218
+ top_p: Optional[float] = None, # Recommended to use top_p or temp, not both
219
+ n: Optional[int] = 1,
220
+ frequency_penalty: Optional[float] = 0,
221
+ presence_penalty: Optional[float] = 0,
222
+ stop: Optional[Union[List[str], str]] = None,
223
+ image_captions: Optional[Dict[str, str]] = None,
224
+ images_dir: Optional[str] = None, # Note: unused, kept for class compatibility
225
+ ):
226
+ super().__init__(endpoint_url=endpoint_url, name=name, additional_headers=additional_headers)
227
+ self.api_version = api_version
228
+ self.token_manager = token_manager
229
+ self.azureml_model_deployment = azureml_model_deployment
230
+ self.model = model
231
+ self.temperature = temperature
232
+ self.max_tokens = max_tokens
233
+ self.top_p = top_p
234
+ self.n = n
235
+ self.frequency_penalty = frequency_penalty
236
+ self.presence_penalty = presence_penalty
237
+ self.image_captions = image_captions or {}
238
+
239
+ # Default stop to end token if not provided
240
+ if not stop:
241
+ stop = []
242
+ # Else if stop sequence is given as a string (Ex: "["\n", "<im_end>"]"), convert
243
+ elif type(stop) is str and stop.startswith("[") and stop.endswith("]"):
244
+ stop = ast.literal_eval(stop)
245
+ elif type(stop) is str:
246
+ stop = [stop]
247
+ self.stop: List = stop # type: ignore[assignment]
248
+
249
+ # If stop tokens do not include default end tokens, add them
250
+ for token in self.stop_tokens:
251
+ if len(self.stop) >= self.max_stop_tokens:
252
+ break
253
+ if token not in self.stop:
254
+ self.stop.append(token)
255
+
256
+ if top_p not in [None, 1.0] and temperature is not None:
257
+ self.logger.warning(
258
+ "Both top_p and temperature are set. OpenAI advises against using both at the same time."
259
+ )
260
+
261
+ self.logger.info(f"Default model settings: {self.get_model_params()}")
262
+
263
+ def get_model_params(self):
264
+ return {param: getattr(self, param) for param in self.model_param_names if getattr(self, param) is not None}
265
+
266
+ def format_request_data(self, prompt: Dict[str, str], **request_params) -> Dict[str, str]: # type: ignore[override]
267
+ """
268
+ Format the request data for the OpenAI API.
269
+ """
270
+ request_data = {"prompt": prompt, **self.get_model_params()}
271
+ request_data.update(request_params)
272
+ return request_data
273
+
274
+ async def get_conversation_completion(
275
+ self,
276
+ messages: List[dict],
277
+ session: AsyncHttpPipeline,
278
+ role: str = "assistant",
279
+ **request_params,
280
+ ) -> dict:
281
+ """
282
+ Query the model a single time with a message.
283
+
284
+ Parameters
285
+ ----------
286
+ messages: List of messages to query the model with.
287
+ Expected format: [{"role": "user", "content": "Hello!"}, ...]
288
+ session: AsyncHttpPipeline object to query the model with.
289
+ role: Role of the user sending the message.
290
+ request_params: Additional parameters to pass to the model.
291
+ """
292
+ prompt = []
293
+ for message in messages:
294
+ prompt.append(f"{self.CHAT_START_TOKEN}{message['role']}\n{message['content']}\n{self.CHAT_END_TOKEN}\n")
295
+ prompt_string: str = "".join(prompt)
296
+ prompt_string += f"{self.CHAT_START_TOKEN}{role}\n"
297
+
298
+ return await self.get_completion(
299
+ prompt=prompt_string,
300
+ session=session,
301
+ **request_params,
302
+ )
303
+
304
+ async def get_all_completions( # type: ignore[override]
305
+ self,
306
+ prompts: List[Dict[str, str]],
307
+ session: AsyncHttpPipeline,
308
+ api_call_max_parallel_count: int = 1,
309
+ api_call_delay_seconds: float = 0.1,
310
+ request_error_rate_threshold: float = 0.5,
311
+ **request_params,
312
+ ) -> List[dict]:
313
+ """
314
+ Run a batch of prompts through the model and return the results in the order given.
315
+
316
+ Parameters
317
+ ----------
318
+ prompts: List of prompts to query the model with.
319
+ session: AsyncHttpPipeline to use for the request.
320
+ api_call_max_parallel_count: Number of parallel requests to make to the API.
321
+ api_call_delay_seconds: Number of seconds to wait between API requests.
322
+ request_error_rate_threshold: Maximum error rate allowed before raising an error.
323
+ request_params: Additional parameters to pass to the API.
324
+ """
325
+ if api_call_max_parallel_count > 1:
326
+ self.logger.info(f"Using {api_call_max_parallel_count} parallel workers to query the API..")
327
+
328
+ # Format prompts and tag with index
329
+ request_datas: List[Dict] = []
330
+ for idx, prompt in enumerate(prompts):
331
+ prompt = self.format_request_data(prompt, **request_params)
332
+ prompt[self.prompt_idx_key] = idx # type: ignore[assignment]
333
+ request_datas.append(prompt)
334
+
335
+ # Perform inference
336
+ if len(prompts) == 0:
337
+ return [] # queue is empty
338
+
339
+ output_collector: List = []
340
+ tasks = [ # create a set of worker-tasks to query inference endpoint in parallel
341
+ asyncio.create_task(
342
+ self.request_api_parallel(
343
+ request_datas=request_datas,
344
+ output_collector=output_collector,
345
+ session=session,
346
+ api_call_delay_seconds=api_call_delay_seconds,
347
+ request_error_rate_threshold=request_error_rate_threshold,
348
+ )
349
+ )
350
+ for _ in range(api_call_max_parallel_count)
351
+ ]
352
+
353
+ # Await the completion of all tasks, and propagate any exceptions
354
+ await asyncio.gather(*tasks, return_exceptions=False)
355
+ if len(request_datas):
356
+ msg = "All inference tasks were finished, but the queue is not empty"
357
+ raise EvaluationException(
358
+ message=msg,
359
+ internal_message=msg,
360
+ target=ErrorTarget.MODELS,
361
+ category=ErrorCategory.FAILED_EXECUTION,
362
+ blame=ErrorBlame.UNKNOWN,
363
+ )
364
+
365
+ # Output results back to the caller
366
+ output_collector.sort(key=lambda x: x[self.prompt_idx_key])
367
+ for output in output_collector:
368
+ output.pop(self.prompt_idx_key)
369
+ return output_collector
370
+
371
+ async def request_api_parallel(
372
+ self,
373
+ request_datas: List[dict],
374
+ output_collector: List,
375
+ session: AsyncHttpPipeline,
376
+ api_call_delay_seconds: float = 0.1,
377
+ request_error_rate_threshold: float = 0.5,
378
+ ) -> None:
379
+ """
380
+ Query the model for all prompts given as a list and append the output to output_collector.
381
+ No return value, output_collector is modified in place.
382
+ """
383
+ logger_tasks: List = [] # to await for logging to finish
384
+
385
+ while True: # process data from queue until it"s empty
386
+ try:
387
+ request_data = request_datas.pop()
388
+ prompt_idx = request_data.pop(self.prompt_idx_key)
389
+
390
+ try:
391
+ response = await self.request_api(
392
+ session=session,
393
+ request_data=request_data,
394
+ )
395
+ await self._add_successful_response(response["time_taken"])
396
+ except Exception as e:
397
+ response = {
398
+ "request": request_data,
399
+ "response": {
400
+ "finish_reason": "error",
401
+ "error": str(e),
402
+ },
403
+ }
404
+ await self._add_error()
405
+
406
+ self.logger.exception(f"Errored on prompt #{prompt_idx}")
407
+
408
+ # if we count too many errors, we stop and raise an exception
409
+ response_count = await self.get_response_count()
410
+ error_rate = await self.get_error_rate()
411
+ if response_count >= MIN_ERRORS_TO_FAIL and error_rate >= request_error_rate_threshold:
412
+ error_msg = (
413
+ f"Error rate is more than {request_error_rate_threshold:.0%} -- something is broken!"
414
+ )
415
+ raise EvaluationException(
416
+ message=error_msg,
417
+ internal_message=error_msg,
418
+ target=ErrorTarget.MODELS,
419
+ category=ErrorCategory.FAILED_EXECUTION,
420
+ blame=ErrorBlame.UNKNOWN,
421
+ )
422
+
423
+ response[self.prompt_idx_key] = prompt_idx
424
+ output_collector.append(response)
425
+
426
+ # Sleep between consecutive requests to avoid rate limit
427
+ await asyncio.sleep(api_call_delay_seconds)
428
+
429
+ except IndexError: # when the queue is empty, the worker is done
430
+ # wait for logging tasks to finish
431
+ await asyncio.gather(*logger_tasks)
432
+ return
433
+
434
+ async def request_api(
435
+ self,
436
+ session: AsyncHttpPipeline,
437
+ request_data: dict,
438
+ ) -> dict:
439
+ """
440
+ Request the model with a body of data.
441
+
442
+ Parameters
443
+ ----------
444
+ session: HTTPS Session for invoking the endpoint.
445
+ request_data: Prompt dictionary to query the model with. (Pass {"prompt": prompt} instead of prompt.)
446
+ """
447
+
448
+ self._log_request(request_data)
449
+
450
+ token = self.token_manager.get_token()
451
+
452
+ headers = {
453
+ "Content-Type": "application/json",
454
+ "X-CV": f"{uuid.uuid4()}",
455
+ "X-ModelType": self.model or "",
456
+ }
457
+
458
+ if self.token_manager.auth_header == "Bearer":
459
+ headers["Authorization"] = f"Bearer {token}"
460
+ elif self.token_manager.auth_header == "api-key":
461
+ headers["api-key"] = token
462
+ headers["Authorization"] = "api-key"
463
+
464
+ # Update timeout for proxy endpoint
465
+ if self.azureml_model_deployment:
466
+ headers["azureml-model-deployment"] = self.azureml_model_deployment
467
+
468
+ # add all additional headers
469
+ if self.additional_headers:
470
+ headers.update(self.additional_headers)
471
+
472
+ params = {}
473
+ if self.api_version:
474
+ params["api-version"] = self.api_version
475
+
476
+ time_start = time.time()
477
+ full_response = None
478
+
479
+ response = await session.post(url=self.endpoint_url, headers=headers, json=request_data, params=params)
480
+
481
+ response.raise_for_status()
482
+
483
+ response_data = response.json()
484
+
485
+ self.logger.info(f"Response: {response_data}")
486
+
487
+ # Copy the full response and return it to be saved in jsonl.
488
+ full_response = copy.copy(response_data)
489
+
490
+ time_taken = time.time() - time_start
491
+
492
+ parsed_response = self._parse_response(response_data, request_data=request_data)
493
+
494
+ return {
495
+ "request": request_data,
496
+ "response": parsed_response,
497
+ "time_taken": time_taken,
498
+ "full_response": full_response,
499
+ }
500
+
501
+ def _parse_response(self, response_data: dict, request_data: Optional[dict] = None) -> dict:
502
+ # https://platform.openai.com/docs/api-reference/completions
503
+ samples = []
504
+ finish_reason = []
505
+ for choice in response_data["choices"]:
506
+ if "text" in choice:
507
+ samples.append(choice["text"])
508
+ if "finish_reason" in choice:
509
+ finish_reason.append(choice["finish_reason"])
510
+
511
+ return {"samples": samples, "finish_reason": finish_reason, "id": response_data["id"]}
512
+
513
+
514
+ # ===========================================================
515
+ # ============== OpenAIChatCompletionsModel =================
516
+ # ===========================================================
517
+
518
+
519
+ class OpenAIChatCompletionsModel(OpenAICompletionsModel):
520
+ """
521
+ OpenAIChatCompletionsModel is a wrapper around OpenAICompletionsModel that
522
+ formats the prompt for chat completion.
523
+ """
524
+
525
+ def __init__(self, name="OpenAIChatCompletionsModel", **kwargs):
526
+ super().__init__(name=name, **kwargs)
527
+
528
+ def format_request_data(self, messages: List[dict], **request_params): # type: ignore[override]
529
+ request_data = {"messages": messages, **self.get_model_params()}
530
+ request_data.update(request_params)
531
+ return request_data
532
+
533
+ async def get_conversation_completion(
534
+ self,
535
+ messages: List[dict],
536
+ session: AsyncHttpPipeline,
537
+ role: str = "assistant",
538
+ **request_params,
539
+ ) -> dict:
540
+ """
541
+ Query the model a single time with a message.
542
+
543
+ Parameters
544
+ ----------
545
+ messages: List of messages to query the model with.
546
+ Expected format: [{"role": "user", "content": "Hello!"}, ...]
547
+ session: AsyncHttpPipeline object to query the model with.
548
+ role: Not used for this model, since it is a chat model.
549
+ request_params: Additional parameters to pass to the model.
550
+ """
551
+ request_data = self.format_request_data(
552
+ messages=messages,
553
+ **request_params,
554
+ )
555
+ return await self.request_api(
556
+ session=session,
557
+ request_data=request_data,
558
+ )
559
+
560
+ async def get_completion(
561
+ self,
562
+ prompt: str,
563
+ session: AsyncHttpPipeline,
564
+ **request_params,
565
+ ) -> dict:
566
+ """
567
+ Query a ChatCompletions model with a single prompt. Note: entire message will be inserted into a "system" call.
568
+
569
+ Parameters
570
+ ----------
571
+ prompt: Prompt str to query model with.
572
+ session: AsyncHttpPipeline object to use for the request.
573
+ **request_params: Additional parameters to pass to the request.
574
+ """
575
+ messages = [{"role": "system", "content": prompt}]
576
+
577
+ request_data = self.format_request_data(messages=messages, **request_params)
578
+ return await self.request_api(
579
+ session=session,
580
+ request_data=request_data,
581
+ )
582
+
583
+ async def get_all_completions(
584
+ self,
585
+ prompts: List[str], # type: ignore[override]
586
+ session: AsyncHttpPipeline,
587
+ api_call_max_parallel_count: int = 1,
588
+ api_call_delay_seconds: float = 0.1,
589
+ request_error_rate_threshold: float = 0.5,
590
+ **request_params,
591
+ ) -> List[dict]:
592
+ prompts_list = [{"role": "system", "content": prompt} for prompt in prompts]
593
+
594
+ return await super().get_all_completions(
595
+ prompts=prompts_list,
596
+ session=session,
597
+ api_call_max_parallel_count=api_call_max_parallel_count,
598
+ api_call_delay_seconds=api_call_delay_seconds,
599
+ request_error_rate_threshold=request_error_rate_threshold,
600
+ **request_params,
601
+ )
602
+
603
+ def _parse_response(self, response_data: dict, request_data: Optional[dict] = None) -> dict:
604
+ # https://platform.openai.com/docs/api-reference/chat
605
+ samples = []
606
+ finish_reason = []
607
+
608
+ for choice in response_data["choices"]:
609
+ if "message" in choice and "content" in choice["message"]:
610
+ samples.append(choice["message"]["content"])
611
+ if "message" in choice and "finish_reason" in choice["message"]:
612
+ finish_reason.append(choice["message"]["finish_reason"])
613
+
614
+ return {"samples": samples, "finish_reason": finish_reason, "id": response_data["id"]}
File without changes
@@ -0,0 +1,65 @@
1
+ ---
2
+ name: TaskSimulatorQueryResponse
3
+ description: Gets queries and responses from a blob of text
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ top_p: 1.0
9
+ presence_penalty: 0
10
+ frequency_penalty: 0
11
+ response_format:
12
+ type: json_object
13
+
14
+ inputs:
15
+ text:
16
+ type: string
17
+ num_queries:
18
+ type: integer
19
+
20
+ ---
21
+ system:
22
+ You're an AI that helps in preparing a Question/Answer quiz from Text for "Who wants to be a millionaire" tv show
23
+ Both Questions and Answers MUST BE extracted from given Text
24
+ Frame Question in a way so that Answer is RELEVANT SHORT BITE-SIZED info from Text
25
+ RELEVANT info could be: NUMBER, DATE, STATISTIC, MONEY, NAME
26
+ A sentence should contribute multiple QnAs if it has more info in it
27
+ Answer must not be more than 5 words
28
+ Answer must be picked from Text as is
29
+ Question should be as descriptive as possible and must include as much context as possible from Text
30
+ Output must always have the provided number of QnAs
31
+ Output must be in JSON format.
32
+ Output must have {{num_queries}} objects in the format specified below. Any other count is unacceptable.
33
+ Text:
34
+ <|text_start|>
35
+ On January 24, 1984, former Apple CEO Steve Jobs introduced the first Macintosh. In late 2003, Apple had 2.06 percent of the desktop share in the United States.
36
+ Some years later, research firms IDC and Gartner reported that Apple's market share in the U.S. had increased to about 6%.
37
+ <|text_end|>
38
+ Output with 5 QnAs:
39
+ {
40
+ "qna":[{
41
+ "q": "When did the former Apple CEO Steve Jobs introduced the first Macintosh?",
42
+ "r": "January 24, 1984"
43
+ },
44
+ {
45
+ "q": "Who was the former Apple CEO that introduced the first Macintosh on January 24, 1984?",
46
+ "r": "Steve Jobs"
47
+ },
48
+ {
49
+ "q": "What percent of the desktop share did Apple have in the United States in late 2003?",
50
+ "r": "2.06 percent"
51
+ },
52
+ {
53
+ "q": "What were the research firms that reported on Apple's market share in the U.S.?",
54
+ "r": "IDC and Gartner"
55
+ },
56
+ {
57
+ "q": "What was the percentage increase of Apple's market share in the U.S., as reported by research firms IDC and Gartner?",
58
+ "r": "6%"
59
+ }]
60
+ }
61
+ Text:
62
+ <|text_start|>
63
+ {{ text }}
64
+ <|text_end|>
65
+ Output with {{ num_queries }} QnAs: