arize-phoenix 1.9.1rc3__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

@@ -1,15 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
4
- import json
5
3
  import logging
6
- import signal
7
- import traceback
8
4
  from collections import defaultdict
9
5
  from typing import (
10
6
  Any,
11
- Callable,
12
- Coroutine,
13
7
  DefaultDict,
14
8
  Dict,
15
9
  Iterable,
@@ -17,7 +11,6 @@ from typing import (
17
11
  Mapping,
18
12
  NamedTuple,
19
13
  Optional,
20
- Sequence,
21
14
  Tuple,
22
15
  Union,
23
16
  cast,
@@ -25,13 +18,12 @@ from typing import (
25
18
 
26
19
  import pandas as pd
27
20
  from pandas import DataFrame
28
- from tqdm.auto import tqdm
29
21
  from typing_extensions import TypeAlias
30
22
 
31
- from phoenix.experimental.evals.evaluators import LLMEvaluator, _snap_to_rail
23
+ from phoenix.experimental.evals.evaluators import LLMEvaluator
24
+ from phoenix.experimental.evals.functions.executor import get_executor_on_sync_context
32
25
  from phoenix.experimental.evals.models import BaseEvalModel, OpenAIModel, set_verbosity
33
26
  from phoenix.experimental.evals.templates import (
34
- NOT_PARSABLE,
35
27
  RAG_RELEVANCY_PROMPT_RAILS_MAP,
36
28
  RAG_RELEVANCY_PROMPT_TEMPLATE,
37
29
  ClassificationTemplate,
@@ -40,7 +32,12 @@ from phoenix.experimental.evals.templates import (
40
32
  map_template,
41
33
  normalize_classification_template,
42
34
  )
43
- from phoenix.experimental.evals.utils import get_tqdm_progress_bar_formatter
35
+ from phoenix.experimental.evals.utils import (
36
+ NOT_PARSABLE,
37
+ get_tqdm_progress_bar_formatter,
38
+ parse_openai_function_call,
39
+ snap_to_rail,
40
+ )
44
41
  from phoenix.trace.semantic_conventions import DOCUMENT_CONTENT, INPUT_VALUE, RETRIEVAL_DOCUMENTS
45
42
  from phoenix.utilities.logging import printif
46
43
 
@@ -55,312 +52,31 @@ OPENINFERENCE_DOCUMENT_COLUMN_NAME = "attributes." + RETRIEVAL_DOCUMENTS
55
52
  _RESPONSE = "response"
56
53
  _EXPLANATION = "explanation"
57
54
 
58
- EvalName: TypeAlias = str
59
- EvalPrediction: TypeAlias = str
55
+ ColumnName: TypeAlias = str
56
+ Label: TypeAlias = str
57
+ Explanation: TypeAlias = Optional[str]
60
58
  Record: TypeAlias = Mapping[str, Any]
59
+ EvaluatorIndex: TypeAlias = int
61
60
  RowIndex: TypeAlias = Any
62
61
 
63
-
64
- class Unset:
65
- pass
66
-
67
-
68
- _unset = Unset()
69
-
70
-
71
- class AsyncExecutor:
72
- """
73
- A class that provides asynchronous execution of tasks using a producer-consumer pattern.
74
-
75
- An async interface is provided by the `execute` method, which returns a coroutine, and a sync
76
- interface is provided by the `run` method.
77
-
78
- Args:
79
- generation_fn (Callable[[Any], Coroutine[Any, Any, Any]]): A coroutine function that
80
- generates tasks to be executed.
81
-
82
- concurrency (int, optional): The number of concurrent consumers. Defaults to 3.
83
-
84
- tqdm_bar_format (Optional[str], optional): The format string for the progress bar. Defaults
85
- to None.
86
-
87
- exit_on_error (bool, optional): Whether to exit execution on the first encountered error.
88
- Defaults to True.
89
-
90
- fallback_return_value (Union[Unset, Any], optional): The fallback return value for tasks
91
- that encounter errors. Defaults to _unset.
92
- """
93
-
94
- def __init__(
95
- self,
96
- generation_fn: Callable[[Any], Coroutine[Any, Any, Any]],
97
- concurrency: int = 3,
98
- tqdm_bar_format: Optional[str] = None,
99
- exit_on_error: bool = True,
100
- max_retries: int = 10,
101
- fallback_return_value: Union[Unset, Any] = _unset,
102
- ):
103
- self.generate = generation_fn
104
- self.fallback_return_value = fallback_return_value
105
- self.concurrency = concurrency
106
- self.tqdm_bar_format = tqdm_bar_format
107
- self.exit_on_error = exit_on_error
108
- self.max_retries = max_retries
109
- self.base_priority = 0
110
-
111
- self._TERMINATE = asyncio.Event()
112
-
113
- def _signal_handler(self, signum: int, frame: Any) -> None:
114
- self._TERMINATE.set()
115
- tqdm.write("Process was interrupted. The return value will be incomplete...")
116
-
117
- async def producer(
118
- self,
119
- inputs: Sequence[Any],
120
- queue: asyncio.PriorityQueue[Tuple[int, Any]],
121
- max_fill: int,
122
- done_producing: asyncio.Event,
123
- ) -> None:
124
- try:
125
- for index, input in enumerate(inputs):
126
- if self._TERMINATE.is_set():
127
- break
128
- while queue.qsize() >= max_fill:
129
- # keep room in the queue for requeues
130
- await asyncio.sleep(1)
131
- await queue.put((self.base_priority, (index, input)))
132
- finally:
133
- done_producing.set()
134
-
135
- async def consumer(
136
- self,
137
- output: List[Any],
138
- queue: asyncio.PriorityQueue[Tuple[int, Any]],
139
- done_producing: asyncio.Event,
140
- progress_bar: tqdm[Any],
141
- ) -> None:
142
- termination_signal_task = None
143
- while True:
144
- marked_done = False
145
- try:
146
- priority, item = await asyncio.wait_for(queue.get(), timeout=1)
147
- except asyncio.TimeoutError:
148
- if done_producing.is_set() and queue.empty():
149
- break
150
- continue
151
- if self._TERMINATE.is_set():
152
- # discard any remaining items in the queue
153
- queue.task_done()
154
- marked_done = True
155
- continue
156
-
157
- index, payload = item
158
- try:
159
- generate_task = asyncio.create_task(self.generate(payload))
160
- termination_signal_task = asyncio.create_task(self._TERMINATE.wait())
161
- done, pending = await asyncio.wait(
162
- [generate_task, termination_signal_task],
163
- timeout=360 * 2,
164
- return_when=asyncio.FIRST_COMPLETED,
165
- )
166
- if generate_task in done:
167
- output[index] = generate_task.result()
168
- progress_bar.update()
169
- elif self._TERMINATE.is_set():
170
- # discard the pending task and remaining items in the queue
171
- if not generate_task.done():
172
- generate_task.cancel()
173
- try:
174
- # allow any cleanup to finish for the cancelled task
175
- await generate_task
176
- except asyncio.CancelledError:
177
- # Handle the cancellation exception
178
- pass
179
- queue.task_done()
180
- marked_done = True
181
- continue
182
- else:
183
- tqdm.write("Worker timeout, requeuing")
184
- await queue.put(item)
185
- except Exception:
186
- tqdm.write("Worker exception and requeuing")
187
- await queue.put(item)
188
- #tqdm.write(f"Exception in worker: {traceback.format_exc()}")
189
- #if self.exit_on_error:
190
- # self._TERMINATE.set()
191
- #else:
192
- # progress_bar.update()
193
- finally:
194
- if not marked_done:
195
- queue.task_done()
196
- if termination_signal_task and not termination_signal_task.done():
197
- termination_signal_task.cancel()
198
-
199
- async def execute(self, inputs: Sequence[Any]) -> List[Any]:
200
- signal.signal(signal.SIGINT, self._signal_handler)
201
- outputs = [self.fallback_return_value] * len(inputs)
202
- progress_bar = tqdm(total=len(inputs), bar_format=self.tqdm_bar_format)
203
-
204
- max_queue_size = 5 * self.concurrency # limit the queue to bound memory usage
205
- max_fill = max_queue_size - (2 * self.concurrency) # ensure there is always room to requeue
206
- queue: asyncio.PriorityQueue[Tuple[int, Any]] = asyncio.PriorityQueue(
207
- maxsize=max_queue_size
208
- )
209
- done_producing = asyncio.Event()
210
-
211
- producer = asyncio.create_task(self.producer(inputs, queue, max_fill, done_producing))
212
- consumers = [
213
- asyncio.create_task(self.consumer(outputs, queue, done_producing, progress_bar))
214
- for _ in range(self.concurrency)
215
- ]
216
-
217
- await asyncio.gather(producer, *consumers)
218
- join_task = asyncio.create_task(queue.join())
219
- termination_signal_task = asyncio.create_task(self._TERMINATE.wait())
220
- done, pending = await asyncio.wait(
221
- [join_task, termination_signal_task], return_when=asyncio.FIRST_COMPLETED
222
- )
223
- if termination_signal_task in done:
224
- # Cancel all tasks
225
- if not join_task.done():
226
- join_task.cancel()
227
- if not producer.done():
228
- producer.cancel()
229
- for task in consumers:
230
- if not task.done():
231
- task.cancel()
232
-
233
- if not termination_signal_task.done():
234
- termination_signal_task.cancel()
235
- return outputs
236
-
237
- def run(self, inputs: Sequence[Any]) -> List[Any]:
238
- return asyncio.run(self.execute(inputs))
239
-
240
-
241
- class SyncExecutor:
242
- """
243
- Synchronous executor for generating outputs from inputs using a given generation function.
244
-
245
- Args:
246
- generation_fn (Callable[[Any], Any]): The generation function that takes an input and
247
- returns an output.
248
-
249
- tqdm_bar_format (Optional[str], optional): The format string for the progress bar. Defaults
250
- to None.
251
-
252
- exit_on_error (bool, optional): Whether to exit execution on the first encountered error.
253
- Defaults to True.
254
-
255
- fallback_return_value (Union[Unset, Any], optional): The fallback return value for tasks
256
- that encounter errors. Defaults to _unset.
257
- """
258
-
259
- def __init__(
260
- self,
261
- generation_fn: Callable[[Any], Any],
262
- tqdm_bar_format: Optional[str] = None,
263
- exit_on_error: bool = True,
264
- fallback_return_value: Union[Unset, Any] = _unset,
265
- ):
266
- self.generate = generation_fn
267
- self.fallback_return_value = fallback_return_value
268
- self.tqdm_bar_format = tqdm_bar_format
269
- self.exit_on_error = exit_on_error
270
-
271
- self._TERMINATE = False
272
-
273
- def _signal_handler(self, signum: int, frame: Any) -> None:
274
- tqdm.write("Process was interrupted. The return value will be incomplete...")
275
- self._TERMINATE = True
276
-
277
- def run(self, inputs: Sequence[Any]) -> List[Any]:
278
- signal.signal(signal.SIGINT, self._signal_handler)
279
- outputs = [self.fallback_return_value] * len(inputs)
280
- progress_bar = tqdm(total=len(inputs), bar_format=self.tqdm_bar_format)
281
-
282
- for index, input in enumerate(inputs):
283
- if self._TERMINATE:
284
- break
285
- try:
286
- result = self.generate(input)
287
- outputs[index] = result
288
- progress_bar.update()
289
- except Exception as e:
290
- tqdm.write(f"Exception in worker: {e}")
291
- if self.exit_on_error:
292
- break
293
- else:
294
- progress_bar.update()
295
- return outputs
296
-
297
-
298
- def get_executor_on_sync_context(
299
- sync_fn: Callable[[Any], Any],
300
- async_fn: Callable[[Any], Coroutine[Any, Any, Any]],
301
- concurrency: int = 3,
302
- tqdm_bar_format: Optional[str] = None,
303
- exit_on_error: bool = True,
304
- fallback_return_value: Union[Unset, Any] = _unset,
305
- ) -> Union[AsyncExecutor, SyncExecutor]:
306
- if _running_event_loop_exists():
307
- if getattr(asyncio, "_nest_patched", False):
308
- return AsyncExecutor(
309
- async_fn,
310
- concurrency=concurrency,
311
- tqdm_bar_format=tqdm_bar_format,
312
- exit_on_error=exit_on_error,
313
- fallback_return_value=fallback_return_value,
314
- )
315
- else:
316
- logger.warning(
317
- "🐌!! If running llm_classify inside a notebook, patching the event loop with "
318
- "nest_asyncio will allow asynchronous eval submission, and is significantly "
319
- "faster. To patch the event loop, run `nest_asyncio.apply()`."
320
- )
321
- return SyncExecutor(
322
- sync_fn,
323
- tqdm_bar_format=tqdm_bar_format,
324
- exit_on_error=exit_on_error,
325
- fallback_return_value=fallback_return_value,
326
- )
327
- else:
328
- return AsyncExecutor(
329
- async_fn,
330
- concurrency=concurrency,
331
- tqdm_bar_format=tqdm_bar_format,
332
- exit_on_error=exit_on_error,
333
- fallback_return_value=fallback_return_value,
334
- )
335
-
336
-
337
- def _running_event_loop_exists() -> bool:
338
- """Checks for a running event loop.
339
-
340
- Returns:
341
- bool: True if a running event loop exists, False otherwise.
342
- """
343
- try:
344
- asyncio.get_running_loop()
345
- return True
346
- except RuntimeError:
347
- return False
62
+ # snapped_response, explanation, response
63
+ ParsedLLMResponse: TypeAlias = Tuple[Optional[str], Optional[str], str]
348
64
 
349
65
 
350
66
  def llm_classify(
351
67
  dataframe: pd.DataFrame,
352
68
  model: BaseEvalModel,
353
69
  template: Union[ClassificationTemplate, PromptTemplate, str],
354
- rails: Union[List[str], List[List[str]]],
70
+ rails: List[str],
355
71
  system_instruction: Optional[str] = None,
356
72
  verbose: bool = False,
357
73
  use_function_calling_if_available: bool = True,
358
74
  provide_explanation: bool = False,
75
+ include_prompt: bool = False,
76
+ include_response: bool = False,
77
+ run_sync: bool = False,
359
78
  concurrency: int = 20,
360
- return_prompt: bool = False,
361
- return_response: bool = False,
362
79
  ) -> pd.DataFrame:
363
- print("llm_classify_new")
364
80
  """Classifies each input row of the dataframe using an LLM. Returns a pandas.DataFrame
365
81
  where the first column is named `label` and contains the classification labels. An optional
366
82
  column named `explanation` is added when `provide_explanation=True`.
@@ -392,9 +108,18 @@ def llm_classify(
392
108
 
393
109
  provide_explanation (bool, default=False): If True, provides an explanation for each
394
110
  classification label. A column named `explanation` is added to the output dataframe.
395
- Currently, this is only available for models with function calling.
396
111
 
397
- concurrency (int, default=20): The number of concurrent evals.
112
+ include_prompt (bool, default=False): If True, includes a column named `prompt` in the
113
+ output dataframe containing the prompt used for each classification.
114
+
115
+ include_response (bool, default=False): If True, includes a column named `response` in the
116
+ output dataframe containing the raw response from the LLM.
117
+
118
+ run_sync (bool, default=False): If True, forces synchronous request submission. Otherwise
119
+ evaluations will be run asynchronously if possible.
120
+
121
+ concurrency (int, default=20): The number of concurrent evals if async submission is
122
+ possible.
398
123
 
399
124
  Returns:
400
125
  pandas.DataFrame: A dataframe where the `label` column (at column position 0) contains
@@ -404,18 +129,6 @@ def llm_classify(
404
129
  from the entries in the rails argument or "NOT_PARSABLE" if the model's output could
405
130
  not be parsed.
406
131
  """
407
- # Check if rails is a single rail to be applied to all, expand if necessary
408
- # Check if rails is a list of lists
409
- if all(isinstance(sublist, list) for sublist in rails):
410
- rails_list = rails
411
- if use_function_calling_if_available:
412
- raise ValueError("When using function calling, rails must be a single rail.")
413
- elif isinstance(rails, list):
414
- # Assuming rails is a list of strings if it's not a list of lists
415
- rails_list = [rails] * len(dataframe)
416
- else:
417
- raise TypeError("rails must be either a list of strings or a list of lists")
418
-
419
132
  tqdm_bar_format = get_tqdm_progress_bar_formatter("llm_classify")
420
133
  use_openai_function_call = (
421
134
  use_function_calling_if_available
@@ -441,8 +154,7 @@ def llm_classify(
441
154
  if generation_info := model.verbose_generation_info():
442
155
  printif(verbose, generation_info)
443
156
 
444
- def _process_response(response_combo: Tuple[str, List[str]]) -> Tuple[str, Optional[str]]:
445
- response, rails_per_response = response_combo
157
+ def _process_response(response: str) -> Tuple[str, Optional[str]]:
446
158
  if not use_openai_function_call:
447
159
  if provide_explanation:
448
160
  unrailed_label, explanation = (
@@ -457,68 +169,46 @@ def llm_classify(
457
169
  unrailed_label = response
458
170
  explanation = None
459
171
  else:
460
- try:
461
- function_arguments = json.loads(response, strict=False)
462
- unrailed_label = function_arguments.get(_RESPONSE)
463
- explanation = function_arguments.get(_EXPLANATION)
464
- except json.JSONDecodeError:
465
- unrailed_label = response
466
- explanation = None
467
- return _snap_to_rail(unrailed_label, rails_per_response, verbose=verbose), explanation
172
+ unrailed_label, explanation = parse_openai_function_call(response)
173
+ return snap_to_rail(unrailed_label, rails, verbose=verbose), explanation
468
174
 
469
- async def _run_llm_classification_async(
470
- prompt_combo: Tuple[str, List[str]],
471
- ) -> Tuple[str, Optional[str], Optional[str], Optional[str], Optional[str]]:
472
- prompt, rails_per_prompt = prompt_combo
175
+ async def _run_llm_classification_async(prompt: str) -> ParsedLLMResponse:
473
176
  with set_verbosity(model, verbose) as verbose_model:
474
177
  response = await verbose_model._async_generate(
475
178
  prompt, instruction=system_instruction, **model_kwargs
476
179
  )
477
- processed_response = _process_response([response, rails_per_prompt])
478
- prompt_to_return = prompt if return_prompt else None
479
- system_instruction_to_return = system_instruction if return_prompt else None
480
- response_to_return = response if return_response else None
481
- # Combine processed_response with prompt and system_instruction
482
- # proccessed_response, explanation, prompt, system_instruction, unprocessed_response
483
- final_result= (*processed_response, prompt_to_return, system_instruction_to_return, response_to_return)
484
- return final_result
485
-
486
- def _run_llm_classification_sync(
487
- prompt_combo: Tuple[str, List[str]],
488
- ) -> Tuple[str, Optional[str], Optional[str], Optional[str], Optional[str]]:
489
- prompt, rails_per_prompt = prompt_combo
180
+ inference, explanation = _process_response(response)
181
+ return inference, explanation, response
182
+
183
+ def _run_llm_classification_sync(prompt: str) -> ParsedLLMResponse:
490
184
  with set_verbosity(model, verbose) as verbose_model:
491
185
  response = verbose_model._generate(
492
186
  prompt, instruction=system_instruction, **model_kwargs
493
187
  )
494
- processed_response = _process_response([response, rails_per_prompt])
495
- prompt_to_return = prompt if return_prompt else None
496
- system_instruction_to_return = system_instruction if return_prompt else None
497
- response_to_return = response if return_response else None
498
- # Combine processed_response with prompt and system_instruction
499
- # proccessed_response, explanation, prompt, system_instruction, unprocessed_response
500
- final_result= (*processed_response, prompt_to_return, system_instruction_to_return, response_to_return)
501
- return final_result
188
+ inference, explanation = _process_response(response)
189
+ return inference, explanation, response
190
+
191
+ fallback_return_value: ParsedLLMResponse = (None, None, "")
502
192
 
503
193
  executor = get_executor_on_sync_context(
504
194
  _run_llm_classification_sync,
505
195
  _run_llm_classification_async,
196
+ run_sync=run_sync,
506
197
  concurrency=concurrency,
507
198
  tqdm_bar_format=tqdm_bar_format,
508
199
  exit_on_error=True,
509
- fallback_return_value=(None, None),
200
+ fallback_return_value=fallback_return_value,
510
201
  )
511
- combined_prompt_rails = list(zip(prompts.tolist(), rails_list))
512
- results = executor.run(combined_prompt_rails)
513
- labels, explanations, prompt_to_return, system_instruction_to_return,response_to_return = zip(*results)
202
+
203
+ results = executor.run(prompts.tolist())
204
+ labels, explanations, responses = zip(*results)
514
205
 
515
206
  return pd.DataFrame(
516
207
  data={
517
208
  "label": labels,
518
209
  **({"explanation": explanations} if provide_explanation else {}),
519
- **({"prompt": prompt_to_return} if return_prompt else {}),
520
- **({"system_intruction": system_instruction_to_return} if return_prompt else {}),
521
- **({"response": response_to_return} if return_response else {}),
210
+ **({"prompt": prompts} if include_prompt else {}),
211
+ **({"response": responses} if include_response else {}),
522
212
  },
523
213
  index=dataframe.index,
524
214
  )
@@ -675,60 +365,67 @@ def _default_openai_function(
675
365
 
676
366
 
677
367
  class RunEvalsPayload(NamedTuple):
368
+ evaluator_index: EvaluatorIndex
369
+ row_index: RowIndex
678
370
  evaluator: LLMEvaluator
679
371
  record: Record
680
- row_index: RowIndex
681
372
 
682
373
 
683
374
  def run_evals(
684
375
  dataframe: DataFrame,
685
376
  evaluators: List[LLMEvaluator],
377
+ provide_explanation: bool = False,
378
+ verbose: bool = False,
686
379
  concurrency: int = 20,
687
- ) -> DataFrame:
380
+ ) -> List[DataFrame]:
688
381
  """
689
- Applies a list of evaluators to every row of a dataframe. Outputs a
690
- dataframe where each column corresponds to an evaluator and each row
691
- corresponds to a row in the input dataframe.
382
+ Applies a list of evaluators to a dataframe. Outputs a list of dataframes in
383
+ which each dataframe contains the outputs of the corresponding evaluator
384
+ applied to the input dataframe.
692
385
 
693
386
  Args:
694
- dataframe (pd.DataFrame): A pandas dataframe in which each row
695
- represents a record to be evaluated. All template variable names must
696
- appear as column names in the dataframe (extra columns unrelated to the
697
- template are permitted).
387
+ dataframe (DataFrame): A pandas dataframe in which each row represents a
388
+ record to be evaluated. All template variable names must appear as
389
+ column names in the dataframe (extra columns unrelated to the template
390
+ are permitted).
391
+
392
+ evaluators (List[LLMEvaluator]): A list of evaluators.
698
393
 
699
- evaluators (List[Evaluator]): A list of evaluators with unique names.
394
+ provide_explanation (bool, optional): If True, provides an explanation
395
+ for each evaluation. A column named "explanation" is added to each
396
+ output dataframe.
700
397
 
701
- concurrency (int, optional): An optional concurrency parameter. Defaults
702
- to 20.
398
+ verbose (bool, optional): If True, prints detailed info to stdout such
399
+ as model invocation parameters and details about retries and snapping to
400
+ rails.
401
+
402
+ concurrency (int, optional): The number of concurrent evals if async
403
+ submission is possible.
703
404
 
704
405
  Returns:
705
- DataFrame: A dataframe where each row contains the outputs of the
706
- evaluators applied to the corresponding row of the input dataframe and
707
- the column names match the names of the evaluators. The index of the
708
- dataframe is the same as the index of the input dataframe.
406
+ List[DataFrame]: A list of dataframes, one for each evaluator, all of
407
+ which have the same number of rows as the input dataframe.
709
408
  """
710
- if len(set(evaluator.name for evaluator in evaluators)) != len(evaluators):
711
- raise ValueError("Evaluators must have unique names.")
712
409
 
713
- async def _run_eval_async(
410
+ async def _arun_eval(
411
+ payload: RunEvalsPayload,
412
+ ) -> Tuple[EvaluatorIndex, RowIndex, Label, Explanation]:
413
+ label, explanation = await payload.evaluator.aevaluate(
414
+ payload.record, provide_explanation=provide_explanation
415
+ )
416
+ return payload.evaluator_index, payload.row_index, label, explanation
417
+
418
+ def _run_eval(
714
419
  payload: RunEvalsPayload,
715
- ) -> Tuple[RowIndex, EvalName, EvalPrediction]:
716
- row_index = payload.row_index
717
- evaluator = payload.evaluator
718
- record = payload.record
719
- eval_result = await evaluator.aevaluate(record)
720
- return row_index, evaluator.name, eval_result
721
-
722
- def _run_eval_sync(payload: RunEvalsPayload) -> Tuple[RowIndex, EvalName, EvalPrediction]:
723
- row_index = payload.row_index
724
- evaluator = payload.evaluator
725
- record = payload.record
726
- eval_result = evaluator.evaluate(record)
727
- return row_index, evaluator.name, eval_result
420
+ ) -> Tuple[EvaluatorIndex, RowIndex, Label, Explanation]:
421
+ label, explanation = payload.evaluator.evaluate(
422
+ payload.record, provide_explanation=provide_explanation
423
+ )
424
+ return payload.evaluator_index, payload.row_index, label, explanation
728
425
 
729
426
  executor = get_executor_on_sync_context(
730
- _run_eval_sync,
731
- _run_eval_async,
427
+ _run_eval,
428
+ _arun_eval,
732
429
  concurrency=concurrency,
733
430
  tqdm_bar_format=get_tqdm_progress_bar_formatter("run_evals"),
734
431
  exit_on_error=True,
@@ -736,15 +433,23 @@ def run_evals(
736
433
  )
737
434
  payloads = [
738
435
  RunEvalsPayload(
436
+ evaluator_index=evaluator_index,
739
437
  row_index=row_index,
740
438
  evaluator=evaluator,
741
439
  record=row.to_dict(),
742
440
  )
743
441
  for row_index, row in dataframe.iterrows()
744
- for evaluator in evaluators
442
+ for evaluator_index, evaluator in enumerate(evaluators)
443
+ ]
444
+ eval_results: List[DefaultDict[RowIndex, Dict[ColumnName, Union[Label, Explanation]]]] = [
445
+ defaultdict(dict) for _ in range(len(evaluators))
745
446
  ]
746
- results: DefaultDict[RowIndex, Dict[EvalName, EvalPrediction]] = defaultdict(dict)
747
- for row_index, eval_name, eval_result in executor.run(payloads):
748
- results[row_index][eval_name] = eval_result
749
- index, data = zip(*results.items())
750
- return DataFrame(data, index=index)
447
+ for evaluator_index, row_index, label, explanation in executor.run(payloads):
448
+ eval_results[evaluator_index][row_index]["label"] = label
449
+ if explanation is not None:
450
+ eval_results[evaluator_index][row_index]["explanation"] = explanation
451
+ eval_dataframes: List[DataFrame] = []
452
+ for eval_result in eval_results:
453
+ index, eval_data = zip(*eval_result.items())
454
+ eval_dataframes.append(DataFrame(eval_data, index=index))
455
+ return eval_dataframes