arize-phoenix 1.9.1rc2__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

@@ -1,15 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
4
- import json
5
3
  import logging
6
- import signal
7
- import traceback
8
4
  from collections import defaultdict
9
5
  from typing import (
10
6
  Any,
11
- Callable,
12
- Coroutine,
13
7
  DefaultDict,
14
8
  Dict,
15
9
  Iterable,
@@ -17,7 +11,6 @@ from typing import (
17
11
  Mapping,
18
12
  NamedTuple,
19
13
  Optional,
20
- Sequence,
21
14
  Tuple,
22
15
  Union,
23
16
  cast,
@@ -25,13 +18,12 @@ from typing import (
25
18
 
26
19
  import pandas as pd
27
20
  from pandas import DataFrame
28
- from tqdm.auto import tqdm
29
21
  from typing_extensions import TypeAlias
30
22
 
31
- from phoenix.experimental.evals.evaluators import LLMEvaluator, _snap_to_rail
23
+ from phoenix.experimental.evals.evaluators import LLMEvaluator
24
+ from phoenix.experimental.evals.functions.executor import get_executor_on_sync_context
32
25
  from phoenix.experimental.evals.models import BaseEvalModel, OpenAIModel, set_verbosity
33
26
  from phoenix.experimental.evals.templates import (
34
- NOT_PARSABLE,
35
27
  RAG_RELEVANCY_PROMPT_RAILS_MAP,
36
28
  RAG_RELEVANCY_PROMPT_TEMPLATE,
37
29
  ClassificationTemplate,
@@ -40,7 +32,12 @@ from phoenix.experimental.evals.templates import (
40
32
  map_template,
41
33
  normalize_classification_template,
42
34
  )
43
- from phoenix.experimental.evals.utils import get_tqdm_progress_bar_formatter
35
+ from phoenix.experimental.evals.utils import (
36
+ NOT_PARSABLE,
37
+ get_tqdm_progress_bar_formatter,
38
+ parse_openai_function_call,
39
+ snap_to_rail,
40
+ )
44
41
  from phoenix.trace.semantic_conventions import DOCUMENT_CONTENT, INPUT_VALUE, RETRIEVAL_DOCUMENTS
45
42
  from phoenix.utilities.logging import printif
46
43
 
@@ -55,316 +52,31 @@ OPENINFERENCE_DOCUMENT_COLUMN_NAME = "attributes." + RETRIEVAL_DOCUMENTS
55
52
  _RESPONSE = "response"
56
53
  _EXPLANATION = "explanation"
57
54
 
58
- EvalName: TypeAlias = str
59
- EvalPrediction: TypeAlias = str
55
+ ColumnName: TypeAlias = str
56
+ Label: TypeAlias = str
57
+ Explanation: TypeAlias = Optional[str]
60
58
  Record: TypeAlias = Mapping[str, Any]
59
+ EvaluatorIndex: TypeAlias = int
61
60
  RowIndex: TypeAlias = Any
62
61
 
63
-
64
- class Unset:
65
- pass
66
-
67
-
68
- _unset = Unset()
69
-
70
-
71
- class AsyncExecutor:
72
- """
73
- A class that provides asynchronous execution of tasks using a producer-consumer pattern.
74
-
75
- An async interface is provided by the `execute` method, which returns a coroutine, and a sync
76
- interface is provided by the `run` method.
77
-
78
- Args:
79
- generation_fn (Callable[[Any], Coroutine[Any, Any, Any]]): A coroutine function that
80
- generates tasks to be executed.
81
-
82
- concurrency (int, optional): The number of concurrent consumers. Defaults to 3.
83
-
84
- tqdm_bar_format (Optional[str], optional): The format string for the progress bar. Defaults
85
- to None.
86
-
87
- exit_on_error (bool, optional): Whether to exit execution on the first encountered error.
88
- Defaults to True.
89
-
90
- fallback_return_value (Union[Unset, Any], optional): The fallback return value for tasks
91
- that encounter errors. Defaults to _unset.
92
- """
93
-
94
- def __init__(
95
- self,
96
- generation_fn: Callable[[Any], Coroutine[Any, Any, Any]],
97
- concurrency: int = 3,
98
- tqdm_bar_format: Optional[str] = None,
99
- exit_on_error: bool = True,
100
- max_retries: int = 10,
101
- fallback_return_value: Union[Unset, Any] = _unset,
102
- ):
103
- self.generate = generation_fn
104
- self.fallback_return_value = fallback_return_value
105
- self.concurrency = concurrency
106
- self.tqdm_bar_format = tqdm_bar_format
107
- self.exit_on_error = exit_on_error
108
- self.max_retries = max_retries
109
- self.base_priority = 0
110
-
111
- self._TERMINATE = asyncio.Event()
112
-
113
- def _signal_handler(self, signum: int, frame: Any) -> None:
114
- self._TERMINATE.set()
115
- tqdm.write("Process was interrupted. The return value will be incomplete...")
116
-
117
- async def producer(
118
- self,
119
- inputs: Sequence[Any],
120
- queue: asyncio.PriorityQueue[Tuple[int, Any]],
121
- max_fill: int,
122
- done_producing: asyncio.Event,
123
- ) -> None:
124
- try:
125
- for index, input in enumerate(inputs):
126
- if self._TERMINATE.is_set():
127
- break
128
- while queue.qsize() >= max_fill:
129
- # keep room in the queue for requeues
130
- await asyncio.sleep(1)
131
- await queue.put((self.base_priority, (index, input)))
132
- finally:
133
- done_producing.set()
134
-
135
- async def consumer(
136
- self,
137
- output: List[Any],
138
- queue: asyncio.PriorityQueue[Tuple[int, Any]],
139
- done_producing: asyncio.Event,
140
- progress_bar: tqdm[Any],
141
- ) -> None:
142
- termination_signal_task = None
143
- while True:
144
- marked_done = False
145
- try:
146
- priority, item = await asyncio.wait_for(queue.get(), timeout=1)
147
- except asyncio.TimeoutError:
148
- if done_producing.is_set() and queue.empty():
149
- break
150
- continue
151
- if self._TERMINATE.is_set():
152
- # discard any remaining items in the queue
153
- queue.task_done()
154
- marked_done = True
155
- continue
156
-
157
- index, payload = item
158
- try:
159
- generate_task = asyncio.create_task(self.generate(payload))
160
- termination_signal_task = asyncio.create_task(self._TERMINATE.wait())
161
- done, pending = await asyncio.wait(
162
- [generate_task, termination_signal_task],
163
- timeout=360 * 2,
164
- return_when=asyncio.FIRST_COMPLETED,
165
- )
166
- if generate_task in done:
167
- output[index] = generate_task.result()
168
- progress_bar.update()
169
- elif self._TERMINATE.is_set():
170
- # discard the pending task and remaining items in the queue
171
- if not generate_task.done():
172
- generate_task.cancel()
173
- try:
174
- # allow any cleanup to finish for the cancelled task
175
- await generate_task
176
- except asyncio.CancelledError:
177
- # Handle the cancellation exception
178
- pass
179
- queue.task_done()
180
- marked_done = True
181
- continue
182
- else:
183
- tqdm.write("Worker timeout, requeuing")
184
- # task timeouts are requeued at base priority
185
- await queue.put((self.base_priority, item))
186
- except Exception as exc:
187
- if (retry_count := abs(priority)) <= self.max_retries:
188
- tqdm.write(
189
- f"Exception in worker on attempt {retry_count + 1}: raised {repr(exc)}"
190
- )
191
- tqdm.write("Requeuing...")
192
- await queue.put((priority - 1, item))
193
- else:
194
- tqdm.write(f"Exception in worker: {traceback.format_exc()}")
195
- if self.exit_on_error:
196
- self._TERMINATE.set()
197
- else:
198
- progress_bar.update()
199
- finally:
200
- if not marked_done:
201
- queue.task_done()
202
- if termination_signal_task and not termination_signal_task.done():
203
- termination_signal_task.cancel()
204
-
205
- async def execute(self, inputs: Sequence[Any]) -> List[Any]:
206
- signal.signal(signal.SIGINT, self._signal_handler)
207
- outputs = [self.fallback_return_value] * len(inputs)
208
- progress_bar = tqdm(total=len(inputs), bar_format=self.tqdm_bar_format)
209
-
210
- max_queue_size = 5 * self.concurrency # limit the queue to bound memory usage
211
- max_fill = max_queue_size - (2 * self.concurrency) # ensure there is always room to requeue
212
- queue: asyncio.PriorityQueue[Tuple[int, Any]] = asyncio.PriorityQueue(
213
- maxsize=max_queue_size
214
- )
215
- done_producing = asyncio.Event()
216
-
217
- producer = asyncio.create_task(self.producer(inputs, queue, max_fill, done_producing))
218
- consumers = [
219
- asyncio.create_task(self.consumer(outputs, queue, done_producing, progress_bar))
220
- for _ in range(self.concurrency)
221
- ]
222
-
223
- await asyncio.gather(producer, *consumers)
224
- join_task = asyncio.create_task(queue.join())
225
- termination_signal_task = asyncio.create_task(self._TERMINATE.wait())
226
- done, pending = await asyncio.wait(
227
- [join_task, termination_signal_task], return_when=asyncio.FIRST_COMPLETED
228
- )
229
- if termination_signal_task in done:
230
- # Cancel all tasks
231
- if not join_task.done():
232
- join_task.cancel()
233
- if not producer.done():
234
- producer.cancel()
235
- for task in consumers:
236
- if not task.done():
237
- task.cancel()
238
-
239
- if not termination_signal_task.done():
240
- termination_signal_task.cancel()
241
- return outputs
242
-
243
- def run(self, inputs: Sequence[Any]) -> List[Any]:
244
- return asyncio.run(self.execute(inputs))
245
-
246
-
247
- class SyncExecutor:
248
- """
249
- Synchronous executor for generating outputs from inputs using a given generation function.
250
-
251
- Args:
252
- generation_fn (Callable[[Any], Any]): The generation function that takes an input and
253
- returns an output.
254
-
255
- tqdm_bar_format (Optional[str], optional): The format string for the progress bar. Defaults
256
- to None.
257
-
258
- exit_on_error (bool, optional): Whether to exit execution on the first encountered error.
259
- Defaults to True.
260
-
261
- fallback_return_value (Union[Unset, Any], optional): The fallback return value for tasks
262
- that encounter errors. Defaults to _unset.
263
- """
264
-
265
- def __init__(
266
- self,
267
- generation_fn: Callable[[Any], Any],
268
- tqdm_bar_format: Optional[str] = None,
269
- exit_on_error: bool = True,
270
- fallback_return_value: Union[Unset, Any] = _unset,
271
- ):
272
- self.generate = generation_fn
273
- self.fallback_return_value = fallback_return_value
274
- self.tqdm_bar_format = tqdm_bar_format
275
- self.exit_on_error = exit_on_error
276
-
277
- self._TERMINATE = False
278
-
279
- def _signal_handler(self, signum: int, frame: Any) -> None:
280
- tqdm.write("Process was interrupted. The return value will be incomplete...")
281
- self._TERMINATE = True
282
-
283
- def run(self, inputs: Sequence[Any]) -> List[Any]:
284
- signal.signal(signal.SIGINT, self._signal_handler)
285
- outputs = [self.fallback_return_value] * len(inputs)
286
- progress_bar = tqdm(total=len(inputs), bar_format=self.tqdm_bar_format)
287
-
288
- for index, input in enumerate(inputs):
289
- if self._TERMINATE:
290
- break
291
- try:
292
- result = self.generate(input)
293
- outputs[index] = result
294
- progress_bar.update()
295
- except Exception as e:
296
- tqdm.write(f"Exception in worker: {e}")
297
- if self.exit_on_error:
298
- break
299
- else:
300
- progress_bar.update()
301
- return outputs
302
-
303
-
304
- def get_executor_on_sync_context(
305
- sync_fn: Callable[[Any], Any],
306
- async_fn: Callable[[Any], Coroutine[Any, Any, Any]],
307
- concurrency: int = 3,
308
- tqdm_bar_format: Optional[str] = None,
309
- exit_on_error: bool = True,
310
- fallback_return_value: Union[Unset, Any] = _unset,
311
- ) -> Union[AsyncExecutor, SyncExecutor]:
312
- if _running_event_loop_exists():
313
- if getattr(asyncio, "_nest_patched", False):
314
- return AsyncExecutor(
315
- async_fn,
316
- concurrency=concurrency,
317
- tqdm_bar_format=tqdm_bar_format,
318
- exit_on_error=exit_on_error,
319
- fallback_return_value=fallback_return_value,
320
- )
321
- else:
322
- logger.warning(
323
- "🐌!! If running llm_classify inside a notebook, patching the event loop with "
324
- "nest_asyncio will allow asynchronous eval submission, and is significantly "
325
- "faster. To patch the event loop, run `nest_asyncio.apply()`."
326
- )
327
- return SyncExecutor(
328
- sync_fn,
329
- tqdm_bar_format=tqdm_bar_format,
330
- exit_on_error=exit_on_error,
331
- fallback_return_value=fallback_return_value,
332
- )
333
- else:
334
- return AsyncExecutor(
335
- async_fn,
336
- concurrency=concurrency,
337
- tqdm_bar_format=tqdm_bar_format,
338
- exit_on_error=exit_on_error,
339
- fallback_return_value=fallback_return_value,
340
- )
341
-
342
-
343
- def _running_event_loop_exists() -> bool:
344
- """Checks for a running event loop.
345
-
346
- Returns:
347
- bool: True if a running event loop exists, False otherwise.
348
- """
349
- try:
350
- asyncio.get_running_loop()
351
- return True
352
- except RuntimeError:
353
- return False
62
+ # snapped_response, explanation, response
63
+ ParsedLLMResponse: TypeAlias = Tuple[Optional[str], Optional[str], str]
354
64
 
355
65
 
356
66
  def llm_classify(
357
67
  dataframe: pd.DataFrame,
358
68
  model: BaseEvalModel,
359
69
  template: Union[ClassificationTemplate, PromptTemplate, str],
360
- rails: Union[List[str], List[List[str]]],
70
+ rails: List[str],
361
71
  system_instruction: Optional[str] = None,
362
72
  verbose: bool = False,
363
73
  use_function_calling_if_available: bool = True,
364
74
  provide_explanation: bool = False,
75
+ include_prompt: bool = False,
76
+ include_response: bool = False,
77
+ run_sync: bool = False,
365
78
  concurrency: int = 20,
366
79
  ) -> pd.DataFrame:
367
- print("llm_classify_new")
368
80
  """Classifies each input row of the dataframe using an LLM. Returns a pandas.DataFrame
369
81
  where the first column is named `label` and contains the classification labels. An optional
370
82
  column named `explanation` is added when `provide_explanation=True`.
@@ -396,9 +108,18 @@ def llm_classify(
396
108
 
397
109
  provide_explanation (bool, default=False): If True, provides an explanation for each
398
110
  classification label. A column named `explanation` is added to the output dataframe.
399
- Currently, this is only available for models with function calling.
400
111
 
401
- concurrency (int, default=20): The number of concurrent evals.
112
+ include_prompt (bool, default=False): If True, includes a column named `prompt` in the
113
+ output dataframe containing the prompt used for each classification.
114
+
115
+ include_response (bool, default=False): If True, includes a column named `response` in the
116
+ output dataframe containing the raw response from the LLM.
117
+
118
+ run_sync (bool, default=False): If True, forces synchronous request submission. Otherwise
119
+ evaluations will be run asynchronously if possible.
120
+
121
+ concurrency (int, default=20): The number of concurrent evals if async submission is
122
+ possible.
402
123
 
403
124
  Returns:
404
125
  pandas.DataFrame: A dataframe where the `label` column (at column position 0) contains
@@ -408,18 +129,6 @@ def llm_classify(
408
129
  from the entries in the rails argument or "NOT_PARSABLE" if the model's output could
409
130
  not be parsed.
410
131
  """
411
- # Check if rails is a single rail to be applied to all, expand if necessary
412
- # Check if rails is a list of lists
413
- if all(isinstance(sublist, list) for sublist in rails):
414
- rails_list = rails
415
- if use_function_calling_if_available:
416
- raise ValueError("When using function calling, rails must be a single rail.")
417
- elif isinstance(rails, list):
418
- # Assuming rails is a list of strings if it's not a list of lists
419
- rails_list = [rails] * len(dataframe)
420
- else:
421
- raise TypeError("rails must be either a list of strings or a list of lists")
422
-
423
132
  tqdm_bar_format = get_tqdm_progress_bar_formatter("llm_classify")
424
133
  use_openai_function_call = (
425
134
  use_function_calling_if_available
@@ -445,8 +154,7 @@ def llm_classify(
445
154
  if generation_info := model.verbose_generation_info():
446
155
  printif(verbose, generation_info)
447
156
 
448
- def _process_response(response_combo: Tuple[str, List[str]]) -> Tuple[str, Optional[str]]:
449
- response, rails_per_response = response_combo
157
+ def _process_response(response: str) -> Tuple[str, Optional[str]]:
450
158
  if not use_openai_function_call:
451
159
  if provide_explanation:
452
160
  unrailed_label, explanation = (
@@ -461,53 +169,46 @@ def llm_classify(
461
169
  unrailed_label = response
462
170
  explanation = None
463
171
  else:
464
- try:
465
- function_arguments = json.loads(response, strict=False)
466
- unrailed_label = function_arguments.get(_RESPONSE)
467
- explanation = function_arguments.get(_EXPLANATION)
468
- except json.JSONDecodeError:
469
- unrailed_label = response
470
- explanation = None
471
- return _snap_to_rail(unrailed_label, rails_per_response, verbose=verbose), explanation
172
+ unrailed_label, explanation = parse_openai_function_call(response)
173
+ return snap_to_rail(unrailed_label, rails, verbose=verbose), explanation
472
174
 
473
- async def _run_llm_classification_async(
474
- prompt_combo: Tuple[str, List[str]],
475
- ) -> Tuple[str, Optional[str]]:
476
- prompt, rails_per_prompt = prompt_combo
175
+ async def _run_llm_classification_async(prompt: str) -> ParsedLLMResponse:
477
176
  with set_verbosity(model, verbose) as verbose_model:
478
177
  response = await verbose_model._async_generate(
479
178
  prompt, instruction=system_instruction, **model_kwargs
480
179
  )
481
- combined = [response, rails_per_prompt]
482
- return _process_response(combined)
180
+ inference, explanation = _process_response(response)
181
+ return inference, explanation, response
483
182
 
484
- def _run_llm_classification_sync(
485
- prompt_combo: Tuple[str, List[str]],
486
- ) -> Tuple[str, Optional[str]]:
487
- prompt, rails_per_prompt = prompt_combo
183
+ def _run_llm_classification_sync(prompt: str) -> ParsedLLMResponse:
488
184
  with set_verbosity(model, verbose) as verbose_model:
489
185
  response = verbose_model._generate(
490
186
  prompt, instruction=system_instruction, **model_kwargs
491
187
  )
492
- combined = [response, rails_per_prompt]
493
- return _process_response(combined)
188
+ inference, explanation = _process_response(response)
189
+ return inference, explanation, response
190
+
191
+ fallback_return_value: ParsedLLMResponse = (None, None, "")
494
192
 
495
193
  executor = get_executor_on_sync_context(
496
194
  _run_llm_classification_sync,
497
195
  _run_llm_classification_async,
196
+ run_sync=run_sync,
498
197
  concurrency=concurrency,
499
198
  tqdm_bar_format=tqdm_bar_format,
500
199
  exit_on_error=True,
501
- fallback_return_value=(None, None),
200
+ fallback_return_value=fallback_return_value,
502
201
  )
503
- combined_prompt_rails = list(zip(prompts.tolist(), rails_list))
504
- results = executor.run(combined_prompt_rails)
505
- labels, explanations = zip(*results)
202
+
203
+ results = executor.run(prompts.tolist())
204
+ labels, explanations, responses = zip(*results)
506
205
 
507
206
  return pd.DataFrame(
508
207
  data={
509
208
  "label": labels,
510
209
  **({"explanation": explanations} if provide_explanation else {}),
210
+ **({"prompt": prompts} if include_prompt else {}),
211
+ **({"response": responses} if include_response else {}),
511
212
  },
512
213
  index=dataframe.index,
513
214
  )
@@ -664,60 +365,67 @@ def _default_openai_function(
664
365
 
665
366
 
666
367
  class RunEvalsPayload(NamedTuple):
368
+ evaluator_index: EvaluatorIndex
369
+ row_index: RowIndex
667
370
  evaluator: LLMEvaluator
668
371
  record: Record
669
- row_index: RowIndex
670
372
 
671
373
 
672
374
  def run_evals(
673
375
  dataframe: DataFrame,
674
376
  evaluators: List[LLMEvaluator],
377
+ provide_explanation: bool = False,
378
+ verbose: bool = False,
675
379
  concurrency: int = 20,
676
- ) -> DataFrame:
380
+ ) -> List[DataFrame]:
677
381
  """
678
- Applies a list of evaluators to every row of a dataframe. Outputs a
679
- dataframe where each column corresponds to an evaluator and each row
680
- corresponds to a row in the input dataframe.
382
+ Applies a list of evaluators to a dataframe. Outputs a list of dataframes in
383
+ which each dataframe contains the outputs of the corresponding evaluator
384
+ applied to the input dataframe.
681
385
 
682
386
  Args:
683
- dataframe (pd.DataFrame): A pandas dataframe in which each row
684
- represents a record to be evaluated. All template variable names must
685
- appear as column names in the dataframe (extra columns unrelated to the
686
- template are permitted).
387
+ dataframe (DataFrame): A pandas dataframe in which each row represents a
388
+ record to be evaluated. All template variable names must appear as
389
+ column names in the dataframe (extra columns unrelated to the template
390
+ are permitted).
391
+
392
+ evaluators (List[LLMEvaluator]): A list of evaluators.
687
393
 
688
- evaluators (List[Evaluator]): A list of evaluators with unique names.
394
+ provide_explanation (bool, optional): If True, provides an explanation
395
+ for each evaluation. A column named "explanation" is added to each
396
+ output dataframe.
689
397
 
690
- concurrency (int, optional): An optional concurrency parameter. Defaults
691
- to 20.
398
+ verbose (bool, optional): If True, prints detailed info to stdout such
399
+ as model invocation parameters and details about retries and snapping to
400
+ rails.
401
+
402
+ concurrency (int, optional): The number of concurrent evals if async
403
+ submission is possible.
692
404
 
693
405
  Returns:
694
- DataFrame: A dataframe where each row contains the outputs of the
695
- evaluators applied to the corresponding row of the input dataframe and
696
- the column names match the names of the evaluators. The index of the
697
- dataframe is the same as the index of the input dataframe.
406
+ List[DataFrame]: A list of dataframes, one for each evaluator, all of
407
+ which have the same number of rows as the input dataframe.
698
408
  """
699
- if len(set(evaluator.name for evaluator in evaluators)) != len(evaluators):
700
- raise ValueError("Evaluators must have unique names.")
701
409
 
702
- async def _run_eval_async(
410
+ async def _arun_eval(
411
+ payload: RunEvalsPayload,
412
+ ) -> Tuple[EvaluatorIndex, RowIndex, Label, Explanation]:
413
+ label, explanation = await payload.evaluator.aevaluate(
414
+ payload.record, provide_explanation=provide_explanation
415
+ )
416
+ return payload.evaluator_index, payload.row_index, label, explanation
417
+
418
+ def _run_eval(
703
419
  payload: RunEvalsPayload,
704
- ) -> Tuple[RowIndex, EvalName, EvalPrediction]:
705
- row_index = payload.row_index
706
- evaluator = payload.evaluator
707
- record = payload.record
708
- eval_result = await evaluator.aevaluate(record)
709
- return row_index, evaluator.name, eval_result
710
-
711
- def _run_eval_sync(payload: RunEvalsPayload) -> Tuple[RowIndex, EvalName, EvalPrediction]:
712
- row_index = payload.row_index
713
- evaluator = payload.evaluator
714
- record = payload.record
715
- eval_result = evaluator.evaluate(record)
716
- return row_index, evaluator.name, eval_result
420
+ ) -> Tuple[EvaluatorIndex, RowIndex, Label, Explanation]:
421
+ label, explanation = payload.evaluator.evaluate(
422
+ payload.record, provide_explanation=provide_explanation
423
+ )
424
+ return payload.evaluator_index, payload.row_index, label, explanation
717
425
 
718
426
  executor = get_executor_on_sync_context(
719
- _run_eval_sync,
720
- _run_eval_async,
427
+ _run_eval,
428
+ _arun_eval,
721
429
  concurrency=concurrency,
722
430
  tqdm_bar_format=get_tqdm_progress_bar_formatter("run_evals"),
723
431
  exit_on_error=True,
@@ -725,15 +433,23 @@ def run_evals(
725
433
  )
726
434
  payloads = [
727
435
  RunEvalsPayload(
436
+ evaluator_index=evaluator_index,
728
437
  row_index=row_index,
729
438
  evaluator=evaluator,
730
439
  record=row.to_dict(),
731
440
  )
732
441
  for row_index, row in dataframe.iterrows()
733
- for evaluator in evaluators
442
+ for evaluator_index, evaluator in enumerate(evaluators)
443
+ ]
444
+ eval_results: List[DefaultDict[RowIndex, Dict[ColumnName, Union[Label, Explanation]]]] = [
445
+ defaultdict(dict) for _ in range(len(evaluators))
734
446
  ]
735
- results: DefaultDict[RowIndex, Dict[EvalName, EvalPrediction]] = defaultdict(dict)
736
- for row_index, eval_name, eval_result in executor.run(payloads):
737
- results[row_index][eval_name] = eval_result
738
- index, data = zip(*results.items())
739
- return DataFrame(data, index=index)
447
+ for evaluator_index, row_index, label, explanation in executor.run(payloads):
448
+ eval_results[evaluator_index][row_index]["label"] = label
449
+ if explanation is not None:
450
+ eval_results[evaluator_index][row_index]["explanation"] = explanation
451
+ eval_dataframes: List[DataFrame] = []
452
+ for eval_result in eval_results:
453
+ index, eval_data = zip(*eval_result.items())
454
+ eval_dataframes.append(DataFrame(eval_data, index=index))
455
+ return eval_dataframes