arize-phoenix 1.9.1rc3__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-1.9.1rc3.dist-info → arize_phoenix-2.0.0.dist-info}/METADATA +1 -1
- {arize_phoenix-1.9.1rc3.dist-info → arize_phoenix-2.0.0.dist-info}/RECORD +21 -19
- {arize_phoenix-1.9.1rc3.dist-info → arize_phoenix-2.0.0.dist-info}/WHEEL +1 -1
- phoenix/__init__.py +1 -1
- phoenix/core/traces.py +1 -1
- phoenix/exceptions.py +2 -0
- phoenix/experimental/evals/__init__.py +3 -2
- phoenix/experimental/evals/evaluators.py +89 -46
- phoenix/experimental/evals/functions/classify.py +103 -398
- phoenix/experimental/evals/functions/executor.py +353 -0
- phoenix/experimental/evals/functions/generate.py +76 -32
- phoenix/experimental/evals/models/rate_limiters.py +25 -5
- phoenix/experimental/evals/templates/__init__.py +0 -2
- phoenix/experimental/evals/templates/template.py +2 -5
- phoenix/experimental/evals/utils/__init__.py +66 -0
- phoenix/server/app.py +3 -2
- phoenix/server/main.py +3 -0
- phoenix/server/static/index.js +459 -436
- phoenix/trace/openai/instrumentor.py +51 -14
- {arize_phoenix-1.9.1rc3.dist-info → arize_phoenix-2.0.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-1.9.1rc3.dist-info → arize_phoenix-2.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,15 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import asyncio
|
|
4
|
-
import json
|
|
5
3
|
import logging
|
|
6
|
-
import signal
|
|
7
|
-
import traceback
|
|
8
4
|
from collections import defaultdict
|
|
9
5
|
from typing import (
|
|
10
6
|
Any,
|
|
11
|
-
Callable,
|
|
12
|
-
Coroutine,
|
|
13
7
|
DefaultDict,
|
|
14
8
|
Dict,
|
|
15
9
|
Iterable,
|
|
@@ -17,7 +11,6 @@ from typing import (
|
|
|
17
11
|
Mapping,
|
|
18
12
|
NamedTuple,
|
|
19
13
|
Optional,
|
|
20
|
-
Sequence,
|
|
21
14
|
Tuple,
|
|
22
15
|
Union,
|
|
23
16
|
cast,
|
|
@@ -25,13 +18,12 @@ from typing import (
|
|
|
25
18
|
|
|
26
19
|
import pandas as pd
|
|
27
20
|
from pandas import DataFrame
|
|
28
|
-
from tqdm.auto import tqdm
|
|
29
21
|
from typing_extensions import TypeAlias
|
|
30
22
|
|
|
31
|
-
from phoenix.experimental.evals.evaluators import LLMEvaluator
|
|
23
|
+
from phoenix.experimental.evals.evaluators import LLMEvaluator
|
|
24
|
+
from phoenix.experimental.evals.functions.executor import get_executor_on_sync_context
|
|
32
25
|
from phoenix.experimental.evals.models import BaseEvalModel, OpenAIModel, set_verbosity
|
|
33
26
|
from phoenix.experimental.evals.templates import (
|
|
34
|
-
NOT_PARSABLE,
|
|
35
27
|
RAG_RELEVANCY_PROMPT_RAILS_MAP,
|
|
36
28
|
RAG_RELEVANCY_PROMPT_TEMPLATE,
|
|
37
29
|
ClassificationTemplate,
|
|
@@ -40,7 +32,12 @@ from phoenix.experimental.evals.templates import (
|
|
|
40
32
|
map_template,
|
|
41
33
|
normalize_classification_template,
|
|
42
34
|
)
|
|
43
|
-
from phoenix.experimental.evals.utils import
|
|
35
|
+
from phoenix.experimental.evals.utils import (
|
|
36
|
+
NOT_PARSABLE,
|
|
37
|
+
get_tqdm_progress_bar_formatter,
|
|
38
|
+
parse_openai_function_call,
|
|
39
|
+
snap_to_rail,
|
|
40
|
+
)
|
|
44
41
|
from phoenix.trace.semantic_conventions import DOCUMENT_CONTENT, INPUT_VALUE, RETRIEVAL_DOCUMENTS
|
|
45
42
|
from phoenix.utilities.logging import printif
|
|
46
43
|
|
|
@@ -55,312 +52,31 @@ OPENINFERENCE_DOCUMENT_COLUMN_NAME = "attributes." + RETRIEVAL_DOCUMENTS
|
|
|
55
52
|
_RESPONSE = "response"
|
|
56
53
|
_EXPLANATION = "explanation"
|
|
57
54
|
|
|
58
|
-
|
|
59
|
-
|
|
55
|
+
ColumnName: TypeAlias = str
|
|
56
|
+
Label: TypeAlias = str
|
|
57
|
+
Explanation: TypeAlias = Optional[str]
|
|
60
58
|
Record: TypeAlias = Mapping[str, Any]
|
|
59
|
+
EvaluatorIndex: TypeAlias = int
|
|
61
60
|
RowIndex: TypeAlias = Any
|
|
62
61
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
pass
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
_unset = Unset()
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
class AsyncExecutor:
|
|
72
|
-
"""
|
|
73
|
-
A class that provides asynchronous execution of tasks using a producer-consumer pattern.
|
|
74
|
-
|
|
75
|
-
An async interface is provided by the `execute` method, which returns a coroutine, and a sync
|
|
76
|
-
interface is provided by the `run` method.
|
|
77
|
-
|
|
78
|
-
Args:
|
|
79
|
-
generation_fn (Callable[[Any], Coroutine[Any, Any, Any]]): A coroutine function that
|
|
80
|
-
generates tasks to be executed.
|
|
81
|
-
|
|
82
|
-
concurrency (int, optional): The number of concurrent consumers. Defaults to 3.
|
|
83
|
-
|
|
84
|
-
tqdm_bar_format (Optional[str], optional): The format string for the progress bar. Defaults
|
|
85
|
-
to None.
|
|
86
|
-
|
|
87
|
-
exit_on_error (bool, optional): Whether to exit execution on the first encountered error.
|
|
88
|
-
Defaults to True.
|
|
89
|
-
|
|
90
|
-
fallback_return_value (Union[Unset, Any], optional): The fallback return value for tasks
|
|
91
|
-
that encounter errors. Defaults to _unset.
|
|
92
|
-
"""
|
|
93
|
-
|
|
94
|
-
def __init__(
|
|
95
|
-
self,
|
|
96
|
-
generation_fn: Callable[[Any], Coroutine[Any, Any, Any]],
|
|
97
|
-
concurrency: int = 3,
|
|
98
|
-
tqdm_bar_format: Optional[str] = None,
|
|
99
|
-
exit_on_error: bool = True,
|
|
100
|
-
max_retries: int = 10,
|
|
101
|
-
fallback_return_value: Union[Unset, Any] = _unset,
|
|
102
|
-
):
|
|
103
|
-
self.generate = generation_fn
|
|
104
|
-
self.fallback_return_value = fallback_return_value
|
|
105
|
-
self.concurrency = concurrency
|
|
106
|
-
self.tqdm_bar_format = tqdm_bar_format
|
|
107
|
-
self.exit_on_error = exit_on_error
|
|
108
|
-
self.max_retries = max_retries
|
|
109
|
-
self.base_priority = 0
|
|
110
|
-
|
|
111
|
-
self._TERMINATE = asyncio.Event()
|
|
112
|
-
|
|
113
|
-
def _signal_handler(self, signum: int, frame: Any) -> None:
|
|
114
|
-
self._TERMINATE.set()
|
|
115
|
-
tqdm.write("Process was interrupted. The return value will be incomplete...")
|
|
116
|
-
|
|
117
|
-
async def producer(
|
|
118
|
-
self,
|
|
119
|
-
inputs: Sequence[Any],
|
|
120
|
-
queue: asyncio.PriorityQueue[Tuple[int, Any]],
|
|
121
|
-
max_fill: int,
|
|
122
|
-
done_producing: asyncio.Event,
|
|
123
|
-
) -> None:
|
|
124
|
-
try:
|
|
125
|
-
for index, input in enumerate(inputs):
|
|
126
|
-
if self._TERMINATE.is_set():
|
|
127
|
-
break
|
|
128
|
-
while queue.qsize() >= max_fill:
|
|
129
|
-
# keep room in the queue for requeues
|
|
130
|
-
await asyncio.sleep(1)
|
|
131
|
-
await queue.put((self.base_priority, (index, input)))
|
|
132
|
-
finally:
|
|
133
|
-
done_producing.set()
|
|
134
|
-
|
|
135
|
-
async def consumer(
|
|
136
|
-
self,
|
|
137
|
-
output: List[Any],
|
|
138
|
-
queue: asyncio.PriorityQueue[Tuple[int, Any]],
|
|
139
|
-
done_producing: asyncio.Event,
|
|
140
|
-
progress_bar: tqdm[Any],
|
|
141
|
-
) -> None:
|
|
142
|
-
termination_signal_task = None
|
|
143
|
-
while True:
|
|
144
|
-
marked_done = False
|
|
145
|
-
try:
|
|
146
|
-
priority, item = await asyncio.wait_for(queue.get(), timeout=1)
|
|
147
|
-
except asyncio.TimeoutError:
|
|
148
|
-
if done_producing.is_set() and queue.empty():
|
|
149
|
-
break
|
|
150
|
-
continue
|
|
151
|
-
if self._TERMINATE.is_set():
|
|
152
|
-
# discard any remaining items in the queue
|
|
153
|
-
queue.task_done()
|
|
154
|
-
marked_done = True
|
|
155
|
-
continue
|
|
156
|
-
|
|
157
|
-
index, payload = item
|
|
158
|
-
try:
|
|
159
|
-
generate_task = asyncio.create_task(self.generate(payload))
|
|
160
|
-
termination_signal_task = asyncio.create_task(self._TERMINATE.wait())
|
|
161
|
-
done, pending = await asyncio.wait(
|
|
162
|
-
[generate_task, termination_signal_task],
|
|
163
|
-
timeout=360 * 2,
|
|
164
|
-
return_when=asyncio.FIRST_COMPLETED,
|
|
165
|
-
)
|
|
166
|
-
if generate_task in done:
|
|
167
|
-
output[index] = generate_task.result()
|
|
168
|
-
progress_bar.update()
|
|
169
|
-
elif self._TERMINATE.is_set():
|
|
170
|
-
# discard the pending task and remaining items in the queue
|
|
171
|
-
if not generate_task.done():
|
|
172
|
-
generate_task.cancel()
|
|
173
|
-
try:
|
|
174
|
-
# allow any cleanup to finish for the cancelled task
|
|
175
|
-
await generate_task
|
|
176
|
-
except asyncio.CancelledError:
|
|
177
|
-
# Handle the cancellation exception
|
|
178
|
-
pass
|
|
179
|
-
queue.task_done()
|
|
180
|
-
marked_done = True
|
|
181
|
-
continue
|
|
182
|
-
else:
|
|
183
|
-
tqdm.write("Worker timeout, requeuing")
|
|
184
|
-
await queue.put(item)
|
|
185
|
-
except Exception:
|
|
186
|
-
tqdm.write("Worker exception and requeuing")
|
|
187
|
-
await queue.put(item)
|
|
188
|
-
#tqdm.write(f"Exception in worker: {traceback.format_exc()}")
|
|
189
|
-
#if self.exit_on_error:
|
|
190
|
-
# self._TERMINATE.set()
|
|
191
|
-
#else:
|
|
192
|
-
# progress_bar.update()
|
|
193
|
-
finally:
|
|
194
|
-
if not marked_done:
|
|
195
|
-
queue.task_done()
|
|
196
|
-
if termination_signal_task and not termination_signal_task.done():
|
|
197
|
-
termination_signal_task.cancel()
|
|
198
|
-
|
|
199
|
-
async def execute(self, inputs: Sequence[Any]) -> List[Any]:
|
|
200
|
-
signal.signal(signal.SIGINT, self._signal_handler)
|
|
201
|
-
outputs = [self.fallback_return_value] * len(inputs)
|
|
202
|
-
progress_bar = tqdm(total=len(inputs), bar_format=self.tqdm_bar_format)
|
|
203
|
-
|
|
204
|
-
max_queue_size = 5 * self.concurrency # limit the queue to bound memory usage
|
|
205
|
-
max_fill = max_queue_size - (2 * self.concurrency) # ensure there is always room to requeue
|
|
206
|
-
queue: asyncio.PriorityQueue[Tuple[int, Any]] = asyncio.PriorityQueue(
|
|
207
|
-
maxsize=max_queue_size
|
|
208
|
-
)
|
|
209
|
-
done_producing = asyncio.Event()
|
|
210
|
-
|
|
211
|
-
producer = asyncio.create_task(self.producer(inputs, queue, max_fill, done_producing))
|
|
212
|
-
consumers = [
|
|
213
|
-
asyncio.create_task(self.consumer(outputs, queue, done_producing, progress_bar))
|
|
214
|
-
for _ in range(self.concurrency)
|
|
215
|
-
]
|
|
216
|
-
|
|
217
|
-
await asyncio.gather(producer, *consumers)
|
|
218
|
-
join_task = asyncio.create_task(queue.join())
|
|
219
|
-
termination_signal_task = asyncio.create_task(self._TERMINATE.wait())
|
|
220
|
-
done, pending = await asyncio.wait(
|
|
221
|
-
[join_task, termination_signal_task], return_when=asyncio.FIRST_COMPLETED
|
|
222
|
-
)
|
|
223
|
-
if termination_signal_task in done:
|
|
224
|
-
# Cancel all tasks
|
|
225
|
-
if not join_task.done():
|
|
226
|
-
join_task.cancel()
|
|
227
|
-
if not producer.done():
|
|
228
|
-
producer.cancel()
|
|
229
|
-
for task in consumers:
|
|
230
|
-
if not task.done():
|
|
231
|
-
task.cancel()
|
|
232
|
-
|
|
233
|
-
if not termination_signal_task.done():
|
|
234
|
-
termination_signal_task.cancel()
|
|
235
|
-
return outputs
|
|
236
|
-
|
|
237
|
-
def run(self, inputs: Sequence[Any]) -> List[Any]:
|
|
238
|
-
return asyncio.run(self.execute(inputs))
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
class SyncExecutor:
|
|
242
|
-
"""
|
|
243
|
-
Synchronous executor for generating outputs from inputs using a given generation function.
|
|
244
|
-
|
|
245
|
-
Args:
|
|
246
|
-
generation_fn (Callable[[Any], Any]): The generation function that takes an input and
|
|
247
|
-
returns an output.
|
|
248
|
-
|
|
249
|
-
tqdm_bar_format (Optional[str], optional): The format string for the progress bar. Defaults
|
|
250
|
-
to None.
|
|
251
|
-
|
|
252
|
-
exit_on_error (bool, optional): Whether to exit execution on the first encountered error.
|
|
253
|
-
Defaults to True.
|
|
254
|
-
|
|
255
|
-
fallback_return_value (Union[Unset, Any], optional): The fallback return value for tasks
|
|
256
|
-
that encounter errors. Defaults to _unset.
|
|
257
|
-
"""
|
|
258
|
-
|
|
259
|
-
def __init__(
|
|
260
|
-
self,
|
|
261
|
-
generation_fn: Callable[[Any], Any],
|
|
262
|
-
tqdm_bar_format: Optional[str] = None,
|
|
263
|
-
exit_on_error: bool = True,
|
|
264
|
-
fallback_return_value: Union[Unset, Any] = _unset,
|
|
265
|
-
):
|
|
266
|
-
self.generate = generation_fn
|
|
267
|
-
self.fallback_return_value = fallback_return_value
|
|
268
|
-
self.tqdm_bar_format = tqdm_bar_format
|
|
269
|
-
self.exit_on_error = exit_on_error
|
|
270
|
-
|
|
271
|
-
self._TERMINATE = False
|
|
272
|
-
|
|
273
|
-
def _signal_handler(self, signum: int, frame: Any) -> None:
|
|
274
|
-
tqdm.write("Process was interrupted. The return value will be incomplete...")
|
|
275
|
-
self._TERMINATE = True
|
|
276
|
-
|
|
277
|
-
def run(self, inputs: Sequence[Any]) -> List[Any]:
|
|
278
|
-
signal.signal(signal.SIGINT, self._signal_handler)
|
|
279
|
-
outputs = [self.fallback_return_value] * len(inputs)
|
|
280
|
-
progress_bar = tqdm(total=len(inputs), bar_format=self.tqdm_bar_format)
|
|
281
|
-
|
|
282
|
-
for index, input in enumerate(inputs):
|
|
283
|
-
if self._TERMINATE:
|
|
284
|
-
break
|
|
285
|
-
try:
|
|
286
|
-
result = self.generate(input)
|
|
287
|
-
outputs[index] = result
|
|
288
|
-
progress_bar.update()
|
|
289
|
-
except Exception as e:
|
|
290
|
-
tqdm.write(f"Exception in worker: {e}")
|
|
291
|
-
if self.exit_on_error:
|
|
292
|
-
break
|
|
293
|
-
else:
|
|
294
|
-
progress_bar.update()
|
|
295
|
-
return outputs
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
def get_executor_on_sync_context(
|
|
299
|
-
sync_fn: Callable[[Any], Any],
|
|
300
|
-
async_fn: Callable[[Any], Coroutine[Any, Any, Any]],
|
|
301
|
-
concurrency: int = 3,
|
|
302
|
-
tqdm_bar_format: Optional[str] = None,
|
|
303
|
-
exit_on_error: bool = True,
|
|
304
|
-
fallback_return_value: Union[Unset, Any] = _unset,
|
|
305
|
-
) -> Union[AsyncExecutor, SyncExecutor]:
|
|
306
|
-
if _running_event_loop_exists():
|
|
307
|
-
if getattr(asyncio, "_nest_patched", False):
|
|
308
|
-
return AsyncExecutor(
|
|
309
|
-
async_fn,
|
|
310
|
-
concurrency=concurrency,
|
|
311
|
-
tqdm_bar_format=tqdm_bar_format,
|
|
312
|
-
exit_on_error=exit_on_error,
|
|
313
|
-
fallback_return_value=fallback_return_value,
|
|
314
|
-
)
|
|
315
|
-
else:
|
|
316
|
-
logger.warning(
|
|
317
|
-
"🐌!! If running llm_classify inside a notebook, patching the event loop with "
|
|
318
|
-
"nest_asyncio will allow asynchronous eval submission, and is significantly "
|
|
319
|
-
"faster. To patch the event loop, run `nest_asyncio.apply()`."
|
|
320
|
-
)
|
|
321
|
-
return SyncExecutor(
|
|
322
|
-
sync_fn,
|
|
323
|
-
tqdm_bar_format=tqdm_bar_format,
|
|
324
|
-
exit_on_error=exit_on_error,
|
|
325
|
-
fallback_return_value=fallback_return_value,
|
|
326
|
-
)
|
|
327
|
-
else:
|
|
328
|
-
return AsyncExecutor(
|
|
329
|
-
async_fn,
|
|
330
|
-
concurrency=concurrency,
|
|
331
|
-
tqdm_bar_format=tqdm_bar_format,
|
|
332
|
-
exit_on_error=exit_on_error,
|
|
333
|
-
fallback_return_value=fallback_return_value,
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
def _running_event_loop_exists() -> bool:
|
|
338
|
-
"""Checks for a running event loop.
|
|
339
|
-
|
|
340
|
-
Returns:
|
|
341
|
-
bool: True if a running event loop exists, False otherwise.
|
|
342
|
-
"""
|
|
343
|
-
try:
|
|
344
|
-
asyncio.get_running_loop()
|
|
345
|
-
return True
|
|
346
|
-
except RuntimeError:
|
|
347
|
-
return False
|
|
62
|
+
# snapped_response, explanation, response
|
|
63
|
+
ParsedLLMResponse: TypeAlias = Tuple[Optional[str], Optional[str], str]
|
|
348
64
|
|
|
349
65
|
|
|
350
66
|
def llm_classify(
|
|
351
67
|
dataframe: pd.DataFrame,
|
|
352
68
|
model: BaseEvalModel,
|
|
353
69
|
template: Union[ClassificationTemplate, PromptTemplate, str],
|
|
354
|
-
rails:
|
|
70
|
+
rails: List[str],
|
|
355
71
|
system_instruction: Optional[str] = None,
|
|
356
72
|
verbose: bool = False,
|
|
357
73
|
use_function_calling_if_available: bool = True,
|
|
358
74
|
provide_explanation: bool = False,
|
|
75
|
+
include_prompt: bool = False,
|
|
76
|
+
include_response: bool = False,
|
|
77
|
+
run_sync: bool = False,
|
|
359
78
|
concurrency: int = 20,
|
|
360
|
-
return_prompt: bool = False,
|
|
361
|
-
return_response: bool = False,
|
|
362
79
|
) -> pd.DataFrame:
|
|
363
|
-
print("llm_classify_new")
|
|
364
80
|
"""Classifies each input row of the dataframe using an LLM. Returns a pandas.DataFrame
|
|
365
81
|
where the first column is named `label` and contains the classification labels. An optional
|
|
366
82
|
column named `explanation` is added when `provide_explanation=True`.
|
|
@@ -392,9 +108,18 @@ def llm_classify(
|
|
|
392
108
|
|
|
393
109
|
provide_explanation (bool, default=False): If True, provides an explanation for each
|
|
394
110
|
classification label. A column named `explanation` is added to the output dataframe.
|
|
395
|
-
Currently, this is only available for models with function calling.
|
|
396
111
|
|
|
397
|
-
|
|
112
|
+
include_prompt (bool, default=False): If True, includes a column named `prompt` in the
|
|
113
|
+
output dataframe containing the prompt used for each classification.
|
|
114
|
+
|
|
115
|
+
include_response (bool, default=False): If True, includes a column named `response` in the
|
|
116
|
+
output dataframe containing the raw response from the LLM.
|
|
117
|
+
|
|
118
|
+
run_sync (bool, default=False): If True, forces synchronous request submission. Otherwise
|
|
119
|
+
evaluations will be run asynchronously if possible.
|
|
120
|
+
|
|
121
|
+
concurrency (int, default=20): The number of concurrent evals if async submission is
|
|
122
|
+
possible.
|
|
398
123
|
|
|
399
124
|
Returns:
|
|
400
125
|
pandas.DataFrame: A dataframe where the `label` column (at column position 0) contains
|
|
@@ -404,18 +129,6 @@ def llm_classify(
|
|
|
404
129
|
from the entries in the rails argument or "NOT_PARSABLE" if the model's output could
|
|
405
130
|
not be parsed.
|
|
406
131
|
"""
|
|
407
|
-
# Check if rails is a single rail to be applied to all, expand if necessary
|
|
408
|
-
# Check if rails is a list of lists
|
|
409
|
-
if all(isinstance(sublist, list) for sublist in rails):
|
|
410
|
-
rails_list = rails
|
|
411
|
-
if use_function_calling_if_available:
|
|
412
|
-
raise ValueError("When using function calling, rails must be a single rail.")
|
|
413
|
-
elif isinstance(rails, list):
|
|
414
|
-
# Assuming rails is a list of strings if it's not a list of lists
|
|
415
|
-
rails_list = [rails] * len(dataframe)
|
|
416
|
-
else:
|
|
417
|
-
raise TypeError("rails must be either a list of strings or a list of lists")
|
|
418
|
-
|
|
419
132
|
tqdm_bar_format = get_tqdm_progress_bar_formatter("llm_classify")
|
|
420
133
|
use_openai_function_call = (
|
|
421
134
|
use_function_calling_if_available
|
|
@@ -441,8 +154,7 @@ def llm_classify(
|
|
|
441
154
|
if generation_info := model.verbose_generation_info():
|
|
442
155
|
printif(verbose, generation_info)
|
|
443
156
|
|
|
444
|
-
def _process_response(
|
|
445
|
-
response, rails_per_response = response_combo
|
|
157
|
+
def _process_response(response: str) -> Tuple[str, Optional[str]]:
|
|
446
158
|
if not use_openai_function_call:
|
|
447
159
|
if provide_explanation:
|
|
448
160
|
unrailed_label, explanation = (
|
|
@@ -457,68 +169,46 @@ def llm_classify(
|
|
|
457
169
|
unrailed_label = response
|
|
458
170
|
explanation = None
|
|
459
171
|
else:
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
unrailed_label = function_arguments.get(_RESPONSE)
|
|
463
|
-
explanation = function_arguments.get(_EXPLANATION)
|
|
464
|
-
except json.JSONDecodeError:
|
|
465
|
-
unrailed_label = response
|
|
466
|
-
explanation = None
|
|
467
|
-
return _snap_to_rail(unrailed_label, rails_per_response, verbose=verbose), explanation
|
|
172
|
+
unrailed_label, explanation = parse_openai_function_call(response)
|
|
173
|
+
return snap_to_rail(unrailed_label, rails, verbose=verbose), explanation
|
|
468
174
|
|
|
469
|
-
async def _run_llm_classification_async(
|
|
470
|
-
prompt_combo: Tuple[str, List[str]],
|
|
471
|
-
) -> Tuple[str, Optional[str], Optional[str], Optional[str], Optional[str]]:
|
|
472
|
-
prompt, rails_per_prompt = prompt_combo
|
|
175
|
+
async def _run_llm_classification_async(prompt: str) -> ParsedLLMResponse:
|
|
473
176
|
with set_verbosity(model, verbose) as verbose_model:
|
|
474
177
|
response = await verbose_model._async_generate(
|
|
475
178
|
prompt, instruction=system_instruction, **model_kwargs
|
|
476
179
|
)
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
# Combine processed_response with prompt and system_instruction
|
|
482
|
-
# proccessed_response, explanation, prompt, system_instruction, unprocessed_response
|
|
483
|
-
final_result= (*processed_response, prompt_to_return, system_instruction_to_return, response_to_return)
|
|
484
|
-
return final_result
|
|
485
|
-
|
|
486
|
-
def _run_llm_classification_sync(
|
|
487
|
-
prompt_combo: Tuple[str, List[str]],
|
|
488
|
-
) -> Tuple[str, Optional[str], Optional[str], Optional[str], Optional[str]]:
|
|
489
|
-
prompt, rails_per_prompt = prompt_combo
|
|
180
|
+
inference, explanation = _process_response(response)
|
|
181
|
+
return inference, explanation, response
|
|
182
|
+
|
|
183
|
+
def _run_llm_classification_sync(prompt: str) -> ParsedLLMResponse:
|
|
490
184
|
with set_verbosity(model, verbose) as verbose_model:
|
|
491
185
|
response = verbose_model._generate(
|
|
492
186
|
prompt, instruction=system_instruction, **model_kwargs
|
|
493
187
|
)
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
# Combine processed_response with prompt and system_instruction
|
|
499
|
-
# proccessed_response, explanation, prompt, system_instruction, unprocessed_response
|
|
500
|
-
final_result= (*processed_response, prompt_to_return, system_instruction_to_return, response_to_return)
|
|
501
|
-
return final_result
|
|
188
|
+
inference, explanation = _process_response(response)
|
|
189
|
+
return inference, explanation, response
|
|
190
|
+
|
|
191
|
+
fallback_return_value: ParsedLLMResponse = (None, None, "")
|
|
502
192
|
|
|
503
193
|
executor = get_executor_on_sync_context(
|
|
504
194
|
_run_llm_classification_sync,
|
|
505
195
|
_run_llm_classification_async,
|
|
196
|
+
run_sync=run_sync,
|
|
506
197
|
concurrency=concurrency,
|
|
507
198
|
tqdm_bar_format=tqdm_bar_format,
|
|
508
199
|
exit_on_error=True,
|
|
509
|
-
fallback_return_value=
|
|
200
|
+
fallback_return_value=fallback_return_value,
|
|
510
201
|
)
|
|
511
|
-
|
|
512
|
-
results = executor.run(
|
|
513
|
-
labels, explanations,
|
|
202
|
+
|
|
203
|
+
results = executor.run(prompts.tolist())
|
|
204
|
+
labels, explanations, responses = zip(*results)
|
|
514
205
|
|
|
515
206
|
return pd.DataFrame(
|
|
516
207
|
data={
|
|
517
208
|
"label": labels,
|
|
518
209
|
**({"explanation": explanations} if provide_explanation else {}),
|
|
519
|
-
**({"prompt":
|
|
520
|
-
**({"
|
|
521
|
-
**({"response": response_to_return} if return_response else {}),
|
|
210
|
+
**({"prompt": prompts} if include_prompt else {}),
|
|
211
|
+
**({"response": responses} if include_response else {}),
|
|
522
212
|
},
|
|
523
213
|
index=dataframe.index,
|
|
524
214
|
)
|
|
@@ -675,60 +365,67 @@ def _default_openai_function(
|
|
|
675
365
|
|
|
676
366
|
|
|
677
367
|
class RunEvalsPayload(NamedTuple):
|
|
368
|
+
evaluator_index: EvaluatorIndex
|
|
369
|
+
row_index: RowIndex
|
|
678
370
|
evaluator: LLMEvaluator
|
|
679
371
|
record: Record
|
|
680
|
-
row_index: RowIndex
|
|
681
372
|
|
|
682
373
|
|
|
683
374
|
def run_evals(
|
|
684
375
|
dataframe: DataFrame,
|
|
685
376
|
evaluators: List[LLMEvaluator],
|
|
377
|
+
provide_explanation: bool = False,
|
|
378
|
+
verbose: bool = False,
|
|
686
379
|
concurrency: int = 20,
|
|
687
|
-
) -> DataFrame:
|
|
380
|
+
) -> List[DataFrame]:
|
|
688
381
|
"""
|
|
689
|
-
Applies a list of evaluators to
|
|
690
|
-
|
|
691
|
-
|
|
382
|
+
Applies a list of evaluators to a dataframe. Outputs a list of dataframes in
|
|
383
|
+
which each dataframe contains the outputs of the corresponding evaluator
|
|
384
|
+
applied to the input dataframe.
|
|
692
385
|
|
|
693
386
|
Args:
|
|
694
|
-
dataframe (
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
387
|
+
dataframe (DataFrame): A pandas dataframe in which each row represents a
|
|
388
|
+
record to be evaluated. All template variable names must appear as
|
|
389
|
+
column names in the dataframe (extra columns unrelated to the template
|
|
390
|
+
are permitted).
|
|
391
|
+
|
|
392
|
+
evaluators (List[LLMEvaluator]): A list of evaluators.
|
|
698
393
|
|
|
699
|
-
|
|
394
|
+
provide_explanation (bool, optional): If True, provides an explanation
|
|
395
|
+
for each evaluation. A column named "explanation" is added to each
|
|
396
|
+
output dataframe.
|
|
700
397
|
|
|
701
|
-
|
|
702
|
-
to
|
|
398
|
+
verbose (bool, optional): If True, prints detailed info to stdout such
|
|
399
|
+
as model invocation parameters and details about retries and snapping to
|
|
400
|
+
rails.
|
|
401
|
+
|
|
402
|
+
concurrency (int, optional): The number of concurrent evals if async
|
|
403
|
+
submission is possible.
|
|
703
404
|
|
|
704
405
|
Returns:
|
|
705
|
-
DataFrame: A
|
|
706
|
-
|
|
707
|
-
the column names match the names of the evaluators. The index of the
|
|
708
|
-
dataframe is the same as the index of the input dataframe.
|
|
406
|
+
List[DataFrame]: A list of dataframes, one for each evaluator, all of
|
|
407
|
+
which have the same number of rows as the input dataframe.
|
|
709
408
|
"""
|
|
710
|
-
if len(set(evaluator.name for evaluator in evaluators)) != len(evaluators):
|
|
711
|
-
raise ValueError("Evaluators must have unique names.")
|
|
712
409
|
|
|
713
|
-
async def
|
|
410
|
+
async def _arun_eval(
|
|
411
|
+
payload: RunEvalsPayload,
|
|
412
|
+
) -> Tuple[EvaluatorIndex, RowIndex, Label, Explanation]:
|
|
413
|
+
label, explanation = await payload.evaluator.aevaluate(
|
|
414
|
+
payload.record, provide_explanation=provide_explanation
|
|
415
|
+
)
|
|
416
|
+
return payload.evaluator_index, payload.row_index, label, explanation
|
|
417
|
+
|
|
418
|
+
def _run_eval(
|
|
714
419
|
payload: RunEvalsPayload,
|
|
715
|
-
) -> Tuple[RowIndex,
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
return row_index, evaluator.name, eval_result
|
|
721
|
-
|
|
722
|
-
def _run_eval_sync(payload: RunEvalsPayload) -> Tuple[RowIndex, EvalName, EvalPrediction]:
|
|
723
|
-
row_index = payload.row_index
|
|
724
|
-
evaluator = payload.evaluator
|
|
725
|
-
record = payload.record
|
|
726
|
-
eval_result = evaluator.evaluate(record)
|
|
727
|
-
return row_index, evaluator.name, eval_result
|
|
420
|
+
) -> Tuple[EvaluatorIndex, RowIndex, Label, Explanation]:
|
|
421
|
+
label, explanation = payload.evaluator.evaluate(
|
|
422
|
+
payload.record, provide_explanation=provide_explanation
|
|
423
|
+
)
|
|
424
|
+
return payload.evaluator_index, payload.row_index, label, explanation
|
|
728
425
|
|
|
729
426
|
executor = get_executor_on_sync_context(
|
|
730
|
-
|
|
731
|
-
|
|
427
|
+
_run_eval,
|
|
428
|
+
_arun_eval,
|
|
732
429
|
concurrency=concurrency,
|
|
733
430
|
tqdm_bar_format=get_tqdm_progress_bar_formatter("run_evals"),
|
|
734
431
|
exit_on_error=True,
|
|
@@ -736,15 +433,23 @@ def run_evals(
|
|
|
736
433
|
)
|
|
737
434
|
payloads = [
|
|
738
435
|
RunEvalsPayload(
|
|
436
|
+
evaluator_index=evaluator_index,
|
|
739
437
|
row_index=row_index,
|
|
740
438
|
evaluator=evaluator,
|
|
741
439
|
record=row.to_dict(),
|
|
742
440
|
)
|
|
743
441
|
for row_index, row in dataframe.iterrows()
|
|
744
|
-
for evaluator in evaluators
|
|
442
|
+
for evaluator_index, evaluator in enumerate(evaluators)
|
|
443
|
+
]
|
|
444
|
+
eval_results: List[DefaultDict[RowIndex, Dict[ColumnName, Union[Label, Explanation]]]] = [
|
|
445
|
+
defaultdict(dict) for _ in range(len(evaluators))
|
|
745
446
|
]
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
447
|
+
for evaluator_index, row_index, label, explanation in executor.run(payloads):
|
|
448
|
+
eval_results[evaluator_index][row_index]["label"] = label
|
|
449
|
+
if explanation is not None:
|
|
450
|
+
eval_results[evaluator_index][row_index]["explanation"] = explanation
|
|
451
|
+
eval_dataframes: List[DataFrame] = []
|
|
452
|
+
for eval_result in eval_results:
|
|
453
|
+
index, eval_data = zip(*eval_result.items())
|
|
454
|
+
eval_dataframes.append(DataFrame(eval_data, index=index))
|
|
455
|
+
return eval_dataframes
|