arize-phoenix 1.9.1rc2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-1.9.1rc2.dist-info → arize_phoenix-2.0.0.dist-info}/METADATA +2 -2
- {arize_phoenix-1.9.1rc2.dist-info → arize_phoenix-2.0.0.dist-info}/RECORD +21 -19
- {arize_phoenix-1.9.1rc2.dist-info → arize_phoenix-2.0.0.dist-info}/WHEEL +1 -1
- phoenix/__init__.py +1 -1
- phoenix/core/traces.py +1 -1
- phoenix/exceptions.py +2 -0
- phoenix/experimental/evals/__init__.py +3 -2
- phoenix/experimental/evals/evaluators.py +89 -46
- phoenix/experimental/evals/functions/classify.py +102 -386
- phoenix/experimental/evals/functions/executor.py +353 -0
- phoenix/experimental/evals/functions/generate.py +76 -32
- phoenix/experimental/evals/models/rate_limiters.py +25 -5
- phoenix/experimental/evals/templates/__init__.py +0 -2
- phoenix/experimental/evals/templates/template.py +2 -5
- phoenix/experimental/evals/utils/__init__.py +66 -0
- phoenix/server/app.py +3 -2
- phoenix/server/main.py +3 -0
- phoenix/server/static/index.js +548 -449
- phoenix/trace/openai/instrumentor.py +51 -14
- {arize_phoenix-1.9.1rc2.dist-info → arize_phoenix-2.0.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-1.9.1rc2.dist-info → arize_phoenix-2.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,15 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import asyncio
|
|
4
|
-
import json
|
|
5
3
|
import logging
|
|
6
|
-
import signal
|
|
7
|
-
import traceback
|
|
8
4
|
from collections import defaultdict
|
|
9
5
|
from typing import (
|
|
10
6
|
Any,
|
|
11
|
-
Callable,
|
|
12
|
-
Coroutine,
|
|
13
7
|
DefaultDict,
|
|
14
8
|
Dict,
|
|
15
9
|
Iterable,
|
|
@@ -17,7 +11,6 @@ from typing import (
|
|
|
17
11
|
Mapping,
|
|
18
12
|
NamedTuple,
|
|
19
13
|
Optional,
|
|
20
|
-
Sequence,
|
|
21
14
|
Tuple,
|
|
22
15
|
Union,
|
|
23
16
|
cast,
|
|
@@ -25,13 +18,12 @@ from typing import (
|
|
|
25
18
|
|
|
26
19
|
import pandas as pd
|
|
27
20
|
from pandas import DataFrame
|
|
28
|
-
from tqdm.auto import tqdm
|
|
29
21
|
from typing_extensions import TypeAlias
|
|
30
22
|
|
|
31
|
-
from phoenix.experimental.evals.evaluators import LLMEvaluator
|
|
23
|
+
from phoenix.experimental.evals.evaluators import LLMEvaluator
|
|
24
|
+
from phoenix.experimental.evals.functions.executor import get_executor_on_sync_context
|
|
32
25
|
from phoenix.experimental.evals.models import BaseEvalModel, OpenAIModel, set_verbosity
|
|
33
26
|
from phoenix.experimental.evals.templates import (
|
|
34
|
-
NOT_PARSABLE,
|
|
35
27
|
RAG_RELEVANCY_PROMPT_RAILS_MAP,
|
|
36
28
|
RAG_RELEVANCY_PROMPT_TEMPLATE,
|
|
37
29
|
ClassificationTemplate,
|
|
@@ -40,7 +32,12 @@ from phoenix.experimental.evals.templates import (
|
|
|
40
32
|
map_template,
|
|
41
33
|
normalize_classification_template,
|
|
42
34
|
)
|
|
43
|
-
from phoenix.experimental.evals.utils import
|
|
35
|
+
from phoenix.experimental.evals.utils import (
|
|
36
|
+
NOT_PARSABLE,
|
|
37
|
+
get_tqdm_progress_bar_formatter,
|
|
38
|
+
parse_openai_function_call,
|
|
39
|
+
snap_to_rail,
|
|
40
|
+
)
|
|
44
41
|
from phoenix.trace.semantic_conventions import DOCUMENT_CONTENT, INPUT_VALUE, RETRIEVAL_DOCUMENTS
|
|
45
42
|
from phoenix.utilities.logging import printif
|
|
46
43
|
|
|
@@ -55,316 +52,31 @@ OPENINFERENCE_DOCUMENT_COLUMN_NAME = "attributes." + RETRIEVAL_DOCUMENTS
|
|
|
55
52
|
_RESPONSE = "response"
|
|
56
53
|
_EXPLANATION = "explanation"
|
|
57
54
|
|
|
58
|
-
|
|
59
|
-
|
|
55
|
+
ColumnName: TypeAlias = str
|
|
56
|
+
Label: TypeAlias = str
|
|
57
|
+
Explanation: TypeAlias = Optional[str]
|
|
60
58
|
Record: TypeAlias = Mapping[str, Any]
|
|
59
|
+
EvaluatorIndex: TypeAlias = int
|
|
61
60
|
RowIndex: TypeAlias = Any
|
|
62
61
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
pass
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
_unset = Unset()
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
class AsyncExecutor:
|
|
72
|
-
"""
|
|
73
|
-
A class that provides asynchronous execution of tasks using a producer-consumer pattern.
|
|
74
|
-
|
|
75
|
-
An async interface is provided by the `execute` method, which returns a coroutine, and a sync
|
|
76
|
-
interface is provided by the `run` method.
|
|
77
|
-
|
|
78
|
-
Args:
|
|
79
|
-
generation_fn (Callable[[Any], Coroutine[Any, Any, Any]]): A coroutine function that
|
|
80
|
-
generates tasks to be executed.
|
|
81
|
-
|
|
82
|
-
concurrency (int, optional): The number of concurrent consumers. Defaults to 3.
|
|
83
|
-
|
|
84
|
-
tqdm_bar_format (Optional[str], optional): The format string for the progress bar. Defaults
|
|
85
|
-
to None.
|
|
86
|
-
|
|
87
|
-
exit_on_error (bool, optional): Whether to exit execution on the first encountered error.
|
|
88
|
-
Defaults to True.
|
|
89
|
-
|
|
90
|
-
fallback_return_value (Union[Unset, Any], optional): The fallback return value for tasks
|
|
91
|
-
that encounter errors. Defaults to _unset.
|
|
92
|
-
"""
|
|
93
|
-
|
|
94
|
-
def __init__(
|
|
95
|
-
self,
|
|
96
|
-
generation_fn: Callable[[Any], Coroutine[Any, Any, Any]],
|
|
97
|
-
concurrency: int = 3,
|
|
98
|
-
tqdm_bar_format: Optional[str] = None,
|
|
99
|
-
exit_on_error: bool = True,
|
|
100
|
-
max_retries: int = 10,
|
|
101
|
-
fallback_return_value: Union[Unset, Any] = _unset,
|
|
102
|
-
):
|
|
103
|
-
self.generate = generation_fn
|
|
104
|
-
self.fallback_return_value = fallback_return_value
|
|
105
|
-
self.concurrency = concurrency
|
|
106
|
-
self.tqdm_bar_format = tqdm_bar_format
|
|
107
|
-
self.exit_on_error = exit_on_error
|
|
108
|
-
self.max_retries = max_retries
|
|
109
|
-
self.base_priority = 0
|
|
110
|
-
|
|
111
|
-
self._TERMINATE = asyncio.Event()
|
|
112
|
-
|
|
113
|
-
def _signal_handler(self, signum: int, frame: Any) -> None:
|
|
114
|
-
self._TERMINATE.set()
|
|
115
|
-
tqdm.write("Process was interrupted. The return value will be incomplete...")
|
|
116
|
-
|
|
117
|
-
async def producer(
|
|
118
|
-
self,
|
|
119
|
-
inputs: Sequence[Any],
|
|
120
|
-
queue: asyncio.PriorityQueue[Tuple[int, Any]],
|
|
121
|
-
max_fill: int,
|
|
122
|
-
done_producing: asyncio.Event,
|
|
123
|
-
) -> None:
|
|
124
|
-
try:
|
|
125
|
-
for index, input in enumerate(inputs):
|
|
126
|
-
if self._TERMINATE.is_set():
|
|
127
|
-
break
|
|
128
|
-
while queue.qsize() >= max_fill:
|
|
129
|
-
# keep room in the queue for requeues
|
|
130
|
-
await asyncio.sleep(1)
|
|
131
|
-
await queue.put((self.base_priority, (index, input)))
|
|
132
|
-
finally:
|
|
133
|
-
done_producing.set()
|
|
134
|
-
|
|
135
|
-
async def consumer(
|
|
136
|
-
self,
|
|
137
|
-
output: List[Any],
|
|
138
|
-
queue: asyncio.PriorityQueue[Tuple[int, Any]],
|
|
139
|
-
done_producing: asyncio.Event,
|
|
140
|
-
progress_bar: tqdm[Any],
|
|
141
|
-
) -> None:
|
|
142
|
-
termination_signal_task = None
|
|
143
|
-
while True:
|
|
144
|
-
marked_done = False
|
|
145
|
-
try:
|
|
146
|
-
priority, item = await asyncio.wait_for(queue.get(), timeout=1)
|
|
147
|
-
except asyncio.TimeoutError:
|
|
148
|
-
if done_producing.is_set() and queue.empty():
|
|
149
|
-
break
|
|
150
|
-
continue
|
|
151
|
-
if self._TERMINATE.is_set():
|
|
152
|
-
# discard any remaining items in the queue
|
|
153
|
-
queue.task_done()
|
|
154
|
-
marked_done = True
|
|
155
|
-
continue
|
|
156
|
-
|
|
157
|
-
index, payload = item
|
|
158
|
-
try:
|
|
159
|
-
generate_task = asyncio.create_task(self.generate(payload))
|
|
160
|
-
termination_signal_task = asyncio.create_task(self._TERMINATE.wait())
|
|
161
|
-
done, pending = await asyncio.wait(
|
|
162
|
-
[generate_task, termination_signal_task],
|
|
163
|
-
timeout=360 * 2,
|
|
164
|
-
return_when=asyncio.FIRST_COMPLETED,
|
|
165
|
-
)
|
|
166
|
-
if generate_task in done:
|
|
167
|
-
output[index] = generate_task.result()
|
|
168
|
-
progress_bar.update()
|
|
169
|
-
elif self._TERMINATE.is_set():
|
|
170
|
-
# discard the pending task and remaining items in the queue
|
|
171
|
-
if not generate_task.done():
|
|
172
|
-
generate_task.cancel()
|
|
173
|
-
try:
|
|
174
|
-
# allow any cleanup to finish for the cancelled task
|
|
175
|
-
await generate_task
|
|
176
|
-
except asyncio.CancelledError:
|
|
177
|
-
# Handle the cancellation exception
|
|
178
|
-
pass
|
|
179
|
-
queue.task_done()
|
|
180
|
-
marked_done = True
|
|
181
|
-
continue
|
|
182
|
-
else:
|
|
183
|
-
tqdm.write("Worker timeout, requeuing")
|
|
184
|
-
# task timeouts are requeued at base priority
|
|
185
|
-
await queue.put((self.base_priority, item))
|
|
186
|
-
except Exception as exc:
|
|
187
|
-
if (retry_count := abs(priority)) <= self.max_retries:
|
|
188
|
-
tqdm.write(
|
|
189
|
-
f"Exception in worker on attempt {retry_count + 1}: raised {repr(exc)}"
|
|
190
|
-
)
|
|
191
|
-
tqdm.write("Requeuing...")
|
|
192
|
-
await queue.put((priority - 1, item))
|
|
193
|
-
else:
|
|
194
|
-
tqdm.write(f"Exception in worker: {traceback.format_exc()}")
|
|
195
|
-
if self.exit_on_error:
|
|
196
|
-
self._TERMINATE.set()
|
|
197
|
-
else:
|
|
198
|
-
progress_bar.update()
|
|
199
|
-
finally:
|
|
200
|
-
if not marked_done:
|
|
201
|
-
queue.task_done()
|
|
202
|
-
if termination_signal_task and not termination_signal_task.done():
|
|
203
|
-
termination_signal_task.cancel()
|
|
204
|
-
|
|
205
|
-
async def execute(self, inputs: Sequence[Any]) -> List[Any]:
|
|
206
|
-
signal.signal(signal.SIGINT, self._signal_handler)
|
|
207
|
-
outputs = [self.fallback_return_value] * len(inputs)
|
|
208
|
-
progress_bar = tqdm(total=len(inputs), bar_format=self.tqdm_bar_format)
|
|
209
|
-
|
|
210
|
-
max_queue_size = 5 * self.concurrency # limit the queue to bound memory usage
|
|
211
|
-
max_fill = max_queue_size - (2 * self.concurrency) # ensure there is always room to requeue
|
|
212
|
-
queue: asyncio.PriorityQueue[Tuple[int, Any]] = asyncio.PriorityQueue(
|
|
213
|
-
maxsize=max_queue_size
|
|
214
|
-
)
|
|
215
|
-
done_producing = asyncio.Event()
|
|
216
|
-
|
|
217
|
-
producer = asyncio.create_task(self.producer(inputs, queue, max_fill, done_producing))
|
|
218
|
-
consumers = [
|
|
219
|
-
asyncio.create_task(self.consumer(outputs, queue, done_producing, progress_bar))
|
|
220
|
-
for _ in range(self.concurrency)
|
|
221
|
-
]
|
|
222
|
-
|
|
223
|
-
await asyncio.gather(producer, *consumers)
|
|
224
|
-
join_task = asyncio.create_task(queue.join())
|
|
225
|
-
termination_signal_task = asyncio.create_task(self._TERMINATE.wait())
|
|
226
|
-
done, pending = await asyncio.wait(
|
|
227
|
-
[join_task, termination_signal_task], return_when=asyncio.FIRST_COMPLETED
|
|
228
|
-
)
|
|
229
|
-
if termination_signal_task in done:
|
|
230
|
-
# Cancel all tasks
|
|
231
|
-
if not join_task.done():
|
|
232
|
-
join_task.cancel()
|
|
233
|
-
if not producer.done():
|
|
234
|
-
producer.cancel()
|
|
235
|
-
for task in consumers:
|
|
236
|
-
if not task.done():
|
|
237
|
-
task.cancel()
|
|
238
|
-
|
|
239
|
-
if not termination_signal_task.done():
|
|
240
|
-
termination_signal_task.cancel()
|
|
241
|
-
return outputs
|
|
242
|
-
|
|
243
|
-
def run(self, inputs: Sequence[Any]) -> List[Any]:
|
|
244
|
-
return asyncio.run(self.execute(inputs))
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
class SyncExecutor:
|
|
248
|
-
"""
|
|
249
|
-
Synchronous executor for generating outputs from inputs using a given generation function.
|
|
250
|
-
|
|
251
|
-
Args:
|
|
252
|
-
generation_fn (Callable[[Any], Any]): The generation function that takes an input and
|
|
253
|
-
returns an output.
|
|
254
|
-
|
|
255
|
-
tqdm_bar_format (Optional[str], optional): The format string for the progress bar. Defaults
|
|
256
|
-
to None.
|
|
257
|
-
|
|
258
|
-
exit_on_error (bool, optional): Whether to exit execution on the first encountered error.
|
|
259
|
-
Defaults to True.
|
|
260
|
-
|
|
261
|
-
fallback_return_value (Union[Unset, Any], optional): The fallback return value for tasks
|
|
262
|
-
that encounter errors. Defaults to _unset.
|
|
263
|
-
"""
|
|
264
|
-
|
|
265
|
-
def __init__(
|
|
266
|
-
self,
|
|
267
|
-
generation_fn: Callable[[Any], Any],
|
|
268
|
-
tqdm_bar_format: Optional[str] = None,
|
|
269
|
-
exit_on_error: bool = True,
|
|
270
|
-
fallback_return_value: Union[Unset, Any] = _unset,
|
|
271
|
-
):
|
|
272
|
-
self.generate = generation_fn
|
|
273
|
-
self.fallback_return_value = fallback_return_value
|
|
274
|
-
self.tqdm_bar_format = tqdm_bar_format
|
|
275
|
-
self.exit_on_error = exit_on_error
|
|
276
|
-
|
|
277
|
-
self._TERMINATE = False
|
|
278
|
-
|
|
279
|
-
def _signal_handler(self, signum: int, frame: Any) -> None:
|
|
280
|
-
tqdm.write("Process was interrupted. The return value will be incomplete...")
|
|
281
|
-
self._TERMINATE = True
|
|
282
|
-
|
|
283
|
-
def run(self, inputs: Sequence[Any]) -> List[Any]:
|
|
284
|
-
signal.signal(signal.SIGINT, self._signal_handler)
|
|
285
|
-
outputs = [self.fallback_return_value] * len(inputs)
|
|
286
|
-
progress_bar = tqdm(total=len(inputs), bar_format=self.tqdm_bar_format)
|
|
287
|
-
|
|
288
|
-
for index, input in enumerate(inputs):
|
|
289
|
-
if self._TERMINATE:
|
|
290
|
-
break
|
|
291
|
-
try:
|
|
292
|
-
result = self.generate(input)
|
|
293
|
-
outputs[index] = result
|
|
294
|
-
progress_bar.update()
|
|
295
|
-
except Exception as e:
|
|
296
|
-
tqdm.write(f"Exception in worker: {e}")
|
|
297
|
-
if self.exit_on_error:
|
|
298
|
-
break
|
|
299
|
-
else:
|
|
300
|
-
progress_bar.update()
|
|
301
|
-
return outputs
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
def get_executor_on_sync_context(
|
|
305
|
-
sync_fn: Callable[[Any], Any],
|
|
306
|
-
async_fn: Callable[[Any], Coroutine[Any, Any, Any]],
|
|
307
|
-
concurrency: int = 3,
|
|
308
|
-
tqdm_bar_format: Optional[str] = None,
|
|
309
|
-
exit_on_error: bool = True,
|
|
310
|
-
fallback_return_value: Union[Unset, Any] = _unset,
|
|
311
|
-
) -> Union[AsyncExecutor, SyncExecutor]:
|
|
312
|
-
if _running_event_loop_exists():
|
|
313
|
-
if getattr(asyncio, "_nest_patched", False):
|
|
314
|
-
return AsyncExecutor(
|
|
315
|
-
async_fn,
|
|
316
|
-
concurrency=concurrency,
|
|
317
|
-
tqdm_bar_format=tqdm_bar_format,
|
|
318
|
-
exit_on_error=exit_on_error,
|
|
319
|
-
fallback_return_value=fallback_return_value,
|
|
320
|
-
)
|
|
321
|
-
else:
|
|
322
|
-
logger.warning(
|
|
323
|
-
"🐌!! If running llm_classify inside a notebook, patching the event loop with "
|
|
324
|
-
"nest_asyncio will allow asynchronous eval submission, and is significantly "
|
|
325
|
-
"faster. To patch the event loop, run `nest_asyncio.apply()`."
|
|
326
|
-
)
|
|
327
|
-
return SyncExecutor(
|
|
328
|
-
sync_fn,
|
|
329
|
-
tqdm_bar_format=tqdm_bar_format,
|
|
330
|
-
exit_on_error=exit_on_error,
|
|
331
|
-
fallback_return_value=fallback_return_value,
|
|
332
|
-
)
|
|
333
|
-
else:
|
|
334
|
-
return AsyncExecutor(
|
|
335
|
-
async_fn,
|
|
336
|
-
concurrency=concurrency,
|
|
337
|
-
tqdm_bar_format=tqdm_bar_format,
|
|
338
|
-
exit_on_error=exit_on_error,
|
|
339
|
-
fallback_return_value=fallback_return_value,
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
def _running_event_loop_exists() -> bool:
|
|
344
|
-
"""Checks for a running event loop.
|
|
345
|
-
|
|
346
|
-
Returns:
|
|
347
|
-
bool: True if a running event loop exists, False otherwise.
|
|
348
|
-
"""
|
|
349
|
-
try:
|
|
350
|
-
asyncio.get_running_loop()
|
|
351
|
-
return True
|
|
352
|
-
except RuntimeError:
|
|
353
|
-
return False
|
|
62
|
+
# snapped_response, explanation, response
|
|
63
|
+
ParsedLLMResponse: TypeAlias = Tuple[Optional[str], Optional[str], str]
|
|
354
64
|
|
|
355
65
|
|
|
356
66
|
def llm_classify(
|
|
357
67
|
dataframe: pd.DataFrame,
|
|
358
68
|
model: BaseEvalModel,
|
|
359
69
|
template: Union[ClassificationTemplate, PromptTemplate, str],
|
|
360
|
-
rails:
|
|
70
|
+
rails: List[str],
|
|
361
71
|
system_instruction: Optional[str] = None,
|
|
362
72
|
verbose: bool = False,
|
|
363
73
|
use_function_calling_if_available: bool = True,
|
|
364
74
|
provide_explanation: bool = False,
|
|
75
|
+
include_prompt: bool = False,
|
|
76
|
+
include_response: bool = False,
|
|
77
|
+
run_sync: bool = False,
|
|
365
78
|
concurrency: int = 20,
|
|
366
79
|
) -> pd.DataFrame:
|
|
367
|
-
print("llm_classify_new")
|
|
368
80
|
"""Classifies each input row of the dataframe using an LLM. Returns a pandas.DataFrame
|
|
369
81
|
where the first column is named `label` and contains the classification labels. An optional
|
|
370
82
|
column named `explanation` is added when `provide_explanation=True`.
|
|
@@ -396,9 +108,18 @@ def llm_classify(
|
|
|
396
108
|
|
|
397
109
|
provide_explanation (bool, default=False): If True, provides an explanation for each
|
|
398
110
|
classification label. A column named `explanation` is added to the output dataframe.
|
|
399
|
-
Currently, this is only available for models with function calling.
|
|
400
111
|
|
|
401
|
-
|
|
112
|
+
include_prompt (bool, default=False): If True, includes a column named `prompt` in the
|
|
113
|
+
output dataframe containing the prompt used for each classification.
|
|
114
|
+
|
|
115
|
+
include_response (bool, default=False): If True, includes a column named `response` in the
|
|
116
|
+
output dataframe containing the raw response from the LLM.
|
|
117
|
+
|
|
118
|
+
run_sync (bool, default=False): If True, forces synchronous request submission. Otherwise
|
|
119
|
+
evaluations will be run asynchronously if possible.
|
|
120
|
+
|
|
121
|
+
concurrency (int, default=20): The number of concurrent evals if async submission is
|
|
122
|
+
possible.
|
|
402
123
|
|
|
403
124
|
Returns:
|
|
404
125
|
pandas.DataFrame: A dataframe where the `label` column (at column position 0) contains
|
|
@@ -408,18 +129,6 @@ def llm_classify(
|
|
|
408
129
|
from the entries in the rails argument or "NOT_PARSABLE" if the model's output could
|
|
409
130
|
not be parsed.
|
|
410
131
|
"""
|
|
411
|
-
# Check if rails is a single rail to be applied to all, expand if necessary
|
|
412
|
-
# Check if rails is a list of lists
|
|
413
|
-
if all(isinstance(sublist, list) for sublist in rails):
|
|
414
|
-
rails_list = rails
|
|
415
|
-
if use_function_calling_if_available:
|
|
416
|
-
raise ValueError("When using function calling, rails must be a single rail.")
|
|
417
|
-
elif isinstance(rails, list):
|
|
418
|
-
# Assuming rails is a list of strings if it's not a list of lists
|
|
419
|
-
rails_list = [rails] * len(dataframe)
|
|
420
|
-
else:
|
|
421
|
-
raise TypeError("rails must be either a list of strings or a list of lists")
|
|
422
|
-
|
|
423
132
|
tqdm_bar_format = get_tqdm_progress_bar_formatter("llm_classify")
|
|
424
133
|
use_openai_function_call = (
|
|
425
134
|
use_function_calling_if_available
|
|
@@ -445,8 +154,7 @@ def llm_classify(
|
|
|
445
154
|
if generation_info := model.verbose_generation_info():
|
|
446
155
|
printif(verbose, generation_info)
|
|
447
156
|
|
|
448
|
-
def _process_response(
|
|
449
|
-
response, rails_per_response = response_combo
|
|
157
|
+
def _process_response(response: str) -> Tuple[str, Optional[str]]:
|
|
450
158
|
if not use_openai_function_call:
|
|
451
159
|
if provide_explanation:
|
|
452
160
|
unrailed_label, explanation = (
|
|
@@ -461,53 +169,46 @@ def llm_classify(
|
|
|
461
169
|
unrailed_label = response
|
|
462
170
|
explanation = None
|
|
463
171
|
else:
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
unrailed_label = function_arguments.get(_RESPONSE)
|
|
467
|
-
explanation = function_arguments.get(_EXPLANATION)
|
|
468
|
-
except json.JSONDecodeError:
|
|
469
|
-
unrailed_label = response
|
|
470
|
-
explanation = None
|
|
471
|
-
return _snap_to_rail(unrailed_label, rails_per_response, verbose=verbose), explanation
|
|
172
|
+
unrailed_label, explanation = parse_openai_function_call(response)
|
|
173
|
+
return snap_to_rail(unrailed_label, rails, verbose=verbose), explanation
|
|
472
174
|
|
|
473
|
-
async def _run_llm_classification_async(
|
|
474
|
-
prompt_combo: Tuple[str, List[str]],
|
|
475
|
-
) -> Tuple[str, Optional[str]]:
|
|
476
|
-
prompt, rails_per_prompt = prompt_combo
|
|
175
|
+
async def _run_llm_classification_async(prompt: str) -> ParsedLLMResponse:
|
|
477
176
|
with set_verbosity(model, verbose) as verbose_model:
|
|
478
177
|
response = await verbose_model._async_generate(
|
|
479
178
|
prompt, instruction=system_instruction, **model_kwargs
|
|
480
179
|
)
|
|
481
|
-
|
|
482
|
-
return
|
|
180
|
+
inference, explanation = _process_response(response)
|
|
181
|
+
return inference, explanation, response
|
|
483
182
|
|
|
484
|
-
def _run_llm_classification_sync(
|
|
485
|
-
prompt_combo: Tuple[str, List[str]],
|
|
486
|
-
) -> Tuple[str, Optional[str]]:
|
|
487
|
-
prompt, rails_per_prompt = prompt_combo
|
|
183
|
+
def _run_llm_classification_sync(prompt: str) -> ParsedLLMResponse:
|
|
488
184
|
with set_verbosity(model, verbose) as verbose_model:
|
|
489
185
|
response = verbose_model._generate(
|
|
490
186
|
prompt, instruction=system_instruction, **model_kwargs
|
|
491
187
|
)
|
|
492
|
-
|
|
493
|
-
return
|
|
188
|
+
inference, explanation = _process_response(response)
|
|
189
|
+
return inference, explanation, response
|
|
190
|
+
|
|
191
|
+
fallback_return_value: ParsedLLMResponse = (None, None, "")
|
|
494
192
|
|
|
495
193
|
executor = get_executor_on_sync_context(
|
|
496
194
|
_run_llm_classification_sync,
|
|
497
195
|
_run_llm_classification_async,
|
|
196
|
+
run_sync=run_sync,
|
|
498
197
|
concurrency=concurrency,
|
|
499
198
|
tqdm_bar_format=tqdm_bar_format,
|
|
500
199
|
exit_on_error=True,
|
|
501
|
-
fallback_return_value=
|
|
200
|
+
fallback_return_value=fallback_return_value,
|
|
502
201
|
)
|
|
503
|
-
|
|
504
|
-
results = executor.run(
|
|
505
|
-
labels, explanations = zip(*results)
|
|
202
|
+
|
|
203
|
+
results = executor.run(prompts.tolist())
|
|
204
|
+
labels, explanations, responses = zip(*results)
|
|
506
205
|
|
|
507
206
|
return pd.DataFrame(
|
|
508
207
|
data={
|
|
509
208
|
"label": labels,
|
|
510
209
|
**({"explanation": explanations} if provide_explanation else {}),
|
|
210
|
+
**({"prompt": prompts} if include_prompt else {}),
|
|
211
|
+
**({"response": responses} if include_response else {}),
|
|
511
212
|
},
|
|
512
213
|
index=dataframe.index,
|
|
513
214
|
)
|
|
@@ -664,60 +365,67 @@ def _default_openai_function(
|
|
|
664
365
|
|
|
665
366
|
|
|
666
367
|
class RunEvalsPayload(NamedTuple):
|
|
368
|
+
evaluator_index: EvaluatorIndex
|
|
369
|
+
row_index: RowIndex
|
|
667
370
|
evaluator: LLMEvaluator
|
|
668
371
|
record: Record
|
|
669
|
-
row_index: RowIndex
|
|
670
372
|
|
|
671
373
|
|
|
672
374
|
def run_evals(
|
|
673
375
|
dataframe: DataFrame,
|
|
674
376
|
evaluators: List[LLMEvaluator],
|
|
377
|
+
provide_explanation: bool = False,
|
|
378
|
+
verbose: bool = False,
|
|
675
379
|
concurrency: int = 20,
|
|
676
|
-
) -> DataFrame:
|
|
380
|
+
) -> List[DataFrame]:
|
|
677
381
|
"""
|
|
678
|
-
Applies a list of evaluators to
|
|
679
|
-
|
|
680
|
-
|
|
382
|
+
Applies a list of evaluators to a dataframe. Outputs a list of dataframes in
|
|
383
|
+
which each dataframe contains the outputs of the corresponding evaluator
|
|
384
|
+
applied to the input dataframe.
|
|
681
385
|
|
|
682
386
|
Args:
|
|
683
|
-
dataframe (
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
387
|
+
dataframe (DataFrame): A pandas dataframe in which each row represents a
|
|
388
|
+
record to be evaluated. All template variable names must appear as
|
|
389
|
+
column names in the dataframe (extra columns unrelated to the template
|
|
390
|
+
are permitted).
|
|
391
|
+
|
|
392
|
+
evaluators (List[LLMEvaluator]): A list of evaluators.
|
|
687
393
|
|
|
688
|
-
|
|
394
|
+
provide_explanation (bool, optional): If True, provides an explanation
|
|
395
|
+
for each evaluation. A column named "explanation" is added to each
|
|
396
|
+
output dataframe.
|
|
689
397
|
|
|
690
|
-
|
|
691
|
-
to
|
|
398
|
+
verbose (bool, optional): If True, prints detailed info to stdout such
|
|
399
|
+
as model invocation parameters and details about retries and snapping to
|
|
400
|
+
rails.
|
|
401
|
+
|
|
402
|
+
concurrency (int, optional): The number of concurrent evals if async
|
|
403
|
+
submission is possible.
|
|
692
404
|
|
|
693
405
|
Returns:
|
|
694
|
-
DataFrame: A
|
|
695
|
-
|
|
696
|
-
the column names match the names of the evaluators. The index of the
|
|
697
|
-
dataframe is the same as the index of the input dataframe.
|
|
406
|
+
List[DataFrame]: A list of dataframes, one for each evaluator, all of
|
|
407
|
+
which have the same number of rows as the input dataframe.
|
|
698
408
|
"""
|
|
699
|
-
if len(set(evaluator.name for evaluator in evaluators)) != len(evaluators):
|
|
700
|
-
raise ValueError("Evaluators must have unique names.")
|
|
701
409
|
|
|
702
|
-
async def
|
|
410
|
+
async def _arun_eval(
|
|
411
|
+
payload: RunEvalsPayload,
|
|
412
|
+
) -> Tuple[EvaluatorIndex, RowIndex, Label, Explanation]:
|
|
413
|
+
label, explanation = await payload.evaluator.aevaluate(
|
|
414
|
+
payload.record, provide_explanation=provide_explanation
|
|
415
|
+
)
|
|
416
|
+
return payload.evaluator_index, payload.row_index, label, explanation
|
|
417
|
+
|
|
418
|
+
def _run_eval(
|
|
703
419
|
payload: RunEvalsPayload,
|
|
704
|
-
) -> Tuple[RowIndex,
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
return row_index, evaluator.name, eval_result
|
|
710
|
-
|
|
711
|
-
def _run_eval_sync(payload: RunEvalsPayload) -> Tuple[RowIndex, EvalName, EvalPrediction]:
|
|
712
|
-
row_index = payload.row_index
|
|
713
|
-
evaluator = payload.evaluator
|
|
714
|
-
record = payload.record
|
|
715
|
-
eval_result = evaluator.evaluate(record)
|
|
716
|
-
return row_index, evaluator.name, eval_result
|
|
420
|
+
) -> Tuple[EvaluatorIndex, RowIndex, Label, Explanation]:
|
|
421
|
+
label, explanation = payload.evaluator.evaluate(
|
|
422
|
+
payload.record, provide_explanation=provide_explanation
|
|
423
|
+
)
|
|
424
|
+
return payload.evaluator_index, payload.row_index, label, explanation
|
|
717
425
|
|
|
718
426
|
executor = get_executor_on_sync_context(
|
|
719
|
-
|
|
720
|
-
|
|
427
|
+
_run_eval,
|
|
428
|
+
_arun_eval,
|
|
721
429
|
concurrency=concurrency,
|
|
722
430
|
tqdm_bar_format=get_tqdm_progress_bar_formatter("run_evals"),
|
|
723
431
|
exit_on_error=True,
|
|
@@ -725,15 +433,23 @@ def run_evals(
|
|
|
725
433
|
)
|
|
726
434
|
payloads = [
|
|
727
435
|
RunEvalsPayload(
|
|
436
|
+
evaluator_index=evaluator_index,
|
|
728
437
|
row_index=row_index,
|
|
729
438
|
evaluator=evaluator,
|
|
730
439
|
record=row.to_dict(),
|
|
731
440
|
)
|
|
732
441
|
for row_index, row in dataframe.iterrows()
|
|
733
|
-
for evaluator in evaluators
|
|
442
|
+
for evaluator_index, evaluator in enumerate(evaluators)
|
|
443
|
+
]
|
|
444
|
+
eval_results: List[DefaultDict[RowIndex, Dict[ColumnName, Union[Label, Explanation]]]] = [
|
|
445
|
+
defaultdict(dict) for _ in range(len(evaluators))
|
|
734
446
|
]
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
447
|
+
for evaluator_index, row_index, label, explanation in executor.run(payloads):
|
|
448
|
+
eval_results[evaluator_index][row_index]["label"] = label
|
|
449
|
+
if explanation is not None:
|
|
450
|
+
eval_results[evaluator_index][row_index]["explanation"] = explanation
|
|
451
|
+
eval_dataframes: List[DataFrame] = []
|
|
452
|
+
for eval_result in eval_results:
|
|
453
|
+
index, eval_data = zip(*eval_result.items())
|
|
454
|
+
eval_dataframes.append(DataFrame(eval_data, index=index))
|
|
455
|
+
return eval_dataframes
|