python-flexeval 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexeval/__init__.py +11 -0
- flexeval/__main__.py +11 -0
- flexeval/classes/__init__.py +15 -0
- flexeval/classes/base.py +32 -0
- flexeval/classes/dataset.py +82 -0
- flexeval/classes/eval_runner.py +158 -0
- flexeval/classes/eval_set_run.py +32 -0
- flexeval/classes/message.py +183 -0
- flexeval/classes/metric.py +55 -0
- flexeval/classes/thread.py +79 -0
- flexeval/classes/tool_call.py +51 -0
- flexeval/classes/turn.py +206 -0
- flexeval/cli.py +104 -0
- flexeval/completions.py +147 -0
- flexeval/compute_metrics.py +788 -0
- flexeval/config.yaml +23 -0
- flexeval/configuration/__init__.py +1 -0
- flexeval/configuration/completion_functions.py +231 -0
- flexeval/configuration/evals.yaml +864 -0
- flexeval/configuration/function_metrics.py +650 -0
- flexeval/configuration/rubric_metrics.yaml +194 -0
- flexeval/data_loader.py +513 -0
- flexeval/db_utils.py +38 -0
- flexeval/dependency_graph.py +234 -0
- flexeval/eval_schema.json +256 -0
- flexeval/function_types.py +173 -0
- flexeval/helpers.py +52 -0
- flexeval/io/__init__.py +1 -0
- flexeval/io/parsers/yaml_parser.py +69 -0
- flexeval/log_utils.py +34 -0
- flexeval/metrics/__init__.py +8 -0
- flexeval/metrics/access.py +28 -0
- flexeval/metrics/save.py +39 -0
- flexeval/rubric.py +62 -0
- flexeval/run_utils.py +65 -0
- flexeval/runner.py +132 -0
- flexeval/schema/__init__.py +11 -0
- flexeval/schema/config_schema.py +46 -0
- flexeval/schema/eval_schema.py +163 -0
- flexeval/schema/evalrun_schema.py +97 -0
- flexeval/schema/rubric_schema.py +40 -0
- flexeval/schema/schema_utils.py +26 -0
- python_flexeval-0.1.5.dist-info/METADATA +118 -0
- python_flexeval-0.1.5.dist-info/RECORD +47 -0
- python_flexeval-0.1.5.dist-info/WHEEL +4 -0
- python_flexeval-0.1.5.dist-info/entry_points.txt +2 -0
- python_flexeval-0.1.5.dist-info/licenses/LICENSE +21 -0
flexeval/config.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
# paths are relative to the root of the repo
|
|
4
|
+
rubric_metrics_path:
|
|
5
|
+
- src/flexeval/configuration/rubric_metrics.yaml
|
|
6
|
+
- example_project/example_specific_rubrics.yaml
|
|
7
|
+
evals_path: src/flexeval/configuration/evals.yaml
|
|
8
|
+
env_file: .env #in same location as main.py
|
|
9
|
+
logs_path: logs/
|
|
10
|
+
eval_schema_path: src/flexeval/eval_schema.json
|
|
11
|
+
|
|
12
|
+
database_path: data/results/results.db
|
|
13
|
+
|
|
14
|
+
max_workers: 1
|
|
15
|
+
|
|
16
|
+
random_seed_conversation_sampling: 42
|
|
17
|
+
max_n_conversation_threads: 50
|
|
18
|
+
nb_evaluations_per_thread: 1
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# any additional environment variables
|
|
23
|
+
env:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Built-in completion functions, function metrics, and rubric metrics."""
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""This file contains a list of Python functions that accept conversations as input
|
|
2
|
+
and produce conversational turns (aka completions) as output.
|
|
3
|
+
|
|
4
|
+
When writing a new function, the arguments must include, at minimum:
|
|
5
|
+
* conversation_history - list of dictionaries with keys ("role","content"), whose values are strings
|
|
6
|
+
* kwargs - dictionary of optional values that can probably be ignored
|
|
7
|
+
Other arguments can be added, but then must also be specified
|
|
8
|
+
in the "completion_llm" section of the evals.yaml config.
|
|
9
|
+
|
|
10
|
+
The outputs must conform to the structure described here:
|
|
11
|
+
https://platform.openai.com/docs/guides/text-generation/chat-completions-api
|
|
12
|
+
with the following format:
|
|
13
|
+
completion = {
|
|
14
|
+
"choices": [
|
|
15
|
+
{
|
|
16
|
+
"message":{
|
|
17
|
+
"content": MY_CONTENT_HERE,
|
|
18
|
+
"role":"assistant"
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
]
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import json
|
|
27
|
+
import logging
|
|
28
|
+
import os
|
|
29
|
+
from typing import Any, Dict, List
|
|
30
|
+
|
|
31
|
+
import litellm
|
|
32
|
+
import requests
|
|
33
|
+
import tiktoken
|
|
34
|
+
from openai import OpenAI
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def echo_completion(
|
|
40
|
+
conversation_history: List[Dict[str, Any]],
|
|
41
|
+
**kwargs: Any,
|
|
42
|
+
) -> Dict[str, Any]:
|
|
43
|
+
prev_message = (
|
|
44
|
+
conversation_history[0]["content"]
|
|
45
|
+
if len(conversation_history) > 0
|
|
46
|
+
else "No messages yet."
|
|
47
|
+
)
|
|
48
|
+
response = prev_message
|
|
49
|
+
if "response" in kwargs:
|
|
50
|
+
response = kwargs["response"]
|
|
51
|
+
# estimate token usage using a default tokenizer
|
|
52
|
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
|
53
|
+
completion_tokens = len(tokenizer.encode(response))
|
|
54
|
+
prompt_tokens = len(
|
|
55
|
+
tokenizer.encode(
|
|
56
|
+
"".join([message["content"] for message in conversation_history])
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
completion = {
|
|
60
|
+
"choices": [{"message": {"content": response, "role": "assistant"}}],
|
|
61
|
+
"model": "echo",
|
|
62
|
+
"usage": {
|
|
63
|
+
"completion_tokens": completion_tokens,
|
|
64
|
+
"prompt_tokens": prompt_tokens,
|
|
65
|
+
},
|
|
66
|
+
}
|
|
67
|
+
return completion
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def litellm_completion(
|
|
71
|
+
conversation_history: list[dict[str, Any]],
|
|
72
|
+
model: str,
|
|
73
|
+
n: int = 1,
|
|
74
|
+
**kwargs,
|
|
75
|
+
) -> dict[str, Any]:
|
|
76
|
+
"""
|
|
77
|
+
Generate a completion for a given conversation history using LiteLLM's completion().
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
conversation_history (List[Dict[str, Any]]): The conversation history as a list of message dictionaries.
|
|
81
|
+
model (str): The name of the model to use for the completion.
|
|
82
|
+
n (int, optional): The number of completion choices to generate. Defaults to 1.
|
|
83
|
+
**kwargs (Any): Additional keyword arguments to pass to completion(). Allowed values vary depending on the chosen model.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Dict[str, Any]: The response.
|
|
87
|
+
"""
|
|
88
|
+
response = litellm.completion(
|
|
89
|
+
messages=conversation_history,
|
|
90
|
+
model=model,
|
|
91
|
+
n=n,
|
|
92
|
+
**kwargs,
|
|
93
|
+
)
|
|
94
|
+
return response.model_dump(exclude_unset=True)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def open_ai_completion(
|
|
98
|
+
conversation_history: List[Dict[str, Any]],
|
|
99
|
+
model_name: str,
|
|
100
|
+
api_key_name: str,
|
|
101
|
+
n: int = 1,
|
|
102
|
+
**kwargs: Any,
|
|
103
|
+
) -> Dict[str, Any]:
|
|
104
|
+
"""
|
|
105
|
+
Generate a completion for a given conversation history using OpenAI's chat completion API.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
conversation_history (List[Dict[str, Any]]): The conversation history as a list of message dictionaries.
|
|
109
|
+
model_name (str): The name of the OpenAI model to use for the completion.
|
|
110
|
+
api_key_name (str): The environment variable name where the API key is stored.
|
|
111
|
+
n (int, optional): The number of completion choices to generate. Defaults to 1.
|
|
112
|
+
**kwargs (Any): Additional keyword arguments to pass to the OpenAI API client.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Dict[str, Any]: The response from the OpenAI API with unset fields excluded.
|
|
116
|
+
"""
|
|
117
|
+
client = OpenAI(api_key=os.getenv(api_key_name))
|
|
118
|
+
|
|
119
|
+
raw_response = client.chat.completions.create(
|
|
120
|
+
model=model_name, messages=conversation_history, n=int(n), **kwargs
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return raw_response.model_dump(exclude_unset=True)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def open_ai_completion_async(
|
|
127
|
+
conversation_history: List[Dict[str, Any]],
|
|
128
|
+
model_name: str,
|
|
129
|
+
api_key_name: str,
|
|
130
|
+
n: int = 1,
|
|
131
|
+
**kwargs: Any,
|
|
132
|
+
) -> Dict[str, Any]:
|
|
133
|
+
"""
|
|
134
|
+
Generate a completion for a given conversation history using OpenAI's chat completion API.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
conversation_history (List[Dict[str, Any]]): The conversation history as a list of message dictionaries.
|
|
138
|
+
model_name (str): The name of the OpenAI model to use for the completion.
|
|
139
|
+
api_key_name (str): The environment variable name where the API key is stored.
|
|
140
|
+
n (int, optional): The number of completion choices to generate. Defaults to 1.
|
|
141
|
+
**kwargs (Any): Additional keyword arguments to pass to the OpenAI API client.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Dict[str, Any]: The response from the OpenAI API with unset fields excluded.
|
|
145
|
+
"""
|
|
146
|
+
client = OpenAI(api_key=os.getenv(api_key_name))
|
|
147
|
+
|
|
148
|
+
raw_response = client.chat.completions.create(
|
|
149
|
+
model=model_name, messages=conversation_history, n=int(n), **kwargs
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return raw_response.model_dump(exclude_unset=True)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def jan_completion(conversation_history, model_name, endpoint, **kwargs):
|
|
156
|
+
# Example: reuse your existing OpenAI setup
|
|
157
|
+
|
|
158
|
+
client = OpenAI(base_url=endpoint, api_key="not-needed")
|
|
159
|
+
raw_response = client.chat.completions.create(
|
|
160
|
+
model=model_name, messages=conversation_history, temperature=0.7, **kwargs
|
|
161
|
+
)
|
|
162
|
+
return raw_response.model_dump(exclude_unset=True)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def lm_studio_completion(conversation_history, model_name, endpoint, **kwargs):
|
|
166
|
+
# Example: reuse your existing OpenAI setup
|
|
167
|
+
|
|
168
|
+
client = OpenAI(base_url=endpoint, api_key="not-needed")
|
|
169
|
+
|
|
170
|
+
raw_response = client.chat.completions.create(
|
|
171
|
+
model=model_name,
|
|
172
|
+
messages=conversation_history,
|
|
173
|
+
temperature=0.7,
|
|
174
|
+
)
|
|
175
|
+
return raw_response.model_dump(exclude_unset=True)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def generic_rest_api_completion(
|
|
179
|
+
conversation_history,
|
|
180
|
+
api_key_name: str,
|
|
181
|
+
endpoint: str,
|
|
182
|
+
**kwargs,
|
|
183
|
+
):
|
|
184
|
+
auth_key = os.environ.get(api_key_name, None)
|
|
185
|
+
|
|
186
|
+
# Headers including the authentication key and content type
|
|
187
|
+
headers = {
|
|
188
|
+
"Authorization": f"Bearer {auth_key}",
|
|
189
|
+
"Content-Type": "application/json",
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
# Sending a POST request
|
|
193
|
+
response = requests.post(
|
|
194
|
+
endpoint, data=json.dumps(conversation_history), headers=headers
|
|
195
|
+
)
|
|
196
|
+
response_data = response.json()
|
|
197
|
+
completion = response_data["completion"]
|
|
198
|
+
# Check if the request was successful
|
|
199
|
+
if response.status_code == 200:
|
|
200
|
+
logger.info("Success:", response.text)
|
|
201
|
+
else:
|
|
202
|
+
logger.info("Error:", response.text)
|
|
203
|
+
|
|
204
|
+
completion = {
|
|
205
|
+
"choices": [{"message": {"content": completion, "role": "assistant"}}]
|
|
206
|
+
}
|
|
207
|
+
return completion
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# when no completion function is needed
|
|
211
|
+
def no_completion_fn(
|
|
212
|
+
conversation_history,
|
|
213
|
+
**kwargs,
|
|
214
|
+
):
|
|
215
|
+
completion = {
|
|
216
|
+
"choices": [{"message": {"content": "hello world", "role": "assistant"}}]
|
|
217
|
+
}
|
|
218
|
+
return completion
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def gpt_3p5_turbo(conversation_history, model_name, key_name, **kwargs):
|
|
222
|
+
client = OpenAI(api_key=os.getenv(key_name))
|
|
223
|
+
raw_response = client.chat.completions.create(
|
|
224
|
+
model="gpt-3.5-turbo", messages=conversation_history
|
|
225
|
+
)
|
|
226
|
+
return raw_response.model_dump(exclude_unset=True)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def placeholder_completion(conversation_history, model_name, **kwargs):
|
|
230
|
+
"""This is just for testing -- always returns 'hi'"""
|
|
231
|
+
return {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}
|