python-flexeval 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. flexeval/__init__.py +11 -0
  2. flexeval/__main__.py +11 -0
  3. flexeval/classes/__init__.py +15 -0
  4. flexeval/classes/base.py +32 -0
  5. flexeval/classes/dataset.py +82 -0
  6. flexeval/classes/eval_runner.py +158 -0
  7. flexeval/classes/eval_set_run.py +32 -0
  8. flexeval/classes/message.py +183 -0
  9. flexeval/classes/metric.py +55 -0
  10. flexeval/classes/thread.py +79 -0
  11. flexeval/classes/tool_call.py +51 -0
  12. flexeval/classes/turn.py +206 -0
  13. flexeval/cli.py +104 -0
  14. flexeval/completions.py +147 -0
  15. flexeval/compute_metrics.py +788 -0
  16. flexeval/config.yaml +23 -0
  17. flexeval/configuration/__init__.py +1 -0
  18. flexeval/configuration/completion_functions.py +231 -0
  19. flexeval/configuration/evals.yaml +864 -0
  20. flexeval/configuration/function_metrics.py +650 -0
  21. flexeval/configuration/rubric_metrics.yaml +194 -0
  22. flexeval/data_loader.py +513 -0
  23. flexeval/db_utils.py +38 -0
  24. flexeval/dependency_graph.py +234 -0
  25. flexeval/eval_schema.json +256 -0
  26. flexeval/function_types.py +173 -0
  27. flexeval/helpers.py +52 -0
  28. flexeval/io/__init__.py +1 -0
  29. flexeval/io/parsers/yaml_parser.py +69 -0
  30. flexeval/log_utils.py +34 -0
  31. flexeval/metrics/__init__.py +8 -0
  32. flexeval/metrics/access.py +28 -0
  33. flexeval/metrics/save.py +39 -0
  34. flexeval/rubric.py +62 -0
  35. flexeval/run_utils.py +65 -0
  36. flexeval/runner.py +132 -0
  37. flexeval/schema/__init__.py +11 -0
  38. flexeval/schema/config_schema.py +46 -0
  39. flexeval/schema/eval_schema.py +163 -0
  40. flexeval/schema/evalrun_schema.py +97 -0
  41. flexeval/schema/rubric_schema.py +40 -0
  42. flexeval/schema/schema_utils.py +26 -0
  43. python_flexeval-0.1.5.dist-info/METADATA +118 -0
  44. python_flexeval-0.1.5.dist-info/RECORD +47 -0
  45. python_flexeval-0.1.5.dist-info/WHEEL +4 -0
  46. python_flexeval-0.1.5.dist-info/entry_points.txt +2 -0
  47. python_flexeval-0.1.5.dist-info/licenses/LICENSE +21 -0
flexeval/config.yaml ADDED
@@ -0,0 +1,23 @@
1
+
2
+
3
+ # paths are relative to the root of the repo
4
+ rubric_metrics_path:
5
+ - src/flexeval/configuration/rubric_metrics.yaml
6
+ - example_project/example_specific_rubrics.yaml
7
+ evals_path: src/flexeval/configuration/evals.yaml
8
+ env_file: .env #in same location as main.py
9
+ logs_path: logs/
10
+ eval_schema_path: src/flexeval/eval_schema.json
11
+
12
+ database_path: data/results/results.db
13
+
14
+ max_workers: 1
15
+
16
+ random_seed_conversation_sampling: 42
17
+ max_n_conversation_threads: 50
18
+ nb_evaluations_per_thread: 1
19
+
20
+
21
+
22
+ # any additional environment variables
23
+ env:
@@ -0,0 +1 @@
1
+ """Built-in completion functions, function metrics, and rubric metrics."""
@@ -0,0 +1,231 @@
1
+ """This file contains a list of Python functions that accept conversations as input
2
+ and produce conversational turns (aka completions) as output.
3
+
4
+ When writing a new function, the arguments must include, at minimum:
5
+ * conversation_history - list of dictionaries with keys ("role","content"), whose values are strings
6
+ * kwargs - dictionary of optional values that can probably be ignored
7
+ Other arguments can be added, but then must also be specified
8
+ in the "completion_llm" section of the evals.yaml config.
9
+
10
+ The outputs must conform to the structure described here:
11
+ https://platform.openai.com/docs/guides/text-generation/chat-completions-api
12
+ with the following format:
13
+ completion = {
14
+ "choices": [
15
+ {
16
+ "message":{
17
+ "content": MY_CONTENT_HERE,
18
+ "role":"assistant"
19
+ }
20
+ }
21
+ ]
22
+ }
23
+
24
+ """
25
+
26
+ import json
27
+ import logging
28
+ import os
29
+ from typing import Any, Dict, List
30
+
31
+ import litellm
32
+ import requests
33
+ import tiktoken
34
+ from openai import OpenAI
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ def echo_completion(
40
+ conversation_history: List[Dict[str, Any]],
41
+ **kwargs: Any,
42
+ ) -> Dict[str, Any]:
43
+ prev_message = (
44
+ conversation_history[0]["content"]
45
+ if len(conversation_history) > 0
46
+ else "No messages yet."
47
+ )
48
+ response = prev_message
49
+ if "response" in kwargs:
50
+ response = kwargs["response"]
51
+ # estimate token usage using a default tokenizer
52
+ tokenizer = tiktoken.get_encoding("cl100k_base")
53
+ completion_tokens = len(tokenizer.encode(response))
54
+ prompt_tokens = len(
55
+ tokenizer.encode(
56
+ "".join([message["content"] for message in conversation_history])
57
+ )
58
+ )
59
+ completion = {
60
+ "choices": [{"message": {"content": response, "role": "assistant"}}],
61
+ "model": "echo",
62
+ "usage": {
63
+ "completion_tokens": completion_tokens,
64
+ "prompt_tokens": prompt_tokens,
65
+ },
66
+ }
67
+ return completion
68
+
69
+
70
+ def litellm_completion(
71
+ conversation_history: list[dict[str, Any]],
72
+ model: str,
73
+ n: int = 1,
74
+ **kwargs,
75
+ ) -> dict[str, Any]:
76
+ """
77
+ Generate a completion for a given conversation history using LiteLLM's completion().
78
+
79
+ Args:
80
+ conversation_history (List[Dict[str, Any]]): The conversation history as a list of message dictionaries.
81
+ model (str): The name of the model to use for the completion.
82
+ n (int, optional): The number of completion choices to generate. Defaults to 1.
83
+ **kwargs (Any): Additional keyword arguments to pass to completion(). Allowed values vary depending on the chosen model.
84
+
85
+ Returns:
86
+ Dict[str, Any]: The response.
87
+ """
88
+ response = litellm.completion(
89
+ messages=conversation_history,
90
+ model=model,
91
+ n=n,
92
+ **kwargs,
93
+ )
94
+ return response.model_dump(exclude_unset=True)
95
+
96
+
97
+ def open_ai_completion(
98
+ conversation_history: List[Dict[str, Any]],
99
+ model_name: str,
100
+ api_key_name: str,
101
+ n: int = 1,
102
+ **kwargs: Any,
103
+ ) -> Dict[str, Any]:
104
+ """
105
+ Generate a completion for a given conversation history using OpenAI's chat completion API.
106
+
107
+ Args:
108
+ conversation_history (List[Dict[str, Any]]): The conversation history as a list of message dictionaries.
109
+ model_name (str): The name of the OpenAI model to use for the completion.
110
+ api_key_name (str): The environment variable name where the API key is stored.
111
+ n (int, optional): The number of completion choices to generate. Defaults to 1.
112
+ **kwargs (Any): Additional keyword arguments to pass to the OpenAI API client.
113
+
114
+ Returns:
115
+ Dict[str, Any]: The response from the OpenAI API with unset fields excluded.
116
+ """
117
+ client = OpenAI(api_key=os.getenv(api_key_name))
118
+
119
+ raw_response = client.chat.completions.create(
120
+ model=model_name, messages=conversation_history, n=int(n), **kwargs
121
+ )
122
+
123
+ return raw_response.model_dump(exclude_unset=True)
124
+
125
+
126
+ def open_ai_completion_async(
127
+ conversation_history: List[Dict[str, Any]],
128
+ model_name: str,
129
+ api_key_name: str,
130
+ n: int = 1,
131
+ **kwargs: Any,
132
+ ) -> Dict[str, Any]:
133
+ """
134
+ Generate a completion for a given conversation history using OpenAI's chat completion API.
135
+
136
+ Args:
137
+ conversation_history (List[Dict[str, Any]]): The conversation history as a list of message dictionaries.
138
+ model_name (str): The name of the OpenAI model to use for the completion.
139
+ api_key_name (str): The environment variable name where the API key is stored.
140
+ n (int, optional): The number of completion choices to generate. Defaults to 1.
141
+ **kwargs (Any): Additional keyword arguments to pass to the OpenAI API client.
142
+
143
+ Returns:
144
+ Dict[str, Any]: The response from the OpenAI API with unset fields excluded.
145
+ """
146
+ client = OpenAI(api_key=os.getenv(api_key_name))
147
+
148
+ raw_response = client.chat.completions.create(
149
+ model=model_name, messages=conversation_history, n=int(n), **kwargs
150
+ )
151
+
152
+ return raw_response.model_dump(exclude_unset=True)
153
+
154
+
155
+ def jan_completion(conversation_history, model_name, endpoint, **kwargs):
156
+ # Example: reuse your existing OpenAI setup
157
+
158
+ client = OpenAI(base_url=endpoint, api_key="not-needed")
159
+ raw_response = client.chat.completions.create(
160
+ model=model_name, messages=conversation_history, temperature=0.7, **kwargs
161
+ )
162
+ return raw_response.model_dump(exclude_unset=True)
163
+
164
+
165
+ def lm_studio_completion(conversation_history, model_name, endpoint, **kwargs):
166
+ # Example: reuse your existing OpenAI setup
167
+
168
+ client = OpenAI(base_url=endpoint, api_key="not-needed")
169
+
170
+ raw_response = client.chat.completions.create(
171
+ model=model_name,
172
+ messages=conversation_history,
173
+ temperature=0.7,
174
+ )
175
+ return raw_response.model_dump(exclude_unset=True)
176
+
177
+
178
+ def generic_rest_api_completion(
179
+ conversation_history,
180
+ api_key_name: str,
181
+ endpoint: str,
182
+ **kwargs,
183
+ ):
184
+ auth_key = os.environ.get(api_key_name, None)
185
+
186
+ # Headers including the authentication key and content type
187
+ headers = {
188
+ "Authorization": f"Bearer {auth_key}",
189
+ "Content-Type": "application/json",
190
+ }
191
+
192
+ # Sending a POST request
193
+ response = requests.post(
194
+ endpoint, data=json.dumps(conversation_history), headers=headers
195
+ )
196
+ response_data = response.json()
197
+ completion = response_data["completion"]
198
+ # Check if the request was successful
199
+ if response.status_code == 200:
200
+ logger.info("Success:", response.text)
201
+ else:
202
+ logger.info("Error:", response.text)
203
+
204
+ completion = {
205
+ "choices": [{"message": {"content": completion, "role": "assistant"}}]
206
+ }
207
+ return completion
208
+
209
+
210
+ # when no completion function is needed
211
+ def no_completion_fn(
212
+ conversation_history,
213
+ **kwargs,
214
+ ):
215
+ completion = {
216
+ "choices": [{"message": {"content": "hello world", "role": "assistant"}}]
217
+ }
218
+ return completion
219
+
220
+
221
+ def gpt_3p5_turbo(conversation_history, model_name, key_name, **kwargs):
222
+ client = OpenAI(api_key=os.getenv(key_name))
223
+ raw_response = client.chat.completions.create(
224
+ model="gpt-3.5-turbo", messages=conversation_history
225
+ )
226
+ return raw_response.model_dump(exclude_unset=True)
227
+
228
+
229
+ def placeholder_completion(conversation_history, model_name, **kwargs):
230
+ """This is just for testing -- always returns 'hi'"""
231
+ return {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}