python-flexeval 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexeval/__init__.py +11 -0
- flexeval/__main__.py +11 -0
- flexeval/classes/__init__.py +15 -0
- flexeval/classes/base.py +32 -0
- flexeval/classes/dataset.py +82 -0
- flexeval/classes/eval_runner.py +158 -0
- flexeval/classes/eval_set_run.py +32 -0
- flexeval/classes/message.py +183 -0
- flexeval/classes/metric.py +55 -0
- flexeval/classes/thread.py +79 -0
- flexeval/classes/tool_call.py +51 -0
- flexeval/classes/turn.py +206 -0
- flexeval/cli.py +104 -0
- flexeval/completions.py +147 -0
- flexeval/compute_metrics.py +788 -0
- flexeval/config.yaml +23 -0
- flexeval/configuration/__init__.py +1 -0
- flexeval/configuration/completion_functions.py +231 -0
- flexeval/configuration/evals.yaml +864 -0
- flexeval/configuration/function_metrics.py +650 -0
- flexeval/configuration/rubric_metrics.yaml +194 -0
- flexeval/data_loader.py +513 -0
- flexeval/db_utils.py +38 -0
- flexeval/dependency_graph.py +234 -0
- flexeval/eval_schema.json +256 -0
- flexeval/function_types.py +173 -0
- flexeval/helpers.py +52 -0
- flexeval/io/__init__.py +1 -0
- flexeval/io/parsers/yaml_parser.py +69 -0
- flexeval/log_utils.py +34 -0
- flexeval/metrics/__init__.py +8 -0
- flexeval/metrics/access.py +28 -0
- flexeval/metrics/save.py +39 -0
- flexeval/rubric.py +62 -0
- flexeval/run_utils.py +65 -0
- flexeval/runner.py +132 -0
- flexeval/schema/__init__.py +11 -0
- flexeval/schema/config_schema.py +46 -0
- flexeval/schema/eval_schema.py +163 -0
- flexeval/schema/evalrun_schema.py +97 -0
- flexeval/schema/rubric_schema.py +40 -0
- flexeval/schema/schema_utils.py +26 -0
- python_flexeval-0.1.5.dist-info/METADATA +118 -0
- python_flexeval-0.1.5.dist-info/RECORD +47 -0
- python_flexeval-0.1.5.dist-info/WHEEL +4 -0
- python_flexeval-0.1.5.dist-info/entry_points.txt +2 -0
- python_flexeval-0.1.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""Determines how configured metrics depend on each other."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import networkx as nx
|
|
7
|
+
|
|
8
|
+
from flexeval.helpers import generate_hash
|
|
9
|
+
from flexeval.schema import eval_schema
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_metrics_graph(metrics: eval_schema.Metrics) -> list[Any]:
|
|
13
|
+
"""Input is the metrics dictionary with keys 'function' and 'rubric', each of which maps to a list
|
|
14
|
+
Output is list of string representations of the nodes in the graph, in topological order
|
|
15
|
+
|
|
16
|
+
Each entry and dependency will get an ID so they are easy to match later
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
# Create a directed graph
|
|
20
|
+
G = nx.DiGraph()
|
|
21
|
+
metric_graph_dict = {}
|
|
22
|
+
|
|
23
|
+
# make an intermediate datastructure that adds IDs to all listed evaluations
|
|
24
|
+
user_metrics_with_ids = {}
|
|
25
|
+
for evaluation_type in ["function", "rubric"]:
|
|
26
|
+
user_metrics_with_ids[evaluation_type] = []
|
|
27
|
+
# add a hash to every metric in the list
|
|
28
|
+
item_list: list[eval_schema.MetricItem] = getattr(metrics, evaluation_type)
|
|
29
|
+
if item_list is not None:
|
|
30
|
+
for item in item_list:
|
|
31
|
+
metric_with_id = {"id": generate_hash()}
|
|
32
|
+
for k, v in item.model_dump().items():
|
|
33
|
+
metric_with_id[k] = v
|
|
34
|
+
user_metrics_with_ids[evaluation_type].append(metric_with_id)
|
|
35
|
+
|
|
36
|
+
# now that all potential parents have IDs, find parents for each child
|
|
37
|
+
for evaluation_type in ["function", "rubric"]:
|
|
38
|
+
for metric_dict in user_metrics_with_ids[evaluation_type]:
|
|
39
|
+
parent_metrics, depends_on_with_parent_ids = get_parent_metrics(
|
|
40
|
+
all_metrics=user_metrics_with_ids, child=metric_dict
|
|
41
|
+
)
|
|
42
|
+
metric_dict["depends_on"] = depends_on_with_parent_ids
|
|
43
|
+
|
|
44
|
+
child_metric_str, evaluation_name = get_metric_info(metric_dict)
|
|
45
|
+
|
|
46
|
+
# Now construct the graph
|
|
47
|
+
# Add an edge, which implicitly adds nodes where necessary
|
|
48
|
+
if len(parent_metrics) > 0:
|
|
49
|
+
for parent_metric_dict in parent_metrics:
|
|
50
|
+
parent_metric_str, _ = get_metric_info(parent_metric_dict)
|
|
51
|
+
G.add_edge(parent_metric_str, child_metric_str)
|
|
52
|
+
# make 'canonical' representation of child
|
|
53
|
+
metric_graph_dict[child_metric_str] = {
|
|
54
|
+
"evaluation_name": evaluation_name, # function or rubric name
|
|
55
|
+
"evaluation_type": evaluation_type,
|
|
56
|
+
}
|
|
57
|
+
for k, v in metric_dict.items():
|
|
58
|
+
if k not in [
|
|
59
|
+
"function_name",
|
|
60
|
+
"rubric_name",
|
|
61
|
+
"type",
|
|
62
|
+
"name",
|
|
63
|
+
]:
|
|
64
|
+
metric_graph_dict[child_metric_str][k] = v
|
|
65
|
+
|
|
66
|
+
# # copy over details of parent metric that aren't already present
|
|
67
|
+
# for k, v in parent_metric_dict.items():
|
|
68
|
+
# if k not in metric_graph_dict[child_metric]:
|
|
69
|
+
# metric_graph_dict[child_metric][k] = v
|
|
70
|
+
else:
|
|
71
|
+
# if there is no parent, just add a node by itself
|
|
72
|
+
G.add_node(child_metric_str)
|
|
73
|
+
metric_graph_dict[child_metric_str] = {
|
|
74
|
+
"evaluation_name": evaluation_name, # function or rubric name
|
|
75
|
+
"evaluation_type": evaluation_type,
|
|
76
|
+
}
|
|
77
|
+
for k, v in metric_dict.items():
|
|
78
|
+
if k not in [
|
|
79
|
+
"function_name",
|
|
80
|
+
"rubric_name",
|
|
81
|
+
"type",
|
|
82
|
+
"name",
|
|
83
|
+
]:
|
|
84
|
+
metric_graph_dict[child_metric_str][k] = v
|
|
85
|
+
|
|
86
|
+
# Make string representation with all nodes for error printing in assertion
|
|
87
|
+
graph_string = "Metric Dependencies:"
|
|
88
|
+
for edge in G.edges():
|
|
89
|
+
graph_string += f"\n{'' if edge[1] == 'root' else edge[1]} -> {edge[0]}"
|
|
90
|
+
if not nx.is_directed_acyclic_graph(G):
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"The set of metric dependencies must be acyclic! You have cyclical dependencies. {graph_string}"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Set up sequence of evaluations
|
|
96
|
+
# Perform topological sort
|
|
97
|
+
# This is the order in which metrics will be evaluated
|
|
98
|
+
# and the conditions under which they will be evaluated
|
|
99
|
+
topological_order = list(nx.topological_sort(G))
|
|
100
|
+
|
|
101
|
+
metric_graph = [metric_graph_dict[node] for node in topological_order]
|
|
102
|
+
return metric_graph
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_metric_info(single_metric: dict) -> tuple[str, str]:
|
|
106
|
+
"""Input will be a single metric dictionary
|
|
107
|
+
Output will be
|
|
108
|
+
- string representation of metric using json.dumps
|
|
109
|
+
- evaluation_name - function_name or rubric_name
|
|
110
|
+
"""
|
|
111
|
+
return json.dumps(single_metric), single_metric.get("name")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_parent_metrics(all_metrics: dict, child: dict) -> tuple[list, list]:
|
|
115
|
+
"""metrics_graph_ordered_list will be a list of metrics in order in which they should be run
|
|
116
|
+
|
|
117
|
+
This function takes the eval represented by "child" and finds ALL evals in "all_metrics"
|
|
118
|
+
that quality as the child's immediate parent
|
|
119
|
+
|
|
120
|
+
An eval can qualify as a parent by having a matching name, type, context_only
|
|
121
|
+
At this point, we won't have enough information to decide whether the child should be run
|
|
122
|
+
(since the child might have additional requirements on the output of the parent)
|
|
123
|
+
but this is enough to tell us that the child should be run AFTER the parent.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
# if we use defaults in "depends_on", we might ends up with non-matches accidentally
|
|
127
|
+
# for a dependency, multiple keys might be listed
|
|
128
|
+
# We should find at least one parent that matches ALL of those key/value pairs, otherwise raise an error
|
|
129
|
+
parents = []
|
|
130
|
+
depends_on_with_id_added = []
|
|
131
|
+
for requirement in child.get("depends_on", []):
|
|
132
|
+
candidate_parents = []
|
|
133
|
+
allowed_types = ["function", "rubric"]
|
|
134
|
+
# if requirement has the type narrowed down, then narrow it down here too
|
|
135
|
+
if "type" in requirement and requirement["type"] is not None:
|
|
136
|
+
allowed_types = [requirement["type"]]
|
|
137
|
+
for candidate_type in allowed_types:
|
|
138
|
+
for candidate in all_metrics.get(candidate_type, []):
|
|
139
|
+
# assume the candidate is a match unless demonstrated otherwise
|
|
140
|
+
matches = True
|
|
141
|
+
|
|
142
|
+
# if it's not the right type, don't match it
|
|
143
|
+
if "type" in requirement and candidate_type not in allowed_types:
|
|
144
|
+
matches = False
|
|
145
|
+
|
|
146
|
+
# if the conditionals are listed in the depends_on entry but don't match...
|
|
147
|
+
# Only check conditionals that are explicitly specified (not None) in the requirement
|
|
148
|
+
conditionals = ["metric_level", "context_only", "name", "kwargs"]
|
|
149
|
+
for conditional in conditionals:
|
|
150
|
+
if (
|
|
151
|
+
conditional in requirement
|
|
152
|
+
and requirement.get(conditional) is not None
|
|
153
|
+
and requirement.get(conditional) != candidate.get(conditional)
|
|
154
|
+
):
|
|
155
|
+
matches = False
|
|
156
|
+
break
|
|
157
|
+
|
|
158
|
+
if matches:
|
|
159
|
+
candidate_parents.append(candidate)
|
|
160
|
+
requirement["parent_id"] = candidate["id"]
|
|
161
|
+
depends_on_with_id_added.append(requirement)
|
|
162
|
+
if len(candidate_parents) == 0:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"We were unable to locate any match for the `depends_on` entry `{json.dumps(requirement, indent=4)}` in the metric `{json.dumps(child, indent=4)}`. The full set of parent candidates is `{json.dumps(all_metrics, indent=4)}`."
|
|
165
|
+
)
|
|
166
|
+
if len(candidate_parents) > 1:
|
|
167
|
+
raise ValueError(
|
|
168
|
+
f"We located more than one match for the `depends_on` entry `{json.dumps(requirement, indent=4)}` in the metric `{json.dumps(child, indent=4)}`. The matches were `{json.dumps(candidate_parents, indent=4)}`. Please add another criterion to disambiguate."
|
|
169
|
+
)
|
|
170
|
+
parents += candidate_parents
|
|
171
|
+
|
|
172
|
+
return parents, depends_on_with_id_added
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def apply_defaults(schema, data, path=None):
|
|
176
|
+
# Initialize path as an empty list if None. This will store the navigation path in the schema.
|
|
177
|
+
|
|
178
|
+
if path is None:
|
|
179
|
+
path = []
|
|
180
|
+
|
|
181
|
+
if data is None:
|
|
182
|
+
# If data is None and defaults are specified, apply them
|
|
183
|
+
return schema.get("default")
|
|
184
|
+
|
|
185
|
+
if isinstance(data, dict):
|
|
186
|
+
# Process dictionaries
|
|
187
|
+
if "properties" in schema:
|
|
188
|
+
# Loop over each schema property
|
|
189
|
+
for key, subschema in schema["properties"].items():
|
|
190
|
+
# Update path with current property
|
|
191
|
+
new_path = path + [key]
|
|
192
|
+
if key in data:
|
|
193
|
+
# Recursively apply defaults, pass the path along
|
|
194
|
+
data[key] = apply_defaults(subschema, data[key], new_path)
|
|
195
|
+
elif "default" in subschema:
|
|
196
|
+
# Apply default if the key is not in the data
|
|
197
|
+
data[key] = subschema["default"]
|
|
198
|
+
# print("setting", path, key, subschema["default"])
|
|
199
|
+
elif "items" in schema:
|
|
200
|
+
if "properties" in schema["items"]:
|
|
201
|
+
# Loop over each schema property
|
|
202
|
+
for key, subschema in schema["items"]["properties"].items():
|
|
203
|
+
# Update path with current property
|
|
204
|
+
new_path = path + [key]
|
|
205
|
+
if key in data:
|
|
206
|
+
# Recursively apply defaults, pass the path along
|
|
207
|
+
data[key] = apply_defaults(subschema, data[key], new_path)
|
|
208
|
+
elif "default" in subschema:
|
|
209
|
+
# Apply default if the key is not in the data
|
|
210
|
+
data[key] = subschema["default"]
|
|
211
|
+
|
|
212
|
+
if path == ["metrics", "function"]:
|
|
213
|
+
data["type"] = "function"
|
|
214
|
+
if path == ["metrics", "rubric"]:
|
|
215
|
+
data["type"] = "rubric"
|
|
216
|
+
|
|
217
|
+
return data
|
|
218
|
+
|
|
219
|
+
if isinstance(data, list) and "items" in schema:
|
|
220
|
+
# Process lists by applying defaults to each item
|
|
221
|
+
item_schema = schema["items"]
|
|
222
|
+
# Apply defaults to each item in the list, passing along the path
|
|
223
|
+
return [apply_defaults(item_schema, item, path) for item in data]
|
|
224
|
+
|
|
225
|
+
return data
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# for verify installation
|
|
229
|
+
# if function_name is defined, rubric
|
|
230
|
+
# make sure "function" and "rubric" default to empty lists
|
|
231
|
+
# TODO - don't set defaults in "depends_on" to make matching more flexible
|
|
232
|
+
# evaluation_name: my_rubric
|
|
233
|
+
# evaluation_type: rubric
|
|
234
|
+
# metric_name: <
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"type": "object",
|
|
4
|
+
"properties": {
|
|
5
|
+
"data": {
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"type": "string"
|
|
9
|
+
},
|
|
10
|
+
"description": "List of absolute or relative paths to data files. Each file must be in *.jsonl format, with one conversation per line.",
|
|
11
|
+
"default":[]
|
|
12
|
+
},
|
|
13
|
+
"do_completion": {
|
|
14
|
+
"type": "boolean",
|
|
15
|
+
"description": "Flag to determine if completions should be done for each conversation. Set to 'true' if you are testing a new API and want to evaluate the API responses. Set to 'false' (default) if you are evaluating past conversations and do not need to generate new completions.",
|
|
16
|
+
"default": false
|
|
17
|
+
},
|
|
18
|
+
"name": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"description": "Name of the test suite. Used as metadata only. Does not need to match the key of the entry in the evals.yaml file.",
|
|
21
|
+
"default": ""
|
|
22
|
+
},
|
|
23
|
+
"notes": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"description": "Additional notes regarding the configuration. Used as metadata only.",
|
|
26
|
+
"default": ""
|
|
27
|
+
},
|
|
28
|
+
"config": {
|
|
29
|
+
"type": "object",
|
|
30
|
+
"properties": {
|
|
31
|
+
"max_workers": {
|
|
32
|
+
"type": "integer",
|
|
33
|
+
"description": "The maximum number of worker threads allowed when computing metrics."
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
"description": "Specific configuration settings that may override default settings. Look in `src/llm-evals/config.yaml` for other fun things to put here.",
|
|
37
|
+
"additionalProperties": true
|
|
38
|
+
},
|
|
39
|
+
"metrics": {
|
|
40
|
+
"type": "object",
|
|
41
|
+
"properties":{
|
|
42
|
+
"function":{
|
|
43
|
+
"type": "array",
|
|
44
|
+
"description": "List of function-based metrics to be evaluated.",
|
|
45
|
+
"items": {
|
|
46
|
+
"type": "object",
|
|
47
|
+
"properties": {
|
|
48
|
+
"name": {
|
|
49
|
+
"type": "string",
|
|
50
|
+
"description": "The function to call to compute this metric."
|
|
51
|
+
},
|
|
52
|
+
"kwargs": {
|
|
53
|
+
"type": "object",
|
|
54
|
+
"description": "Keyword arguments for the function. Each key must correspond to an argument in the function as implemented in `function_metrics.py`. Extra keys will cause an error.",
|
|
55
|
+
"additionalProperties": true,
|
|
56
|
+
"default": {}
|
|
57
|
+
},
|
|
58
|
+
"depends_on": {
|
|
59
|
+
"type": "array",
|
|
60
|
+
"default": [],
|
|
61
|
+
"description": "List of dependencies that must be satisfied for this metric to be computed.",
|
|
62
|
+
"items": {
|
|
63
|
+
"type": "object",
|
|
64
|
+
"properties": {
|
|
65
|
+
"name": {
|
|
66
|
+
"type": "string",
|
|
67
|
+
"description": "Name of the dependency function or rubric."
|
|
68
|
+
},
|
|
69
|
+
"type": {
|
|
70
|
+
"type": "string",
|
|
71
|
+
"description": "One of 'function' or 'rubric' indicating the type of the dependency.",
|
|
72
|
+
"pattern": "^((function)|(rubric))$"
|
|
73
|
+
},
|
|
74
|
+
"kwargs": {
|
|
75
|
+
"type": "object",
|
|
76
|
+
"description": "The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
|
|
77
|
+
"additionalProperties": true
|
|
78
|
+
},
|
|
79
|
+
"context_only": {
|
|
80
|
+
"type": "boolean",
|
|
81
|
+
"description": "The context_only value for the dependency. If provided, used to match which evaluation this dependency is for."
|
|
82
|
+
},
|
|
83
|
+
"last_turn_only": {
|
|
84
|
+
"type": "boolean",
|
|
85
|
+
"description": "The last_turn_only value for the dependency. If provided, used to match which evaluation this dependency is for."
|
|
86
|
+
},
|
|
87
|
+
"metric_name": {
|
|
88
|
+
"type": "string",
|
|
89
|
+
"description": "Name of the metric dependency. This may be different than function_name if the metric function returns a key/value pair - in which case, this will match the key."
|
|
90
|
+
},
|
|
91
|
+
"metric_min_value": {
|
|
92
|
+
"type": "number",
|
|
93
|
+
"description": "Minimum value of the dependency to consider it as satisfied.",
|
|
94
|
+
"default": -1e20
|
|
95
|
+
},
|
|
96
|
+
"metric_max_value": {
|
|
97
|
+
"type": "number",
|
|
98
|
+
"description": "Maximum value of the dependency to consider it as satisfied.",
|
|
99
|
+
"default": 1e20
|
|
100
|
+
}
|
|
101
|
+
},
|
|
102
|
+
"additionalProperties": false
|
|
103
|
+
},
|
|
104
|
+
"required": ["name"]
|
|
105
|
+
},
|
|
106
|
+
"metric_level": {
|
|
107
|
+
"type": "string",
|
|
108
|
+
"description": "What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
|
|
109
|
+
"default": "Turn"
|
|
110
|
+
},
|
|
111
|
+
"context_only": {
|
|
112
|
+
"type": "boolean",
|
|
113
|
+
"description": "If true, only the context (that is, the previous messages) will be evaluated, not the current object. Cannot be done with only thread",
|
|
114
|
+
"default": false
|
|
115
|
+
},
|
|
116
|
+
"last_instance_only": {
|
|
117
|
+
"type": "boolean",
|
|
118
|
+
"description": "If true, the object will only be evaluated if it's the last instance (i.e., turn or message depending on metric_level) in an existing conversation, or if it's a new completion.",
|
|
119
|
+
"default": false
|
|
120
|
+
}
|
|
121
|
+
},
|
|
122
|
+
"required": ["name"]
|
|
123
|
+
}
|
|
124
|
+
},
|
|
125
|
+
"rubric":{
|
|
126
|
+
"type": "array",
|
|
127
|
+
"description": "List of rubrics to be evaluated",
|
|
128
|
+
"items": {
|
|
129
|
+
"type": "object",
|
|
130
|
+
"properties": {
|
|
131
|
+
"name": {
|
|
132
|
+
"type": "string",
|
|
133
|
+
"description": "The rubric to use to evaluate this metric."
|
|
134
|
+
},
|
|
135
|
+
"kwargs": {
|
|
136
|
+
"type": "object",
|
|
137
|
+
"description": "Keyword arguments for the function. Each key must correspond to an argument in the function as implemented in `function_metrics.py`. Extra keys will cause an error.",
|
|
138
|
+
"additionalProperties": true,
|
|
139
|
+
"default":{}
|
|
140
|
+
},
|
|
141
|
+
"metric_level": {
|
|
142
|
+
"type": "string",
|
|
143
|
+
"description": "What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
|
|
144
|
+
"default": "Turn"
|
|
145
|
+
},
|
|
146
|
+
"context_only": {
|
|
147
|
+
"type": "boolean",
|
|
148
|
+
"description": "If true, only the context (that is, the previous messages) will be evaluated, not the current object. Cannot be done with only thread",
|
|
149
|
+
"default": false
|
|
150
|
+
},
|
|
151
|
+
"last_instance_only": {
|
|
152
|
+
"type": "boolean",
|
|
153
|
+
"description": "If true, the object will only be evaluated if it's the last instance (i.e., turn or message depending on metric_level) in an existing conversation, or if it's a new completion.",
|
|
154
|
+
"default": false
|
|
155
|
+
},
|
|
156
|
+
"depends_on": {
|
|
157
|
+
"type": "array",
|
|
158
|
+
"description": "List of dependencies that must be satisfied for this metric to be computed.",
|
|
159
|
+
"default":[],
|
|
160
|
+
"items": {
|
|
161
|
+
"type": "object",
|
|
162
|
+
"properties": {
|
|
163
|
+
"name": {
|
|
164
|
+
"type": "string",
|
|
165
|
+
"description": "Name of the dependency function or rubric."
|
|
166
|
+
},
|
|
167
|
+
"type": {
|
|
168
|
+
"type": "string",
|
|
169
|
+
"description": "One of 'function' or 'rubric' indicating the type of the dependency.",
|
|
170
|
+
"pattern": "^((function)|(rubric))$"
|
|
171
|
+
},
|
|
172
|
+
"kwargs": {
|
|
173
|
+
"type": "object",
|
|
174
|
+
"description": "The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
|
|
175
|
+
"additionalProperties": true
|
|
176
|
+
},
|
|
177
|
+
"context_only": {
|
|
178
|
+
"type": "boolean",
|
|
179
|
+
"description": "The context_only value for the dependency. If provided, used to match which evaluation this dependency is for."
|
|
180
|
+
},
|
|
181
|
+
"last_turn_only": {
|
|
182
|
+
"type": "boolean",
|
|
183
|
+
"description": "The last_turn_only value for the dependency. If provided, used to match which evaluation this dependency is for."
|
|
184
|
+
},
|
|
185
|
+
"metric_name": {
|
|
186
|
+
"type": "string",
|
|
187
|
+
"description": "Name of the metric dependency. This may be different than function_name if the metric function returns a key/value pair - in which case, this will match the key."
|
|
188
|
+
},
|
|
189
|
+
"metric_min_value": {
|
|
190
|
+
"type": "number",
|
|
191
|
+
"description": "Minimum value of the dependency to consider it as satisfied.",
|
|
192
|
+
"default": -1e20
|
|
193
|
+
},
|
|
194
|
+
"metric_max_value": {
|
|
195
|
+
"type": "number",
|
|
196
|
+
"description": "Maximum value of the dependency to consider it as satisfied.",
|
|
197
|
+
"default": 1e20
|
|
198
|
+
}
|
|
199
|
+
},
|
|
200
|
+
"additionalProperties": false
|
|
201
|
+
},
|
|
202
|
+
"required": ["name"]
|
|
203
|
+
}
|
|
204
|
+
},
|
|
205
|
+
"required": ["name"]
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
},
|
|
210
|
+
"completion_llm": {
|
|
211
|
+
"type": "object",
|
|
212
|
+
"description":"Specification of the LLM or API used to perform new completions. Must be defined if `do_completions: true` is set.",
|
|
213
|
+
"properties": {
|
|
214
|
+
"function_name": {
|
|
215
|
+
"type": "string",
|
|
216
|
+
"description": "Completion function defined in `completion_functions.py`. Must be specified."
|
|
217
|
+
},
|
|
218
|
+
"include_system_prompt": {
|
|
219
|
+
"type": "boolean",
|
|
220
|
+
"default": false
|
|
221
|
+
},
|
|
222
|
+
"kwargs": {
|
|
223
|
+
"type": "object",
|
|
224
|
+
"description": "Additional arguments that will be passed to the completion function. Must correspond to arguments in tne named function.",
|
|
225
|
+
"default": {},
|
|
226
|
+
"additionalProperties": true
|
|
227
|
+
}
|
|
228
|
+
},
|
|
229
|
+
"required": ["function_name"],
|
|
230
|
+
"additionalProperties": false
|
|
231
|
+
},
|
|
232
|
+
"grader_llm": {
|
|
233
|
+
"type": "object",
|
|
234
|
+
"description":"Specification of the LLM or API used to grade rubrics. Must be defined if any rubric_metrics are specified.",
|
|
235
|
+
"properties": {
|
|
236
|
+
"function_name": {
|
|
237
|
+
"type": "string",
|
|
238
|
+
"description": "Function defined in `completion_functions.py`. We're not really completing a conversation, but we ARE asking an LLM to provide a response to an input - in this case, the rubric."
|
|
239
|
+
},
|
|
240
|
+
"kwargs": {
|
|
241
|
+
"type": "object",
|
|
242
|
+
"description": "Additional arguments that will be passed to the completion function. Must correspond to arguments in tne named function.",
|
|
243
|
+
"default": {},
|
|
244
|
+
"additionalProperties": true
|
|
245
|
+
}
|
|
246
|
+
},
|
|
247
|
+
"required": ["function_name"],
|
|
248
|
+
"optional": ["kwargs"],
|
|
249
|
+
"additionalProperties": false
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
},
|
|
253
|
+
"required": ["data", "metrics"],
|
|
254
|
+
"additionalProperties": true
|
|
255
|
+
}
|
|
256
|
+
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Inspection utilities that use type hints to determine the appropriate object to pass to a function metric.
|
|
2
|
+
|
|
3
|
+
See :mod:`~flexeval.schema.eval_schema`.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import inspect
|
|
7
|
+
import logging
|
|
8
|
+
import types
|
|
9
|
+
import typing
|
|
10
|
+
from collections.abc import Callable, Iterable
|
|
11
|
+
|
|
12
|
+
from flexeval.classes import message, thread, tool_call, turn
|
|
13
|
+
from flexeval.schema import eval_schema
|
|
14
|
+
|
|
15
|
+
AnyFunctionObjectInput = typing.Union[
|
|
16
|
+
turn.Turn,
|
|
17
|
+
message.Message,
|
|
18
|
+
thread.Thread,
|
|
19
|
+
tool_call.ToolCall,
|
|
20
|
+
]
|
|
21
|
+
FLEXEVAL_TYPE_SET: set[type] = {
|
|
22
|
+
turn.Turn,
|
|
23
|
+
message.Message,
|
|
24
|
+
thread.Thread,
|
|
25
|
+
tool_call.ToolCall,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def is_callable_valid_for_metric_level(
|
|
32
|
+
metric_function: Callable, metric_level: eval_schema.MetricLevel
|
|
33
|
+
) -> bool:
|
|
34
|
+
valid_levels = get_valid_levels_for_callable(metric_function)
|
|
35
|
+
return metric_level in valid_levels
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_valid_levels_for_callable(metric_function: Callable) -> set[str]:
|
|
39
|
+
"""Given a callable, determine the valid metric_level values based on the type annotation of the first parameter.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
metric_function (Callable): A callable, probably one available via EvalRun
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
set[str]: Valid values for MetricItem.metric_level
|
|
46
|
+
"""
|
|
47
|
+
accepted_parameter_types = get_first_parameter_types(metric_function)
|
|
48
|
+
valid_levels = set()
|
|
49
|
+
for flexeval_type in FLEXEVAL_TYPE_SET:
|
|
50
|
+
if flexeval_type in accepted_parameter_types:
|
|
51
|
+
valid_levels.add(flexeval_type.__name__)
|
|
52
|
+
if str in accepted_parameter_types:
|
|
53
|
+
for level in ["Message", "Turn", "Thread"]:
|
|
54
|
+
valid_levels.add(level)
|
|
55
|
+
if list in accepted_parameter_types:
|
|
56
|
+
for level in ["Turn", "Thread"]:
|
|
57
|
+
valid_levels.add(level)
|
|
58
|
+
if dict in accepted_parameter_types:
|
|
59
|
+
valid_levels.add("ToolCall")
|
|
60
|
+
return valid_levels
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_first_parameter_types(metric_function: Callable) -> set[type]:
|
|
64
|
+
input_type = next(
|
|
65
|
+
iter(inspect.signature(metric_function).parameters.values())
|
|
66
|
+
).annotation
|
|
67
|
+
if input_type is inspect._empty:
|
|
68
|
+
logger.debug(
|
|
69
|
+
f"Function '{metric_function}' has a first parameter with no type annotation."
|
|
70
|
+
)
|
|
71
|
+
return set()
|
|
72
|
+
return get_acceptable_arg_types(input_type)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_acceptable_arg_types(input_type: type) -> set[type]:
|
|
76
|
+
# Note: we don't support NewType annotations yet
|
|
77
|
+
origin_type = typing.get_origin(input_type)
|
|
78
|
+
if origin_type is typing.Annotated:
|
|
79
|
+
# unpack Annotated types
|
|
80
|
+
input_type = typing.get_args(input_type)[0]
|
|
81
|
+
origin_type = typing.get_origin(input_type)
|
|
82
|
+
if origin_type in (typing.Union, types.UnionType):
|
|
83
|
+
union_arg_type_sets = [
|
|
84
|
+
get_acceptable_arg_types(type_arg)
|
|
85
|
+
for type_arg in typing.get_args(input_type)
|
|
86
|
+
]
|
|
87
|
+
return set.union(*union_arg_type_sets)
|
|
88
|
+
else: # not a union type
|
|
89
|
+
if origin_type is not None:
|
|
90
|
+
# e.g. input_type=list[str], origin_type=list
|
|
91
|
+
return {origin_type}
|
|
92
|
+
else:
|
|
93
|
+
# e.g. input_type=list, origin_type=list
|
|
94
|
+
if input_type is list or input_type is Iterable:
|
|
95
|
+
logger.warning(
|
|
96
|
+
"Type hint {input_type} lacks the detail that would allow us to determine the specific objects it accepts."
|
|
97
|
+
)
|
|
98
|
+
return {input_type}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def get_function_input(
|
|
102
|
+
metric_function: Callable,
|
|
103
|
+
metric_level: eval_schema.MetricLevel,
|
|
104
|
+
input_object: AnyFunctionObjectInput,
|
|
105
|
+
context_only: bool,
|
|
106
|
+
) -> AnyFunctionObjectInput | str | dict | list:
|
|
107
|
+
"""Coerce input_object to a type accepted by metric_function at this metric_level.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
metric_function (Callable): Function to invoke with the returned input.
|
|
111
|
+
metric_level (eval_schema.MetricLevel): The metric level at which metric_function is being invoked.
|
|
112
|
+
input_object (AnyFunctionObjectInput): The input_object to be coerced, or passed as-is if accepted by metric_function.
|
|
113
|
+
context_only (bool): Determines how strings and lists are converted. See schema documentation.
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
ValueError: If the function accepts at least one declared type, but
|
|
117
|
+
it's a type we don't support at all e.g. set or
|
|
118
|
+
it's a type we don't support at this metric_level.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
AnyFunctionObjectInput | str | dict | list: The coerced input for metric_function.
|
|
122
|
+
"""
|
|
123
|
+
if metric_level not in eval_schema.VALID_METRIC_LEVELS:
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f"metric_level '{metric_level}' not one of the valid levels: {eval_schema.VALID_METRIC_LEVELS}"
|
|
126
|
+
)
|
|
127
|
+
input_type = type(input_object)
|
|
128
|
+
accepted_parameter_types = get_first_parameter_types(metric_function)
|
|
129
|
+
if len(accepted_parameter_types) == 0:
|
|
130
|
+
logger.debug(
|
|
131
|
+
f"Metric function '{metric_function}' has a first parameter with no type hint, so we can't determine if a type transformation needs to be applied."
|
|
132
|
+
)
|
|
133
|
+
return input_object
|
|
134
|
+
if input_type in accepted_parameter_types:
|
|
135
|
+
# no transformation necessary; the function accepts the type we already have
|
|
136
|
+
return input_object
|
|
137
|
+
elif dict in accepted_parameter_types and metric_level == "ToolCall":
|
|
138
|
+
return input_object.get_dict_representation()
|
|
139
|
+
elif list in accepted_parameter_types and metric_level in ["Turn", "Thread"]:
|
|
140
|
+
if context_only:
|
|
141
|
+
return input_object.get_context()
|
|
142
|
+
else:
|
|
143
|
+
# this is on a single turn - pass in the parsed list
|
|
144
|
+
return input_object.get_content()
|
|
145
|
+
elif str in accepted_parameter_types:
|
|
146
|
+
if metric_level == "ToolCall":
|
|
147
|
+
raise ValueError(
|
|
148
|
+
"Functions that accept strings can't be used for tool calls. Accept a dict (or a flexeval.classes.tool_call.ToolCall) instead."
|
|
149
|
+
)
|
|
150
|
+
if context_only:
|
|
151
|
+
# join together all previous turns
|
|
152
|
+
return join_all_contents_to_string(input_object.get_context())
|
|
153
|
+
else:
|
|
154
|
+
# current turn only
|
|
155
|
+
return join_all_contents_to_string(input_object.get_content())
|
|
156
|
+
else:
|
|
157
|
+
# the function accepts at least one declared type, but either:
|
|
158
|
+
# - it's a type we don't support at all e.g. set
|
|
159
|
+
# - it's a type we don't support at this metric_level
|
|
160
|
+
raise ValueError(
|
|
161
|
+
f"For metric level '{metric_level}', can't coerce '{input_type.__name__}' for function '{metric_function}' to accepted parameter type(s) '{', '.join([type.__name__ for type in accepted_parameter_types])}'."
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def join_all_contents_to_string(content: list[dict] | typing.Any) -> str:
|
|
166
|
+
"""
|
|
167
|
+
content is a list of dictionaries whose keys include 'content'.
|
|
168
|
+
Returns a string with all the 'content' entries concatenated together,
|
|
169
|
+
separated by newline.
|
|
170
|
+
"""
|
|
171
|
+
if isinstance(content, list):
|
|
172
|
+
content = "\n".join([item.get("content", "") for item in content])
|
|
173
|
+
return content
|