python-flexeval 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. flexeval/__init__.py +11 -0
  2. flexeval/__main__.py +11 -0
  3. flexeval/classes/__init__.py +15 -0
  4. flexeval/classes/base.py +32 -0
  5. flexeval/classes/dataset.py +82 -0
  6. flexeval/classes/eval_runner.py +158 -0
  7. flexeval/classes/eval_set_run.py +32 -0
  8. flexeval/classes/message.py +183 -0
  9. flexeval/classes/metric.py +55 -0
  10. flexeval/classes/thread.py +79 -0
  11. flexeval/classes/tool_call.py +51 -0
  12. flexeval/classes/turn.py +206 -0
  13. flexeval/cli.py +104 -0
  14. flexeval/completions.py +147 -0
  15. flexeval/compute_metrics.py +788 -0
  16. flexeval/config.yaml +23 -0
  17. flexeval/configuration/__init__.py +1 -0
  18. flexeval/configuration/completion_functions.py +231 -0
  19. flexeval/configuration/evals.yaml +864 -0
  20. flexeval/configuration/function_metrics.py +650 -0
  21. flexeval/configuration/rubric_metrics.yaml +194 -0
  22. flexeval/data_loader.py +513 -0
  23. flexeval/db_utils.py +38 -0
  24. flexeval/dependency_graph.py +234 -0
  25. flexeval/eval_schema.json +256 -0
  26. flexeval/function_types.py +173 -0
  27. flexeval/helpers.py +52 -0
  28. flexeval/io/__init__.py +1 -0
  29. flexeval/io/parsers/yaml_parser.py +69 -0
  30. flexeval/log_utils.py +34 -0
  31. flexeval/metrics/__init__.py +8 -0
  32. flexeval/metrics/access.py +28 -0
  33. flexeval/metrics/save.py +39 -0
  34. flexeval/rubric.py +62 -0
  35. flexeval/run_utils.py +65 -0
  36. flexeval/runner.py +132 -0
  37. flexeval/schema/__init__.py +11 -0
  38. flexeval/schema/config_schema.py +46 -0
  39. flexeval/schema/eval_schema.py +163 -0
  40. flexeval/schema/evalrun_schema.py +97 -0
  41. flexeval/schema/rubric_schema.py +40 -0
  42. flexeval/schema/schema_utils.py +26 -0
  43. python_flexeval-0.1.5.dist-info/METADATA +118 -0
  44. python_flexeval-0.1.5.dist-info/RECORD +47 -0
  45. python_flexeval-0.1.5.dist-info/WHEEL +4 -0
  46. python_flexeval-0.1.5.dist-info/entry_points.txt +2 -0
  47. python_flexeval-0.1.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,234 @@
1
+ """Determines how configured metrics depend on each other."""
2
+
3
+ import json
4
+ from typing import Any
5
+
6
+ import networkx as nx
7
+
8
+ from flexeval.helpers import generate_hash
9
+ from flexeval.schema import eval_schema
10
+
11
+
12
+ def create_metrics_graph(metrics: eval_schema.Metrics) -> list[Any]:
13
+ """Input is the metrics dictionary with keys 'function' and 'rubric', each of which maps to a list
14
+ Output is list of string representations of the nodes in the graph, in topological order
15
+
16
+ Each entry and dependency will get an ID so they are easy to match later
17
+ """
18
+
19
+ # Create a directed graph
20
+ G = nx.DiGraph()
21
+ metric_graph_dict = {}
22
+
23
+ # make an intermediate datastructure that adds IDs to all listed evaluations
24
+ user_metrics_with_ids = {}
25
+ for evaluation_type in ["function", "rubric"]:
26
+ user_metrics_with_ids[evaluation_type] = []
27
+ # add a hash to every metric in the list
28
+ item_list: list[eval_schema.MetricItem] = getattr(metrics, evaluation_type)
29
+ if item_list is not None:
30
+ for item in item_list:
31
+ metric_with_id = {"id": generate_hash()}
32
+ for k, v in item.model_dump().items():
33
+ metric_with_id[k] = v
34
+ user_metrics_with_ids[evaluation_type].append(metric_with_id)
35
+
36
+ # now that all potential parents have IDs, find parents for each child
37
+ for evaluation_type in ["function", "rubric"]:
38
+ for metric_dict in user_metrics_with_ids[evaluation_type]:
39
+ parent_metrics, depends_on_with_parent_ids = get_parent_metrics(
40
+ all_metrics=user_metrics_with_ids, child=metric_dict
41
+ )
42
+ metric_dict["depends_on"] = depends_on_with_parent_ids
43
+
44
+ child_metric_str, evaluation_name = get_metric_info(metric_dict)
45
+
46
+ # Now construct the graph
47
+ # Add an edge, which implicitly adds nodes where necessary
48
+ if len(parent_metrics) > 0:
49
+ for parent_metric_dict in parent_metrics:
50
+ parent_metric_str, _ = get_metric_info(parent_metric_dict)
51
+ G.add_edge(parent_metric_str, child_metric_str)
52
+ # make 'canonical' representation of child
53
+ metric_graph_dict[child_metric_str] = {
54
+ "evaluation_name": evaluation_name, # function or rubric name
55
+ "evaluation_type": evaluation_type,
56
+ }
57
+ for k, v in metric_dict.items():
58
+ if k not in [
59
+ "function_name",
60
+ "rubric_name",
61
+ "type",
62
+ "name",
63
+ ]:
64
+ metric_graph_dict[child_metric_str][k] = v
65
+
66
+ # # copy over details of parent metric that aren't already present
67
+ # for k, v in parent_metric_dict.items():
68
+ # if k not in metric_graph_dict[child_metric]:
69
+ # metric_graph_dict[child_metric][k] = v
70
+ else:
71
+ # if there is no parent, just add a node by itself
72
+ G.add_node(child_metric_str)
73
+ metric_graph_dict[child_metric_str] = {
74
+ "evaluation_name": evaluation_name, # function or rubric name
75
+ "evaluation_type": evaluation_type,
76
+ }
77
+ for k, v in metric_dict.items():
78
+ if k not in [
79
+ "function_name",
80
+ "rubric_name",
81
+ "type",
82
+ "name",
83
+ ]:
84
+ metric_graph_dict[child_metric_str][k] = v
85
+
86
+ # Make string representation with all nodes for error printing in assertion
87
+ graph_string = "Metric Dependencies:"
88
+ for edge in G.edges():
89
+ graph_string += f"\n{'' if edge[1] == 'root' else edge[1]} -> {edge[0]}"
90
+ if not nx.is_directed_acyclic_graph(G):
91
+ raise ValueError(
92
+ "The set of metric dependencies must be acyclic! You have cyclical dependencies. {graph_string}"
93
+ )
94
+
95
+ # Set up sequence of evaluations
96
+ # Perform topological sort
97
+ # This is the order in which metrics will be evaluated
98
+ # and the conditions under which they will be evaluated
99
+ topological_order = list(nx.topological_sort(G))
100
+
101
+ metric_graph = [metric_graph_dict[node] for node in topological_order]
102
+ return metric_graph
103
+
104
+
105
+ def get_metric_info(single_metric: dict) -> tuple[str, str]:
106
+ """Input will be a single metric dictionary
107
+ Output will be
108
+ - string representation of metric using json.dumps
109
+ - evaluation_name - function_name or rubric_name
110
+ """
111
+ return json.dumps(single_metric), single_metric.get("name")
112
+
113
+
114
+ def get_parent_metrics(all_metrics: dict, child: dict) -> tuple[list, list]:
115
+ """metrics_graph_ordered_list will be a list of metrics in order in which they should be run
116
+
117
+ This function takes the eval represented by "child" and finds ALL evals in "all_metrics"
118
+ that quality as the child's immediate parent
119
+
120
+ An eval can qualify as a parent by having a matching name, type, context_only
121
+ At this point, we won't have enough information to decide whether the child should be run
122
+ (since the child might have additional requirements on the output of the parent)
123
+ but this is enough to tell us that the child should be run AFTER the parent.
124
+ """
125
+
126
+ # if we use defaults in "depends_on", we might ends up with non-matches accidentally
127
+ # for a dependency, multiple keys might be listed
128
+ # We should find at least one parent that matches ALL of those key/value pairs, otherwise raise an error
129
+ parents = []
130
+ depends_on_with_id_added = []
131
+ for requirement in child.get("depends_on", []):
132
+ candidate_parents = []
133
+ allowed_types = ["function", "rubric"]
134
+ # if requirement has the type narrowed down, then narrow it down here too
135
+ if "type" in requirement and requirement["type"] is not None:
136
+ allowed_types = [requirement["type"]]
137
+ for candidate_type in allowed_types:
138
+ for candidate in all_metrics.get(candidate_type, []):
139
+ # assume the candidate is a match unless demonstrated otherwise
140
+ matches = True
141
+
142
+ # if it's not the right type, don't match it
143
+ if "type" in requirement and candidate_type not in allowed_types:
144
+ matches = False
145
+
146
+ # if the conditionals are listed in the depends_on entry but don't match...
147
+ # Only check conditionals that are explicitly specified (not None) in the requirement
148
+ conditionals = ["metric_level", "context_only", "name", "kwargs"]
149
+ for conditional in conditionals:
150
+ if (
151
+ conditional in requirement
152
+ and requirement.get(conditional) is not None
153
+ and requirement.get(conditional) != candidate.get(conditional)
154
+ ):
155
+ matches = False
156
+ break
157
+
158
+ if matches:
159
+ candidate_parents.append(candidate)
160
+ requirement["parent_id"] = candidate["id"]
161
+ depends_on_with_id_added.append(requirement)
162
+ if len(candidate_parents) == 0:
163
+ raise ValueError(
164
+ f"We were unable to locate any match for the `depends_on` entry `{json.dumps(requirement, indent=4)}` in the metric `{json.dumps(child, indent=4)}`. The full set of parent candidates is `{json.dumps(all_metrics, indent=4)}`."
165
+ )
166
+ if len(candidate_parents) > 1:
167
+ raise ValueError(
168
+ f"We located more than one match for the `depends_on` entry `{json.dumps(requirement, indent=4)}` in the metric `{json.dumps(child, indent=4)}`. The matches were `{json.dumps(candidate_parents, indent=4)}`. Please add another criterion to disambiguate."
169
+ )
170
+ parents += candidate_parents
171
+
172
+ return parents, depends_on_with_id_added
173
+
174
+
175
+ def apply_defaults(schema, data, path=None):
176
+ # Initialize path as an empty list if None. This will store the navigation path in the schema.
177
+
178
+ if path is None:
179
+ path = []
180
+
181
+ if data is None:
182
+ # If data is None and defaults are specified, apply them
183
+ return schema.get("default")
184
+
185
+ if isinstance(data, dict):
186
+ # Process dictionaries
187
+ if "properties" in schema:
188
+ # Loop over each schema property
189
+ for key, subschema in schema["properties"].items():
190
+ # Update path with current property
191
+ new_path = path + [key]
192
+ if key in data:
193
+ # Recursively apply defaults, pass the path along
194
+ data[key] = apply_defaults(subschema, data[key], new_path)
195
+ elif "default" in subschema:
196
+ # Apply default if the key is not in the data
197
+ data[key] = subschema["default"]
198
+ # print("setting", path, key, subschema["default"])
199
+ elif "items" in schema:
200
+ if "properties" in schema["items"]:
201
+ # Loop over each schema property
202
+ for key, subschema in schema["items"]["properties"].items():
203
+ # Update path with current property
204
+ new_path = path + [key]
205
+ if key in data:
206
+ # Recursively apply defaults, pass the path along
207
+ data[key] = apply_defaults(subschema, data[key], new_path)
208
+ elif "default" in subschema:
209
+ # Apply default if the key is not in the data
210
+ data[key] = subschema["default"]
211
+
212
+ if path == ["metrics", "function"]:
213
+ data["type"] = "function"
214
+ if path == ["metrics", "rubric"]:
215
+ data["type"] = "rubric"
216
+
217
+ return data
218
+
219
+ if isinstance(data, list) and "items" in schema:
220
+ # Process lists by applying defaults to each item
221
+ item_schema = schema["items"]
222
+ # Apply defaults to each item in the list, passing along the path
223
+ return [apply_defaults(item_schema, item, path) for item in data]
224
+
225
+ return data
226
+
227
+
228
+ # for verify installation
229
+ # if function_name is defined, rubric
230
+ # make sure "function" and "rubric" default to empty lists
231
+ # TODO - don't set defaults in "depends_on" to make matching more flexible
232
+ # evaluation_name: my_rubric
233
+ # evaluation_type: rubric
234
+ # metric_name: <
@@ -0,0 +1,256 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "type": "object",
4
+ "properties": {
5
+ "data": {
6
+ "type": "array",
7
+ "items": {
8
+ "type": "string"
9
+ },
10
+ "description": "List of absolute or relative paths to data files. Each file must be in *.jsonl format, with one conversation per line.",
11
+ "default":[]
12
+ },
13
+ "do_completion": {
14
+ "type": "boolean",
15
+ "description": "Flag to determine if completions should be done for each conversation. Set to 'true' if you are testing a new API and want to evaluate the API responses. Set to 'false' (default) if you are evaluating past conversations and do not need to generate new completions.",
16
+ "default": false
17
+ },
18
+ "name": {
19
+ "type": "string",
20
+ "description": "Name of the test suite. Used as metadata only. Does not need to match the key of the entry in the evals.yaml file.",
21
+ "default": ""
22
+ },
23
+ "notes": {
24
+ "type": "string",
25
+ "description": "Additional notes regarding the configuration. Used as metadata only.",
26
+ "default": ""
27
+ },
28
+ "config": {
29
+ "type": "object",
30
+ "properties": {
31
+ "max_workers": {
32
+ "type": "integer",
33
+ "description": "The maximum number of worker threads allowed when computing metrics."
34
+ }
35
+ },
36
+ "description": "Specific configuration settings that may override default settings. Look in `src/llm-evals/config.yaml` for other fun things to put here.",
37
+ "additionalProperties": true
38
+ },
39
+ "metrics": {
40
+ "type": "object",
41
+ "properties":{
42
+ "function":{
43
+ "type": "array",
44
+ "description": "List of function-based metrics to be evaluated.",
45
+ "items": {
46
+ "type": "object",
47
+ "properties": {
48
+ "name": {
49
+ "type": "string",
50
+ "description": "The function to call to compute this metric."
51
+ },
52
+ "kwargs": {
53
+ "type": "object",
54
+ "description": "Keyword arguments for the function. Each key must correspond to an argument in the function as implemented in `function_metrics.py`. Extra keys will cause an error.",
55
+ "additionalProperties": true,
56
+ "default": {}
57
+ },
58
+ "depends_on": {
59
+ "type": "array",
60
+ "default": [],
61
+ "description": "List of dependencies that must be satisfied for this metric to be computed.",
62
+ "items": {
63
+ "type": "object",
64
+ "properties": {
65
+ "name": {
66
+ "type": "string",
67
+ "description": "Name of the dependency function or rubric."
68
+ },
69
+ "type": {
70
+ "type": "string",
71
+ "description": "One of 'function' or 'rubric' indicating the type of the dependency.",
72
+ "pattern": "^((function)|(rubric))$"
73
+ },
74
+ "kwargs": {
75
+ "type": "object",
76
+ "description": "The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
77
+ "additionalProperties": true
78
+ },
79
+ "context_only": {
80
+ "type": "boolean",
81
+ "description": "The context_only value for the dependency. If provided, used to match which evaluation this dependency is for."
82
+ },
83
+ "last_turn_only": {
84
+ "type": "boolean",
85
+ "description": "The last_turn_only value for the dependency. If provided, used to match which evaluation this dependency is for."
86
+ },
87
+ "metric_name": {
88
+ "type": "string",
89
+ "description": "Name of the metric dependency. This may be different than function_name if the metric function returns a key/value pair - in which case, this will match the key."
90
+ },
91
+ "metric_min_value": {
92
+ "type": "number",
93
+ "description": "Minimum value of the dependency to consider it as satisfied.",
94
+ "default": -1e20
95
+ },
96
+ "metric_max_value": {
97
+ "type": "number",
98
+ "description": "Maximum value of the dependency to consider it as satisfied.",
99
+ "default": 1e20
100
+ }
101
+ },
102
+ "additionalProperties": false
103
+ },
104
+ "required": ["name"]
105
+ },
106
+ "metric_level": {
107
+ "type": "string",
108
+ "description": "What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
109
+ "default": "Turn"
110
+ },
111
+ "context_only": {
112
+ "type": "boolean",
113
+ "description": "If true, only the context (that is, the previous messages) will be evaluated, not the current object. Cannot be done with only thread",
114
+ "default": false
115
+ },
116
+ "last_instance_only": {
117
+ "type": "boolean",
118
+ "description": "If true, the object will only be evaluated if it's the last instance (i.e., turn or message depending on metric_level) in an existing conversation, or if it's a new completion.",
119
+ "default": false
120
+ }
121
+ },
122
+ "required": ["name"]
123
+ }
124
+ },
125
+ "rubric":{
126
+ "type": "array",
127
+ "description": "List of rubrics to be evaluated",
128
+ "items": {
129
+ "type": "object",
130
+ "properties": {
131
+ "name": {
132
+ "type": "string",
133
+ "description": "The rubric to use to evaluate this metric."
134
+ },
135
+ "kwargs": {
136
+ "type": "object",
137
+ "description": "Keyword arguments for the function. Each key must correspond to an argument in the function as implemented in `function_metrics.py`. Extra keys will cause an error.",
138
+ "additionalProperties": true,
139
+ "default":{}
140
+ },
141
+ "metric_level": {
142
+ "type": "string",
143
+ "description": "What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
144
+ "default": "Turn"
145
+ },
146
+ "context_only": {
147
+ "type": "boolean",
148
+ "description": "If true, only the context (that is, the previous messages) will be evaluated, not the current object. Cannot be done with only thread",
149
+ "default": false
150
+ },
151
+ "last_instance_only": {
152
+ "type": "boolean",
153
+ "description": "If true, the object will only be evaluated if it's the last instance (i.e., turn or message depending on metric_level) in an existing conversation, or if it's a new completion.",
154
+ "default": false
155
+ },
156
+ "depends_on": {
157
+ "type": "array",
158
+ "description": "List of dependencies that must be satisfied for this metric to be computed.",
159
+ "default":[],
160
+ "items": {
161
+ "type": "object",
162
+ "properties": {
163
+ "name": {
164
+ "type": "string",
165
+ "description": "Name of the dependency function or rubric."
166
+ },
167
+ "type": {
168
+ "type": "string",
169
+ "description": "One of 'function' or 'rubric' indicating the type of the dependency.",
170
+ "pattern": "^((function)|(rubric))$"
171
+ },
172
+ "kwargs": {
173
+ "type": "object",
174
+ "description": "The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
175
+ "additionalProperties": true
176
+ },
177
+ "context_only": {
178
+ "type": "boolean",
179
+ "description": "The context_only value for the dependency. If provided, used to match which evaluation this dependency is for."
180
+ },
181
+ "last_turn_only": {
182
+ "type": "boolean",
183
+ "description": "The last_turn_only value for the dependency. If provided, used to match which evaluation this dependency is for."
184
+ },
185
+ "metric_name": {
186
+ "type": "string",
187
+ "description": "Name of the metric dependency. This may be different than function_name if the metric function returns a key/value pair - in which case, this will match the key."
188
+ },
189
+ "metric_min_value": {
190
+ "type": "number",
191
+ "description": "Minimum value of the dependency to consider it as satisfied.",
192
+ "default": -1e20
193
+ },
194
+ "metric_max_value": {
195
+ "type": "number",
196
+ "description": "Maximum value of the dependency to consider it as satisfied.",
197
+ "default": 1e20
198
+ }
199
+ },
200
+ "additionalProperties": false
201
+ },
202
+ "required": ["name"]
203
+ }
204
+ },
205
+ "required": ["name"]
206
+ }
207
+ }
208
+ }
209
+ },
210
+ "completion_llm": {
211
+ "type": "object",
212
+ "description":"Specification of the LLM or API used to perform new completions. Must be defined if `do_completions: true` is set.",
213
+ "properties": {
214
+ "function_name": {
215
+ "type": "string",
216
+ "description": "Completion function defined in `completion_functions.py`. Must be specified."
217
+ },
218
+ "include_system_prompt": {
219
+ "type": "boolean",
220
+ "default": false
221
+ },
222
+ "kwargs": {
223
+ "type": "object",
224
+ "description": "Additional arguments that will be passed to the completion function. Must correspond to arguments in tne named function.",
225
+ "default": {},
226
+ "additionalProperties": true
227
+ }
228
+ },
229
+ "required": ["function_name"],
230
+ "additionalProperties": false
231
+ },
232
+ "grader_llm": {
233
+ "type": "object",
234
+ "description":"Specification of the LLM or API used to grade rubrics. Must be defined if any rubric_metrics are specified.",
235
+ "properties": {
236
+ "function_name": {
237
+ "type": "string",
238
+ "description": "Function defined in `completion_functions.py`. We're not really completing a conversation, but we ARE asking an LLM to provide a response to an input - in this case, the rubric."
239
+ },
240
+ "kwargs": {
241
+ "type": "object",
242
+ "description": "Additional arguments that will be passed to the completion function. Must correspond to arguments in tne named function.",
243
+ "default": {},
244
+ "additionalProperties": true
245
+ }
246
+ },
247
+ "required": ["function_name"],
248
+ "optional": ["kwargs"],
249
+ "additionalProperties": false
250
+ }
251
+
252
+ },
253
+ "required": ["data", "metrics"],
254
+ "additionalProperties": true
255
+ }
256
+
@@ -0,0 +1,173 @@
1
+ """Inspection utilities that use type hints to determine the appropriate object to pass to a function metric.
2
+
3
+ See :mod:`~flexeval.schema.eval_schema`.
4
+ """
5
+
6
+ import inspect
7
+ import logging
8
+ import types
9
+ import typing
10
+ from collections.abc import Callable, Iterable
11
+
12
+ from flexeval.classes import message, thread, tool_call, turn
13
+ from flexeval.schema import eval_schema
14
+
15
+ AnyFunctionObjectInput = typing.Union[
16
+ turn.Turn,
17
+ message.Message,
18
+ thread.Thread,
19
+ tool_call.ToolCall,
20
+ ]
21
+ FLEXEVAL_TYPE_SET: set[type] = {
22
+ turn.Turn,
23
+ message.Message,
24
+ thread.Thread,
25
+ tool_call.ToolCall,
26
+ }
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def is_callable_valid_for_metric_level(
32
+ metric_function: Callable, metric_level: eval_schema.MetricLevel
33
+ ) -> bool:
34
+ valid_levels = get_valid_levels_for_callable(metric_function)
35
+ return metric_level in valid_levels
36
+
37
+
38
+ def get_valid_levels_for_callable(metric_function: Callable) -> set[str]:
39
+ """Given a callable, determine the valid metric_level values based on the type annotation of the first parameter.
40
+
41
+ Args:
42
+ metric_function (Callable): A callable, probably one available via EvalRun
43
+
44
+ Returns:
45
+ set[str]: Valid values for MetricItem.metric_level
46
+ """
47
+ accepted_parameter_types = get_first_parameter_types(metric_function)
48
+ valid_levels = set()
49
+ for flexeval_type in FLEXEVAL_TYPE_SET:
50
+ if flexeval_type in accepted_parameter_types:
51
+ valid_levels.add(flexeval_type.__name__)
52
+ if str in accepted_parameter_types:
53
+ for level in ["Message", "Turn", "Thread"]:
54
+ valid_levels.add(level)
55
+ if list in accepted_parameter_types:
56
+ for level in ["Turn", "Thread"]:
57
+ valid_levels.add(level)
58
+ if dict in accepted_parameter_types:
59
+ valid_levels.add("ToolCall")
60
+ return valid_levels
61
+
62
+
63
+ def get_first_parameter_types(metric_function: Callable) -> set[type]:
64
+ input_type = next(
65
+ iter(inspect.signature(metric_function).parameters.values())
66
+ ).annotation
67
+ if input_type is inspect._empty:
68
+ logger.debug(
69
+ f"Function '{metric_function}' has a first parameter with no type annotation."
70
+ )
71
+ return set()
72
+ return get_acceptable_arg_types(input_type)
73
+
74
+
75
+ def get_acceptable_arg_types(input_type: type) -> set[type]:
76
+ # Note: we don't support NewType annotations yet
77
+ origin_type = typing.get_origin(input_type)
78
+ if origin_type is typing.Annotated:
79
+ # unpack Annotated types
80
+ input_type = typing.get_args(input_type)[0]
81
+ origin_type = typing.get_origin(input_type)
82
+ if origin_type in (typing.Union, types.UnionType):
83
+ union_arg_type_sets = [
84
+ get_acceptable_arg_types(type_arg)
85
+ for type_arg in typing.get_args(input_type)
86
+ ]
87
+ return set.union(*union_arg_type_sets)
88
+ else: # not a union type
89
+ if origin_type is not None:
90
+ # e.g. input_type=list[str], origin_type=list
91
+ return {origin_type}
92
+ else:
93
+ # e.g. input_type=list, origin_type=list
94
+ if input_type is list or input_type is Iterable:
95
+ logger.warning(
96
+ "Type hint {input_type} lacks the detail that would allow us to determine the specific objects it accepts."
97
+ )
98
+ return {input_type}
99
+
100
+
101
+ def get_function_input(
102
+ metric_function: Callable,
103
+ metric_level: eval_schema.MetricLevel,
104
+ input_object: AnyFunctionObjectInput,
105
+ context_only: bool,
106
+ ) -> AnyFunctionObjectInput | str | dict | list:
107
+ """Coerce input_object to a type accepted by metric_function at this metric_level.
108
+
109
+ Args:
110
+ metric_function (Callable): Function to invoke with the returned input.
111
+ metric_level (eval_schema.MetricLevel): The metric level at which metric_function is being invoked.
112
+ input_object (AnyFunctionObjectInput): The input_object to be coerced, or passed as-is if accepted by metric_function.
113
+ context_only (bool): Determines how strings and lists are converted. See schema documentation.
114
+
115
+ Raises:
116
+ ValueError: If the function accepts at least one declared type, but
117
+ it's a type we don't support at all e.g. set or
118
+ it's a type we don't support at this metric_level.
119
+
120
+ Returns:
121
+ AnyFunctionObjectInput | str | dict | list: The coerced input for metric_function.
122
+ """
123
+ if metric_level not in eval_schema.VALID_METRIC_LEVELS:
124
+ raise ValueError(
125
+ f"metric_level '{metric_level}' not one of the valid levels: {eval_schema.VALID_METRIC_LEVELS}"
126
+ )
127
+ input_type = type(input_object)
128
+ accepted_parameter_types = get_first_parameter_types(metric_function)
129
+ if len(accepted_parameter_types) == 0:
130
+ logger.debug(
131
+ f"Metric function '{metric_function}' has a first parameter with no type hint, so we can't determine if a type transformation needs to be applied."
132
+ )
133
+ return input_object
134
+ if input_type in accepted_parameter_types:
135
+ # no transformation necessary; the function accepts the type we already have
136
+ return input_object
137
+ elif dict in accepted_parameter_types and metric_level == "ToolCall":
138
+ return input_object.get_dict_representation()
139
+ elif list in accepted_parameter_types and metric_level in ["Turn", "Thread"]:
140
+ if context_only:
141
+ return input_object.get_context()
142
+ else:
143
+ # this is on a single turn - pass in the parsed list
144
+ return input_object.get_content()
145
+ elif str in accepted_parameter_types:
146
+ if metric_level == "ToolCall":
147
+ raise ValueError(
148
+ "Functions that accept strings can't be used for tool calls. Accept a dict (or a flexeval.classes.tool_call.ToolCall) instead."
149
+ )
150
+ if context_only:
151
+ # join together all previous turns
152
+ return join_all_contents_to_string(input_object.get_context())
153
+ else:
154
+ # current turn only
155
+ return join_all_contents_to_string(input_object.get_content())
156
+ else:
157
+ # the function accepts at least one declared type, but either:
158
+ # - it's a type we don't support at all e.g. set
159
+ # - it's a type we don't support at this metric_level
160
+ raise ValueError(
161
+ f"For metric level '{metric_level}', can't coerce '{input_type.__name__}' for function '{metric_function}' to accepted parameter type(s) '{', '.join([type.__name__ for type in accepted_parameter_types])}'."
162
+ )
163
+
164
+
165
+ def join_all_contents_to_string(content: list[dict] | typing.Any) -> str:
166
+ """
167
+ content is a list of dictionaries whose keys include 'content'.
168
+ Returns a string with all the 'content' entries concatenated together,
169
+ separated by newline.
170
+ """
171
+ if isinstance(content, list):
172
+ content = "\n".join([item.get("content", "") for item in content])
173
+ return content