python-flexeval 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexeval/__about__.py +1 -1
- flexeval/classes/dataset.py +12 -72
- flexeval/classes/eval_set_run.py +18 -7
- flexeval/classes/jsonview.py +10 -5
- flexeval/classes/message.py +11 -5
- flexeval/classes/metric.py +0 -8
- flexeval/classes/thread.py +0 -2
- flexeval/classes/tool_call.py +0 -2
- flexeval/classes/turn.py +7 -5
- flexeval/completions.py +8 -5
- flexeval/compute_metrics.py +45 -32
- flexeval/configuration/evals.yaml +2 -25
- flexeval/data_loader.py +219 -317
- flexeval/db_utils.py +11 -2
- flexeval/dependency_graph.py +3 -3
- flexeval/eval_schema.json +0 -18
- flexeval/function_types.py +2 -13
- flexeval/metrics/save.py +12 -8
- flexeval/run_utils.py +163 -17
- flexeval/runner.py +6 -14
- flexeval/schema/config_schema.py +12 -0
- flexeval/schema/eval_schema.py +3 -0
- flexeval/schema/evalrun_schema.py +41 -10
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.1.dist-info}/METADATA +3 -3
- python_flexeval-0.4.1.dist-info/RECORD +49 -0
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.1.dist-info}/WHEEL +1 -1
- python_flexeval-0.3.0.dist-info/RECORD +0 -49
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.1.dist-info}/entry_points.txt +0 -0
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.1.dist-info}/licenses/LICENSE +0 -0
flexeval/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.4.1"
|
flexeval/classes/dataset.py
CHANGED
|
@@ -1,82 +1,22 @@
|
|
|
1
|
-
import
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime
|
|
2
3
|
|
|
3
4
|
import peewee as pw
|
|
4
5
|
|
|
5
6
|
from flexeval.classes.base import BaseModel
|
|
6
|
-
from flexeval.classes.
|
|
7
|
+
from flexeval.classes.jsonview import JsonView
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
7
10
|
|
|
8
11
|
|
|
9
12
|
class Dataset(BaseModel):
|
|
10
13
|
"""Holds a dataset, e.g. a jsonl file"""
|
|
11
14
|
|
|
12
15
|
id = pw.IntegerField(primary_key=True)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
# In line with LangGraph expectations, we assume n=1 for all outputs of LLMs
|
|
22
|
-
# However, each node can append list with length 2+ to the message queue
|
|
23
|
-
|
|
24
|
-
# Thread - conversation
|
|
25
|
-
# Turn - adjacent messages from the same agent
|
|
26
|
-
# Message -
|
|
27
|
-
# role - human or ai, user or assistant
|
|
28
|
-
# text - empty string or non-empty
|
|
29
|
-
# list of 0+ Tool Calls
|
|
30
|
-
# post-processing - add a turn_id
|
|
31
|
-
# additional_kwargs JSON
|
|
32
|
-
# ToolUse
|
|
33
|
-
# foreign keys to "invoker" message and "function output" message
|
|
34
|
-
# message that invoked it - foreign key
|
|
35
|
-
# parameters of the input
|
|
36
|
-
# result of tool call
|
|
37
|
-
# Metric
|
|
38
|
-
# granularity type
|
|
39
|
-
# foreign key to the object
|
|
40
|
-
|
|
41
|
-
# **each entry from LangGraph is a LIST of completions - usually with length 1
|
|
42
|
-
|
|
43
|
-
# Completion - has one bit of text content, and 0+ ToolCalls
|
|
44
|
-
# ToolCall - tool call (and response!) associated with the completion
|
|
45
|
-
# completion_id
|
|
46
|
-
# message_id
|
|
47
|
-
# turn_id
|
|
48
|
-
|
|
49
|
-
def load_data(self):
|
|
50
|
-
from flexeval import (
|
|
51
|
-
data_loader,
|
|
52
|
-
) # Local import as this needs to happen after the module is fully loaded
|
|
53
|
-
|
|
54
|
-
if self.filename.endswith(".jsonl"):
|
|
55
|
-
self.datatype = "json"
|
|
56
|
-
data_loader.load_jsonl(
|
|
57
|
-
dataset=self,
|
|
58
|
-
filename=self.filename,
|
|
59
|
-
max_n_conversation_threads=self.max_n_conversation_threads,
|
|
60
|
-
nb_evaluations_per_thread=self.nb_evaluations_per_thread,
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
elif is_sqlite_file(self.filename):
|
|
64
|
-
self.datatype = "sqlite"
|
|
65
|
-
data_loader.load_langgraph_sqlite(
|
|
66
|
-
dataset=self,
|
|
67
|
-
filename=self.filename,
|
|
68
|
-
max_n_conversation_threads=self.max_n_conversation_threads,
|
|
69
|
-
nb_evaluations_per_thread=self.nb_evaluations_per_thread,
|
|
70
|
-
)
|
|
71
|
-
else:
|
|
72
|
-
raise ValueError(
|
|
73
|
-
f"Unsupported format '{os.path.splitext(self.filename)[-1]}'. Each Data File must be either a jsonl or sqlite file. You provided the file: '{self.filename}'"
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def is_sqlite_file(filepath):
|
|
78
|
-
# Open the file in binary mode
|
|
79
|
-
with open(filepath, "rb") as file:
|
|
80
|
-
header = file.read(16)
|
|
81
|
-
# Check if the header matches the SQLite format header
|
|
82
|
-
return header == b"SQLite format 3\x00"
|
|
16
|
+
timestamp = pw.DateTimeField(default=datetime.now)
|
|
17
|
+
datasource_type = pw.TextField(null=False)
|
|
18
|
+
name = pw.TextField(default=None, null=True)
|
|
19
|
+
notes = pw.TextField(default=None, null=True)
|
|
20
|
+
is_loaded = pw.BooleanField(default=False)
|
|
21
|
+
metadata = pw.TextField(default="{}", null=False)
|
|
22
|
+
metadata_dict = JsonView("metadata")
|
flexeval/classes/eval_set_run.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from datetime import datetime
|
|
3
2
|
|
|
4
3
|
import peewee as pw
|
|
5
4
|
|
|
6
5
|
from flexeval.classes.base import BaseModel
|
|
6
|
+
from flexeval.classes.dataset import Dataset
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class EvalSetRun(BaseModel):
|
|
@@ -12,7 +12,6 @@ class EvalSetRun(BaseModel):
|
|
|
12
12
|
id = pw.IntegerField(primary_key=True)
|
|
13
13
|
name = pw.CharField(null=True)
|
|
14
14
|
notes = pw.TextField(null=True)
|
|
15
|
-
dataset_files = pw.TextField() # JSON string
|
|
16
15
|
metrics = pw.TextField()
|
|
17
16
|
metrics_graph_ordered_list = pw.TextField()
|
|
18
17
|
do_completion = pw.BooleanField()
|
|
@@ -25,8 +24,20 @@ class EvalSetRun(BaseModel):
|
|
|
25
24
|
default=datetime.now
|
|
26
25
|
) # Automatically set to current date and time
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
27
|
+
@property
|
|
28
|
+
def dataset_list(self) -> list[Dataset]:
|
|
29
|
+
"""Returns the actual Dataset objects linked to this EvalSetRun via the join table."""
|
|
30
|
+
return list(
|
|
31
|
+
Dataset.select()
|
|
32
|
+
.join(EvalSetRunDatasets)
|
|
33
|
+
.where(EvalSetRunDatasets.evalsetrun == self)
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class EvalSetRunDatasets(BaseModel):
|
|
38
|
+
"""Datasets used by an EvalSetRun."""
|
|
39
|
+
|
|
40
|
+
id = pw.IntegerField(primary_key=True)
|
|
41
|
+
timestamp = pw.DateTimeField(default=datetime.now)
|
|
42
|
+
evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="dataset_links")
|
|
43
|
+
dataset = pw.ForeignKeyField(Dataset, backref="evalsetrun_links")
|
flexeval/classes/jsonview.py
CHANGED
|
@@ -23,7 +23,7 @@ class JsonViewDict(UserDict):
|
|
|
23
23
|
|
|
24
24
|
def _sync_to_model(self):
|
|
25
25
|
"""Sync the current data back to the model field."""
|
|
26
|
-
json_str = self.
|
|
26
|
+
json_str = self.json_dumps_fn(self.data)
|
|
27
27
|
setattr(self.model_instance, self.text_field_attr_name, json_str)
|
|
28
28
|
|
|
29
29
|
# Override mutating methods to trigger sync
|
|
@@ -58,6 +58,14 @@ class JsonViewDict(UserDict):
|
|
|
58
58
|
super().update(*args, **kwargs)
|
|
59
59
|
self._sync_to_model()
|
|
60
60
|
|
|
61
|
+
def refresh_from_model(self):
|
|
62
|
+
"""If the text attribute has been mutated in the model, this method brings the view back in sync.
|
|
63
|
+
|
|
64
|
+
If you're going to use the JsonView, avoid mutating the text attribute directly.
|
|
65
|
+
"""
|
|
66
|
+
text_value = getattr(self.model_instance, self.text_field_attr_name)
|
|
67
|
+
self.update(self.json_loads_fn(text_value))
|
|
68
|
+
|
|
61
69
|
|
|
62
70
|
class JsonView:
|
|
63
71
|
"""Descriptor that provides dict-like access to a JSON text field.
|
|
@@ -66,9 +74,6 @@ class JsonView:
|
|
|
66
74
|
class SomeModel(pw.Model):
|
|
67
75
|
some_field = pw.TextField(default="{}")
|
|
68
76
|
some_field_dict = JsonView(text_field_attr_name="some_field")
|
|
69
|
-
|
|
70
|
-
m = SomeModel()
|
|
71
|
-
m.some_field_dict["chosen_mistake"] = "whatever"
|
|
72
77
|
"""
|
|
73
78
|
|
|
74
79
|
def __init__(self, text_field_attr_name):
|
|
@@ -79,7 +84,7 @@ class JsonView:
|
|
|
79
84
|
"""Called when the descriptor is assigned to a class attribute."""
|
|
80
85
|
self.attr_name = f"_{name}_dict"
|
|
81
86
|
|
|
82
|
-
def __get__(self, instance, owner):
|
|
87
|
+
def __get__(self, instance, owner) -> JsonViewDict:
|
|
83
88
|
if instance is None:
|
|
84
89
|
return self
|
|
85
90
|
|
flexeval/classes/message.py
CHANGED
|
@@ -7,7 +7,6 @@ from playhouse.shortcuts import model_to_dict
|
|
|
7
7
|
|
|
8
8
|
from flexeval.classes.base import BaseModel
|
|
9
9
|
from flexeval.classes.dataset import Dataset
|
|
10
|
-
from flexeval.classes.eval_set_run import EvalSetRun
|
|
11
10
|
from flexeval.classes.thread import Thread
|
|
12
11
|
from flexeval.classes.turn import Turn
|
|
13
12
|
from flexeval.classes.jsonview import JsonView
|
|
@@ -24,7 +23,6 @@ class Message(BaseModel):
|
|
|
24
23
|
|
|
25
24
|
id = pw.IntegerField(primary_key=True)
|
|
26
25
|
|
|
27
|
-
evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="messages")
|
|
28
26
|
dataset = pw.ForeignKeyField(Dataset, backref="messages")
|
|
29
27
|
thread = pw.ForeignKeyField(Thread, backref="messages")
|
|
30
28
|
index_in_thread = pw.IntegerField()
|
|
@@ -71,10 +69,18 @@ class Message(BaseModel):
|
|
|
71
69
|
super().__init__(**kwargs)
|
|
72
70
|
self.metrics_to_evaluate = []
|
|
73
71
|
|
|
74
|
-
def get_completion(
|
|
72
|
+
def get_completion(
|
|
73
|
+
self,
|
|
74
|
+
include_system_prompt=False,
|
|
75
|
+
completion_config: dict | None = None,
|
|
76
|
+
evalsetrun=None,
|
|
77
|
+
):
|
|
75
78
|
# only get a completion if this is the final turn - we probably don't want to branch from mid-conversation
|
|
76
79
|
if self.is_final_turn_in_input:
|
|
77
|
-
completion_config
|
|
80
|
+
if completion_config is None:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
"completion_config must be provided to get_completion()"
|
|
83
|
+
)
|
|
78
84
|
completion_fn_name = completion_config.get("function_name", None)
|
|
79
85
|
completion_function_kwargs = completion_config.get("kwargs", None)
|
|
80
86
|
|
|
@@ -104,7 +110,7 @@ class Message(BaseModel):
|
|
|
104
110
|
# which generally means it'll have a structure like this
|
|
105
111
|
# {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}
|
|
106
112
|
result = model_to_dict(self, exclude=[self.id])
|
|
107
|
-
result["evalsetrun"] =
|
|
113
|
+
result["evalsetrun"] = evalsetrun
|
|
108
114
|
result["dataset"] = self.dataset
|
|
109
115
|
result["datasetrow"] = self.datasetrow
|
|
110
116
|
result["turn_number"] = self.turn_number + 1
|
flexeval/classes/metric.py
CHANGED
|
@@ -37,14 +37,6 @@ class Metric(BaseModel):
|
|
|
37
37
|
null=True
|
|
38
38
|
) # necessary if rubric result is INVALID or e.g. latency doesn't apply to the very first message
|
|
39
39
|
kwargs = pw.TextField()
|
|
40
|
-
# context_only allows us to create another kind of dependency
|
|
41
|
-
# where we can quantify something about the previous conversation
|
|
42
|
-
# and then use that quantity in a downstream analysis
|
|
43
|
-
# e.g. 'would a plot be pedagogically appropriate here' is really a question about the PAST of the conversation
|
|
44
|
-
# NOTE: but we have gotten rid of context_only for rubrics, where only {context} is used so technically here 'context_only' is False
|
|
45
|
-
# or 'was the conversation ever flagged by the moderation api' would be a question about the previous turns that might
|
|
46
|
-
# allow to have better context for the properties of this turn
|
|
47
|
-
# context_only = pw.BooleanField(default=False)
|
|
48
40
|
source = pw.TextField() # TODO - make another table for this? But maybe not, because this also contains filled-in rubrics
|
|
49
41
|
depends_on = pw.TextField()
|
|
50
42
|
rubric_prompt = pw.TextField(null=True)
|
flexeval/classes/thread.py
CHANGED
|
@@ -2,7 +2,6 @@ import peewee as pw
|
|
|
2
2
|
|
|
3
3
|
from flexeval.classes.base import BaseModel
|
|
4
4
|
from flexeval.classes.dataset import Dataset
|
|
5
|
-
from flexeval.classes.eval_set_run import EvalSetRun
|
|
6
5
|
from flexeval.classes.jsonview import JsonView
|
|
7
6
|
|
|
8
7
|
|
|
@@ -13,7 +12,6 @@ class Thread(BaseModel):
|
|
|
13
12
|
|
|
14
13
|
id = pw.IntegerField(primary_key=True)
|
|
15
14
|
dataset = pw.ForeignKeyField(Dataset, backref="threads")
|
|
16
|
-
evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="threads")
|
|
17
15
|
|
|
18
16
|
langgraph_thread_id = pw.TextField(null=True)
|
|
19
17
|
eval_run_thread_id = pw.TextField(null=True)
|
flexeval/classes/tool_call.py
CHANGED
|
@@ -2,7 +2,6 @@ import peewee as pw
|
|
|
2
2
|
|
|
3
3
|
from flexeval.classes.base import BaseModel
|
|
4
4
|
from flexeval.classes.dataset import Dataset
|
|
5
|
-
from flexeval.classes.eval_set_run import EvalSetRun
|
|
6
5
|
from flexeval.classes.message import Message
|
|
7
6
|
from flexeval.classes.thread import Thread
|
|
8
7
|
from flexeval.classes.turn import Turn
|
|
@@ -16,7 +15,6 @@ class ToolCall(BaseModel):
|
|
|
16
15
|
|
|
17
16
|
id = pw.IntegerField(primary_key=True)
|
|
18
17
|
|
|
19
|
-
evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="toolcalls")
|
|
20
18
|
dataset = pw.ForeignKeyField(Dataset, backref="toolcalls")
|
|
21
19
|
thread = pw.ForeignKeyField(Thread, backref="toolcalls")
|
|
22
20
|
message = pw.ForeignKeyField(Message, backref="toolcalls")
|
flexeval/classes/turn.py
CHANGED
|
@@ -7,7 +7,6 @@ from playhouse.shortcuts import model_to_dict
|
|
|
7
7
|
|
|
8
8
|
from flexeval.classes.base import BaseModel
|
|
9
9
|
from flexeval.classes.dataset import Dataset
|
|
10
|
-
from flexeval.classes.eval_set_run import EvalSetRun
|
|
11
10
|
from flexeval.classes.thread import Thread
|
|
12
11
|
from flexeval.configuration import completion_functions
|
|
13
12
|
|
|
@@ -22,7 +21,6 @@ class Turn(BaseModel):
|
|
|
22
21
|
|
|
23
22
|
id = pw.IntegerField(primary_key=True)
|
|
24
23
|
|
|
25
|
-
evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="turns")
|
|
26
24
|
dataset = pw.ForeignKeyField(Dataset, backref="turns")
|
|
27
25
|
thread = pw.ForeignKeyField(Thread, backref="turns")
|
|
28
26
|
index_in_thread = pw.IntegerField()
|
|
@@ -32,10 +30,13 @@ class Turn(BaseModel):
|
|
|
32
30
|
super().__init__(**kwargs)
|
|
33
31
|
self.metrics_to_evaluate = []
|
|
34
32
|
|
|
35
|
-
def get_completion(self):
|
|
33
|
+
def get_completion(self, completion_config: dict | None = None, evalsetrun=None):
|
|
36
34
|
# only get a completion if this is the final turn - we probably don't want to branch from mid-conversation
|
|
37
35
|
if self.is_final_turn_in_input:
|
|
38
|
-
completion_config
|
|
36
|
+
if completion_config is None:
|
|
37
|
+
raise ValueError(
|
|
38
|
+
"completion_config must be provided to get_completion()"
|
|
39
|
+
)
|
|
39
40
|
completion_fn_name = completion_config.get("function_name", None)
|
|
40
41
|
completion_function_kwargs = completion_config.get("kwargs", None)
|
|
41
42
|
|
|
@@ -69,7 +70,7 @@ class Turn(BaseModel):
|
|
|
69
70
|
# - make the completion function just return content?
|
|
70
71
|
# {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}
|
|
71
72
|
result = model_to_dict(self, exclude=[self.id])
|
|
72
|
-
result["evalsetrun"] =
|
|
73
|
+
result["evalsetrun"] = evalsetrun
|
|
73
74
|
result["dataset"] = self.dataset
|
|
74
75
|
result["datasetrow"] = self.datasetrow
|
|
75
76
|
result["turn_number"] = self.turn_number + 1
|
|
@@ -108,6 +109,7 @@ class Turn(BaseModel):
|
|
|
108
109
|
"""
|
|
109
110
|
context = ""
|
|
110
111
|
for message in self.messages:
|
|
112
|
+
# TODO why not just use message.get_context(include_system_prompt=include_system_prompt) here?
|
|
111
113
|
context = message.context
|
|
112
114
|
break
|
|
113
115
|
context = json.loads(context)
|
flexeval/completions.py
CHANGED
|
@@ -55,10 +55,15 @@ def get_completion(turn: classes.turn.Turn, completion_llm: CompletionLlm):
|
|
|
55
55
|
return completion
|
|
56
56
|
|
|
57
57
|
|
|
58
|
-
def get_completions(
|
|
58
|
+
def get_completions(
|
|
59
|
+
eval_run: EvalRun,
|
|
60
|
+
evalsetrun: classes.eval_set_run.EvalSetRun,
|
|
61
|
+
datasets: list[classes.dataset.Dataset],
|
|
62
|
+
):
|
|
59
63
|
n_workers = eval_run.config.max_workers
|
|
64
|
+
threads = [thread for dataset in datasets for thread in dataset.threads]
|
|
60
65
|
if n_workers == 1:
|
|
61
|
-
for thread in
|
|
66
|
+
for thread in threads:
|
|
62
67
|
# select last turn in thread
|
|
63
68
|
if len(thread.turns) == 0:
|
|
64
69
|
continue
|
|
@@ -75,7 +80,7 @@ def get_completions(eval_run: EvalRun, evalsetrun: classes.eval_set_run.EvalSetR
|
|
|
75
80
|
else:
|
|
76
81
|
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
|
77
82
|
futures: dict[Future, classes.turn.Turn] = {}
|
|
78
|
-
for thread in
|
|
83
|
+
for thread in threads:
|
|
79
84
|
if len(thread.turns) == 0:
|
|
80
85
|
continue
|
|
81
86
|
turn = (
|
|
@@ -113,7 +118,6 @@ def save_completion(
|
|
|
113
118
|
new_turn = turn
|
|
114
119
|
else:
|
|
115
120
|
new_turn = classes.turn.Turn.create(
|
|
116
|
-
evalsetrun=evalsetrun,
|
|
117
121
|
dataset=turn.dataset,
|
|
118
122
|
thread=turn.thread,
|
|
119
123
|
index_in_thread=turn.index_in_thread + 1,
|
|
@@ -129,7 +133,6 @@ def save_completion(
|
|
|
129
133
|
{"role": prev_message.role, "content": prev_message.content}
|
|
130
134
|
)
|
|
131
135
|
classes.message.Message.create(
|
|
132
|
-
evalsetrun=evalsetrun,
|
|
133
136
|
dataset=turn.dataset,
|
|
134
137
|
thread=turn.thread,
|
|
135
138
|
turn=new_turn,
|
flexeval/compute_metrics.py
CHANGED
|
@@ -14,6 +14,7 @@ from typing import Iterable, Union
|
|
|
14
14
|
import networkx as nx
|
|
15
15
|
|
|
16
16
|
from flexeval import function_types
|
|
17
|
+
from flexeval.classes.dataset import Dataset
|
|
17
18
|
from flexeval.classes.eval_set_run import EvalSetRun
|
|
18
19
|
from flexeval.classes.message import Message
|
|
19
20
|
from flexeval.classes.thread import Thread
|
|
@@ -159,8 +160,8 @@ class MetricGraphBuilder:
|
|
|
159
160
|
metric = self.metric_id_map[metric_id]
|
|
160
161
|
return self.get_or_create_object_metric(dependency_metric_level, object, metric)
|
|
161
162
|
|
|
162
|
-
def build_thread_task_graphs(self,
|
|
163
|
-
threads =
|
|
163
|
+
def build_thread_task_graphs(self, dataset: Dataset) -> Iterable[nx.DiGraph]:
|
|
164
|
+
threads = dataset.threads
|
|
164
165
|
for thread in threads:
|
|
165
166
|
yield self.build_thread_task_graph(thread)
|
|
166
167
|
|
|
@@ -208,28 +209,35 @@ class MetricGraphBuilder:
|
|
|
208
209
|
return g
|
|
209
210
|
|
|
210
211
|
|
|
211
|
-
def compute_metrics(
|
|
212
|
+
def compute_metrics(
|
|
213
|
+
evalrun: EvalRun, evalsetrun: EvalSetRun, datasets: list[Dataset]
|
|
214
|
+
) -> list[dict]:
|
|
212
215
|
n_workers = evalrun.config.max_workers
|
|
213
216
|
raise_on_error = evalrun.config.raise_on_metric_error
|
|
214
217
|
mgb = MetricGraphBuilder()
|
|
215
218
|
mgb.build_metric_structures(evalsetrun)
|
|
216
|
-
graphs = mgb.build_thread_task_graphs(evalsetrun)
|
|
217
219
|
mc = MetricComputer.from_evalrun(evalrun, evalsetrun)
|
|
218
220
|
metrics = []
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
metrics.extend(graph_metrics)
|
|
223
|
-
else:
|
|
224
|
-
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
|
225
|
-
futures = []
|
|
221
|
+
for dataset in datasets:
|
|
222
|
+
graphs = mgb.build_thread_task_graphs(dataset)
|
|
223
|
+
if n_workers == 1:
|
|
226
224
|
for graph in graphs:
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
metrics.extend(
|
|
231
|
-
|
|
232
|
-
|
|
225
|
+
graph_metrics = mc.process_thread_dependency_graph(
|
|
226
|
+
graph, raise_on_error
|
|
227
|
+
)
|
|
228
|
+
metrics.extend(graph_metrics)
|
|
229
|
+
else:
|
|
230
|
+
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
|
231
|
+
futures = []
|
|
232
|
+
for graph in graphs:
|
|
233
|
+
future = executor.submit(mc.process_thread_dependency_graph, graph)
|
|
234
|
+
futures.append(future)
|
|
235
|
+
for i, future in enumerate(futures):
|
|
236
|
+
metrics.extend(future.result())
|
|
237
|
+
if i % 100 == 0:
|
|
238
|
+
logger.info(
|
|
239
|
+
f"Metrics futures resulted: {i + 1} / {len(futures)}"
|
|
240
|
+
)
|
|
233
241
|
return metrics
|
|
234
242
|
|
|
235
243
|
|
|
@@ -296,10 +304,18 @@ class MetricComputer:
|
|
|
296
304
|
self.rubrics: dict | None = (
|
|
297
305
|
self.load_rubrics(evalsetrun) if evalsetrun is not None else None
|
|
298
306
|
)
|
|
307
|
+
self.do_completion: bool = (
|
|
308
|
+
evalsetrun.do_completion if evalsetrun is not None else False
|
|
309
|
+
)
|
|
310
|
+
self.grader_llm: str | None = (
|
|
311
|
+
evalsetrun.grader_llm if evalsetrun is not None else None
|
|
312
|
+
)
|
|
299
313
|
|
|
300
|
-
def load_rubrics(self, evalsetrun: EvalSetRun):
|
|
301
|
-
"""
|
|
302
|
-
|
|
314
|
+
def load_rubrics(self, evalsetrun: EvalSetRun) -> dict:
|
|
315
|
+
"""Load and return rubrics from the given EvalSetRun."""
|
|
316
|
+
rubrics = json.loads(evalsetrun.rubrics)
|
|
317
|
+
self.rubrics = rubrics
|
|
318
|
+
return rubrics
|
|
303
319
|
|
|
304
320
|
def process_thread_dependency_graphs(
|
|
305
321
|
self, graph_list: Iterable[nx.DiGraph]
|
|
@@ -467,7 +483,6 @@ class MetricComputer:
|
|
|
467
483
|
evaluation_type: str,
|
|
468
484
|
metric_level: str,
|
|
469
485
|
kwargs: dict,
|
|
470
|
-
context_only: bool = None,
|
|
471
486
|
depends_on: list = None,
|
|
472
487
|
id: int = None,
|
|
473
488
|
notes: str = None, # just a placeholder
|
|
@@ -477,7 +492,6 @@ class MetricComputer:
|
|
|
477
492
|
function_name=evaluation_name,
|
|
478
493
|
metric_kwargs=kwargs,
|
|
479
494
|
metric_level=metric_level,
|
|
480
|
-
context_only=context_only,
|
|
481
495
|
input_object=object,
|
|
482
496
|
depends_on=depends_on,
|
|
483
497
|
id=id,
|
|
@@ -515,10 +529,9 @@ class MetricComputer:
|
|
|
515
529
|
metric_level: eval_schema.MetricLevel,
|
|
516
530
|
input_object: function_types.AnyFunctionObjectInput,
|
|
517
531
|
metric_kwargs: dict,
|
|
518
|
-
context_only: bool,
|
|
519
532
|
):
|
|
520
533
|
function_input = function_types.get_function_input(
|
|
521
|
-
metric_function, metric_level, input_object
|
|
534
|
+
metric_function, metric_level, input_object
|
|
522
535
|
)
|
|
523
536
|
metrics_result = metric_function(function_input, **metric_kwargs)
|
|
524
537
|
return metrics_result
|
|
@@ -541,7 +554,6 @@ class MetricComputer:
|
|
|
541
554
|
metric_kwargs: dict,
|
|
542
555
|
input_object: Union[Thread, Turn, Message, ToolCall],
|
|
543
556
|
metric_level: eval_schema.MetricLevel,
|
|
544
|
-
context_only: bool,
|
|
545
557
|
depends_on: list,
|
|
546
558
|
id: int,
|
|
547
559
|
):
|
|
@@ -552,7 +564,7 @@ class MetricComputer:
|
|
|
552
564
|
# Check if the function exists in any of the function namespaces
|
|
553
565
|
metric_function, metric_source = self.find_function(function_name)
|
|
554
566
|
metrics_result = self.invoke_function(
|
|
555
|
-
metric_function, metric_level, input_object, metric_kwargs
|
|
567
|
+
metric_function, metric_level, input_object, metric_kwargs
|
|
556
568
|
)
|
|
557
569
|
|
|
558
570
|
base_result = {
|
|
@@ -562,7 +574,6 @@ class MetricComputer:
|
|
|
562
574
|
"metric_level": metric_level,
|
|
563
575
|
"kwargs": metric_kwargs,
|
|
564
576
|
"source": metric_source, # TODO - put this back?
|
|
565
|
-
"context_only": context_only,
|
|
566
577
|
"depends_on": depends_on,
|
|
567
578
|
"id": id,
|
|
568
579
|
}
|
|
@@ -611,7 +622,9 @@ class MetricComputer:
|
|
|
611
622
|
if self.rubrics is not None:
|
|
612
623
|
rubrics = self.rubrics
|
|
613
624
|
else:
|
|
614
|
-
|
|
625
|
+
raise ValueError(
|
|
626
|
+
"No rubrics loaded. Rubrics must be loaded via MetricComputer.from_evalrun() before computing rubric metrics."
|
|
627
|
+
)
|
|
615
628
|
if rubric_name not in rubrics:
|
|
616
629
|
raise ValueError(
|
|
617
630
|
f"You requested a rubric called '{rubric_name}', but only these were found: {rubrics.keys()}."
|
|
@@ -643,7 +656,7 @@ class MetricComputer:
|
|
|
643
656
|
"Your rubric should not have both {content} and {completion}. Please check the README file for more information about how to write FlexEval rubrics."
|
|
644
657
|
)
|
|
645
658
|
|
|
646
|
-
if "{completion}" in prompt and not
|
|
659
|
+
if "{completion}" in prompt and not self.do_completion:
|
|
647
660
|
raise Exception(
|
|
648
661
|
"Your rubric has {completion}, but in your test specification for this rubric evaluation, do_completion is not True. Please check the README file for more information about how to write FlexEval rubrics."
|
|
649
662
|
)
|
|
@@ -656,7 +669,7 @@ class MetricComputer:
|
|
|
656
669
|
)
|
|
657
670
|
|
|
658
671
|
# with do_completion == True, only the completion is evaluated with or without the context.
|
|
659
|
-
if
|
|
672
|
+
if self.do_completion and "{completion}" in prompt:
|
|
660
673
|
# TODO revisit this logic
|
|
661
674
|
# also included object.is_completion, which only works for Message rubrics
|
|
662
675
|
# but we can in principle check for a message in either a turn or a thread with is_flexeval_completion true
|
|
@@ -665,11 +678,11 @@ class MetricComputer:
|
|
|
665
678
|
choice_scores = rubrics.get(rubric_name).get("choice_scores")
|
|
666
679
|
|
|
667
680
|
# get rubric grader
|
|
668
|
-
if
|
|
681
|
+
if self.grader_llm is None or self.grader_llm == "":
|
|
669
682
|
raise ValueError(
|
|
670
683
|
"Attempting to evaluate a rubric metric, but no grader LLM defined."
|
|
671
684
|
)
|
|
672
|
-
grader_completion_function = json.loads(
|
|
685
|
+
grader_completion_function = json.loads(self.grader_llm)
|
|
673
686
|
if grader_completion_function is None or len(grader_completion_function) == 0:
|
|
674
687
|
raise ValueError(
|
|
675
688
|
"Attempting to evaluate a rubric metric, but no grader LLM defined."
|