python-flexeval 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
flexeval/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.0"
1
+ __version__ = "0.4.1"
@@ -1,82 +1,22 @@
1
- import os.path
1
+ import logging
2
+ from datetime import datetime
2
3
 
3
4
  import peewee as pw
4
5
 
5
6
  from flexeval.classes.base import BaseModel
6
- from flexeval.classes.eval_set_run import EvalSetRun
7
+ from flexeval.classes.jsonview import JsonView
8
+
9
+ logger = logging.getLogger(__name__)
7
10
 
8
11
 
9
12
  class Dataset(BaseModel):
10
13
  """Holds a dataset, e.g. a jsonl file"""
11
14
 
12
15
  id = pw.IntegerField(primary_key=True)
13
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="datasets")
14
- filename = pw.TextField()
15
- datatype = pw.TextField(null=True)
16
- contents = pw.TextField(null=True) # raw contents
17
-
18
- max_n_conversation_threads = pw.IntegerField(null=True)
19
- nb_evaluations_per_thread = pw.IntegerField(null=True, default=1)
20
-
21
- # In line with LangGraph expectations, we assume n=1 for all outputs of LLMs
22
- # However, each node can append list with length 2+ to the message queue
23
-
24
- # Thread - conversation
25
- # Turn - adjacent messages from the same agent
26
- # Message -
27
- # role - human or ai, user or assistant
28
- # text - empty string or non-empty
29
- # list of 0+ Tool Calls
30
- # post-processing - add a turn_id
31
- # additional_kwargs JSON
32
- # ToolUse
33
- # foreign keys to "invoker" message and "function output" message
34
- # message that invoked it - foreign key
35
- # parameters of the input
36
- # result of tool call
37
- # Metric
38
- # granularity type
39
- # foreign key to the object
40
-
41
- # **each entry from LangGraph is a LIST of completions - usually with length 1
42
-
43
- # Completion - has one bit of text content, and 0+ ToolCalls
44
- # ToolCall - tool call (and response!) associated with the completion
45
- # completion_id
46
- # message_id
47
- # turn_id
48
-
49
- def load_data(self):
50
- from flexeval import (
51
- data_loader,
52
- ) # Local import as this needs to happen after the module is fully loaded
53
-
54
- if self.filename.endswith(".jsonl"):
55
- self.datatype = "json"
56
- data_loader.load_jsonl(
57
- dataset=self,
58
- filename=self.filename,
59
- max_n_conversation_threads=self.max_n_conversation_threads,
60
- nb_evaluations_per_thread=self.nb_evaluations_per_thread,
61
- )
62
-
63
- elif is_sqlite_file(self.filename):
64
- self.datatype = "sqlite"
65
- data_loader.load_langgraph_sqlite(
66
- dataset=self,
67
- filename=self.filename,
68
- max_n_conversation_threads=self.max_n_conversation_threads,
69
- nb_evaluations_per_thread=self.nb_evaluations_per_thread,
70
- )
71
- else:
72
- raise ValueError(
73
- f"Unsupported format '{os.path.splitext(self.filename)[-1]}'. Each Data File must be either a jsonl or sqlite file. You provided the file: '{self.filename}'"
74
- )
75
-
76
-
77
- def is_sqlite_file(filepath):
78
- # Open the file in binary mode
79
- with open(filepath, "rb") as file:
80
- header = file.read(16)
81
- # Check if the header matches the SQLite format header
82
- return header == b"SQLite format 3\x00"
16
+ timestamp = pw.DateTimeField(default=datetime.now)
17
+ datasource_type = pw.TextField(null=False)
18
+ name = pw.TextField(default=None, null=True)
19
+ notes = pw.TextField(default=None, null=True)
20
+ is_loaded = pw.BooleanField(default=False)
21
+ metadata = pw.TextField(default="{}", null=False)
22
+ metadata_dict = JsonView("metadata")
@@ -1,9 +1,9 @@
1
- import json
2
1
  from datetime import datetime
3
2
 
4
3
  import peewee as pw
5
4
 
6
5
  from flexeval.classes.base import BaseModel
6
+ from flexeval.classes.dataset import Dataset
7
7
 
8
8
 
9
9
  class EvalSetRun(BaseModel):
@@ -12,7 +12,6 @@ class EvalSetRun(BaseModel):
12
12
  id = pw.IntegerField(primary_key=True)
13
13
  name = pw.CharField(null=True)
14
14
  notes = pw.TextField(null=True)
15
- dataset_files = pw.TextField() # JSON string
16
15
  metrics = pw.TextField()
17
16
  metrics_graph_ordered_list = pw.TextField()
18
17
  do_completion = pw.BooleanField()
@@ -25,8 +24,20 @@ class EvalSetRun(BaseModel):
25
24
  default=datetime.now
26
25
  ) # Automatically set to current date and time
27
26
 
28
- def get_datasets(self) -> list[str]:
29
- # TODO Turn these into DataSource instances instead, returning list[DataSource]
30
- temp = json.loads(self.dataset_files)
31
- assert isinstance(temp, list), "The `data` entry in evals.yaml must be a list."
32
- return temp
27
+ @property
28
+ def dataset_list(self) -> list[Dataset]:
29
+ """Returns the actual Dataset objects linked to this EvalSetRun via the join table."""
30
+ return list(
31
+ Dataset.select()
32
+ .join(EvalSetRunDatasets)
33
+ .where(EvalSetRunDatasets.evalsetrun == self)
34
+ )
35
+
36
+
37
+ class EvalSetRunDatasets(BaseModel):
38
+ """Datasets used by an EvalSetRun."""
39
+
40
+ id = pw.IntegerField(primary_key=True)
41
+ timestamp = pw.DateTimeField(default=datetime.now)
42
+ evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="dataset_links")
43
+ dataset = pw.ForeignKeyField(Dataset, backref="evalsetrun_links")
@@ -23,7 +23,7 @@ class JsonViewDict(UserDict):
23
23
 
24
24
  def _sync_to_model(self):
25
25
  """Sync the current data back to the model field."""
26
- json_str = self.json_loads_fn(self.data)
26
+ json_str = self.json_dumps_fn(self.data)
27
27
  setattr(self.model_instance, self.text_field_attr_name, json_str)
28
28
 
29
29
  # Override mutating methods to trigger sync
@@ -58,6 +58,14 @@ class JsonViewDict(UserDict):
58
58
  super().update(*args, **kwargs)
59
59
  self._sync_to_model()
60
60
 
61
+ def refresh_from_model(self):
62
+ """If the text attribute has been mutated in the model, this method brings the view back in sync.
63
+
64
+ If you're going to use the JsonView, avoid mutating the text attribute directly.
65
+ """
66
+ text_value = getattr(self.model_instance, self.text_field_attr_name)
67
+ self.update(self.json_loads_fn(text_value))
68
+
61
69
 
62
70
  class JsonView:
63
71
  """Descriptor that provides dict-like access to a JSON text field.
@@ -66,9 +74,6 @@ class JsonView:
66
74
  class SomeModel(pw.Model):
67
75
  some_field = pw.TextField(default="{}")
68
76
  some_field_dict = JsonView(text_field_attr_name="some_field")
69
-
70
- m = SomeModel()
71
- m.some_field_dict["chosen_mistake"] = "whatever"
72
77
  """
73
78
 
74
79
  def __init__(self, text_field_attr_name):
@@ -79,7 +84,7 @@ class JsonView:
79
84
  """Called when the descriptor is assigned to a class attribute."""
80
85
  self.attr_name = f"_{name}_dict"
81
86
 
82
- def __get__(self, instance, owner):
87
+ def __get__(self, instance, owner) -> JsonViewDict:
83
88
  if instance is None:
84
89
  return self
85
90
 
@@ -7,7 +7,6 @@ from playhouse.shortcuts import model_to_dict
7
7
 
8
8
  from flexeval.classes.base import BaseModel
9
9
  from flexeval.classes.dataset import Dataset
10
- from flexeval.classes.eval_set_run import EvalSetRun
11
10
  from flexeval.classes.thread import Thread
12
11
  from flexeval.classes.turn import Turn
13
12
  from flexeval.classes.jsonview import JsonView
@@ -24,7 +23,6 @@ class Message(BaseModel):
24
23
 
25
24
  id = pw.IntegerField(primary_key=True)
26
25
 
27
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="messages")
28
26
  dataset = pw.ForeignKeyField(Dataset, backref="messages")
29
27
  thread = pw.ForeignKeyField(Thread, backref="messages")
30
28
  index_in_thread = pw.IntegerField()
@@ -71,10 +69,18 @@ class Message(BaseModel):
71
69
  super().__init__(**kwargs)
72
70
  self.metrics_to_evaluate = []
73
71
 
74
- def get_completion(self, include_system_prompt=False):
72
+ def get_completion(
73
+ self,
74
+ include_system_prompt=False,
75
+ completion_config: dict | None = None,
76
+ evalsetrun=None,
77
+ ):
75
78
  # only get a completion if this is the final turn - we probably don't want to branch from mid-conversation
76
79
  if self.is_final_turn_in_input:
77
- completion_config = json.loads(self.evalsetrun.completion_llm)
80
+ if completion_config is None:
81
+ raise ValueError(
82
+ "completion_config must be provided to get_completion()"
83
+ )
78
84
  completion_fn_name = completion_config.get("function_name", None)
79
85
  completion_function_kwargs = completion_config.get("kwargs", None)
80
86
 
@@ -104,7 +110,7 @@ class Message(BaseModel):
104
110
  # which generally means it'll have a structure like this
105
111
  # {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}
106
112
  result = model_to_dict(self, exclude=[self.id])
107
- result["evalsetrun"] = self.evalsetrun
113
+ result["evalsetrun"] = evalsetrun
108
114
  result["dataset"] = self.dataset
109
115
  result["datasetrow"] = self.datasetrow
110
116
  result["turn_number"] = self.turn_number + 1
@@ -37,14 +37,6 @@ class Metric(BaseModel):
37
37
  null=True
38
38
  ) # necessary if rubric result is INVALID or e.g. latency doesn't apply to the very first message
39
39
  kwargs = pw.TextField()
40
- # context_only allows us to create another kind of dependency
41
- # where we can quantify something about the previous conversation
42
- # and then use that quantity in a downstream analysis
43
- # e.g. 'would a plot be pedagogically appropriate here' is really a question about the PAST of the conversation
44
- # NOTE: but we have gotten rid of context_only for rubrics, where only {context} is used so technically here 'context_only' is False
45
- # or 'was the conversation ever flagged by the moderation api' would be a question about the previous turns that might
46
- # allow to have better context for the properties of this turn
47
- # context_only = pw.BooleanField(default=False)
48
40
  source = pw.TextField() # TODO - make another table for this? But maybe not, because this also contains filled-in rubrics
49
41
  depends_on = pw.TextField()
50
42
  rubric_prompt = pw.TextField(null=True)
@@ -2,7 +2,6 @@ import peewee as pw
2
2
 
3
3
  from flexeval.classes.base import BaseModel
4
4
  from flexeval.classes.dataset import Dataset
5
- from flexeval.classes.eval_set_run import EvalSetRun
6
5
  from flexeval.classes.jsonview import JsonView
7
6
 
8
7
 
@@ -13,7 +12,6 @@ class Thread(BaseModel):
13
12
 
14
13
  id = pw.IntegerField(primary_key=True)
15
14
  dataset = pw.ForeignKeyField(Dataset, backref="threads")
16
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="threads")
17
15
 
18
16
  langgraph_thread_id = pw.TextField(null=True)
19
17
  eval_run_thread_id = pw.TextField(null=True)
@@ -2,7 +2,6 @@ import peewee as pw
2
2
 
3
3
  from flexeval.classes.base import BaseModel
4
4
  from flexeval.classes.dataset import Dataset
5
- from flexeval.classes.eval_set_run import EvalSetRun
6
5
  from flexeval.classes.message import Message
7
6
  from flexeval.classes.thread import Thread
8
7
  from flexeval.classes.turn import Turn
@@ -16,7 +15,6 @@ class ToolCall(BaseModel):
16
15
 
17
16
  id = pw.IntegerField(primary_key=True)
18
17
 
19
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="toolcalls")
20
18
  dataset = pw.ForeignKeyField(Dataset, backref="toolcalls")
21
19
  thread = pw.ForeignKeyField(Thread, backref="toolcalls")
22
20
  message = pw.ForeignKeyField(Message, backref="toolcalls")
flexeval/classes/turn.py CHANGED
@@ -7,7 +7,6 @@ from playhouse.shortcuts import model_to_dict
7
7
 
8
8
  from flexeval.classes.base import BaseModel
9
9
  from flexeval.classes.dataset import Dataset
10
- from flexeval.classes.eval_set_run import EvalSetRun
11
10
  from flexeval.classes.thread import Thread
12
11
  from flexeval.configuration import completion_functions
13
12
 
@@ -22,7 +21,6 @@ class Turn(BaseModel):
22
21
 
23
22
  id = pw.IntegerField(primary_key=True)
24
23
 
25
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="turns")
26
24
  dataset = pw.ForeignKeyField(Dataset, backref="turns")
27
25
  thread = pw.ForeignKeyField(Thread, backref="turns")
28
26
  index_in_thread = pw.IntegerField()
@@ -32,10 +30,13 @@ class Turn(BaseModel):
32
30
  super().__init__(**kwargs)
33
31
  self.metrics_to_evaluate = []
34
32
 
35
- def get_completion(self):
33
+ def get_completion(self, completion_config: dict | None = None, evalsetrun=None):
36
34
  # only get a completion if this is the final turn - we probably don't want to branch from mid-conversation
37
35
  if self.is_final_turn_in_input:
38
- completion_config = json.loads(self.evalsetrun.completion_llm)
36
+ if completion_config is None:
37
+ raise ValueError(
38
+ "completion_config must be provided to get_completion()"
39
+ )
39
40
  completion_fn_name = completion_config.get("function_name", None)
40
41
  completion_function_kwargs = completion_config.get("kwargs", None)
41
42
 
@@ -69,7 +70,7 @@ class Turn(BaseModel):
69
70
  # - make the completion function just return content?
70
71
  # {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}
71
72
  result = model_to_dict(self, exclude=[self.id])
72
- result["evalsetrun"] = self.evalsetrun
73
+ result["evalsetrun"] = evalsetrun
73
74
  result["dataset"] = self.dataset
74
75
  result["datasetrow"] = self.datasetrow
75
76
  result["turn_number"] = self.turn_number + 1
@@ -108,6 +109,7 @@ class Turn(BaseModel):
108
109
  """
109
110
  context = ""
110
111
  for message in self.messages:
112
+ # TODO why not just use message.get_context(include_system_prompt=include_system_prompt) here?
111
113
  context = message.context
112
114
  break
113
115
  context = json.loads(context)
flexeval/completions.py CHANGED
@@ -55,10 +55,15 @@ def get_completion(turn: classes.turn.Turn, completion_llm: CompletionLlm):
55
55
  return completion
56
56
 
57
57
 
58
- def get_completions(eval_run: EvalRun, evalsetrun: classes.eval_set_run.EvalSetRun):
58
+ def get_completions(
59
+ eval_run: EvalRun,
60
+ evalsetrun: classes.eval_set_run.EvalSetRun,
61
+ datasets: list[classes.dataset.Dataset],
62
+ ):
59
63
  n_workers = eval_run.config.max_workers
64
+ threads = [thread for dataset in datasets for thread in dataset.threads]
60
65
  if n_workers == 1:
61
- for thread in evalsetrun.threads:
66
+ for thread in threads:
62
67
  # select last turn in thread
63
68
  if len(thread.turns) == 0:
64
69
  continue
@@ -75,7 +80,7 @@ def get_completions(eval_run: EvalRun, evalsetrun: classes.eval_set_run.EvalSetR
75
80
  else:
76
81
  with ThreadPoolExecutor(max_workers=n_workers) as executor:
77
82
  futures: dict[Future, classes.turn.Turn] = {}
78
- for thread in evalsetrun.threads:
83
+ for thread in threads:
79
84
  if len(thread.turns) == 0:
80
85
  continue
81
86
  turn = (
@@ -113,7 +118,6 @@ def save_completion(
113
118
  new_turn = turn
114
119
  else:
115
120
  new_turn = classes.turn.Turn.create(
116
- evalsetrun=evalsetrun,
117
121
  dataset=turn.dataset,
118
122
  thread=turn.thread,
119
123
  index_in_thread=turn.index_in_thread + 1,
@@ -129,7 +133,6 @@ def save_completion(
129
133
  {"role": prev_message.role, "content": prev_message.content}
130
134
  )
131
135
  classes.message.Message.create(
132
- evalsetrun=evalsetrun,
133
136
  dataset=turn.dataset,
134
137
  thread=turn.thread,
135
138
  turn=new_turn,
@@ -14,6 +14,7 @@ from typing import Iterable, Union
14
14
  import networkx as nx
15
15
 
16
16
  from flexeval import function_types
17
+ from flexeval.classes.dataset import Dataset
17
18
  from flexeval.classes.eval_set_run import EvalSetRun
18
19
  from flexeval.classes.message import Message
19
20
  from flexeval.classes.thread import Thread
@@ -159,8 +160,8 @@ class MetricGraphBuilder:
159
160
  metric = self.metric_id_map[metric_id]
160
161
  return self.get_or_create_object_metric(dependency_metric_level, object, metric)
161
162
 
162
- def build_thread_task_graphs(self, evalsetrun: EvalSetRun) -> Iterable[nx.DiGraph]:
163
- threads = evalsetrun.threads
163
+ def build_thread_task_graphs(self, dataset: Dataset) -> Iterable[nx.DiGraph]:
164
+ threads = dataset.threads
164
165
  for thread in threads:
165
166
  yield self.build_thread_task_graph(thread)
166
167
 
@@ -208,28 +209,35 @@ class MetricGraphBuilder:
208
209
  return g
209
210
 
210
211
 
211
- def compute_metrics(evalrun: EvalRun, evalsetrun: EvalSetRun) -> list[dict]:
212
+ def compute_metrics(
213
+ evalrun: EvalRun, evalsetrun: EvalSetRun, datasets: list[Dataset]
214
+ ) -> list[dict]:
212
215
  n_workers = evalrun.config.max_workers
213
216
  raise_on_error = evalrun.config.raise_on_metric_error
214
217
  mgb = MetricGraphBuilder()
215
218
  mgb.build_metric_structures(evalsetrun)
216
- graphs = mgb.build_thread_task_graphs(evalsetrun)
217
219
  mc = MetricComputer.from_evalrun(evalrun, evalsetrun)
218
220
  metrics = []
219
- if n_workers == 1:
220
- for graph in graphs:
221
- graph_metrics = mc.process_thread_dependency_graph(graph, raise_on_error)
222
- metrics.extend(graph_metrics)
223
- else:
224
- with ThreadPoolExecutor(max_workers=n_workers) as executor:
225
- futures = []
221
+ for dataset in datasets:
222
+ graphs = mgb.build_thread_task_graphs(dataset)
223
+ if n_workers == 1:
226
224
  for graph in graphs:
227
- future = executor.submit(mc.process_thread_dependency_graph, graph)
228
- futures.append(future)
229
- for i, future in enumerate(futures):
230
- metrics.extend(future.result())
231
- if i % 100 == 0:
232
- logger.info(f"Metrics futures resulted: {i + 1} / {len(futures)}")
225
+ graph_metrics = mc.process_thread_dependency_graph(
226
+ graph, raise_on_error
227
+ )
228
+ metrics.extend(graph_metrics)
229
+ else:
230
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
231
+ futures = []
232
+ for graph in graphs:
233
+ future = executor.submit(mc.process_thread_dependency_graph, graph)
234
+ futures.append(future)
235
+ for i, future in enumerate(futures):
236
+ metrics.extend(future.result())
237
+ if i % 100 == 0:
238
+ logger.info(
239
+ f"Metrics futures resulted: {i + 1} / {len(futures)}"
240
+ )
233
241
  return metrics
234
242
 
235
243
 
@@ -296,10 +304,18 @@ class MetricComputer:
296
304
  self.rubrics: dict | None = (
297
305
  self.load_rubrics(evalsetrun) if evalsetrun is not None else None
298
306
  )
307
+ self.do_completion: bool = (
308
+ evalsetrun.do_completion if evalsetrun is not None else False
309
+ )
310
+ self.grader_llm: str | None = (
311
+ evalsetrun.grader_llm if evalsetrun is not None else None
312
+ )
299
313
 
300
- def load_rubrics(self, evalsetrun: EvalSetRun):
301
- """Set the rubrics to be used by this MetricComputer from the given EvalSetRun."""
302
- self.rubrics = json.loads(evalsetrun.rubrics)
314
+ def load_rubrics(self, evalsetrun: EvalSetRun) -> dict:
315
+ """Load and return rubrics from the given EvalSetRun."""
316
+ rubrics = json.loads(evalsetrun.rubrics)
317
+ self.rubrics = rubrics
318
+ return rubrics
303
319
 
304
320
  def process_thread_dependency_graphs(
305
321
  self, graph_list: Iterable[nx.DiGraph]
@@ -467,7 +483,6 @@ class MetricComputer:
467
483
  evaluation_type: str,
468
484
  metric_level: str,
469
485
  kwargs: dict,
470
- context_only: bool = None,
471
486
  depends_on: list = None,
472
487
  id: int = None,
473
488
  notes: str = None, # just a placeholder
@@ -477,7 +492,6 @@ class MetricComputer:
477
492
  function_name=evaluation_name,
478
493
  metric_kwargs=kwargs,
479
494
  metric_level=metric_level,
480
- context_only=context_only,
481
495
  input_object=object,
482
496
  depends_on=depends_on,
483
497
  id=id,
@@ -515,10 +529,9 @@ class MetricComputer:
515
529
  metric_level: eval_schema.MetricLevel,
516
530
  input_object: function_types.AnyFunctionObjectInput,
517
531
  metric_kwargs: dict,
518
- context_only: bool,
519
532
  ):
520
533
  function_input = function_types.get_function_input(
521
- metric_function, metric_level, input_object, context_only
534
+ metric_function, metric_level, input_object
522
535
  )
523
536
  metrics_result = metric_function(function_input, **metric_kwargs)
524
537
  return metrics_result
@@ -541,7 +554,6 @@ class MetricComputer:
541
554
  metric_kwargs: dict,
542
555
  input_object: Union[Thread, Turn, Message, ToolCall],
543
556
  metric_level: eval_schema.MetricLevel,
544
- context_only: bool,
545
557
  depends_on: list,
546
558
  id: int,
547
559
  ):
@@ -552,7 +564,7 @@ class MetricComputer:
552
564
  # Check if the function exists in any of the function namespaces
553
565
  metric_function, metric_source = self.find_function(function_name)
554
566
  metrics_result = self.invoke_function(
555
- metric_function, metric_level, input_object, metric_kwargs, context_only
567
+ metric_function, metric_level, input_object, metric_kwargs
556
568
  )
557
569
 
558
570
  base_result = {
@@ -562,7 +574,6 @@ class MetricComputer:
562
574
  "metric_level": metric_level,
563
575
  "kwargs": metric_kwargs,
564
576
  "source": metric_source, # TODO - put this back?
565
- "context_only": context_only,
566
577
  "depends_on": depends_on,
567
578
  "id": id,
568
579
  }
@@ -611,7 +622,9 @@ class MetricComputer:
611
622
  if self.rubrics is not None:
612
623
  rubrics = self.rubrics
613
624
  else:
614
- rubrics = json.loads(object.evalsetrun.rubrics)
625
+ raise ValueError(
626
+ "No rubrics loaded. Rubrics must be loaded via MetricComputer.from_evalrun() before computing rubric metrics."
627
+ )
615
628
  if rubric_name not in rubrics:
616
629
  raise ValueError(
617
630
  f"You requested a rubric called '{rubric_name}', but only these were found: {rubrics.keys()}."
@@ -643,7 +656,7 @@ class MetricComputer:
643
656
  "Your rubric should not have both {content} and {completion}. Please check the README file for more information about how to write FlexEval rubrics."
644
657
  )
645
658
 
646
- if "{completion}" in prompt and not object.evalsetrun.do_completion:
659
+ if "{completion}" in prompt and not self.do_completion:
647
660
  raise Exception(
648
661
  "Your rubric has {completion}, but in your test specification for this rubric evaluation, do_completion is not True. Please check the README file for more information about how to write FlexEval rubrics."
649
662
  )
@@ -656,7 +669,7 @@ class MetricComputer:
656
669
  )
657
670
 
658
671
  # with do_completion == True, only the completion is evaluated with or without the context.
659
- if object.evalsetrun.do_completion and "{completion}" in prompt:
672
+ if self.do_completion and "{completion}" in prompt:
660
673
  # TODO revisit this logic
661
674
  # also included object.is_completion, which only works for Message rubrics
662
675
  # but we can in principle check for a message in either a turn or a thread with is_flexeval_completion true
@@ -665,11 +678,11 @@ class MetricComputer:
665
678
  choice_scores = rubrics.get(rubric_name).get("choice_scores")
666
679
 
667
680
  # get rubric grader
668
- if object.evalsetrun.grader_llm is None or object.evalsetrun.grader_llm == "":
681
+ if self.grader_llm is None or self.grader_llm == "":
669
682
  raise ValueError(
670
683
  "Attempting to evaluate a rubric metric, but no grader LLM defined."
671
684
  )
672
- grader_completion_function = json.loads(object.evalsetrun.grader_llm)
685
+ grader_completion_function = json.loads(self.grader_llm)
673
686
  if grader_completion_function is None or len(grader_completion_function) == 0:
674
687
  raise ValueError(
675
688
  "Attempting to evaluate a rubric metric, but no grader LLM defined."