PyPI - python-flexeval - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

python-flexeval 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

flexeval/__about__.py +1 -1
flexeval/classes/dataset.py +12 -72
flexeval/classes/eval_set_run.py +18 -7
flexeval/classes/jsonview.py +10 -5
flexeval/classes/message.py +11 -5
flexeval/classes/metric.py +0 -8
flexeval/classes/thread.py +0 -2
flexeval/classes/tool_call.py +0 -2
flexeval/classes/turn.py +7 -5
flexeval/completions.py +8 -5
flexeval/compute_metrics.py +45 -32
flexeval/configuration/evals.yaml +2 -25
flexeval/data_loader.py +219 -317
flexeval/db_utils.py +11 -2
flexeval/dependency_graph.py +3 -3
flexeval/eval_schema.json +0 -18
flexeval/function_types.py +2 -13
flexeval/metrics/save.py +12 -8
flexeval/run_utils.py +163 -17
flexeval/runner.py +6 -14
flexeval/schema/config_schema.py +12 -0
flexeval/schema/eval_schema.py +3 -0
flexeval/schema/evalrun_schema.py +41 -10
{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.1.dist-info}/METADATA +3 -3
python_flexeval-0.4.1.dist-info/RECORD +49 -0
{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.1.dist-info}/WHEEL +1 -1
python_flexeval-0.3.0.dist-info/RECORD +0 -49
{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.1.dist-info}/entry_points.txt +0 -0
{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.1.dist-info}/licenses/LICENSE +0 -0

flexeval/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.0"
1	+ __version__ = "0.4.1"

flexeval/classes/dataset.py CHANGED Viewed

@@ -1,82 +1,22 @@
-import os.path
+import logging
+from datetime import datetime
 import peewee as pw
 from flexeval.classes.base import BaseModel
-from flexeval.classes.eval_set_run import EvalSetRun
+from flexeval.classes.jsonview import JsonView
+logger = logging.getLogger(__name__)
 class Dataset(BaseModel):
     """Holds a dataset, e.g. a jsonl file"""
     id = pw.IntegerField(primary_key=True)
-    evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="datasets")
-    filename = pw.TextField()
-    datatype = pw.TextField(null=True)
-    contents = pw.TextField(null=True)  # raw contents
-    max_n_conversation_threads = pw.IntegerField(null=True)
-    nb_evaluations_per_thread = pw.IntegerField(null=True, default=1)
-    # In line with LangGraph expectations, we assume n=1 for all outputs of LLMs
-    # However, each node can append list with length 2+ to the message queue
-    # Thread - conversation
-    # Turn - adjacent messages from the same agent
-    # Message -
-    #   role - human or ai, user or assistant
-    #   text - empty string or non-empty
-    #   list of 0+ Tool Calls
-    #   post-processing - add a turn_id
-    #   additional_kwargs JSON
-    # ToolUse
-    #   foreign keys to "invoker" message and "function output" message
-    #   message that invoked it - foreign key
-    #   parameters of the input
-    #   result of tool call
-    # Metric
-    #   granularity type
-    #   foreign key to the object
-    # **each entry from LangGraph is a LIST of completions - usually with length 1
-    # Completion - has one bit of text content, and 0+ ToolCalls
-    # ToolCall - tool call (and response!) associated with the completion
-    #   completion_id
-    #   message_id
-    #   turn_id
-    def load_data(self):
-        from flexeval import (
-            data_loader,
-        )  # Local import as this needs to happen after the module is fully loaded
-        if self.filename.endswith(".jsonl"):
-            self.datatype = "json"
-            data_loader.load_jsonl(
-                dataset=self,
-                filename=self.filename,
-                max_n_conversation_threads=self.max_n_conversation_threads,
-                nb_evaluations_per_thread=self.nb_evaluations_per_thread,
-            )
-        elif is_sqlite_file(self.filename):
-            self.datatype = "sqlite"
-            data_loader.load_langgraph_sqlite(
-                dataset=self,
-                filename=self.filename,
-                max_n_conversation_threads=self.max_n_conversation_threads,
-                nb_evaluations_per_thread=self.nb_evaluations_per_thread,
-            )
-        else:
-            raise ValueError(
-                f"Unsupported format '{os.path.splitext(self.filename)[-1]}'. Each Data File must be either a jsonl or sqlite file. You provided the file: '{self.filename}'"
-            )
-def is_sqlite_file(filepath):
-    # Open the file in binary mode
-    with open(filepath, "rb") as file:
-        header = file.read(16)
-    # Check if the header matches the SQLite format header
-    return header == b"SQLite format 3\x00"
+    timestamp = pw.DateTimeField(default=datetime.now)
+    datasource_type = pw.TextField(null=False)
+    name = pw.TextField(default=None, null=True)
+    notes = pw.TextField(default=None, null=True)
+    is_loaded = pw.BooleanField(default=False)
+    metadata = pw.TextField(default="{}", null=False)
+    metadata_dict = JsonView("metadata")

flexeval/classes/eval_set_run.py CHANGED Viewed

@@ -1,9 +1,9 @@
-import json
 from datetime import datetime
 import peewee as pw
 from flexeval.classes.base import BaseModel
+from flexeval.classes.dataset import Dataset
 class EvalSetRun(BaseModel):
@@ -12,7 +12,6 @@ class EvalSetRun(BaseModel):
     id = pw.IntegerField(primary_key=True)
     name = pw.CharField(null=True)
     notes = pw.TextField(null=True)
-    dataset_files = pw.TextField()  # JSON string
     metrics = pw.TextField()
     metrics_graph_ordered_list = pw.TextField()
     do_completion = pw.BooleanField()
@@ -25,8 +24,20 @@ class EvalSetRun(BaseModel):
         default=datetime.now
     )  # Automatically set to current date and time
-    def get_datasets(self) -> list[str]:
-        # TODO Turn these into DataSource instances instead, returning list[DataSource]
-        temp = json.loads(self.dataset_files)
-        assert isinstance(temp, list), "The `data` entry in evals.yaml must be a list."
-        return temp
+    @property
+    def dataset_list(self) -> list[Dataset]:
+        """Returns the actual Dataset objects linked to this EvalSetRun via the join table."""
+        return list(
+            Dataset.select()
+            .join(EvalSetRunDatasets)
+            .where(EvalSetRunDatasets.evalsetrun == self)
+        )
+class EvalSetRunDatasets(BaseModel):
+    """Datasets used by an EvalSetRun."""
+    id = pw.IntegerField(primary_key=True)
+    timestamp = pw.DateTimeField(default=datetime.now)
+    evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="dataset_links")
+    dataset = pw.ForeignKeyField(Dataset, backref="evalsetrun_links")

flexeval/classes/jsonview.py CHANGED Viewed

@@ -23,7 +23,7 @@ class JsonViewDict(UserDict):
     def _sync_to_model(self):
         """Sync the current data back to the model field."""
-        json_str = self.json_loads_fn(self.data)
+        json_str = self.json_dumps_fn(self.data)
         setattr(self.model_instance, self.text_field_attr_name, json_str)
     # Override mutating methods to trigger sync
@@ -58,6 +58,14 @@ class JsonViewDict(UserDict):
         super().update(*args, **kwargs)
         self._sync_to_model()
+    def refresh_from_model(self):
+        """If the text attribute has been mutated in the model, this method brings the view back in sync.
+        If you're going to use the JsonView, avoid mutating the text attribute directly.
+        """
+        text_value = getattr(self.model_instance, self.text_field_attr_name)
+        self.update(self.json_loads_fn(text_value))
 class JsonView:
     """Descriptor that provides dict-like access to a JSON text field.
@@ -66,9 +74,6 @@ class JsonView:
     class SomeModel(pw.Model):
         some_field = pw.TextField(default="{}")
         some_field_dict = JsonView(text_field_attr_name="some_field")
-    m = SomeModel()
-    m.some_field_dict["chosen_mistake"] = "whatever"
     """
     def __init__(self, text_field_attr_name):
@@ -79,7 +84,7 @@ class JsonView:
         """Called when the descriptor is assigned to a class attribute."""
         self.attr_name = f"_{name}_dict"
-    def __get__(self, instance, owner):
+    def __get__(self, instance, owner) -> JsonViewDict:
         if instance is None:
             return self

flexeval/classes/message.py CHANGED Viewed

@@ -7,7 +7,6 @@ from playhouse.shortcuts import model_to_dict
 from flexeval.classes.base import BaseModel
 from flexeval.classes.dataset import Dataset
-from flexeval.classes.eval_set_run import EvalSetRun
 from flexeval.classes.thread import Thread
 from flexeval.classes.turn import Turn
 from flexeval.classes.jsonview import JsonView
@@ -24,7 +23,6 @@ class Message(BaseModel):
     id = pw.IntegerField(primary_key=True)
-    evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="messages")
     dataset = pw.ForeignKeyField(Dataset, backref="messages")
     thread = pw.ForeignKeyField(Thread, backref="messages")
     index_in_thread = pw.IntegerField()
@@ -71,10 +69,18 @@ class Message(BaseModel):
         super().__init__(**kwargs)
         self.metrics_to_evaluate = []
-    def get_completion(self, include_system_prompt=False):
+    def get_completion(
+        self,
+        include_system_prompt=False,
+        completion_config: dict | None = None,
+        evalsetrun=None,
+    ):
         # only get a completion if this is the final turn - we probably don't want to branch from mid-conversation
         if self.is_final_turn_in_input:
-            completion_config = json.loads(self.evalsetrun.completion_llm)
+            if completion_config is None:
+                raise ValueError(
+                    "completion_config must be provided to get_completion()"
+                )
             completion_fn_name = completion_config.get("function_name", None)
             completion_function_kwargs = completion_config.get("kwargs", None)
@@ -104,7 +110,7 @@ class Message(BaseModel):
             # which generally means it'll have a structure like this
             # {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}
             result = model_to_dict(self, exclude=[self.id])
-            result["evalsetrun"] = self.evalsetrun
+            result["evalsetrun"] = evalsetrun
             result["dataset"] = self.dataset
             result["datasetrow"] = self.datasetrow
             result["turn_number"] = self.turn_number + 1

flexeval/classes/metric.py CHANGED Viewed

@@ -37,14 +37,6 @@ class Metric(BaseModel):
         null=True
     )  # necessary if rubric result is INVALID or e.g. latency doesn't apply to the very first message
     kwargs = pw.TextField()
-    # context_only allows us to create another kind of dependency
-    # where we can quantify something about the previous conversation
-    # and then use that quantity in a downstream analysis
-    # e.g. 'would a plot be pedagogically appropriate here' is really a question about the PAST of the conversation
-    #      NOTE: but we have gotten rid of context_only for rubrics, where only {context} is used so technically here 'context_only' is False
-    # or 'was the conversation ever flagged by the moderation api' would be a question about the previous turns that might
-    #    allow to have better context for the properties of this turn
-    # context_only = pw.BooleanField(default=False)
     source = pw.TextField()  # TODO - make another table for this? But maybe not, because this also contains filled-in rubrics
     depends_on = pw.TextField()
     rubric_prompt = pw.TextField(null=True)

flexeval/classes/thread.py CHANGED Viewed

@@ -2,7 +2,6 @@ import peewee as pw
 from flexeval.classes.base import BaseModel
 from flexeval.classes.dataset import Dataset
-from flexeval.classes.eval_set_run import EvalSetRun
 from flexeval.classes.jsonview import JsonView
@@ -13,7 +12,6 @@ class Thread(BaseModel):
     id = pw.IntegerField(primary_key=True)
     dataset = pw.ForeignKeyField(Dataset, backref="threads")
-    evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="threads")
     langgraph_thread_id = pw.TextField(null=True)
     eval_run_thread_id = pw.TextField(null=True)

flexeval/classes/tool_call.py CHANGED Viewed

@@ -2,7 +2,6 @@ import peewee as pw
 from flexeval.classes.base import BaseModel
 from flexeval.classes.dataset import Dataset
-from flexeval.classes.eval_set_run import EvalSetRun
 from flexeval.classes.message import Message
 from flexeval.classes.thread import Thread
 from flexeval.classes.turn import Turn
@@ -16,7 +15,6 @@ class ToolCall(BaseModel):
     id = pw.IntegerField(primary_key=True)
-    evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="toolcalls")
     dataset = pw.ForeignKeyField(Dataset, backref="toolcalls")
     thread = pw.ForeignKeyField(Thread, backref="toolcalls")
     message = pw.ForeignKeyField(Message, backref="toolcalls")

flexeval/classes/turn.py CHANGED Viewed

@@ -7,7 +7,6 @@ from playhouse.shortcuts import model_to_dict
 from flexeval.classes.base import BaseModel
 from flexeval.classes.dataset import Dataset
-from flexeval.classes.eval_set_run import EvalSetRun
 from flexeval.classes.thread import Thread
 from flexeval.configuration import completion_functions
@@ -22,7 +21,6 @@ class Turn(BaseModel):
     id = pw.IntegerField(primary_key=True)
-    evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="turns")
     dataset = pw.ForeignKeyField(Dataset, backref="turns")
     thread = pw.ForeignKeyField(Thread, backref="turns")
     index_in_thread = pw.IntegerField()
@@ -32,10 +30,13 @@ class Turn(BaseModel):
         super().__init__(**kwargs)
         self.metrics_to_evaluate = []
-    def get_completion(self):
+    def get_completion(self, completion_config: dict | None = None, evalsetrun=None):
         # only get a completion if this is the final turn - we probably don't want to branch from mid-conversation
         if self.is_final_turn_in_input:
-            completion_config = json.loads(self.evalsetrun.completion_llm)
+            if completion_config is None:
+                raise ValueError(
+                    "completion_config must be provided to get_completion()"
+                )
             completion_fn_name = completion_config.get("function_name", None)
             completion_function_kwargs = completion_config.get("kwargs", None)
@@ -69,7 +70,7 @@ class Turn(BaseModel):
             #       - make the completion function just return content?
             # {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}
             result = model_to_dict(self, exclude=[self.id])
-            result["evalsetrun"] = self.evalsetrun
+            result["evalsetrun"] = evalsetrun
             result["dataset"] = self.dataset
             result["datasetrow"] = self.datasetrow
             result["turn_number"] = self.turn_number + 1
@@ -108,6 +109,7 @@ class Turn(BaseModel):
         """
         context = ""
         for message in self.messages:
+            # TODO why not just use message.get_context(include_system_prompt=include_system_prompt) here?
             context = message.context
             break
         context = json.loads(context)

flexeval/completions.py CHANGED Viewed

@@ -55,10 +55,15 @@ def get_completion(turn: classes.turn.Turn, completion_llm: CompletionLlm):
     return completion
-def get_completions(eval_run: EvalRun, evalsetrun: classes.eval_set_run.EvalSetRun):
+def get_completions(
+    eval_run: EvalRun,
+    evalsetrun: classes.eval_set_run.EvalSetRun,
+    datasets: list[classes.dataset.Dataset],
+):
     n_workers = eval_run.config.max_workers
+    threads = [thread for dataset in datasets for thread in dataset.threads]
     if n_workers == 1:
-        for thread in evalsetrun.threads:
+        for thread in threads:
             # select last turn in thread
             if len(thread.turns) == 0:
                 continue
@@ -75,7 +80,7 @@ def get_completions(eval_run: EvalRun, evalsetrun: classes.eval_set_run.EvalSetR
     else:
         with ThreadPoolExecutor(max_workers=n_workers) as executor:
             futures: dict[Future, classes.turn.Turn] = {}
-            for thread in evalsetrun.threads:
+            for thread in threads:
                 if len(thread.turns) == 0:
                     continue
                 turn = (
@@ -113,7 +118,6 @@ def save_completion(
         new_turn = turn
     else:
         new_turn = classes.turn.Turn.create(
-            evalsetrun=evalsetrun,
             dataset=turn.dataset,
             thread=turn.thread,
             index_in_thread=turn.index_in_thread + 1,
@@ -129,7 +133,6 @@ def save_completion(
         {"role": prev_message.role, "content": prev_message.content}
     )
     classes.message.Message.create(
-        evalsetrun=evalsetrun,
         dataset=turn.dataset,
         thread=turn.thread,
         turn=new_turn,

flexeval/compute_metrics.py CHANGED Viewed

@@ -14,6 +14,7 @@ from typing import Iterable, Union
 import networkx as nx
 from flexeval import function_types
+from flexeval.classes.dataset import Dataset
 from flexeval.classes.eval_set_run import EvalSetRun
 from flexeval.classes.message import Message
 from flexeval.classes.thread import Thread
@@ -159,8 +160,8 @@ class MetricGraphBuilder:
         metric = self.metric_id_map[metric_id]
         return self.get_or_create_object_metric(dependency_metric_level, object, metric)
-    def build_thread_task_graphs(self, evalsetrun: EvalSetRun) -> Iterable[nx.DiGraph]:
-        threads = evalsetrun.threads
+    def build_thread_task_graphs(self, dataset: Dataset) -> Iterable[nx.DiGraph]:
+        threads = dataset.threads
         for thread in threads:
             yield self.build_thread_task_graph(thread)
@@ -208,28 +209,35 @@ class MetricGraphBuilder:
         return g
-def compute_metrics(evalrun: EvalRun, evalsetrun: EvalSetRun) -> list[dict]:
+def compute_metrics(
+    evalrun: EvalRun, evalsetrun: EvalSetRun, datasets: list[Dataset]
+) -> list[dict]:
     n_workers = evalrun.config.max_workers
     raise_on_error = evalrun.config.raise_on_metric_error
     mgb = MetricGraphBuilder()
     mgb.build_metric_structures(evalsetrun)
-    graphs = mgb.build_thread_task_graphs(evalsetrun)
     mc = MetricComputer.from_evalrun(evalrun, evalsetrun)
     metrics = []
-    if n_workers == 1:
-        for graph in graphs:
-            graph_metrics = mc.process_thread_dependency_graph(graph, raise_on_error)
-            metrics.extend(graph_metrics)
-    else:
-        with ThreadPoolExecutor(max_workers=n_workers) as executor:
-            futures = []
+    for dataset in datasets:
+        graphs = mgb.build_thread_task_graphs(dataset)
+        if n_workers == 1:
             for graph in graphs:
-                future = executor.submit(mc.process_thread_dependency_graph, graph)
-                futures.append(future)
-            for i, future in enumerate(futures):
-                metrics.extend(future.result())
-                if i % 100 == 0:
-                    logger.info(f"Metrics futures resulted: {i + 1} / {len(futures)}")
+                graph_metrics = mc.process_thread_dependency_graph(
+                    graph, raise_on_error
+                )
+                metrics.extend(graph_metrics)
+        else:
+            with ThreadPoolExecutor(max_workers=n_workers) as executor:
+                futures = []
+                for graph in graphs:
+                    future = executor.submit(mc.process_thread_dependency_graph, graph)
+                    futures.append(future)
+                for i, future in enumerate(futures):
+                    metrics.extend(future.result())
+                    if i % 100 == 0:
+                        logger.info(
+                            f"Metrics futures resulted: {i + 1} / {len(futures)}"
+                        )
     return metrics
@@ -296,10 +304,18 @@ class MetricComputer:
         self.rubrics: dict | None = (
             self.load_rubrics(evalsetrun) if evalsetrun is not None else None
         )
+        self.do_completion: bool = (
+            evalsetrun.do_completion if evalsetrun is not None else False
+        )
+        self.grader_llm: str | None = (
+            evalsetrun.grader_llm if evalsetrun is not None else None
+        )
-    def load_rubrics(self, evalsetrun: EvalSetRun):
-        """Set the rubrics to be used by this MetricComputer from the given EvalSetRun."""
-        self.rubrics = json.loads(evalsetrun.rubrics)
+    def load_rubrics(self, evalsetrun: EvalSetRun) -> dict:
+        """Load and return rubrics from the given EvalSetRun."""
+        rubrics = json.loads(evalsetrun.rubrics)
+        self.rubrics = rubrics
+        return rubrics
     def process_thread_dependency_graphs(
         self, graph_list: Iterable[nx.DiGraph]
@@ -467,7 +483,6 @@ class MetricComputer:
         evaluation_type: str,
         metric_level: str,
         kwargs: dict,
-        context_only: bool = None,
         depends_on: list = None,
         id: int = None,
         notes: str = None,  # just a placeholder
@@ -477,7 +492,6 @@ class MetricComputer:
                 function_name=evaluation_name,
                 metric_kwargs=kwargs,
                 metric_level=metric_level,
-                context_only=context_only,
                 input_object=object,
                 depends_on=depends_on,
                 id=id,
@@ -515,10 +529,9 @@ class MetricComputer:
         metric_level: eval_schema.MetricLevel,
         input_object: function_types.AnyFunctionObjectInput,
         metric_kwargs: dict,
-        context_only: bool,
     ):
         function_input = function_types.get_function_input(
-            metric_function, metric_level, input_object, context_only
+            metric_function, metric_level, input_object
         )
         metrics_result = metric_function(function_input, **metric_kwargs)
         return metrics_result
@@ -541,7 +554,6 @@ class MetricComputer:
         metric_kwargs: dict,
         input_object: Union[Thread, Turn, Message, ToolCall],
         metric_level: eval_schema.MetricLevel,
-        context_only: bool,
         depends_on: list,
         id: int,
     ):
@@ -552,7 +564,7 @@ class MetricComputer:
         # Check if the function exists in any of the function namespaces
         metric_function, metric_source = self.find_function(function_name)
         metrics_result = self.invoke_function(
-            metric_function, metric_level, input_object, metric_kwargs, context_only
+            metric_function, metric_level, input_object, metric_kwargs
         )
         base_result = {
@@ -562,7 +574,6 @@ class MetricComputer:
             "metric_level": metric_level,
             "kwargs": metric_kwargs,
             "source": metric_source,  # TODO - put this back?
-            "context_only": context_only,
             "depends_on": depends_on,
             "id": id,
         }
@@ -611,7 +622,9 @@ class MetricComputer:
         if self.rubrics is not None:
             rubrics = self.rubrics
         else:
-            rubrics = json.loads(object.evalsetrun.rubrics)
+            raise ValueError(
+                "No rubrics loaded. Rubrics must be loaded via MetricComputer.from_evalrun() before computing rubric metrics."
+            )
         if rubric_name not in rubrics:
             raise ValueError(
                 f"You requested a rubric called '{rubric_name}', but only these were found: {rubrics.keys()}."
@@ -643,7 +656,7 @@ class MetricComputer:
                 "Your rubric should not have both {content} and {completion}. Please check the README file for more information about how to write FlexEval rubrics."
             )
-        if "{completion}" in prompt and not object.evalsetrun.do_completion:
+        if "{completion}" in prompt and not self.do_completion:
             raise Exception(
                 "Your rubric has {completion}, but in your test specification for this rubric evaluation, do_completion is not True. Please check the README file for more information about how to write FlexEval rubrics."
             )
@@ -656,7 +669,7 @@ class MetricComputer:
         )
         # with do_completion == True, only the completion is evaluated with or without the context.
-        if object.evalsetrun.do_completion and "{completion}" in prompt:
+        if self.do_completion and "{completion}" in prompt:
             # TODO revisit this logic
             # also included object.is_completion, which only works for Message rubrics
             # but we can in principle check for a message in either a turn or a thread with is_flexeval_completion true
@@ -665,11 +678,11 @@ class MetricComputer:
         choice_scores = rubrics.get(rubric_name).get("choice_scores")
         # get rubric grader
-        if object.evalsetrun.grader_llm is None or object.evalsetrun.grader_llm == "":
+        if self.grader_llm is None or self.grader_llm == "":
             raise ValueError(
                 "Attempting to evaluate a rubric metric, but no grader LLM defined."
             )
-        grader_completion_function = json.loads(object.evalsetrun.grader_llm)
+        grader_completion_function = json.loads(self.grader_llm)
         if grader_completion_function is None or len(grader_completion_function) == 0:
             raise ValueError(
                 "Attempting to evaluate a rubric metric, but no grader LLM defined."

python-flexeval 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

python-flexeval 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl