PyPI - python-flexeval - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

python-flexeval 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

flexeval/__about__.py +1 -1
flexeval/classes/dataset.py +12 -72
flexeval/classes/eval_set_run.py +18 -7
flexeval/classes/jsonview.py +10 -5
flexeval/classes/message.py +11 -5
flexeval/classes/metric.py +0 -8
flexeval/classes/thread.py +0 -2
flexeval/classes/tool_call.py +0 -2
flexeval/classes/turn.py +7 -5
flexeval/completions.py +8 -5
flexeval/compute_metrics.py +45 -32
flexeval/configuration/evals.yaml +2 -25
flexeval/data_loader.py +219 -317
flexeval/db_utils.py +11 -2
flexeval/dependency_graph.py +3 -3
flexeval/eval_schema.json +0 -18
flexeval/function_types.py +2 -13
flexeval/metrics/save.py +12 -8
flexeval/run_utils.py +163 -17
flexeval/runner.py +6 -14
flexeval/schema/config_schema.py +12 -0
flexeval/schema/eval_schema.py +3 -0
flexeval/schema/evalrun_schema.py +41 -10
{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/METADATA +3 -3
python_flexeval-0.4.0.dist-info/RECORD +49 -0
{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/WHEEL +1 -1
python_flexeval-0.3.0.dist-info/RECORD +0 -49
{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/entry_points.txt +0 -0
{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/licenses/LICENSE +0 -0

flexeval/dependency_graph.py CHANGED Viewed

@@ -115,9 +115,9 @@ def get_parent_metrics(all_metrics: dict, child: dict) -> tuple[list, list]:
     """metrics_graph_ordered_list will be a list of metrics in order in which they should be run
     This function takes the eval represented by "child" and finds ALL evals in "all_metrics"
-    that quality as the child's immediate parent
+    that qualify as the child's immediate parent
-    An eval can qualify as a parent by having a matching name, type, context_only
+    An eval can qualify as a parent by having a matching name, type, etc.
     At this point, we won't have enough information to decide whether the child should be run
     (since the child might have additional requirements on the output of the parent)
     but this is enough to tell us that the child should be run AFTER the parent.
@@ -145,7 +145,7 @@ def get_parent_metrics(all_metrics: dict, child: dict) -> tuple[list, list]:
                 # if the conditionals are listed in the depends_on entry but don't match...
                 # Only check conditionals that are explicitly specified (not None) in the requirement
-                conditionals = ["metric_level", "context_only", "name", "kwargs"]
+                conditionals = ["metric_level", "name", "kwargs"]
                 for conditional in conditionals:
                     if (
                         conditional in requirement

flexeval/eval_schema.json CHANGED Viewed

@@ -76,10 +76,6 @@
                             "description": "The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
                             "additionalProperties": true
                           },
-                          "context_only": {
-                            "type": "boolean",
-                            "description": "The context_only value for the dependency. If provided, used to match which evaluation this dependency is for."
-                          },
                           "last_turn_only": {
                               "type": "boolean",
                               "description": "The last_turn_only value for the dependency. If provided, used to match which evaluation this dependency is for."
@@ -108,11 +104,6 @@
                       "description": "What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
                       "default": "Turn"
                     },
-                    "context_only": {
-                      "type": "boolean",
-                      "description": "If true, only the context (that is, the previous messages) will be evaluated, not the current object. Cannot be done with only thread",
-                      "default": false
-                    },
                     "last_instance_only": {
                       "type": "boolean",
                       "description": "If true, the object will only be evaluated if it's the last instance (i.e., turn or message depending on metric_level) in an existing conversation, or if it's a new completion.",
@@ -143,11 +134,6 @@
                         "description": "What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
                         "default": "Turn"
                       },
-                      "context_only": {
-                          "type": "boolean",
-                          "description": "If true, only the context (that is, the previous messages) will be evaluated, not the current object. Cannot be done with only thread",
-                          "default": false
-                      },
                       "last_instance_only": {
                           "type": "boolean",
                           "description": "If true, the object will only be evaluated if it's the last instance (i.e., turn or message depending on metric_level) in an existing conversation, or if it's a new completion.",
@@ -174,10 +160,6 @@
                                     "description": "The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
                                     "additionalProperties": true
                                   },
-                                  "context_only": {
-                                    "type": "boolean",
-                                    "description": "The context_only value for the dependency. If provided, used to match which evaluation this dependency is for."
-                                  },
                                   "last_turn_only": {
                                       "type": "boolean",
                                       "description": "The last_turn_only value for the dependency. If provided, used to match which evaluation this dependency is for."

flexeval/function_types.py CHANGED Viewed

@@ -102,7 +102,6 @@ def get_function_input(
     metric_function: Callable,
     metric_level: eval_schema.MetricLevel,
     input_object: AnyFunctionObjectInput,
-    context_only: bool,
 ) -> AnyFunctionObjectInput | str | dict | list:
     """Coerce input_object to a type accepted by metric_function at this metric_level.
@@ -110,7 +109,6 @@ def get_function_input(
         metric_function (Callable): Function to invoke with the returned input.
         metric_level (eval_schema.MetricLevel): The metric level at which metric_function is being invoked.
         input_object (AnyFunctionObjectInput): The input_object to be coerced, or passed as-is if accepted by metric_function.
-        context_only (bool): Determines how strings and lists are converted. See schema documentation.
     Raises:
         ValueError: If the function accepts at least one declared type, but
@@ -137,22 +135,13 @@ def get_function_input(
     elif dict in accepted_parameter_types and metric_level == "ToolCall":
         return input_object.get_dict_representation()
     elif list in accepted_parameter_types and metric_level in ["Turn", "Thread"]:
-        if context_only:
-            return input_object.get_context()
-        else:
-            # this is on a single turn - pass in the parsed list
-            return input_object.get_content()
+        return input_object.get_content()
     elif str in accepted_parameter_types:
         if metric_level == "ToolCall":
             raise ValueError(
                 "Functions that accept strings can't be used for tool calls. Accept a dict (or a flexeval.classes.tool_call.ToolCall) instead."
             )
-        if context_only:
-            # join together all previous turns
-            return join_all_contents_to_string(input_object.get_context())
-        else:
-            # current turn only
-            return join_all_contents_to_string(input_object.get_content())
+        return join_all_contents_to_string(input_object.get_content())
     else:
         # the function accepts at least one declared type, but either:
         # - it's a type we don't support at all e.g. set

flexeval/metrics/save.py CHANGED Viewed

@@ -1,25 +1,30 @@
 import json
 from typing import Iterable
+from flexeval.classes.dataset import Dataset
+from flexeval.classes.eval_set_run import EvalSetRun
 from flexeval.classes.metric import Metric
-def save_metrics(metrics: Iterable[Metric]):
+def save_metrics(
+    metrics: Iterable[Metric], evalsetrun: EvalSetRun, datasets: list[Dataset]
+):
+    # Build a mapping from dataset id to dataset for quick lookup
+    dataset_by_id = {d.id: d for d in datasets}
     for metric in metrics:
         # TODO - speed this up somehow
         thread = metric.get("thread")
         if thread is None:
             thread = metric[metric["metric_level"].lower()].thread
+        # Determine the dataset from the metric's object
+        metric_object = metric[metric["metric_level"].lower()]
+        dataset = dataset_by_id.get(metric_object.dataset_id)
         Metric.create(
             message=metric.get("message", None),
             turn=metric.get("turn", None),
             toolcall=metric.get("toolcall", None),
-            evalsetrun=metric[
-                metric["metric_level"].lower()
-            ].evalsetrun,  # metric["turn"].evalsetrun,
-            dataset=metric[
-                metric["metric_level"].lower()
-            ].dataset,  # metric["turn"].dataset,
+            evalsetrun=evalsetrun,
+            dataset=dataset,
             thread=thread,
             evaluation_name=metric["evaluation_name"],
             evaluation_type=metric["evaluation_type"],
@@ -28,7 +33,6 @@ def save_metrics(metrics: Iterable[Metric]):
             metric_level=metric["metric_level"],
             kwargs=metric["kwargs"],
             depends_on=json.dumps(metric["depends_on"]),
-            context_only=metric.get("context_only", False),
             source=metric["source"],
             rubric_prompt=metric.get("rubric_prompt", None),
             rubric_completion=metric.get("rubric_completion", None),

flexeval/run_utils.py CHANGED Viewed

@@ -6,7 +6,9 @@ import logging
 from flexeval import rubric
 from flexeval.classes.dataset import Dataset
 from flexeval.classes.eval_runner import EvalRunner
-from flexeval.classes.eval_set_run import EvalSetRun
+from flexeval.classes.eval_set_run import EvalSetRun, EvalSetRunDatasets
+from flexeval.schema import evalrun_schema
+from flexeval import data_loader
 logger = logging.getLogger(__name__)
@@ -16,17 +18,11 @@ def build_eval_set_run(runner: EvalRunner) -> EvalSetRun:
     # TODO this code uses a model_name that does not appear in the Eval schema; should look into this
     model_name = json.dumps(None)
-    # model_name = json.dumps(
-    #        runner.eval.get("completion_llm", {}).get("model_name", None)
-    #    )
     evalsetrun = EvalSetRun.create(
         name=runner.evalrun.eval.name,
         notes=runner.evalrun.eval.notes,
         metrics=runner.evalrun.eval.metrics.model_dump_json(),
         metrics_graph_ordered_list=json.dumps(runner.metrics_graph_ordered_list),
-        dataset_files=json.dumps(
-            [str(data_source.path) for data_source in runner.evalrun.data_sources]
-        ),
         do_completion=runner.evalrun.eval.do_completion,
         completion_llm=(
             runner.evalrun.eval.completion_llm.model_dump_json()
@@ -51,15 +47,165 @@ def build_eval_set_run(runner: EvalRunner) -> EvalSetRun:
     return evalsetrun
-def build_datasets(runner: EvalRunner, evalsetrun: EvalSetRun):
-    for filename in evalsetrun.get_datasets():
-        # these will automatically be saved as a property of evalsetrun
-        Dataset.create(
-            evalsetrun=evalsetrun,
-            filename=filename,
-            max_n_conversation_threads=runner.evalrun.config.max_n_conversation_threads,
-            nb_evaluations_per_thread=runner.evalrun.config.nb_evaluations_per_thread,
+def find_dataset_by_name(name: str) -> Dataset | None:
+    """Return the loaded Dataset with this name, or None if no such dataset exists.
+    If a Dataset with this name exists but is not marked is_loaded (the remnant
+    of a crashed prior load), it is treated as stale: cleaned up via
+    :func:`_cleanup_stale_dataset` and None is returned, so the caller can
+    proceed as if no dataset existed.
+    Raises:
+        ValueError: If more than one Dataset has this name, or if a stale
+            unloaded Dataset has derived rows (metrics or eval-run links) that
+            suggest a genuine integrity problem — see _cleanup_stale_dataset.
+    """
+    # LIMIT 2: we only need to know 0, 1, or >1
+    results = list(Dataset.select().where(Dataset.name == name).limit(2))
+    if len(results) == 0:
+        return None
+    if len(results) > 1:
+        raise ValueError(f"Multiple datasets with name '{name}'.")
+    dataset = results[0]
+    if not dataset.is_loaded:
+        _cleanup_stale_dataset(dataset)
+        return None
+    return dataset
+def _cleanup_stale_dataset(dataset: Dataset) -> None:
+    """Delete a partially-loaded Dataset and its child rows.
+    A Dataset with ``is_loaded=False`` is the remnant of a prior load that
+    crashed between the Dataset row being committed and the final
+    ``is_loaded=True`` save — its Thread/Turn/Message/ToolCall rows (if any)
+    are partial and unusable.
+    Derived rows (Metric, EvalSetRunDatasets) should never exist for an
+    unloaded Dataset — they're only created after a successful load. If they
+    do, something bypassed the normal flow and we refuse to touch it.
+    """
+    if dataset.metrics_list.exists() or dataset.evalsetrun_links.exists():
+        raise ValueError(
+            f"Dataset '{dataset.name}' (ID={dataset.id}) has is_loaded=False but "
+            "has metrics or eval-run links — refusing to clean up (possible integrity error)."
         )
-        runner.logger.info(
-            f"Created dataset from '{filename}'. Max number of conversation threads: '{runner.evalrun.config.max_n_conversation_threads}' - Nb of evaluations per thread: '{runner.evalrun.config.nb_evaluations_per_thread}'"
+    counts = {
+        "threads": dataset.threads.count(),
+        "turns": dataset.turns.count(),
+        "messages": dataset.messages.count(),
+        "toolcalls": dataset.toolcalls.count(),
+    }
+    logger.warning(
+        f"Dropping unloaded dataset '{dataset.name}' (ID={dataset.id}); "
+        f"partial rows from a prior failed load: {counts}. Reloading from scratch."
+    )
+    dataset.delete_instance(recursive=True)
+def create_dataset(data_source: evalrun_schema.DataSource) -> Dataset:
+    dataset = Dataset.create(
+        datasource_type=type(data_source).__name__,
+        name=data_source.name,
+        notes=data_source.notes,
+    )
+    return dataset
+def load_datasets(
+    evalrun: evalrun_schema.EvalRun,
+) -> list[Dataset]:
+    datasets = []
+    config = evalrun.config
+    for data_source in evalrun.data_sources:
+        datasource_type = type(data_source).__name__
+        # Auto-name unnamed IterableDataSources so same-instance reuse works
+        if (
+            isinstance(data_source, evalrun_schema.IterableDataSource)
+            and not data_source.name
+        ):
+            data_source.name = f"_iterable_{id(data_source)}"
+        # 1. Validate naming constraints
+        if config.raise_on_unnamed_dataset and (
+            data_source.name is None or data_source.name.strip() == ""
+        ):
+            raise ValueError(
+                f"Configuration requires named datasets, but a {datasource_type} was unnamed."
+            )
+        # 2. Look up existing dataset by name (if named)
+        existing_dataset = None
+        if data_source.name:
+            existing_dataset = find_dataset_by_name(data_source.name)
+        # 3. Dispatch by DataSource type
+        if isinstance(data_source, evalrun_schema.NamedDataSource):
+            # NamedDataSource MUST match an existing dataset
+            if existing_dataset is None:
+                raise ValueError(
+                    f"NamedDataSource requires an existing dataset with name '{data_source.name}', but none was found."
+                )
+            dataset = existing_dataset
+        elif isinstance(
+            data_source,
+            (evalrun_schema.FileDataSource, evalrun_schema.IterableDataSource),
+        ):
+            # Reuse if configured and existing dataset matches (checked first, takes priority)
+            if config.reuse_dataset_by_name and existing_dataset is not None:
+                if existing_dataset.datasource_type != datasource_type:
+                    logger.warning(
+                        f"Reusing dataset '{existing_dataset.name}' (ID={existing_dataset.id}) "
+                        f"but datasource type differs: existing={existing_dataset.datasource_type}, new={datasource_type}."
+                    )
+                logger.info(
+                    f"Reusing existing dataset '{existing_dataset.name}' (ID={existing_dataset.id})."
+                )
+                dataset = existing_dataset
+            else:
+                # Check for duplicate name conflict (only when not reusing)
+                if (
+                    config.raise_on_duplicate_dataset_name
+                    and existing_dataset is not None
+                ):
+                    raise ValueError(
+                        f"Configuration requires unique dataset names, but '{data_source.name}' already exists (ID={existing_dataset.id})."
+                    )
+                # Create and load new dataset
+                dataset = create_dataset(data_source)
+                if isinstance(data_source, evalrun_schema.IterableDataSource):
+                    data_loader.load_iterable(dataset, data_source.contents)
+                elif isinstance(data_source, evalrun_schema.FileDataSource):
+                    data_loader.load_file(
+                        dataset,
+                        data_source,
+                        max_n_conversation_threads=config.max_n_conversation_threads,
+                        nb_evaluations_per_thread=config.nb_evaluations_per_thread,
+                    )
+                    dataset.metadata_dict["imported_path"] = str(data_source.path)
+                    dataset.metadata_dict["imported_format"] = data_source.format.value
+                dataset.is_loaded = True
+                dataset.save()
+        else:
+            raise ValueError(f"Unsupported DataSource type: {datasource_type}")
+        datasets.append(dataset)
+    return datasets
+def set_datasets_for_evalsetrun(datasets: list[Dataset], evalsetrun: EvalSetRun):
+    for dataset in datasets:
+        EvalSetRunDatasets.create(
+            evalsetrun=evalsetrun,
+            dataset=dataset,
         )
+def build_evalsetrun_datasets(
+    evalrun: evalrun_schema.EvalRun, evalsetrun: EvalSetRun
+) -> list[Dataset]:
+    datasets = load_datasets(evalrun)
+    set_datasets_for_evalsetrun(datasets, evalsetrun)
+    return datasets

flexeval/runner.py CHANGED Viewed

@@ -86,26 +86,18 @@ def run(eval_run: EvalRun) -> EvalRunner:
         rd.seed(rd_seed)
         runner.logger.info(f"Set random seed to '{rd_seed}'.")
-        run_utils.build_datasets(runner, evalsetrun)
-    except Exception:
-        runner.logger.exception(
-            "An error occurred creating dataset metadata.", exc_info=True
-        )
-    try:
-        runner.logger.info("Parsing data files")
-        for dataset in evalsetrun.datasets:
-            runner.logger.debug(f"Loading data from '{dataset.filename}'.")
-            dataset.load_data()
+        datasets = run_utils.build_evalsetrun_datasets(runner.evalrun, evalsetrun)
     except Exception:
         runner.logger.exception("An error occurred loading data.", exc_info=True)
+        runner.shutdown_logging()
+        raise
     # Do completions, if necessary
     try:
         if evalsetrun.do_completion:
             # We do this by creating new turns
             runner.logger.info("Generating completions")
-            completions.get_completions(eval_run, evalsetrun)
+            completions.get_completions(eval_run, evalsetrun, datasets)
     except Exception:
         runner.logger.exception(
             "An error occurred generating completions.", exc_info=True
@@ -118,9 +110,9 @@ def run(eval_run: EvalRun) -> EvalRunner:
     #################  Compute Metrics  ###################
     #######################################################
     try:
-        metrics = compute_metrics.compute_metrics(eval_run, evalsetrun)
+        metrics = compute_metrics.compute_metrics(eval_run, evalsetrun, datasets)
         runner.logger.info(f"Saving '{len(metrics)}' metrics to database.")
-        flexeval.metrics.save.save_metrics(metrics)
+        flexeval.metrics.save.save_metrics(metrics, evalsetrun, datasets)
     except Exception:
         runner.logger.exception("An error occurred computing metrics.", exc_info=True)
         if eval_run.config.raise_on_metric_error:

flexeval/schema/config_schema.py CHANGED Viewed

@@ -44,3 +44,15 @@ class Config(BaseModel):
         False,
         description="If False (default), no exception will be thrown if a metric function raises an exception.",
     )
+    raise_on_duplicate_dataset_name: bool = Field(
+        False,
+        description="If True, throw an exception if two datasets would be created with the same name. Ignored when reuse_dataset_by_name is True.",
+    )
+    raise_on_unnamed_dataset: bool = Field(
+        False,
+        description="If True, throw an exception if any dataset is unnamed.",
+    )
+    reuse_dataset_by_name: bool = Field(
+        True,
+        description="If True (default), reuse a previously loaded dataset with the same name instead of creating a new one. This avoids redundant data loading and prevents iterator-based data sources from being consumed twice.",
+    )

flexeval/schema/eval_schema.py CHANGED Viewed

@@ -60,6 +60,9 @@ class DependsOnItem(BaseModel):
 class MetricItem(BaseModel):
     "Defines a metric."
+    class Config:
+        extra = "forbid"
     name: str = Field(
         ...,
         description="The function to call or name of rubric to use to compute this metric.",

flexeval/schema/evalrun_schema.py CHANGED Viewed

@@ -1,39 +1,70 @@
 """The top-level :class:`~flexeval.schema.evalrun_schema.EvalRun` schema and associated sub-schema."""
+import enum
 from pathlib import Path
-from typing import Annotated, Callable, Iterable, Literal
+from typing import Annotated, Callable, Iterable, Literal, Union
 from annotated_types import Len
-from pydantic import BaseModel, Field, FilePath
+from pydantic import BaseModel, Discriminator, Field, FilePath, Tag
 from flexeval.configuration import function_metrics
 from flexeval.schema import config_schema, eval_schema, rubric_schema, schema_utils
 class DataSource(BaseModel):
-    # TODO support more generic DataSource interface
-    # for now, we need to use FileDataSource because we path the JSONL paths along
-    name: str | None = Field(None, description="")
-    notes: str | None = Field(None, description="")
+    """Represents a source of data that can be used in evaluations."""
+    name: str | None = Field(
+        None, description="Used as metadata. No uniqueness requirement."
+    )
+    notes: str | None = Field(
+        None, description="Used as metadata; put whatever you want here."
+    )
+class NamedDataSource(DataSource):
+    """Look up a previously loaded DataSource by name. Must have a unique name."""
+    type: Literal["named"] = "named"
+    name: str = Field(description="The name to match on.")
 class IterableDataSource(DataSource):
-    """Not yet implemented."""
+    """Iterable of data items."""
+    type: Literal["iterable"] = "iterable"
     contents: Iterable = Field(
         default_factory=list,
-        description="Iterable of data items, presumably in the jsonl format (for now).",
+        description="Iterable of data items. For now, each item must be a dictionary with role and content keys.",
     )
+class FileFormatEnum(str, enum.Enum):
+    jsonl = "jsonl"
+    langgraph_sqlite = "langgraph_sqlite"
 class FileDataSource(DataSource):
     """File to be used as a data source."""
+    type: Literal["file"] = "file"
     # TODO in the future, we could use cloudpathlib to support cloud paths
     path: FilePath = Field(
         description="Absolute or relative path to data file. Each file must be in jsonl format, with one conversation per line."
     )
-    format: Literal["jsonl"] = Field("jsonl", description="Format of the data file.")
+    format: FileFormatEnum = Field(
+        FileFormatEnum.jsonl, description="Format of the data file. Default: JSONL"
+    )
+DataSourceType = Annotated[
+    Union[
+        Annotated[NamedDataSource, Tag("named")],
+        Annotated[FileDataSource, Tag("file")],
+        Annotated[IterableDataSource, Tag("iterable")],
+    ],
+    Discriminator("type"),
+]
 class FunctionsCollection(BaseModel):
@@ -68,7 +99,7 @@ class EvalRun(BaseModel):
     Read more in the :ref:`user_guide`."""
-    data_sources: Annotated[list[FileDataSource], Len(min_length=1)] = Field(
+    data_sources: Annotated[list[DataSourceType], Len(min_length=1)] = Field(
         description="List of data sources.",
     )
     database_path: Path = Field(

{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: python-flexeval
-Version: 0.3.0
+Version: 0.4.0
 Summary: FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
 Project-URL: Homepage, https://digitalharborfoundation.github.io/FlexEval/
 Project-URL: GitHub, https://github.com/DigitalHarborFoundation/FlexEval
@@ -21,8 +21,8 @@ Requires-Dist: flatten-json>=0.1.14
 Requires-Dist: jsonschema>=4.23.0
 Requires-Dist: langchain-openai>=0.3.8
 Requires-Dist: langchain>=0.3.20
-Requires-Dist: langgraph-checkpoint-sqlite>=2.0.6
-Requires-Dist: langgraph>=0.3.6
+Requires-Dist: langgraph-checkpoint-sqlite>=3.0.0
+Requires-Dist: langgraph>=1.0.0
 Requires-Dist: litellm>=1.74.3
 Requires-Dist: msgpack>=1.1.0
 Requires-Dist: networkx>=3.4.2

python_flexeval-0.4.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,49 @@
+flexeval/__about__.py,sha256=42STGor_9nKYXumfeV5tiyD_M8VdcddX7CEexmibPBk,22
+flexeval/__init__.py,sha256=UXI_xdSxnGAK2plDODBbPF3df-N7E9YJ418QHK7XN-Q,391
+flexeval/__main__.py,sha256=c9NQqsea3e-_6b736gBeIO3O_zdXQ1wtY3-Scj5NiPg,126
+flexeval/cli.py,sha256=RwtRk121OivbLQyYpYxJ7PugPIYQ8J4qXHFN2SxxPy4,2985
+flexeval/completions.py,sha256=8PwpWXawARiSngeE2bRzTRXmPyXmxUjPKNFv4zCuAzE,5731
+flexeval/compute_metrics.py,sha256=SNhPpe5ol7Cqr2kjaBdeTIWIYqVlGjd9ZVDl9Qq90y0,37636
+flexeval/config.yaml,sha256=dpkFdW0rKf7StGoVeIGaCNw0n0yOfYWig0xmIfsDdbg,530
+flexeval/data_loader.py,sha256=jptI0tG2YYk40xNYiZzfSqcqmYzw9pIBt_rFtpw3T4o,17099
+flexeval/db_utils.py,sha256=xz97uZbUMQaTyGoR-7lKrMDs8SGdHy09SCvfCkxB36A,1687
+flexeval/dependency_graph.py,sha256=dUQp0WQ9G2FskorUMLYOKFQ9_JwIrMR_DpVrqh4n0xg,10515
+flexeval/eval_schema.json,sha256=pAS3vPLBEyH3Yjglos6aB0aNMTEUFbV-3Rf6wuSVtR4,11881
+flexeval/function_types.py,sha256=rz8AcsHJOkFfAEEocN3HX5EgEh70Oze5dSlNMdaihVU,6420
+flexeval/helpers.py,sha256=gX-6Hx4_wOiqbfY8c8_kL3XbkdV8mpEjPmaAe44lOSk,1605
+flexeval/log_utils.py,sha256=E3RloPQZbtd8sEIg7mfN5fAku-TeNGqWy03SmwRllIE,923
+flexeval/rubric.py,sha256=UwtJOxIxFJcQVrDXXuCA3tF_FFTcvLPqo2F9lq8gPcM,2167
+flexeval/run_utils.py,sha256=z9ISQlthcLUUsGiIyaGHI1IwICBy_JLN_Efg4TNv8Mk,8536
+flexeval/runner.py,sha256=RuQYQgafD0p4qlVK-IxDRKBJPic40YJItrJg8-M9Shw,4110
+flexeval/classes/__init__.py,sha256=fywDMYX8W-nXFKRXolzn-RWd_7tiJr6FlouQJvYSoyE,347
+flexeval/classes/base.py,sha256=xxkTa8joPe39CFwveeTPW56LW-x7rsi5oBAIxrvM5iI,944
+flexeval/classes/dataset.py,sha256=10t4_1Jyg9rYe00VqOYt_biQYnSZrKbJ3nEIRMCF4O8,656
+flexeval/classes/eval_runner.py,sha256=ZvCpyaD7lorDK_mYJSZqQbvI6FfLbIWRFHNarWTAMQU,6270
+flexeval/classes/eval_set_run.py,sha256=n15zMu-KANEDc2K3sqs-KEI12bWpkhSrF0EkEAiBPV4,1449
+flexeval/classes/jsonview.py,sha256=9HQfEY7BH9D58EnR4N9R5oMQsCMjJxsMcPHdzOBLj2w,3773
+flexeval/classes/message.py,sha256=fiW0JhXKt5IiLw7zA4XVKjpY1rObVGvoBtUTXjOXWhs,7741
+flexeval/classes/metric.py,sha256=yXwRx8ECsEYXKg24r0Y0e8B81XGZma1xOcYp4Zi86pM,2109
+flexeval/classes/thread.py,sha256=3gwiLwe3xP0atzsyCG3SKd2G3QtY21vk1Gif4p9ZwI8,2802
+flexeval/classes/tool_call.py,sha256=qBWTAjEKl35Za4BU-sVRPuTxkgVPRTcefQBynUjGEqI,1626
+flexeval/classes/turn.py,sha256=eN_8mPDJa5x4bGbuiDrEPFbGE6Cs9F4vGoJO17ZaSMI,8771
+flexeval/configuration/__init__.py,sha256=wP_gpYyaEp5DxCSH8-4KHchH07JMZZOk8eCFMfd5LBw,75
+flexeval/configuration/completion_functions.py,sha256=-N0iFAfcYcm35S78M3ES4MBkLXpDeEfy2Qq1ORHGBXE,7491
+flexeval/configuration/evals.yaml,sha256=2cApBbwSQr3C4pil0yfZJRkeWviVwaHH13tLmZNoRaI,21924
+flexeval/configuration/function_metrics.py,sha256=SGCxCAfG5NfKop-d3_uJgF83nPrlfHAhd-TU0GpEPFY,22427
+flexeval/configuration/rubric_metrics.yaml,sha256=JfE6gPj4LtM2v0b5-Zge3NwM17YgJEBZXzTVn9UL7zk,9424
+flexeval/io/__init__.py,sha256=MqdgcPzkFpSnOEz-e2GNNd8XOI_DbyNjIP8AT5eqUqI,101
+flexeval/io/parsers/yaml_parser.py,sha256=2yE6j_RM_YG5nkNUWZckrymh61n28AG46lqnPSlWitk,1818
+flexeval/metrics/__init__.py,sha256=qrgUhTXzezAOoABhck3hMVN-c2Bwn7CTg-e_P2w7PlA,134
+flexeval/metrics/access.py,sha256=mP89IUNTWpHguMEdjjh_deMxdiyClb61hg3k7Jcus-o,1299
+flexeval/metrics/save.py,sha256=nquTUmcUuiCkj5VY0vFonEflo4ZHZN-Xbc_Lvy2AC2k,1837
+flexeval/schema/__init__.py,sha256=4OA6Q7Dguz-uaulwoRsrtaoReFmyNsKqyi_CvfDV4-c,379
+flexeval/schema/config_schema.py,sha256=cH2iE-bj-8Rs1-CEUP-xVn1S0r2wtRmI6kWqfQ4M_Y4,2272
+flexeval/schema/eval_schema.py,sha256=8idEhxogqzUPwojBcfyNIH8yGWX74oa0NhUB3vabwlc,6651
+flexeval/schema/evalrun_schema.py,sha256=nF3GCNlzxhJvu-V2h4-RkX5xWhBA9mQIR_ofR3T6de0,4315
+flexeval/schema/rubric_schema.py,sha256=uxcf7MHWKW3EmABUnWeCinGUP6LBjskiq7zkEPHmAvU,1615
+flexeval/schema/schema_utils.py,sha256=Fg1foqRA-9X-hl_vqIF3bpYdE51hNEgdw739Q-s3iQc,698
+python_flexeval-0.4.0.dist-info/METADATA,sha256=isVFK5bnXc7iBmzvrALqo7OYOYi653_UZOjY_TXBkqE,5599
+python_flexeval-0.4.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+python_flexeval-0.4.0.dist-info/entry_points.txt,sha256=wSyluqXhrX3xySVYAtM-Kv23p4OauKQCSBuNNfzEGtI,52
+python_flexeval-0.4.0.dist-info/licenses/LICENSE,sha256=OlAu_c13gw6-fJ9UdhZBMeNr5STLrnWG_0Hv0SCXtu4,1082
+python_flexeval-0.4.0.dist-info/RECORD,,

{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.27.0
+Generator: hatchling 1.29.0
 Root-Is-Purelib: true
 Tag: py3-none-any

python_flexeval-0.3.0.dist-info/RECORD DELETED Viewed

@@ -1,49 +0,0 @@
-flexeval/__about__.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
-flexeval/__init__.py,sha256=UXI_xdSxnGAK2plDODBbPF3df-N7E9YJ418QHK7XN-Q,391
-flexeval/__main__.py,sha256=c9NQqsea3e-_6b736gBeIO3O_zdXQ1wtY3-Scj5NiPg,126
-flexeval/cli.py,sha256=RwtRk121OivbLQyYpYxJ7PugPIYQ8J4qXHFN2SxxPy4,2985
-flexeval/completions.py,sha256=pi_tYK4m3vKSqAC1ym9Jc3e4srcQSXfx-mX4qI5qisQ,5686
-flexeval/compute_metrics.py,sha256=4X6XFk0qUKcaCDllNeJreuhlnDHmfRPlsf0f8fWFOxA,37277
-flexeval/config.yaml,sha256=dpkFdW0rKf7StGoVeIGaCNw0n0yOfYWig0xmIfsDdbg,530
-flexeval/data_loader.py,sha256=UP-HWqh5o_euqT2GvTbUYmA-yJcbTKtmug4w63w2CbA,26153
-flexeval/db_utils.py,sha256=2jgqexLCAqShvgPrImZz12UkMZtfERhP8iXjratXYok,1612
-flexeval/dependency_graph.py,sha256=SaG9gjkw2Q0NykqQWs4JzPkv5sMj2aXXmhjJ7yRkV4Q,10539
-flexeval/eval_schema.json,sha256=BQetj8O0_4rorj3Mpqk-sj_SCaRkGMrvBUcxhuw6zLE,13111
-flexeval/function_types.py,sha256=eH8NadQRw7XAOXAOKWYN6b7urjr57J5WzdiVyzh0Wb4,6898
-flexeval/helpers.py,sha256=gX-6Hx4_wOiqbfY8c8_kL3XbkdV8mpEjPmaAe44lOSk,1605
-flexeval/log_utils.py,sha256=E3RloPQZbtd8sEIg7mfN5fAku-TeNGqWy03SmwRllIE,923
-flexeval/rubric.py,sha256=UwtJOxIxFJcQVrDXXuCA3tF_FFTcvLPqo2F9lq8gPcM,2167
-flexeval/run_utils.py,sha256=cNFVRsFNYY9gpzbIUc-H4Gk7TWC64GXsYowQHoG7ZVU,2597
-flexeval/runner.py,sha256=X6ZfjfwIM3ymN_kHfRt_JSKPxpDxs_MWQPrvWhl2L7I,4340
-flexeval/classes/__init__.py,sha256=fywDMYX8W-nXFKRXolzn-RWd_7tiJr6FlouQJvYSoyE,347
-flexeval/classes/base.py,sha256=xxkTa8joPe39CFwveeTPW56LW-x7rsi5oBAIxrvM5iI,944
-flexeval/classes/dataset.py,sha256=Y_EdEIuhx526SSvkqk2tFBzkOgBkVY-5FeraYMtU5lo,2913
-flexeval/classes/eval_runner.py,sha256=ZvCpyaD7lorDK_mYJSZqQbvI6FfLbIWRFHNarWTAMQU,6270
-flexeval/classes/eval_set_run.py,sha256=fq_wBOaxuq7dLxiZIw76WGIwhRBNbQWDUhpiK0wDG_A,1116
-flexeval/classes/jsonview.py,sha256=3XJTh46ODfqdNbrXYDEV6kRO8KbeiHJo5pb4aJrbHRY,3459
-flexeval/classes/message.py,sha256=gDejDfaHGQKgS_CpJqjPAVzpiRD2JddKo17Yi1wVeiw,7676
-flexeval/classes/metric.py,sha256=d8l39_QwnQDmTJvy9TIulU4p0jqD7ldMUi4m5zfK2Es,2806
-flexeval/classes/thread.py,sha256=cFQu3Mwzk8-Def8xccB8F6zKv64Srvhz5n83yLELvKo,2922
-flexeval/classes/tool_call.py,sha256=CteT2Hajor0PlHEEn7apfZux5_mremSIDrQmZ0iB7K0,1748
-flexeval/classes/turn.py,sha256=kLmgnYQ-4a8sydzGK1HTQRyUDXZIedmt_NFR3shLJFE,8635
-flexeval/configuration/__init__.py,sha256=wP_gpYyaEp5DxCSH8-4KHchH07JMZZOk8eCFMfd5LBw,75
-flexeval/configuration/completion_functions.py,sha256=-N0iFAfcYcm35S78M3ES4MBkLXpDeEfy2Qq1ORHGBXE,7491
-flexeval/configuration/evals.yaml,sha256=3mbD3gEccTDotm8kj4doYTujqRD_PkGhCVhjQaSEqSs,22651
-flexeval/configuration/function_metrics.py,sha256=SGCxCAfG5NfKop-d3_uJgF83nPrlfHAhd-TU0GpEPFY,22427
-flexeval/configuration/rubric_metrics.yaml,sha256=JfE6gPj4LtM2v0b5-Zge3NwM17YgJEBZXzTVn9UL7zk,9424
-flexeval/io/__init__.py,sha256=MqdgcPzkFpSnOEz-e2GNNd8XOI_DbyNjIP8AT5eqUqI,101
-flexeval/io/parsers/yaml_parser.py,sha256=2yE6j_RM_YG5nkNUWZckrymh61n28AG46lqnPSlWitk,1818
-flexeval/metrics/__init__.py,sha256=qrgUhTXzezAOoABhck3hMVN-c2Bwn7CTg-e_P2w7PlA,134
-flexeval/metrics/access.py,sha256=mP89IUNTWpHguMEdjjh_deMxdiyClb61hg3k7Jcus-o,1299
-flexeval/metrics/save.py,sha256=8x9ifRiHtQT7_WeMP0XmYK1zfourXMnHkGZy_iR0Xcc,1643
-flexeval/schema/__init__.py,sha256=4OA6Q7Dguz-uaulwoRsrtaoReFmyNsKqyi_CvfDV4-c,379
-flexeval/schema/config_schema.py,sha256=LkmtiOLfPsX1u_6Ey6gFbRr8tQwxqcuLcyf-xYcBf9o,1619
-flexeval/schema/eval_schema.py,sha256=iHMbanW4Ef_sp51KiaZKeP3Dn4Z6pWCGa7N2SPvsFK0,6607
-flexeval/schema/evalrun_schema.py,sha256=M7JY01DhlLzwZc2jJTIeGPs9vt6TFMPir51MFhtRllA,3526
-flexeval/schema/rubric_schema.py,sha256=uxcf7MHWKW3EmABUnWeCinGUP6LBjskiq7zkEPHmAvU,1615
-flexeval/schema/schema_utils.py,sha256=Fg1foqRA-9X-hl_vqIF3bpYdE51hNEgdw739Q-s3iQc,698
-python_flexeval-0.3.0.dist-info/METADATA,sha256=xBbeZrF4aEdl94pg-L2P_Di6cxtxA3aZnu6fxFjUf-8,5599
-python_flexeval-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-python_flexeval-0.3.0.dist-info/entry_points.txt,sha256=wSyluqXhrX3xySVYAtM-Kv23p4OauKQCSBuNNfzEGtI,52
-python_flexeval-0.3.0.dist-info/licenses/LICENSE,sha256=OlAu_c13gw6-fJ9UdhZBMeNr5STLrnWG_0Hv0SCXtu4,1082
-python_flexeval-0.3.0.dist-info/RECORD,,

{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

python-flexeval 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

python-flexeval 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl