python-flexeval 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexeval/__init__.py +11 -0
- flexeval/__main__.py +11 -0
- flexeval/classes/__init__.py +15 -0
- flexeval/classes/base.py +32 -0
- flexeval/classes/dataset.py +82 -0
- flexeval/classes/eval_runner.py +158 -0
- flexeval/classes/eval_set_run.py +32 -0
- flexeval/classes/message.py +183 -0
- flexeval/classes/metric.py +55 -0
- flexeval/classes/thread.py +79 -0
- flexeval/classes/tool_call.py +51 -0
- flexeval/classes/turn.py +206 -0
- flexeval/cli.py +104 -0
- flexeval/completions.py +147 -0
- flexeval/compute_metrics.py +788 -0
- flexeval/config.yaml +23 -0
- flexeval/configuration/__init__.py +1 -0
- flexeval/configuration/completion_functions.py +231 -0
- flexeval/configuration/evals.yaml +864 -0
- flexeval/configuration/function_metrics.py +650 -0
- flexeval/configuration/rubric_metrics.yaml +194 -0
- flexeval/data_loader.py +513 -0
- flexeval/db_utils.py +38 -0
- flexeval/dependency_graph.py +234 -0
- flexeval/eval_schema.json +256 -0
- flexeval/function_types.py +173 -0
- flexeval/helpers.py +52 -0
- flexeval/io/__init__.py +1 -0
- flexeval/io/parsers/yaml_parser.py +69 -0
- flexeval/log_utils.py +34 -0
- flexeval/metrics/__init__.py +8 -0
- flexeval/metrics/access.py +28 -0
- flexeval/metrics/save.py +39 -0
- flexeval/rubric.py +62 -0
- flexeval/run_utils.py +65 -0
- flexeval/runner.py +132 -0
- flexeval/schema/__init__.py +11 -0
- flexeval/schema/config_schema.py +46 -0
- flexeval/schema/eval_schema.py +163 -0
- flexeval/schema/evalrun_schema.py +97 -0
- flexeval/schema/rubric_schema.py +40 -0
- flexeval/schema/schema_utils.py +26 -0
- python_flexeval-0.1.5.dist-info/METADATA +118 -0
- python_flexeval-0.1.5.dist-info/RECORD +47 -0
- python_flexeval-0.1.5.dist-info/WHEEL +4 -0
- python_flexeval-0.1.5.dist-info/entry_points.txt +2 -0
- python_flexeval-0.1.5.dist-info/licenses/LICENSE +21 -0
flexeval/helpers.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Generic utility functions."""
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
4
|
+
import hashlib
|
|
5
|
+
|
|
6
|
+
import networkx as nx
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def generate_hash():
|
|
10
|
+
"""Create a random 8-digit id"""
|
|
11
|
+
# Create a new SHA-256 hash object
|
|
12
|
+
hash_object = hashlib.sha256()
|
|
13
|
+
|
|
14
|
+
# Update the hash object with the bytes of the string
|
|
15
|
+
hash_object.update(datetime.datetime.now().isoformat().encode())
|
|
16
|
+
|
|
17
|
+
# Get the hexadecimal digest of the hash
|
|
18
|
+
full_hash = hash_object.hexdigest()
|
|
19
|
+
|
|
20
|
+
# Return the first 8 digits of the hash
|
|
21
|
+
return full_hash[:8]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def visualize_graph(graph: nx.DiGraph, output_path: str | None = None):
|
|
25
|
+
"""Visualize graphs produced by :class:`~flexeval.compute_metrics.MetricGraphBuilder`.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
graph (nx.DiGraph): The graph
|
|
29
|
+
output_path (str | None, optional): If not None, will save the graph as an image using :meth:`matplotlib.pyplot.Figure.savefig`.
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
ImportError: If matplotlib is not installed.
|
|
33
|
+
"""
|
|
34
|
+
try:
|
|
35
|
+
import matplotlib.pyplot as plt
|
|
36
|
+
except ImportError:
|
|
37
|
+
raise ImportError("matplotlib must be installed to use this helper function.")
|
|
38
|
+
fig, ax = plt.subplots(1, 1, figsize=(12, 5))
|
|
39
|
+
pos = nx.spring_layout(graph)
|
|
40
|
+
nx.draw(graph, ax=ax, pos=pos)
|
|
41
|
+
nx.draw_networkx_labels(
|
|
42
|
+
graph,
|
|
43
|
+
font_size=8,
|
|
44
|
+
ax=ax,
|
|
45
|
+
pos=pos,
|
|
46
|
+
labels={
|
|
47
|
+
om: f"{om.object.__class__.__name__} {om.object.id}\n{om.metric.get('id')}(l={om.metric.get('metric_level')},r={om.metric.get('kwargs', {}).get('response')})"
|
|
48
|
+
for om in graph
|
|
49
|
+
},
|
|
50
|
+
)
|
|
51
|
+
if output_path is not None:
|
|
52
|
+
fig.savefig(output_path)
|
flexeval/io/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Input/output utilities, primarily for reading and writing the :mod:`~flexeval.schema` objects."""
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Type, TypeVar
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from flexeval.schema import Config, Eval, EvalRun
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
T = TypeVar("T", bound=pydantic.BaseModel)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_config_from_yaml(
|
|
15
|
+
filename: Path | str,
|
|
16
|
+
) -> Config:
|
|
17
|
+
return load_model_from_yaml(filename, Config)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_evals_from_yaml(
|
|
21
|
+
filename: Path | str,
|
|
22
|
+
) -> dict[str, Eval]:
|
|
23
|
+
return load_models_from_yaml_filepath(filename, Eval)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def load_eval_run_from_yaml(
|
|
27
|
+
filename: Path | str,
|
|
28
|
+
) -> EvalRun:
|
|
29
|
+
return load_model_from_yaml(filename, EvalRun)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def load_models_from_yaml_filepath(
|
|
33
|
+
filename: Path | str,
|
|
34
|
+
model_type: Type[T],
|
|
35
|
+
) -> dict[str, T]:
|
|
36
|
+
try:
|
|
37
|
+
with open(filename) as file:
|
|
38
|
+
return load_models_from_yaml_stream(file, model_type)
|
|
39
|
+
except (OSError, ValueError) as ex:
|
|
40
|
+
raise ValueError(
|
|
41
|
+
f"Failed to load '{filename}' as a list of '{model_type.__name__}' models: {ex}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def load_models_from_yaml_stream(
|
|
46
|
+
stream,
|
|
47
|
+
model_type: Type[T],
|
|
48
|
+
) -> dict[str, T]:
|
|
49
|
+
try:
|
|
50
|
+
contents = yaml.safe_load(stream)
|
|
51
|
+
return {key: model_type(**value) for key, value in contents.items()}
|
|
52
|
+
except (OSError, yaml.YAMLError, pydantic.ValidationError) as ex:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"Failed to load YAML stream as a list of '{model_type.__name__}' models: {ex}"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def load_model_from_yaml(
|
|
59
|
+
filename: Path | str,
|
|
60
|
+
model_type: Type[T],
|
|
61
|
+
) -> T:
|
|
62
|
+
try:
|
|
63
|
+
with open(filename) as file:
|
|
64
|
+
contents = yaml.safe_load(file)
|
|
65
|
+
return model_type(**contents)
|
|
66
|
+
except (OSError, yaml.YAMLError, pydantic.ValidationError) as ex:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
f"Failed to load '{filename}' as a '{model_type.__name__}': {ex}"
|
|
69
|
+
)
|
flexeval/log_utils.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Logging utilities."""
|
|
2
|
+
|
|
3
|
+
import enum
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LogLevel(str, enum.Enum):
|
|
8
|
+
CRITICAL = "critical"
|
|
9
|
+
ERROR = "error"
|
|
10
|
+
WARNING = "warning"
|
|
11
|
+
INFO = "info"
|
|
12
|
+
DEBUG = "debug"
|
|
13
|
+
|
|
14
|
+
@classmethod
|
|
15
|
+
def get_logging_constant(cls, level_str: str) -> int:
|
|
16
|
+
if level_str == cls.CRITICAL.value:
|
|
17
|
+
return logging.CRITICAL
|
|
18
|
+
elif level_str == cls.ERROR.value:
|
|
19
|
+
return logging.ERROR
|
|
20
|
+
elif level_str == cls.WARNING.value:
|
|
21
|
+
return logging.WARNING
|
|
22
|
+
elif level_str == cls.INFO.value:
|
|
23
|
+
return logging.INFO
|
|
24
|
+
elif level_str == cls.DEBUG.value:
|
|
25
|
+
return logging.DEBUG
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def set_up_logging(log_level: int = logging.INFO):
|
|
29
|
+
# set up logging
|
|
30
|
+
logging.basicConfig(
|
|
31
|
+
level=log_level,
|
|
32
|
+
format="%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d (%(funcName)s) - %(message)s",
|
|
33
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
34
|
+
)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
|
|
3
|
+
from flexeval.classes import metric
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def count_dict_values(lst: list[dict]) -> dict[str, Counter]:
|
|
7
|
+
"""Convenience function for counting key values.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
lst (list[dict]): List of dictionaries.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
dict[str, Counter]: counter for each key that appears in the dicts in lst.
|
|
14
|
+
"""
|
|
15
|
+
counts = {}
|
|
16
|
+
for d in lst:
|
|
17
|
+
for k, v in d.items():
|
|
18
|
+
if k not in counts:
|
|
19
|
+
counts[k] = Counter()
|
|
20
|
+
counts[k][v] += 1
|
|
21
|
+
return counts
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_all_metrics() -> list:
|
|
25
|
+
results = []
|
|
26
|
+
for m in metric.Metric.select():
|
|
27
|
+
results.append(m.__data__.copy())
|
|
28
|
+
return results
|
flexeval/metrics/save.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Iterable
|
|
3
|
+
|
|
4
|
+
from flexeval.classes.metric import Metric
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def save_metrics(metrics: Iterable[Metric]):
|
|
8
|
+
for metric in metrics:
|
|
9
|
+
# TODO - speed this up somehow
|
|
10
|
+
thread = metric.get("thread")
|
|
11
|
+
if thread is None:
|
|
12
|
+
thread = metric[metric["metric_level"].lower()].thread
|
|
13
|
+
Metric.create(
|
|
14
|
+
message=metric.get("message", None),
|
|
15
|
+
turn=metric.get("turn", None),
|
|
16
|
+
toolcall=metric.get("toolcall", None),
|
|
17
|
+
evalsetrun=metric[
|
|
18
|
+
metric["metric_level"].lower()
|
|
19
|
+
].evalsetrun, # metric["turn"].evalsetrun,
|
|
20
|
+
dataset=metric[
|
|
21
|
+
metric["metric_level"].lower()
|
|
22
|
+
].dataset, # metric["turn"].dataset,
|
|
23
|
+
thread=thread,
|
|
24
|
+
evaluation_name=metric["evaluation_name"],
|
|
25
|
+
evaluation_type=metric["evaluation_type"],
|
|
26
|
+
metric_name=metric["metric_name"],
|
|
27
|
+
metric_value=metric["metric_value"],
|
|
28
|
+
metric_level=metric["metric_level"],
|
|
29
|
+
kwargs=metric["kwargs"],
|
|
30
|
+
depends_on=json.dumps(metric["depends_on"]),
|
|
31
|
+
context_only=metric.get("context_only", False),
|
|
32
|
+
source=metric["source"],
|
|
33
|
+
rubric_prompt=metric.get("rubric_prompt", None),
|
|
34
|
+
rubric_completion=metric.get("rubric_completion", None),
|
|
35
|
+
rubric_model=metric.get("rubric_model", None),
|
|
36
|
+
rubric_completion_tokens=metric.get("rubric_completion_tokens", None),
|
|
37
|
+
rubric_prompt_tokens=metric.get("rubric_prompt_tokens", None),
|
|
38
|
+
rubric_score=metric.get("rubric_score", None),
|
|
39
|
+
)
|
flexeval/rubric.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Rubric metric IO utilities. Should maybe be moved to :mod:`~flexeval.io`."""
|
|
2
|
+
|
|
3
|
+
import importlib.resources
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import flexeval.configuration
|
|
8
|
+
from flexeval.io.parsers import yaml_parser
|
|
9
|
+
from flexeval.schema import Rubric, RubricsCollection
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
default_rubric_collection = (
|
|
15
|
+
None # don't access directly, use load_default_rubric_metrics
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def load_rubrics_from_yaml_file(path: Path) -> dict[str, Rubric]:
|
|
20
|
+
return yaml_parser.load_models_from_yaml_filepath(path, Rubric)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def load_rubrics_from_yaml_stream(
|
|
24
|
+
stream,
|
|
25
|
+
) -> dict[str, Rubric]:
|
|
26
|
+
return yaml_parser.load_models_from_yaml_stream(stream, Rubric)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_default_rubric_collection() -> RubricsCollection:
|
|
30
|
+
global default_rubric_collection
|
|
31
|
+
if default_rubric_collection is not None:
|
|
32
|
+
return default_rubric_collection
|
|
33
|
+
logger.debug("Attempting to load from default rubric_metrics.yaml.")
|
|
34
|
+
metrics_path = (
|
|
35
|
+
importlib.resources.files(flexeval.configuration) / "rubric_metrics.yaml"
|
|
36
|
+
)
|
|
37
|
+
metrics_string = metrics_path.read_text()
|
|
38
|
+
rubric_metrics = load_rubrics_from_yaml_stream(metrics_string)
|
|
39
|
+
default_rubric_collection = RubricsCollection(rubrics=rubric_metrics)
|
|
40
|
+
return default_rubric_collection
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load_rubric_metrics(
|
|
44
|
+
rubric_collections: list[Path | RubricsCollection], add_defaults: bool = False
|
|
45
|
+
) -> dict[str, Rubric]:
|
|
46
|
+
rubric_metrics = {}
|
|
47
|
+
if add_defaults:
|
|
48
|
+
rubric_collections.append(get_default_rubric_collection())
|
|
49
|
+
for rubric_collection in rubric_collections:
|
|
50
|
+
if isinstance(rubric_collection, Path):
|
|
51
|
+
rubric_dict = load_rubrics_from_yaml_file(rubric_collection)
|
|
52
|
+
else:
|
|
53
|
+
rubric_dict = rubric_collection.rubrics
|
|
54
|
+
for key, rubric in rubric_dict.items():
|
|
55
|
+
if key not in rubric_metrics:
|
|
56
|
+
rubric_metrics[key] = rubric
|
|
57
|
+
if rubric.name is None:
|
|
58
|
+
# infer rubric name from parent key
|
|
59
|
+
rubric.name = key
|
|
60
|
+
else:
|
|
61
|
+
logger.warning(f"Ignoring second (or later) version of rubric '{key}'.")
|
|
62
|
+
return rubric_metrics
|
flexeval/run_utils.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Utilities for :mod:`~flexeval.runner`."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from flexeval import rubric
|
|
7
|
+
from flexeval.classes.dataset import Dataset
|
|
8
|
+
from flexeval.classes.eval_runner import EvalRunner
|
|
9
|
+
from flexeval.classes.eval_set_run import EvalSetRun
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def build_eval_set_run(runner: EvalRunner) -> EvalSetRun:
|
|
15
|
+
rubrics = rubric.load_rubric_metrics(runner.evalrun.rubric_paths)
|
|
16
|
+
|
|
17
|
+
# TODO this code uses a model_name that does not appear in the Eval schema; should look into this
|
|
18
|
+
model_name = json.dumps(None)
|
|
19
|
+
# model_name = json.dumps(
|
|
20
|
+
# runner.eval.get("completion_llm", {}).get("model_name", None)
|
|
21
|
+
# )
|
|
22
|
+
evalsetrun = EvalSetRun.create(
|
|
23
|
+
name=runner.evalrun.eval.name,
|
|
24
|
+
notes=runner.evalrun.eval.notes,
|
|
25
|
+
metrics=runner.evalrun.eval.metrics.model_dump_json(),
|
|
26
|
+
metrics_graph_ordered_list=json.dumps(runner.metrics_graph_ordered_list),
|
|
27
|
+
dataset_files=json.dumps(
|
|
28
|
+
[str(data_source.path) for data_source in runner.evalrun.data_sources]
|
|
29
|
+
),
|
|
30
|
+
do_completion=runner.evalrun.eval.do_completion,
|
|
31
|
+
completion_llm=(
|
|
32
|
+
runner.evalrun.eval.completion_llm.model_dump_json()
|
|
33
|
+
if runner.evalrun.eval.completion_llm is not None
|
|
34
|
+
else json.dumps(None)
|
|
35
|
+
),
|
|
36
|
+
model_name=model_name,
|
|
37
|
+
grader_llm=(
|
|
38
|
+
runner.evalrun.eval.grader_llm.model_dump_json()
|
|
39
|
+
if runner.evalrun.eval.grader_llm is not None
|
|
40
|
+
else json.dumps(None)
|
|
41
|
+
),
|
|
42
|
+
# only save rubrics that will actually be used
|
|
43
|
+
rubrics=json.dumps(
|
|
44
|
+
{
|
|
45
|
+
i["evaluation_name"]: rubrics[i["evaluation_name"]].model_dump()
|
|
46
|
+
for i in runner.metrics_graph_ordered_list
|
|
47
|
+
if i["evaluation_type"] == "rubric"
|
|
48
|
+
}
|
|
49
|
+
),
|
|
50
|
+
)
|
|
51
|
+
return evalsetrun
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def build_datasets(runner: EvalRunner, evalsetrun: EvalSetRun):
|
|
55
|
+
for filename in evalsetrun.get_datasets():
|
|
56
|
+
# these will automatically be saved as a property of evalsetrun
|
|
57
|
+
Dataset.create(
|
|
58
|
+
evalsetrun=evalsetrun,
|
|
59
|
+
filename=filename,
|
|
60
|
+
max_n_conversation_threads=runner.evalrun.config.max_n_conversation_threads,
|
|
61
|
+
nb_evaluations_per_thread=runner.evalrun.config.nb_evaluations_per_thread,
|
|
62
|
+
)
|
|
63
|
+
runner.logger.info(
|
|
64
|
+
f"Created dataset from '{filename}'. Max number of conversation threads: '{runner.evalrun.config.max_n_conversation_threads}' - Nb of evaluations per thread: '{runner.evalrun.config.nb_evaluations_per_thread}'"
|
|
65
|
+
)
|
flexeval/runner.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Convenience functions for running an Eval Run."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import random as rd
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import flexeval.metrics
|
|
8
|
+
from flexeval import completions, compute_metrics, run_utils
|
|
9
|
+
from flexeval.classes.eval_runner import EvalRunner
|
|
10
|
+
from flexeval.io.parsers import yaml_parser
|
|
11
|
+
from flexeval.schema import EvalRun, FileDataSource
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Levels of abstraction -
|
|
17
|
+
# Dataset
|
|
18
|
+
# Thread
|
|
19
|
+
# Turn
|
|
20
|
+
# Message
|
|
21
|
+
# ToolCall
|
|
22
|
+
# Metric
|
|
23
|
+
|
|
24
|
+
# Features to add:
|
|
25
|
+
# - allow comparison with 'ideal' responses
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def run_from_name_args(
|
|
29
|
+
input_data: list[Path],
|
|
30
|
+
database_path: Path,
|
|
31
|
+
eval_name: str,
|
|
32
|
+
config_path: str,
|
|
33
|
+
evals_path: str,
|
|
34
|
+
**kwargs,
|
|
35
|
+
):
|
|
36
|
+
data_sources = [FileDataSource(path=input_path) for input_path in input_data]
|
|
37
|
+
config = yaml_parser.load_config_from_yaml(config_path)
|
|
38
|
+
evals = yaml_parser.load_evals_from_yaml(evals_path)
|
|
39
|
+
if eval_name not in evals:
|
|
40
|
+
raise ValueError(
|
|
41
|
+
f"Eval name '{eval_name}' not in defined evals: {list(evals.keys())}"
|
|
42
|
+
)
|
|
43
|
+
selected_eval = evals[eval_name]
|
|
44
|
+
if selected_eval.name is None or selected_eval.name.strip() == "":
|
|
45
|
+
selected_eval.name = eval_name
|
|
46
|
+
for key, value in kwargs.items():
|
|
47
|
+
setattr(config, key, value)
|
|
48
|
+
eval_run = EvalRun(
|
|
49
|
+
data_sources=data_sources,
|
|
50
|
+
database_path=database_path,
|
|
51
|
+
eval=selected_eval,
|
|
52
|
+
config=config,
|
|
53
|
+
)
|
|
54
|
+
return run(eval_run)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def run(eval_run: EvalRun) -> EvalRunner:
|
|
58
|
+
"""Runs the evaluations."""
|
|
59
|
+
runner = EvalRunner(eval_run)
|
|
60
|
+
|
|
61
|
+
#######################################################
|
|
62
|
+
############ Create Test Run ########################
|
|
63
|
+
#######################################################
|
|
64
|
+
try:
|
|
65
|
+
runner.logger.info("Creating EvalSetRun")
|
|
66
|
+
# TODO instead of raw 'metrics', pass in graph created when setting up the runner
|
|
67
|
+
|
|
68
|
+
evalsetrun = run_utils.build_eval_set_run(runner)
|
|
69
|
+
runner.logger.info(f"Metric graph: {evalsetrun.metrics_graph_ordered_list}")
|
|
70
|
+
except Exception:
|
|
71
|
+
runner.logger.exception(
|
|
72
|
+
"An error occurred creating the EvalSetRun.", exc_info=True
|
|
73
|
+
)
|
|
74
|
+
runner.shutdown_logging()
|
|
75
|
+
raise
|
|
76
|
+
|
|
77
|
+
#######################################################
|
|
78
|
+
############ Load and Parse Data ####################
|
|
79
|
+
#######################################################
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
runner.logger.debug("Loading data")
|
|
83
|
+
|
|
84
|
+
# set random seed
|
|
85
|
+
rd_seed = runner.evalrun.config.random_seed_conversation_sampling
|
|
86
|
+
rd.seed(rd_seed)
|
|
87
|
+
runner.logger.info(f"Set random seed to '{rd_seed}'.")
|
|
88
|
+
|
|
89
|
+
run_utils.build_datasets(runner, evalsetrun)
|
|
90
|
+
except Exception:
|
|
91
|
+
runner.logger.exception(
|
|
92
|
+
"An error occurred creating dataset metadata.", exc_info=True
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
runner.logger.info("Parsing data files")
|
|
97
|
+
for dataset in evalsetrun.datasets:
|
|
98
|
+
runner.logger.debug(f"Loading data from '{dataset.filename}'.")
|
|
99
|
+
dataset.load_data()
|
|
100
|
+
except Exception:
|
|
101
|
+
runner.logger.exception("An error occurred loading data.", exc_info=True)
|
|
102
|
+
|
|
103
|
+
# Do completions, if necessary
|
|
104
|
+
try:
|
|
105
|
+
if evalsetrun.do_completion:
|
|
106
|
+
# We do this by creating new turns
|
|
107
|
+
runner.logger.info("Generating completions")
|
|
108
|
+
completions.get_completions(eval_run, evalsetrun)
|
|
109
|
+
except Exception:
|
|
110
|
+
runner.logger.exception(
|
|
111
|
+
"An error occurred generating completions.", exc_info=True
|
|
112
|
+
)
|
|
113
|
+
if eval_run.config.raise_on_completion_error:
|
|
114
|
+
runner.shutdown_logging()
|
|
115
|
+
raise
|
|
116
|
+
|
|
117
|
+
#######################################################
|
|
118
|
+
################# Compute Metrics ###################
|
|
119
|
+
#######################################################
|
|
120
|
+
try:
|
|
121
|
+
metrics = compute_metrics.compute_metrics(eval_run, evalsetrun)
|
|
122
|
+
runner.logger.info(f"Saving '{len(metrics)}' metrics to database.")
|
|
123
|
+
flexeval.metrics.save.save_metrics(metrics)
|
|
124
|
+
except Exception:
|
|
125
|
+
runner.logger.exception("An error occurred computing metrics.", exc_info=True)
|
|
126
|
+
if eval_run.config.raise_on_metric_error:
|
|
127
|
+
runner.shutdown_logging()
|
|
128
|
+
raise
|
|
129
|
+
|
|
130
|
+
runner.logger.info("Evaluation run complete.")
|
|
131
|
+
runner.shutdown_logging()
|
|
132
|
+
return runner
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Pydantic schema for the core configuration options used for FlexEval.
|
|
2
|
+
|
|
3
|
+
The top-level schema object is :class:`~flexeval.schema.evalrun_schema.EvalRun`.
|
|
4
|
+
|
|
5
|
+
See :mod:`~flexeval.classes` for the internal Peewee objects produced from the Pydantic configuration.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .config_schema import *
|
|
9
|
+
from .eval_schema import *
|
|
10
|
+
from .evalrun_schema import *
|
|
11
|
+
from .rubric_schema import *
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from flexeval.schema import schema_utils
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Config(BaseModel):
|
|
9
|
+
class Config:
|
|
10
|
+
extra = "ignore"
|
|
11
|
+
# validate_assignment will ensure that fields never take an invalid value e.g. if copied from an eval
|
|
12
|
+
validate_assignment = True
|
|
13
|
+
|
|
14
|
+
logs_path: Path | None = Field(
|
|
15
|
+
None,
|
|
16
|
+
description="Log directory path.",
|
|
17
|
+
)
|
|
18
|
+
env_filepath: Path | None = Field(
|
|
19
|
+
None,
|
|
20
|
+
description="A .env file to be processed by python-dotenv before running evals with this config.",
|
|
21
|
+
)
|
|
22
|
+
env: schema_utils.OptionalDict = Field(
|
|
23
|
+
default_factory=dict, description="Any additional environment variables."
|
|
24
|
+
)
|
|
25
|
+
clear_tables: bool = Field(
|
|
26
|
+
False,
|
|
27
|
+
description="Clear any existing tables, if the output SQLite database already exists.",
|
|
28
|
+
)
|
|
29
|
+
max_workers: int = Field(
|
|
30
|
+
1,
|
|
31
|
+
description="Max worker count. Multiple threads will be used if set to > 1. This may have usage limit implications if you are calling APIs.",
|
|
32
|
+
)
|
|
33
|
+
random_seed_conversation_sampling: int = 42
|
|
34
|
+
max_n_conversation_threads: int = (
|
|
35
|
+
50 # TODO allow setting this to None, and set it to None by default
|
|
36
|
+
)
|
|
37
|
+
nb_evaluations_per_thread: int = 1
|
|
38
|
+
|
|
39
|
+
raise_on_completion_error: bool = Field(
|
|
40
|
+
False,
|
|
41
|
+
description="If False (default), metrics will be run even if one or more completions fails.",
|
|
42
|
+
)
|
|
43
|
+
raise_on_metric_error: bool = Field(
|
|
44
|
+
False,
|
|
45
|
+
description="If False (default), no exception will be thrown if a metric function raises an exception.",
|
|
46
|
+
)
|