python-flexeval 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. flexeval/__init__.py +11 -0
  2. flexeval/__main__.py +11 -0
  3. flexeval/classes/__init__.py +15 -0
  4. flexeval/classes/base.py +32 -0
  5. flexeval/classes/dataset.py +82 -0
  6. flexeval/classes/eval_runner.py +158 -0
  7. flexeval/classes/eval_set_run.py +32 -0
  8. flexeval/classes/message.py +183 -0
  9. flexeval/classes/metric.py +55 -0
  10. flexeval/classes/thread.py +79 -0
  11. flexeval/classes/tool_call.py +51 -0
  12. flexeval/classes/turn.py +206 -0
  13. flexeval/cli.py +104 -0
  14. flexeval/completions.py +147 -0
  15. flexeval/compute_metrics.py +788 -0
  16. flexeval/config.yaml +23 -0
  17. flexeval/configuration/__init__.py +1 -0
  18. flexeval/configuration/completion_functions.py +231 -0
  19. flexeval/configuration/evals.yaml +864 -0
  20. flexeval/configuration/function_metrics.py +650 -0
  21. flexeval/configuration/rubric_metrics.yaml +194 -0
  22. flexeval/data_loader.py +513 -0
  23. flexeval/db_utils.py +38 -0
  24. flexeval/dependency_graph.py +234 -0
  25. flexeval/eval_schema.json +256 -0
  26. flexeval/function_types.py +173 -0
  27. flexeval/helpers.py +52 -0
  28. flexeval/io/__init__.py +1 -0
  29. flexeval/io/parsers/yaml_parser.py +69 -0
  30. flexeval/log_utils.py +34 -0
  31. flexeval/metrics/__init__.py +8 -0
  32. flexeval/metrics/access.py +28 -0
  33. flexeval/metrics/save.py +39 -0
  34. flexeval/rubric.py +62 -0
  35. flexeval/run_utils.py +65 -0
  36. flexeval/runner.py +132 -0
  37. flexeval/schema/__init__.py +11 -0
  38. flexeval/schema/config_schema.py +46 -0
  39. flexeval/schema/eval_schema.py +163 -0
  40. flexeval/schema/evalrun_schema.py +97 -0
  41. flexeval/schema/rubric_schema.py +40 -0
  42. flexeval/schema/schema_utils.py +26 -0
  43. python_flexeval-0.1.5.dist-info/METADATA +118 -0
  44. python_flexeval-0.1.5.dist-info/RECORD +47 -0
  45. python_flexeval-0.1.5.dist-info/WHEEL +4 -0
  46. python_flexeval-0.1.5.dist-info/entry_points.txt +2 -0
  47. python_flexeval-0.1.5.dist-info/licenses/LICENSE +21 -0
flexeval/helpers.py ADDED
@@ -0,0 +1,52 @@
1
+ """Generic utility functions."""
2
+
3
+ import datetime
4
+ import hashlib
5
+
6
+ import networkx as nx
7
+
8
+
9
+ def generate_hash():
10
+ """Create a random 8-digit id"""
11
+ # Create a new SHA-256 hash object
12
+ hash_object = hashlib.sha256()
13
+
14
+ # Update the hash object with the bytes of the string
15
+ hash_object.update(datetime.datetime.now().isoformat().encode())
16
+
17
+ # Get the hexadecimal digest of the hash
18
+ full_hash = hash_object.hexdigest()
19
+
20
+ # Return the first 8 digits of the hash
21
+ return full_hash[:8]
22
+
23
+
24
+ def visualize_graph(graph: nx.DiGraph, output_path: str | None = None):
25
+ """Visualize graphs produced by :class:`~flexeval.compute_metrics.MetricGraphBuilder`.
26
+
27
+ Args:
28
+ graph (nx.DiGraph): The graph
29
+ output_path (str | None, optional): If not None, will save the graph as an image using :meth:`matplotlib.pyplot.Figure.savefig`.
30
+
31
+ Raises:
32
+ ImportError: If matplotlib is not installed.
33
+ """
34
+ try:
35
+ import matplotlib.pyplot as plt
36
+ except ImportError:
37
+ raise ImportError("matplotlib must be installed to use this helper function.")
38
+ fig, ax = plt.subplots(1, 1, figsize=(12, 5))
39
+ pos = nx.spring_layout(graph)
40
+ nx.draw(graph, ax=ax, pos=pos)
41
+ nx.draw_networkx_labels(
42
+ graph,
43
+ font_size=8,
44
+ ax=ax,
45
+ pos=pos,
46
+ labels={
47
+ om: f"{om.object.__class__.__name__} {om.object.id}\n{om.metric.get('id')}(l={om.metric.get('metric_level')},r={om.metric.get('kwargs', {}).get('response')})"
48
+ for om in graph
49
+ },
50
+ )
51
+ if output_path is not None:
52
+ fig.savefig(output_path)
@@ -0,0 +1 @@
1
+ """Input/output utilities, primarily for reading and writing the :mod:`~flexeval.schema` objects."""
@@ -0,0 +1,69 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Type, TypeVar
4
+
5
+ import pydantic
6
+ import yaml
7
+
8
+ from flexeval.schema import Config, Eval, EvalRun
9
+
10
+ logger = logging.getLogger(__name__)
11
+ T = TypeVar("T", bound=pydantic.BaseModel)
12
+
13
+
14
+ def load_config_from_yaml(
15
+ filename: Path | str,
16
+ ) -> Config:
17
+ return load_model_from_yaml(filename, Config)
18
+
19
+
20
+ def load_evals_from_yaml(
21
+ filename: Path | str,
22
+ ) -> dict[str, Eval]:
23
+ return load_models_from_yaml_filepath(filename, Eval)
24
+
25
+
26
+ def load_eval_run_from_yaml(
27
+ filename: Path | str,
28
+ ) -> EvalRun:
29
+ return load_model_from_yaml(filename, EvalRun)
30
+
31
+
32
+ def load_models_from_yaml_filepath(
33
+ filename: Path | str,
34
+ model_type: Type[T],
35
+ ) -> dict[str, T]:
36
+ try:
37
+ with open(filename) as file:
38
+ return load_models_from_yaml_stream(file, model_type)
39
+ except (OSError, ValueError) as ex:
40
+ raise ValueError(
41
+ f"Failed to load '{filename}' as a list of '{model_type.__name__}' models: {ex}"
42
+ )
43
+
44
+
45
+ def load_models_from_yaml_stream(
46
+ stream,
47
+ model_type: Type[T],
48
+ ) -> dict[str, T]:
49
+ try:
50
+ contents = yaml.safe_load(stream)
51
+ return {key: model_type(**value) for key, value in contents.items()}
52
+ except (OSError, yaml.YAMLError, pydantic.ValidationError) as ex:
53
+ raise ValueError(
54
+ f"Failed to load YAML stream as a list of '{model_type.__name__}' models: {ex}"
55
+ )
56
+
57
+
58
+ def load_model_from_yaml(
59
+ filename: Path | str,
60
+ model_type: Type[T],
61
+ ) -> T:
62
+ try:
63
+ with open(filename) as file:
64
+ contents = yaml.safe_load(file)
65
+ return model_type(**contents)
66
+ except (OSError, yaml.YAMLError, pydantic.ValidationError) as ex:
67
+ raise ValueError(
68
+ f"Failed to load '{filename}' as a '{model_type.__name__}': {ex}"
69
+ )
flexeval/log_utils.py ADDED
@@ -0,0 +1,34 @@
1
+ """Logging utilities."""
2
+
3
+ import enum
4
+ import logging
5
+
6
+
7
+ class LogLevel(str, enum.Enum):
8
+ CRITICAL = "critical"
9
+ ERROR = "error"
10
+ WARNING = "warning"
11
+ INFO = "info"
12
+ DEBUG = "debug"
13
+
14
+ @classmethod
15
+ def get_logging_constant(cls, level_str: str) -> int:
16
+ if level_str == cls.CRITICAL.value:
17
+ return logging.CRITICAL
18
+ elif level_str == cls.ERROR.value:
19
+ return logging.ERROR
20
+ elif level_str == cls.WARNING.value:
21
+ return logging.WARNING
22
+ elif level_str == cls.INFO.value:
23
+ return logging.INFO
24
+ elif level_str == cls.DEBUG.value:
25
+ return logging.DEBUG
26
+
27
+
28
+ def set_up_logging(log_level: int = logging.INFO):
29
+ # set up logging
30
+ logging.basicConfig(
31
+ level=log_level,
32
+ format="%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d (%(funcName)s) - %(message)s",
33
+ datefmt="%Y-%m-%d %H:%M:%S",
34
+ )
@@ -0,0 +1,8 @@
1
+ """Utility functions for accessing metrics."""
2
+
3
+ from flexeval.metrics import access, save
4
+
5
+ __all__ = [
6
+ "access",
7
+ "save",
8
+ ]
@@ -0,0 +1,28 @@
1
+ from collections import Counter
2
+
3
+ from flexeval.classes import metric
4
+
5
+
6
+ def count_dict_values(lst: list[dict]) -> dict[str, Counter]:
7
+ """Convenience function for counting key values.
8
+
9
+ Args:
10
+ lst (list[dict]): List of dictionaries.
11
+
12
+ Returns:
13
+ dict[str, Counter]: counter for each key that appears in the dicts in lst.
14
+ """
15
+ counts = {}
16
+ for d in lst:
17
+ for k, v in d.items():
18
+ if k not in counts:
19
+ counts[k] = Counter()
20
+ counts[k][v] += 1
21
+ return counts
22
+
23
+
24
+ def get_all_metrics() -> list:
25
+ results = []
26
+ for m in metric.Metric.select():
27
+ results.append(m.__data__.copy())
28
+ return results
@@ -0,0 +1,39 @@
1
+ import json
2
+ from typing import Iterable
3
+
4
+ from flexeval.classes.metric import Metric
5
+
6
+
7
+ def save_metrics(metrics: Iterable[Metric]):
8
+ for metric in metrics:
9
+ # TODO - speed this up somehow
10
+ thread = metric.get("thread")
11
+ if thread is None:
12
+ thread = metric[metric["metric_level"].lower()].thread
13
+ Metric.create(
14
+ message=metric.get("message", None),
15
+ turn=metric.get("turn", None),
16
+ toolcall=metric.get("toolcall", None),
17
+ evalsetrun=metric[
18
+ metric["metric_level"].lower()
19
+ ].evalsetrun, # metric["turn"].evalsetrun,
20
+ dataset=metric[
21
+ metric["metric_level"].lower()
22
+ ].dataset, # metric["turn"].dataset,
23
+ thread=thread,
24
+ evaluation_name=metric["evaluation_name"],
25
+ evaluation_type=metric["evaluation_type"],
26
+ metric_name=metric["metric_name"],
27
+ metric_value=metric["metric_value"],
28
+ metric_level=metric["metric_level"],
29
+ kwargs=metric["kwargs"],
30
+ depends_on=json.dumps(metric["depends_on"]),
31
+ context_only=metric.get("context_only", False),
32
+ source=metric["source"],
33
+ rubric_prompt=metric.get("rubric_prompt", None),
34
+ rubric_completion=metric.get("rubric_completion", None),
35
+ rubric_model=metric.get("rubric_model", None),
36
+ rubric_completion_tokens=metric.get("rubric_completion_tokens", None),
37
+ rubric_prompt_tokens=metric.get("rubric_prompt_tokens", None),
38
+ rubric_score=metric.get("rubric_score", None),
39
+ )
flexeval/rubric.py ADDED
@@ -0,0 +1,62 @@
1
+ """Rubric metric IO utilities. Should maybe be moved to :mod:`~flexeval.io`."""
2
+
3
+ import importlib.resources
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ import flexeval.configuration
8
+ from flexeval.io.parsers import yaml_parser
9
+ from flexeval.schema import Rubric, RubricsCollection
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ default_rubric_collection = (
15
+ None # don't access directly, use load_default_rubric_metrics
16
+ )
17
+
18
+
19
+ def load_rubrics_from_yaml_file(path: Path) -> dict[str, Rubric]:
20
+ return yaml_parser.load_models_from_yaml_filepath(path, Rubric)
21
+
22
+
23
+ def load_rubrics_from_yaml_stream(
24
+ stream,
25
+ ) -> dict[str, Rubric]:
26
+ return yaml_parser.load_models_from_yaml_stream(stream, Rubric)
27
+
28
+
29
+ def get_default_rubric_collection() -> RubricsCollection:
30
+ global default_rubric_collection
31
+ if default_rubric_collection is not None:
32
+ return default_rubric_collection
33
+ logger.debug("Attempting to load from default rubric_metrics.yaml.")
34
+ metrics_path = (
35
+ importlib.resources.files(flexeval.configuration) / "rubric_metrics.yaml"
36
+ )
37
+ metrics_string = metrics_path.read_text()
38
+ rubric_metrics = load_rubrics_from_yaml_stream(metrics_string)
39
+ default_rubric_collection = RubricsCollection(rubrics=rubric_metrics)
40
+ return default_rubric_collection
41
+
42
+
43
+ def load_rubric_metrics(
44
+ rubric_collections: list[Path | RubricsCollection], add_defaults: bool = False
45
+ ) -> dict[str, Rubric]:
46
+ rubric_metrics = {}
47
+ if add_defaults:
48
+ rubric_collections.append(get_default_rubric_collection())
49
+ for rubric_collection in rubric_collections:
50
+ if isinstance(rubric_collection, Path):
51
+ rubric_dict = load_rubrics_from_yaml_file(rubric_collection)
52
+ else:
53
+ rubric_dict = rubric_collection.rubrics
54
+ for key, rubric in rubric_dict.items():
55
+ if key not in rubric_metrics:
56
+ rubric_metrics[key] = rubric
57
+ if rubric.name is None:
58
+ # infer rubric name from parent key
59
+ rubric.name = key
60
+ else:
61
+ logger.warning(f"Ignoring second (or later) version of rubric '{key}'.")
62
+ return rubric_metrics
flexeval/run_utils.py ADDED
@@ -0,0 +1,65 @@
1
+ """Utilities for :mod:`~flexeval.runner`."""
2
+
3
+ import json
4
+ import logging
5
+
6
+ from flexeval import rubric
7
+ from flexeval.classes.dataset import Dataset
8
+ from flexeval.classes.eval_runner import EvalRunner
9
+ from flexeval.classes.eval_set_run import EvalSetRun
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def build_eval_set_run(runner: EvalRunner) -> EvalSetRun:
15
+ rubrics = rubric.load_rubric_metrics(runner.evalrun.rubric_paths)
16
+
17
+ # TODO this code uses a model_name that does not appear in the Eval schema; should look into this
18
+ model_name = json.dumps(None)
19
+ # model_name = json.dumps(
20
+ # runner.eval.get("completion_llm", {}).get("model_name", None)
21
+ # )
22
+ evalsetrun = EvalSetRun.create(
23
+ name=runner.evalrun.eval.name,
24
+ notes=runner.evalrun.eval.notes,
25
+ metrics=runner.evalrun.eval.metrics.model_dump_json(),
26
+ metrics_graph_ordered_list=json.dumps(runner.metrics_graph_ordered_list),
27
+ dataset_files=json.dumps(
28
+ [str(data_source.path) for data_source in runner.evalrun.data_sources]
29
+ ),
30
+ do_completion=runner.evalrun.eval.do_completion,
31
+ completion_llm=(
32
+ runner.evalrun.eval.completion_llm.model_dump_json()
33
+ if runner.evalrun.eval.completion_llm is not None
34
+ else json.dumps(None)
35
+ ),
36
+ model_name=model_name,
37
+ grader_llm=(
38
+ runner.evalrun.eval.grader_llm.model_dump_json()
39
+ if runner.evalrun.eval.grader_llm is not None
40
+ else json.dumps(None)
41
+ ),
42
+ # only save rubrics that will actually be used
43
+ rubrics=json.dumps(
44
+ {
45
+ i["evaluation_name"]: rubrics[i["evaluation_name"]].model_dump()
46
+ for i in runner.metrics_graph_ordered_list
47
+ if i["evaluation_type"] == "rubric"
48
+ }
49
+ ),
50
+ )
51
+ return evalsetrun
52
+
53
+
54
+ def build_datasets(runner: EvalRunner, evalsetrun: EvalSetRun):
55
+ for filename in evalsetrun.get_datasets():
56
+ # these will automatically be saved as a property of evalsetrun
57
+ Dataset.create(
58
+ evalsetrun=evalsetrun,
59
+ filename=filename,
60
+ max_n_conversation_threads=runner.evalrun.config.max_n_conversation_threads,
61
+ nb_evaluations_per_thread=runner.evalrun.config.nb_evaluations_per_thread,
62
+ )
63
+ runner.logger.info(
64
+ f"Created dataset from '{filename}'. Max number of conversation threads: '{runner.evalrun.config.max_n_conversation_threads}' - Nb of evaluations per thread: '{runner.evalrun.config.nb_evaluations_per_thread}'"
65
+ )
flexeval/runner.py ADDED
@@ -0,0 +1,132 @@
1
+ """Convenience functions for running an Eval Run."""
2
+
3
+ import logging
4
+ import random as rd
5
+ from pathlib import Path
6
+
7
+ import flexeval.metrics
8
+ from flexeval import completions, compute_metrics, run_utils
9
+ from flexeval.classes.eval_runner import EvalRunner
10
+ from flexeval.io.parsers import yaml_parser
11
+ from flexeval.schema import EvalRun, FileDataSource
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ # Levels of abstraction -
17
+ # Dataset
18
+ # Thread
19
+ # Turn
20
+ # Message
21
+ # ToolCall
22
+ # Metric
23
+
24
+ # Features to add:
25
+ # - allow comparison with 'ideal' responses
26
+
27
+
28
+ def run_from_name_args(
29
+ input_data: list[Path],
30
+ database_path: Path,
31
+ eval_name: str,
32
+ config_path: str,
33
+ evals_path: str,
34
+ **kwargs,
35
+ ):
36
+ data_sources = [FileDataSource(path=input_path) for input_path in input_data]
37
+ config = yaml_parser.load_config_from_yaml(config_path)
38
+ evals = yaml_parser.load_evals_from_yaml(evals_path)
39
+ if eval_name not in evals:
40
+ raise ValueError(
41
+ f"Eval name '{eval_name}' not in defined evals: {list(evals.keys())}"
42
+ )
43
+ selected_eval = evals[eval_name]
44
+ if selected_eval.name is None or selected_eval.name.strip() == "":
45
+ selected_eval.name = eval_name
46
+ for key, value in kwargs.items():
47
+ setattr(config, key, value)
48
+ eval_run = EvalRun(
49
+ data_sources=data_sources,
50
+ database_path=database_path,
51
+ eval=selected_eval,
52
+ config=config,
53
+ )
54
+ return run(eval_run)
55
+
56
+
57
+ def run(eval_run: EvalRun) -> EvalRunner:
58
+ """Runs the evaluations."""
59
+ runner = EvalRunner(eval_run)
60
+
61
+ #######################################################
62
+ ############ Create Test Run ########################
63
+ #######################################################
64
+ try:
65
+ runner.logger.info("Creating EvalSetRun")
66
+ # TODO instead of raw 'metrics', pass in graph created when setting up the runner
67
+
68
+ evalsetrun = run_utils.build_eval_set_run(runner)
69
+ runner.logger.info(f"Metric graph: {evalsetrun.metrics_graph_ordered_list}")
70
+ except Exception:
71
+ runner.logger.exception(
72
+ "An error occurred creating the EvalSetRun.", exc_info=True
73
+ )
74
+ runner.shutdown_logging()
75
+ raise
76
+
77
+ #######################################################
78
+ ############ Load and Parse Data ####################
79
+ #######################################################
80
+
81
+ try:
82
+ runner.logger.debug("Loading data")
83
+
84
+ # set random seed
85
+ rd_seed = runner.evalrun.config.random_seed_conversation_sampling
86
+ rd.seed(rd_seed)
87
+ runner.logger.info(f"Set random seed to '{rd_seed}'.")
88
+
89
+ run_utils.build_datasets(runner, evalsetrun)
90
+ except Exception:
91
+ runner.logger.exception(
92
+ "An error occurred creating dataset metadata.", exc_info=True
93
+ )
94
+
95
+ try:
96
+ runner.logger.info("Parsing data files")
97
+ for dataset in evalsetrun.datasets:
98
+ runner.logger.debug(f"Loading data from '{dataset.filename}'.")
99
+ dataset.load_data()
100
+ except Exception:
101
+ runner.logger.exception("An error occurred loading data.", exc_info=True)
102
+
103
+ # Do completions, if necessary
104
+ try:
105
+ if evalsetrun.do_completion:
106
+ # We do this by creating new turns
107
+ runner.logger.info("Generating completions")
108
+ completions.get_completions(eval_run, evalsetrun)
109
+ except Exception:
110
+ runner.logger.exception(
111
+ "An error occurred generating completions.", exc_info=True
112
+ )
113
+ if eval_run.config.raise_on_completion_error:
114
+ runner.shutdown_logging()
115
+ raise
116
+
117
+ #######################################################
118
+ ################# Compute Metrics ###################
119
+ #######################################################
120
+ try:
121
+ metrics = compute_metrics.compute_metrics(eval_run, evalsetrun)
122
+ runner.logger.info(f"Saving '{len(metrics)}' metrics to database.")
123
+ flexeval.metrics.save.save_metrics(metrics)
124
+ except Exception:
125
+ runner.logger.exception("An error occurred computing metrics.", exc_info=True)
126
+ if eval_run.config.raise_on_metric_error:
127
+ runner.shutdown_logging()
128
+ raise
129
+
130
+ runner.logger.info("Evaluation run complete.")
131
+ runner.shutdown_logging()
132
+ return runner
@@ -0,0 +1,11 @@
1
+ """Pydantic schema for the core configuration options used for FlexEval.
2
+
3
+ The top-level schema object is :class:`~flexeval.schema.evalrun_schema.EvalRun`.
4
+
5
+ See :mod:`~flexeval.classes` for the internal Peewee objects produced from the Pydantic configuration.
6
+ """
7
+
8
+ from .config_schema import *
9
+ from .eval_schema import *
10
+ from .evalrun_schema import *
11
+ from .rubric_schema import *
@@ -0,0 +1,46 @@
1
+ from pathlib import Path
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from flexeval.schema import schema_utils
6
+
7
+
8
+ class Config(BaseModel):
9
+ class Config:
10
+ extra = "ignore"
11
+ # validate_assignment will ensure that fields never take an invalid value e.g. if copied from an eval
12
+ validate_assignment = True
13
+
14
+ logs_path: Path | None = Field(
15
+ None,
16
+ description="Log directory path.",
17
+ )
18
+ env_filepath: Path | None = Field(
19
+ None,
20
+ description="A .env file to be processed by python-dotenv before running evals with this config.",
21
+ )
22
+ env: schema_utils.OptionalDict = Field(
23
+ default_factory=dict, description="Any additional environment variables."
24
+ )
25
+ clear_tables: bool = Field(
26
+ False,
27
+ description="Clear any existing tables, if the output SQLite database already exists.",
28
+ )
29
+ max_workers: int = Field(
30
+ 1,
31
+ description="Max worker count. Multiple threads will be used if set to > 1. This may have usage limit implications if you are calling APIs.",
32
+ )
33
+ random_seed_conversation_sampling: int = 42
34
+ max_n_conversation_threads: int = (
35
+ 50 # TODO allow setting this to None, and set it to None by default
36
+ )
37
+ nb_evaluations_per_thread: int = 1
38
+
39
+ raise_on_completion_error: bool = Field(
40
+ False,
41
+ description="If False (default), metrics will be run even if one or more completions fails.",
42
+ )
43
+ raise_on_metric_error: bool = Field(
44
+ False,
45
+ description="If False (default), no exception will be thrown if a metric function raises an exception.",
46
+ )