python-flexeval 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexeval/__init__.py +11 -0
- flexeval/__main__.py +11 -0
- flexeval/classes/__init__.py +15 -0
- flexeval/classes/base.py +32 -0
- flexeval/classes/dataset.py +82 -0
- flexeval/classes/eval_runner.py +158 -0
- flexeval/classes/eval_set_run.py +32 -0
- flexeval/classes/message.py +183 -0
- flexeval/classes/metric.py +55 -0
- flexeval/classes/thread.py +79 -0
- flexeval/classes/tool_call.py +51 -0
- flexeval/classes/turn.py +206 -0
- flexeval/cli.py +104 -0
- flexeval/completions.py +147 -0
- flexeval/compute_metrics.py +788 -0
- flexeval/config.yaml +23 -0
- flexeval/configuration/__init__.py +1 -0
- flexeval/configuration/completion_functions.py +231 -0
- flexeval/configuration/evals.yaml +864 -0
- flexeval/configuration/function_metrics.py +650 -0
- flexeval/configuration/rubric_metrics.yaml +194 -0
- flexeval/data_loader.py +513 -0
- flexeval/db_utils.py +38 -0
- flexeval/dependency_graph.py +234 -0
- flexeval/eval_schema.json +256 -0
- flexeval/function_types.py +173 -0
- flexeval/helpers.py +52 -0
- flexeval/io/__init__.py +1 -0
- flexeval/io/parsers/yaml_parser.py +69 -0
- flexeval/log_utils.py +34 -0
- flexeval/metrics/__init__.py +8 -0
- flexeval/metrics/access.py +28 -0
- flexeval/metrics/save.py +39 -0
- flexeval/rubric.py +62 -0
- flexeval/run_utils.py +65 -0
- flexeval/runner.py +132 -0
- flexeval/schema/__init__.py +11 -0
- flexeval/schema/config_schema.py +46 -0
- flexeval/schema/eval_schema.py +163 -0
- flexeval/schema/evalrun_schema.py +97 -0
- flexeval/schema/rubric_schema.py +40 -0
- flexeval/schema/schema_utils.py +26 -0
- python_flexeval-0.1.5.dist-info/METADATA +118 -0
- python_flexeval-0.1.5.dist-info/RECORD +47 -0
- python_flexeval-0.1.5.dist-info/WHEEL +4 -0
- python_flexeval-0.1.5.dist-info/entry_points.txt +2 -0
- python_flexeval-0.1.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# originally generated by datamodel-codegen:
|
|
2
|
+
# filename: src/flexeval/eval_schema.json
|
|
3
|
+
# timestamp: 2025-05-19T21:42:39+00:00
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from flexeval.schema import schema_utils
|
|
13
|
+
|
|
14
|
+
VALID_METRIC_LEVELS = ["Message", "Turn", "Thread", "ToolCall"]
|
|
15
|
+
MetricLevel = Literal["Message", "Turn", "Thread", "ToolCall"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DependsOnItem(BaseModel):
|
|
19
|
+
class Config:
|
|
20
|
+
extra = "forbid"
|
|
21
|
+
|
|
22
|
+
name: Optional[str] = Field(
|
|
23
|
+
None, description="Name of the dependency function or rubric."
|
|
24
|
+
)
|
|
25
|
+
type: Optional[Literal["function", "rubric"]] = Field(
|
|
26
|
+
None,
|
|
27
|
+
description="One of 'function' or 'rubric' indicating the type of the dependency.",
|
|
28
|
+
)
|
|
29
|
+
kwargs: Optional[Dict[str, Any]] = Field(
|
|
30
|
+
None,
|
|
31
|
+
description="The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
|
|
32
|
+
)
|
|
33
|
+
metric_name: Optional[str] = Field(
|
|
34
|
+
None,
|
|
35
|
+
description="Name of the metric dependency. This may be different than function_name if the metric function returns a key/value pair - in which case, this will match the key.",
|
|
36
|
+
)
|
|
37
|
+
metric_level: Optional[MetricLevel] = Field(
|
|
38
|
+
None,
|
|
39
|
+
description="The level of the metric to depend on, which must be equal to or 'greater' than the dependent metric's level. e.g. a Turn can depend on a Thread metric, but not the reverse.",
|
|
40
|
+
)
|
|
41
|
+
relative_object_position: int = Field(
|
|
42
|
+
0,
|
|
43
|
+
le=0,
|
|
44
|
+
strict=True,
|
|
45
|
+
description="The position of the object within the Thread. If 0 (default), this is the metric value for the current object. If -1, this is the metric value for the most recent object before this one.",
|
|
46
|
+
)
|
|
47
|
+
# TODO we could consider adding an absolute_object_position
|
|
48
|
+
metric_min_value: Optional[float] = Field(
|
|
49
|
+
-sys.float_info.max,
|
|
50
|
+
description="Minimum value of the dependency to consider it as satisfied.",
|
|
51
|
+
)
|
|
52
|
+
metric_max_value: Optional[float] = Field(
|
|
53
|
+
sys.float_info.max,
|
|
54
|
+
description="Maximum value of the dependency to consider it as satisfied.",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class MetricItem(BaseModel):
|
|
59
|
+
name: str = Field(
|
|
60
|
+
...,
|
|
61
|
+
description="The function to call or name of rubric to use to compute this metric.",
|
|
62
|
+
)
|
|
63
|
+
depends_on: Optional[List[DependsOnItem]] = Field(
|
|
64
|
+
default_factory=list,
|
|
65
|
+
description="List of dependencies that must be satisfied for this metric to be computed.",
|
|
66
|
+
)
|
|
67
|
+
# TODO why is metric_level optional? Should likely make it required
|
|
68
|
+
metric_level: Optional[MetricLevel] = Field(
|
|
69
|
+
"Turn",
|
|
70
|
+
description="What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class FunctionItem(MetricItem):
|
|
75
|
+
kwargs: schema_utils.OptionalDict = Field(
|
|
76
|
+
default_factory=dict,
|
|
77
|
+
description="Keyword arguments for the function. Each key must correspond to an argument in the function. Extra keys will cause an error.",
|
|
78
|
+
)
|
|
79
|
+
# TODO add the ability to provide a function source: Path | FunctionsCollection | schema_utils.ModuleType
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class RubricItem(MetricItem):
|
|
83
|
+
# TODO is RubricItem.kwargs actually used?
|
|
84
|
+
kwargs: Optional[Dict[str, Any]] = Field(
|
|
85
|
+
default_factory=dict,
|
|
86
|
+
description="Keyword arguments for the rubric evaluation.",
|
|
87
|
+
)
|
|
88
|
+
# TODO add the ability to provide a rubric source: Path | RubricsCollection
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class Metrics(BaseModel):
|
|
92
|
+
"""Defines the metrics to be evaluated."""
|
|
93
|
+
|
|
94
|
+
function: Optional[List[FunctionItem]] = Field(
|
|
95
|
+
None, description="List of function-based metrics to be evaluated."
|
|
96
|
+
)
|
|
97
|
+
rubric: Optional[List[RubricItem]] = Field(
|
|
98
|
+
None, description="List of rubrics to be evaluated."
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class CompletionLlm(BaseModel):
|
|
103
|
+
class Config:
|
|
104
|
+
extra = "forbid"
|
|
105
|
+
|
|
106
|
+
function_name: str = Field(
|
|
107
|
+
...,
|
|
108
|
+
description="Completion function defined in `completion_functions.py` or available in the global namespace.",
|
|
109
|
+
)
|
|
110
|
+
include_system_prompt: bool = True
|
|
111
|
+
kwargs: Dict[str, Any] = Field(
|
|
112
|
+
default_factory=dict,
|
|
113
|
+
description="Additional arguments that will be passed to the completion function. Must correspond to arguments in the named function.",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class GraderLlm(BaseModel):
|
|
118
|
+
class Config:
|
|
119
|
+
extra = "forbid"
|
|
120
|
+
|
|
121
|
+
function_name: str = Field(
|
|
122
|
+
...,
|
|
123
|
+
description="Function defined in `completion_functions.py`. We're not really completing a conversation, but we ARE asking an LLM to provide a response to an input - in this case, the rubric.",
|
|
124
|
+
)
|
|
125
|
+
kwargs: Dict[str, Any] = Field(
|
|
126
|
+
default_factory=dict,
|
|
127
|
+
description="Additional arguments that will be passed to the completion function. Must correspond to arguments in tne named function.",
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class Eval(BaseModel):
|
|
132
|
+
"""Defines the evaluation that should be executed.
|
|
133
|
+
|
|
134
|
+
The key fields are :attr:`metrics` and :attr:`grader_llm`.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
class Config:
|
|
138
|
+
# TODO don't permit additional fields in Eval
|
|
139
|
+
extra = "allow"
|
|
140
|
+
|
|
141
|
+
do_completion: bool = Field(
|
|
142
|
+
False,
|
|
143
|
+
description="Flag to determine if completions should be done in each thread. Set to 'true' if you are testing a new API and want to evaluate the API responses. Set to 'false' (default) if you are evaluating past conversations and do not need to generate new completions.",
|
|
144
|
+
)
|
|
145
|
+
name: Optional[str] = Field(
|
|
146
|
+
None,
|
|
147
|
+
description="Name of the test suite. Used as metadata only. Does not need to match the key of the entry in the evals.yaml file.",
|
|
148
|
+
)
|
|
149
|
+
notes: str = Field(
|
|
150
|
+
"",
|
|
151
|
+
description="Additional notes regarding the configuration. Used as metadata only.",
|
|
152
|
+
)
|
|
153
|
+
metrics: Metrics = Field(
|
|
154
|
+
default_factory=Metrics, description="Metrics to use in the evaluation."
|
|
155
|
+
)
|
|
156
|
+
completion_llm: Optional[CompletionLlm] = Field(
|
|
157
|
+
None,
|
|
158
|
+
description="Specification of the LLM or API used to perform new completions. Must be defined if `do_completions: true` is set.",
|
|
159
|
+
)
|
|
160
|
+
grader_llm: Optional[GraderLlm] = Field(
|
|
161
|
+
None,
|
|
162
|
+
description="Specification of the LLM or API used to grade rubrics. Must be defined if any rubric_metrics are specified.",
|
|
163
|
+
)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""The top-level :class:`~flexeval.schema.evalrun_schema.EvalRun` schema and associated sub-schema."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Annotated, Callable, Iterable, Literal
|
|
5
|
+
|
|
6
|
+
from annotated_types import Len
|
|
7
|
+
from pydantic import BaseModel, Field, FilePath
|
|
8
|
+
|
|
9
|
+
from flexeval.configuration import function_metrics
|
|
10
|
+
from flexeval.schema import config_schema, eval_schema, rubric_schema, schema_utils
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DataSource(BaseModel):
|
|
14
|
+
# TODO support more generic DataSource interface
|
|
15
|
+
# for now, we need to use FileDataSource because we path the JSONL paths along
|
|
16
|
+
name: str | None = Field(None, description="")
|
|
17
|
+
notes: str | None = Field(None, description="")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class IterableDataSource(DataSource):
|
|
21
|
+
"""Not yet implemented."""
|
|
22
|
+
|
|
23
|
+
contents: Iterable = Field(
|
|
24
|
+
default_factory=list,
|
|
25
|
+
description="Iterable of data items, presumably in the jsonl format (for now).",
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class FileDataSource(DataSource):
|
|
30
|
+
"""File to be used as a data source."""
|
|
31
|
+
|
|
32
|
+
# TODO in the future, we could use cloudpathlib to support cloud paths
|
|
33
|
+
path: FilePath = Field(
|
|
34
|
+
description="Absolute or relative path to data file. Each file must be in jsonl format, with one conversation per line."
|
|
35
|
+
)
|
|
36
|
+
format: Literal["jsonl"] = Field("jsonl", description="Format of the data file.")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class FunctionsCollection(BaseModel):
|
|
40
|
+
"""Collection of functions that can be used as :class:`~flexeval.schema.eval_schema.FunctionItem`\s."""
|
|
41
|
+
|
|
42
|
+
functions: list[Callable] = Field(
|
|
43
|
+
default_factory=list,
|
|
44
|
+
description="Callables that can be used as functions for evaluation.",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_default_rubrics() -> list[Path | rubric_schema.RubricsCollection]:
|
|
49
|
+
"""Utility function to retrieve the default rubric collection."""
|
|
50
|
+
from flexeval import rubric
|
|
51
|
+
|
|
52
|
+
return [rubric.get_default_rubric_collection()]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_default_function_metrics() -> list[
|
|
56
|
+
Path | FunctionsCollection | schema_utils.ModuleType
|
|
57
|
+
]:
|
|
58
|
+
"""Utility function to retrieve the default function collection."""
|
|
59
|
+
return [function_metrics]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class EvalRun(BaseModel):
|
|
63
|
+
"""EvalRun defines the schema that FlexEval expects.
|
|
64
|
+
|
|
65
|
+
At a minimum, you need to provide a set of input data sources and an :class:`~flexeval.schema.eval_schema.Eval`.
|
|
66
|
+
|
|
67
|
+
You can evaluate an EvalRun using :func:`~flexeval.runner.run`.
|
|
68
|
+
|
|
69
|
+
Read more in the :ref:`user_guide`."""
|
|
70
|
+
|
|
71
|
+
data_sources: Annotated[list[FileDataSource], Len(min_length=1)] = Field(
|
|
72
|
+
description="List of data sources.",
|
|
73
|
+
)
|
|
74
|
+
database_path: Path = Field(
|
|
75
|
+
Path("flexeval/results/results.db"),
|
|
76
|
+
description="Output database path.",
|
|
77
|
+
)
|
|
78
|
+
eval: eval_schema.Eval = Field(
|
|
79
|
+
description="The evaluation to apply to the data sources."
|
|
80
|
+
)
|
|
81
|
+
config: config_schema.Config = Field(
|
|
82
|
+
default_factory=config_schema.Config, description="Configuration details."
|
|
83
|
+
)
|
|
84
|
+
rubric_paths: list[Path | rubric_schema.RubricsCollection] = Field(
|
|
85
|
+
default_factory=get_default_rubrics,
|
|
86
|
+
description="Additional sources for rubrics. If a Path, should be a YAML file in the expected format.",
|
|
87
|
+
)
|
|
88
|
+
function_modules: list[FilePath | FunctionsCollection | schema_utils.ModuleType] = (
|
|
89
|
+
Field(
|
|
90
|
+
default_factory=get_default_function_metrics,
|
|
91
|
+
description="Additional sources for functions.",
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
add_default_functions: bool = Field(
|
|
95
|
+
True,
|
|
96
|
+
description="If the default functions at :mod:`flexeval.configuration.function_metrics` should be made available.",
|
|
97
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field, field_validator
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Rubric(BaseModel):
|
|
5
|
+
prompt: str = Field(description="Prompt for the rubric.")
|
|
6
|
+
choice_scores: dict[str, int | float] = Field(
|
|
7
|
+
default_factory=dict, description="Choices."
|
|
8
|
+
)
|
|
9
|
+
name: str | None = Field(None, description="Optional name of the rubric.")
|
|
10
|
+
notes: str | None = Field(None, description="Optional notes.")
|
|
11
|
+
|
|
12
|
+
@field_validator("prompt")
|
|
13
|
+
@classmethod
|
|
14
|
+
def is_rubric_prompt_valid(cls, prompt: str):
|
|
15
|
+
if "{conversation}" in prompt and "{context}" in prompt:
|
|
16
|
+
raise ValueError(
|
|
17
|
+
"Your rubric should not have both {conversation} and {context}. Please check the README file for more information about how to write FlexEval rubrics."
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
if "{completion}" in prompt and "{content}" in prompt:
|
|
21
|
+
raise ValueError(
|
|
22
|
+
"Your rubric should not have both {content} and {completion}. Please check the README file for more information about how to write FlexEval rubrics."
|
|
23
|
+
)
|
|
24
|
+
return prompt
|
|
25
|
+
|
|
26
|
+
@field_validator("choice_scores")
|
|
27
|
+
@classmethod
|
|
28
|
+
def check_non_empty(cls, v):
|
|
29
|
+
if not v:
|
|
30
|
+
raise ValueError("Must provide at least two choice scores.")
|
|
31
|
+
return v
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class RubricsCollection(BaseModel):
|
|
35
|
+
"""Collection of rubrics that can be used as :class:`~flexeval.schema.eval_schema.RubricItem`\s."""
|
|
36
|
+
|
|
37
|
+
rubrics: dict[str, Rubric] = Field(
|
|
38
|
+
default_factory=dict,
|
|
39
|
+
description="Mapping of rubric names to Rubrics. The rubric names are used for matching metrics to specific rubrics.",
|
|
40
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import types
|
|
2
|
+
from typing import Annotated, Any
|
|
3
|
+
|
|
4
|
+
from pydantic import BeforeValidator, PlainSerializer, PlainValidator
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def validate_python_module(value: Any) -> Any:
|
|
8
|
+
if not isinstance(value, types.ModuleType):
|
|
9
|
+
raise ValueError(f"Expected a module, got a '{type(value)}'.")
|
|
10
|
+
return value
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
ModuleType = Annotated[
|
|
14
|
+
types.ModuleType,
|
|
15
|
+
PlainValidator(validate_python_module),
|
|
16
|
+
PlainSerializer(lambda x: str(x.__name__)),
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def convert_none_or_empty_string_to_dict(value: Any):
|
|
21
|
+
if value is None or (isinstance(value, str) and value.strip() == ""):
|
|
22
|
+
return {}
|
|
23
|
+
return value
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
OptionalDict = Annotated[dict, BeforeValidator(convert_none_or_empty_string_to_dict)]
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: python-flexeval
|
|
3
|
+
Version: 0.1.5
|
|
4
|
+
Summary: FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
|
|
5
|
+
Project-URL: Homepage, https://digitalharborfoundation.github.io/FlexEval/
|
|
6
|
+
Project-URL: GitHub, https://github.com/DigitalHarborFoundation/FlexEval
|
|
7
|
+
Project-URL: Issues, https://github.com/DigitalHarborFoundation/FlexEval/issues
|
|
8
|
+
Author: S. Thomas Christie, Zachary Levonian, Baptiste Moreau-Pernet, Anna Rafferty, Terry Yu Tian
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: conversation,education,evaluation,large language models,learning engineering
|
|
12
|
+
Classifier: Intended Audience :: Education
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Natural Language :: English
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: flatten-json>=0.1.14
|
|
21
|
+
Requires-Dist: jsonschema>=4.23.0
|
|
22
|
+
Requires-Dist: langchain-openai>=0.3.8
|
|
23
|
+
Requires-Dist: langchain>=0.3.20
|
|
24
|
+
Requires-Dist: langgraph-checkpoint-sqlite>=2.0.6
|
|
25
|
+
Requires-Dist: langgraph>=0.3.6
|
|
26
|
+
Requires-Dist: litellm>=1.74.3
|
|
27
|
+
Requires-Dist: msgpack>=1.1.0
|
|
28
|
+
Requires-Dist: networkx>=3.4.2
|
|
29
|
+
Requires-Dist: openai>=1.66.0
|
|
30
|
+
Requires-Dist: pandas>=2.2.3
|
|
31
|
+
Requires-Dist: peewee>=3.17.9
|
|
32
|
+
Requires-Dist: pydantic>=2.10.6
|
|
33
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
34
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
35
|
+
Requires-Dist: requests>=2.32.3
|
|
36
|
+
Requires-Dist: sympy>=1.13.3
|
|
37
|
+
Requires-Dist: textstat>=0.7.5
|
|
38
|
+
Requires-Dist: typer>=0.16.0
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
|
|
41
|
+
# FlexEval LLM Evals
|
|
42
|
+
|
|
43
|
+
[](https://doi.org/10.5281/zenodo.12729993)
|
|
44
|
+
[](https://github.com/DigitalHarborFoundation/FlexEval/blob/main/LICENSE)
|
|
45
|
+
|
|
46
|
+

|
|
47
|
+
|
|
48
|
+
FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
|
|
49
|
+
|
|
50
|
+
**Documentation:** <https://digitalharborfoundation.github.io/FlexEval>
|
|
51
|
+
|
|
52
|
+
Additional details about FlexEval can be found [in our paper](https://doi.org/10.5281/zenodo.12729993) at the _Educational Data Mining_ 2024 conference.
|
|
53
|
+
|
|
54
|
+
## Usage
|
|
55
|
+
|
|
56
|
+
Basic usage:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
import flexeval
|
|
60
|
+
from flexeval.schema import Eval, EvalRun, FileDataSource, Metrics, FunctionItem, Config
|
|
61
|
+
|
|
62
|
+
data_sources = [FileDataSource(path="vignettes/conversations.jsonl")]
|
|
63
|
+
eval = Eval(metrics=Metrics(function=[FunctionItem(name="flesch_reading_ease")]))
|
|
64
|
+
config = Config(clear_tables=True)
|
|
65
|
+
eval_run = EvalRun(
|
|
66
|
+
data_sources=data_sources,
|
|
67
|
+
database_path="eval_results.db",
|
|
68
|
+
eval=eval,
|
|
69
|
+
config=config,
|
|
70
|
+
)
|
|
71
|
+
flexeval.run(eval_run)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
This example computes [Flesch reading ease](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease) for every turn in a list of conversations provided in JSONL format. The metric values are stored in an SQLite database called `eval_results.db`.
|
|
75
|
+
|
|
76
|
+
See additional usage examples in the [vignettes](/vignettes).
|
|
77
|
+
|
|
78
|
+
## Installation
|
|
79
|
+
|
|
80
|
+
FlexEval is on PyPI as [`python-flexeval`](https://pypi.org/p/python-flexeval). See the [Installation](https://digitalharborfoundation.github.io/FlexEval/getting_started.html#Installation) section in the [Getting Started](https://digitalharborfoundation.github.io/FlexEval/getting_started.html) guide.
|
|
81
|
+
|
|
82
|
+
Using `pip`:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pip install python-flexeval
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Basic functionality
|
|
89
|
+
|
|
90
|
+
FlexEval is designed to be "batteries included" for many basic use cases. It supports the following out-of-the-box:
|
|
91
|
+
|
|
92
|
+
- scoring historical conversations - useful for monitoring live systems.
|
|
93
|
+
- scoring LLMs:
|
|
94
|
+
- locally hosted and served via an endpoint using something like [LM Studio](https://lmstudio.ai)
|
|
95
|
+
- LLMs accessible by a REST endpoint and accessible via a network call
|
|
96
|
+
- any OpenAI LLM
|
|
97
|
+
- a set of useful rubrics
|
|
98
|
+
- a set of useful Python functions
|
|
99
|
+
|
|
100
|
+
Evaluation results are saved in an SQLite database. See the [Metric Analysis](/vignettes/metric_analysis.ipynb) vignette for a sample analysis demonstrating the structure and utility of the data saved by FlexEval.
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
Read more in the [Getting Started](https://digitalharborfoundation.github.io/FlexEval/getting_started.html) guide.
|
|
104
|
+
|
|
105
|
+
## Cite this work
|
|
106
|
+
|
|
107
|
+
If this work is useful to you, please cite [our EDM 2024 paper](https://educationaldatamining.org/edm2024/proceedings/2024.EDM-posters.107/2024.EDM-posters.107.pdf):
|
|
108
|
+
|
|
109
|
+
>S. Thomas Christie, Baptiste Moreau-Pernet, Yu Tian, & John Whitmer. (2024). FlexEval: a customizable tool for chatbot performance evaluation and dialogue analysis. _Proceedings of the 17th International Conference on Educational Data Mining_, 903-908. Atlanta, Georgia, USA, July 2024. <https://doi.org/10.5281/zenodo.12729993>
|
|
110
|
+
|
|
111
|
+
## Development
|
|
112
|
+
|
|
113
|
+
Pull requests are welcome. Feel free to contribute:
|
|
114
|
+
- New rubrics or functions
|
|
115
|
+
- Bug fixes
|
|
116
|
+
- New features
|
|
117
|
+
|
|
118
|
+
See [DEVELOPMENT.md](DEVELOPMENT.md).
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
flexeval/__init__.py,sha256=FIVIg06yxMU_RHPpx22QtX94hnS8Ce7gCjOcQ2pECMc,337
|
|
2
|
+
flexeval/__main__.py,sha256=c9NQqsea3e-_6b736gBeIO3O_zdXQ1wtY3-Scj5NiPg,126
|
|
3
|
+
flexeval/cli.py,sha256=RwtRk121OivbLQyYpYxJ7PugPIYQ8J4qXHFN2SxxPy4,2985
|
|
4
|
+
flexeval/completions.py,sha256=pi_tYK4m3vKSqAC1ym9Jc3e4srcQSXfx-mX4qI5qisQ,5686
|
|
5
|
+
flexeval/compute_metrics.py,sha256=elQZvuh2jyateWzwIPm8RLHASq-XqFMinEIA0rlMkj8,37277
|
|
6
|
+
flexeval/config.yaml,sha256=dpkFdW0rKf7StGoVeIGaCNw0n0yOfYWig0xmIfsDdbg,530
|
|
7
|
+
flexeval/data_loader.py,sha256=EKc6wdpQuhrB2ai2U_fQxojzt1RR716ELisiZXpfu58,25311
|
|
8
|
+
flexeval/db_utils.py,sha256=FKekqWAZ0oQbYNvw0bxuzHcZxlSsKKJkUhyfod-pMLg,1412
|
|
9
|
+
flexeval/dependency_graph.py,sha256=SaG9gjkw2Q0NykqQWs4JzPkv5sMj2aXXmhjJ7yRkV4Q,10539
|
|
10
|
+
flexeval/eval_schema.json,sha256=BQetj8O0_4rorj3Mpqk-sj_SCaRkGMrvBUcxhuw6zLE,13111
|
|
11
|
+
flexeval/function_types.py,sha256=eH8NadQRw7XAOXAOKWYN6b7urjr57J5WzdiVyzh0Wb4,6898
|
|
12
|
+
flexeval/helpers.py,sha256=gX-6Hx4_wOiqbfY8c8_kL3XbkdV8mpEjPmaAe44lOSk,1605
|
|
13
|
+
flexeval/log_utils.py,sha256=E3RloPQZbtd8sEIg7mfN5fAku-TeNGqWy03SmwRllIE,923
|
|
14
|
+
flexeval/rubric.py,sha256=UwtJOxIxFJcQVrDXXuCA3tF_FFTcvLPqo2F9lq8gPcM,2167
|
|
15
|
+
flexeval/run_utils.py,sha256=cNFVRsFNYY9gpzbIUc-H4Gk7TWC64GXsYowQHoG7ZVU,2597
|
|
16
|
+
flexeval/runner.py,sha256=X6ZfjfwIM3ymN_kHfRt_JSKPxpDxs_MWQPrvWhl2L7I,4340
|
|
17
|
+
flexeval/classes/__init__.py,sha256=fywDMYX8W-nXFKRXolzn-RWd_7tiJr6FlouQJvYSoyE,347
|
|
18
|
+
flexeval/classes/base.py,sha256=xxkTa8joPe39CFwveeTPW56LW-x7rsi5oBAIxrvM5iI,944
|
|
19
|
+
flexeval/classes/dataset.py,sha256=Y_EdEIuhx526SSvkqk2tFBzkOgBkVY-5FeraYMtU5lo,2913
|
|
20
|
+
flexeval/classes/eval_runner.py,sha256=-jkPlKhTWX0FpUDrzCaUIlIIlKsSAmDy06T4I1aB3Ds,6269
|
|
21
|
+
flexeval/classes/eval_set_run.py,sha256=fq_wBOaxuq7dLxiZIw76WGIwhRBNbQWDUhpiK0wDG_A,1116
|
|
22
|
+
flexeval/classes/message.py,sha256=zuDm_v1gmK49Fw5m-HTWiqndrI_xtLotlXD8nhRDDTg,7518
|
|
23
|
+
flexeval/classes/metric.py,sha256=d8l39_QwnQDmTJvy9TIulU4p0jqD7ldMUi4m5zfK2Es,2806
|
|
24
|
+
flexeval/classes/thread.py,sha256=LchsK9mmrY4K-zSTMAAmywlzPVwnpZ7rOHqBGPIlda8,2779
|
|
25
|
+
flexeval/classes/tool_call.py,sha256=CteT2Hajor0PlHEEn7apfZux5_mremSIDrQmZ0iB7K0,1748
|
|
26
|
+
flexeval/classes/turn.py,sha256=kLmgnYQ-4a8sydzGK1HTQRyUDXZIedmt_NFR3shLJFE,8635
|
|
27
|
+
flexeval/configuration/__init__.py,sha256=wP_gpYyaEp5DxCSH8-4KHchH07JMZZOk8eCFMfd5LBw,75
|
|
28
|
+
flexeval/configuration/completion_functions.py,sha256=-N0iFAfcYcm35S78M3ES4MBkLXpDeEfy2Qq1ORHGBXE,7491
|
|
29
|
+
flexeval/configuration/evals.yaml,sha256=3mbD3gEccTDotm8kj4doYTujqRD_PkGhCVhjQaSEqSs,22651
|
|
30
|
+
flexeval/configuration/function_metrics.py,sha256=UqCCl_xoG6kH6jRset0m1FQoAfUrqt9bqipxAshN5_A,22419
|
|
31
|
+
flexeval/configuration/rubric_metrics.yaml,sha256=JfE6gPj4LtM2v0b5-Zge3NwM17YgJEBZXzTVn9UL7zk,9424
|
|
32
|
+
flexeval/io/__init__.py,sha256=MqdgcPzkFpSnOEz-e2GNNd8XOI_DbyNjIP8AT5eqUqI,101
|
|
33
|
+
flexeval/io/parsers/yaml_parser.py,sha256=2yE6j_RM_YG5nkNUWZckrymh61n28AG46lqnPSlWitk,1818
|
|
34
|
+
flexeval/metrics/__init__.py,sha256=zBg-thOos5X1-YUH70PkdMqFnPdsrTM0Bt3fIjhfxDM,131
|
|
35
|
+
flexeval/metrics/access.py,sha256=U-IhG_dhC8HZ9BMnBKHiEvHretUuAnzuUWJ288XuPiA,681
|
|
36
|
+
flexeval/metrics/save.py,sha256=8x9ifRiHtQT7_WeMP0XmYK1zfourXMnHkGZy_iR0Xcc,1643
|
|
37
|
+
flexeval/schema/__init__.py,sha256=4OA6Q7Dguz-uaulwoRsrtaoReFmyNsKqyi_CvfDV4-c,379
|
|
38
|
+
flexeval/schema/config_schema.py,sha256=LkmtiOLfPsX1u_6Ey6gFbRr8tQwxqcuLcyf-xYcBf9o,1619
|
|
39
|
+
flexeval/schema/eval_schema.py,sha256=95kCkiGS67TfpVUfUaBdBMoKIpUJoY1beUgLWwg5Ljk,6373
|
|
40
|
+
flexeval/schema/evalrun_schema.py,sha256=LE6RmNHeRJIRye68xUMOaknWMNLcugfnQoUEkeP1JRs,3526
|
|
41
|
+
flexeval/schema/rubric_schema.py,sha256=9DaqU-Av6XMig7iIy3EObLhEkhjtYIxeCqpovKLYfYw,1615
|
|
42
|
+
flexeval/schema/schema_utils.py,sha256=Fg1foqRA-9X-hl_vqIF3bpYdE51hNEgdw739Q-s3iQc,698
|
|
43
|
+
python_flexeval-0.1.5.dist-info/METADATA,sha256=LPvBmYMMKpyxgStPchWxj1fhBYoNbbdb7-UgQX2b4CY,5095
|
|
44
|
+
python_flexeval-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
45
|
+
python_flexeval-0.1.5.dist-info/entry_points.txt,sha256=wSyluqXhrX3xySVYAtM-Kv23p4OauKQCSBuNNfzEGtI,52
|
|
46
|
+
python_flexeval-0.1.5.dist-info/licenses/LICENSE,sha256=OlAu_c13gw6-fJ9UdhZBMeNr5STLrnWG_0Hv0SCXtu4,1082
|
|
47
|
+
python_flexeval-0.1.5.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Digital Harbor Foundation
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|