python-flexeval 0.1.5__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.gitignore +1 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/Makefile +11 -1
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/PKG-INFO +7 -5
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/README.md +6 -4
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/conf.py +8 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/index.rst +9 -4
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/vignettes.py +13 -16
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/pyproject.toml +5 -1
- python_flexeval-0.3.0/src/flexeval/__about__.py +1 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/__init__.py +2 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/eval_runner.py +1 -1
- python_flexeval-0.3.0/src/flexeval/classes/jsonview.py +107 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/message.py +5 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/thread.py +4 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/compute_metrics.py +1 -1
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/configuration/function_metrics.py +2 -2
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/data_loader.py +26 -11
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/db_utils.py +8 -1
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/metrics/__init__.py +1 -1
- python_flexeval-0.3.0/src/flexeval/metrics/access.py +50 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/eval_schema.py +10 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/evalrun_schema.py +1 -1
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/rubric_schema.py +1 -1
- python_flexeval-0.3.0/tests/data/simple_metadata.jsonl +2 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_data_loader.py +33 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/uv.lock +1835 -1245
- python_flexeval-0.1.5/src/flexeval/metrics/access.py +0 -28
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.env-example +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.github/dependabot.yml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.github/workflows/deploy-to-pypi.yml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.github/workflows/github-pages.yml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.github/workflows/validate.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.pre-commit-config.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.python-version +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.vscode/settings.json +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/CITATION.bib +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/CITATION.cff +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/CLAUDE.md +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/DEVELOPMENT.md +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/Dockerfile +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/EDM_2024_FlexEval.pdf +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/LICENSE +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/data/metabase/.gitkeep +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docker-compose.yml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/_static/flexeval_banner.svg +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/_static/flexeval_favicon.svg +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/_static/flexeval_logo.png +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/_static/flexeval_logo2.png +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/_templates/footer.html +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/api.rst +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/getting_started.rst +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/sphinxext/__init__.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/sphinxext/github.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/abstractions.rst +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/cli.rst +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/index.rst +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/logging.rst +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/motivation.md +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/rubric_guide.md +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/vignettes.rst +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/example_project/example_specific_rubrics.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/logs/.gitkeep +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/make.bat +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/ruff.toml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/__main__.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/__init__.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/base.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/dataset.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/eval_set_run.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/metric.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/tool_call.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/turn.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/cli.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/completions.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/config.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/configuration/__init__.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/configuration/completion_functions.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/configuration/evals.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/configuration/rubric_metrics.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/dependency_graph.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/eval_schema.json +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/function_types.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/helpers.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/io/__init__.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/io/parsers/yaml_parser.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/log_utils.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/metrics/save.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/rubric.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/run_utils.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/runner.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/__init__.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/config_schema.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/schema_utils.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/metabase/Dockerfile +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/__init__.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/data/multiturn.jsonl +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/data/plot-convos.jsonl +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/data/simple.jsonl +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/data/simple_nosystem.jsonl +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/__init__.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/config-tests.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/data/multiturn.jsonl +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/data/plot-convos.jsonl +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/data/simple.jsonl +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/evals.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/functional_tests.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/langgraph_data.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/function_metric.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/functional_config.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/functional_evals.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/test_config.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/test_dataset.jsonl +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/test_evals.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/test_rubric_metrics.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/unittest.env +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/__init__.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/io/test_yaml_parser.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/mixins.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_completions.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_compute_metrics.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_db_utils.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_dependency_graph.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_eval_runner.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_function_metrics.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_function_types.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_functional.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_rubric.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_schema.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/.gitignore +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/basic.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/basic_cli.md +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/basic_rubric.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/conversations.jsonl +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/custom_functions.py +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/custom_rubric.md +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/custom_rubrics.yaml +0 -0
- {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/eval_run.yaml +0 -0
|
@@ -6,7 +6,7 @@ SPHINXBUILD ?= uv run sphinx-build
|
|
|
6
6
|
SOURCEDIR = docs
|
|
7
7
|
BUILDDIR = build
|
|
8
8
|
|
|
9
|
-
.PHONY: dochelp docautobuild docclean unittest Makefile
|
|
9
|
+
.PHONY: dochelp docautobuild docclean unittest get-version set-version Makefile
|
|
10
10
|
|
|
11
11
|
# Put it first so that "make" without argument is like "make dochelp".
|
|
12
12
|
dochelp:
|
|
@@ -23,6 +23,16 @@ docclean:
|
|
|
23
23
|
unittest:
|
|
24
24
|
@uv run python -m unittest discover -s tests.unit
|
|
25
25
|
|
|
26
|
+
# see: https://hatch.pypa.io/1.13/version/
|
|
27
|
+
get-version:
|
|
28
|
+
@uv run hatch version
|
|
29
|
+
|
|
30
|
+
set-version:
|
|
31
|
+
@if [ -z "$(VERSION)" ]; then \
|
|
32
|
+
echo "VERSION is not set. Usage: make set-version VERSION=x.y.z"; \
|
|
33
|
+
exit 1; \
|
|
34
|
+
fi
|
|
35
|
+
@uv run hatch version "$(VERSION)"
|
|
26
36
|
|
|
27
37
|
# Catch-all target: route all unknown targets to Sphinx using the new
|
|
28
38
|
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: python-flexeval
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
|
|
5
5
|
Project-URL: Homepage, https://digitalharborfoundation.github.io/FlexEval/
|
|
6
6
|
Project-URL: GitHub, https://github.com/DigitalHarborFoundation/FlexEval
|
|
@@ -40,10 +40,12 @@ Description-Content-Type: text/markdown
|
|
|
40
40
|
|
|
41
41
|
# FlexEval LLM Evals
|
|
42
42
|
|
|
43
|
+
[](https://pypi.org/project/python-flexeval/)
|
|
43
44
|
[](https://doi.org/10.5281/zenodo.12729993)
|
|
44
45
|
[](https://github.com/DigitalHarborFoundation/FlexEval/blob/main/LICENSE)
|
|
46
|
+
[](https://github.com/DigitalHarborFoundation/FlexEval/issues)
|
|
45
47
|
|
|
46
|
-

|
|
48
|
+

|
|
47
49
|
|
|
48
50
|
FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
|
|
49
51
|
|
|
@@ -73,7 +75,7 @@ flexeval.run(eval_run)
|
|
|
73
75
|
|
|
74
76
|
This example computes [Flesch reading ease](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease) for every turn in a list of conversations provided in JSONL format. The metric values are stored in an SQLite database called `eval_results.db`.
|
|
75
77
|
|
|
76
|
-
See additional usage examples in the [vignettes](/vignettes).
|
|
78
|
+
See additional usage examples in the [vignettes](https://github.com/DigitalHarborFoundation/FlexEval/tree/main/vignettes).
|
|
77
79
|
|
|
78
80
|
## Installation
|
|
79
81
|
|
|
@@ -97,7 +99,7 @@ FlexEval is designed to be "batteries included" for many basic use cases. It sup
|
|
|
97
99
|
- a set of useful rubrics
|
|
98
100
|
- a set of useful Python functions
|
|
99
101
|
|
|
100
|
-
Evaluation results are saved in an SQLite database. See the [Metric Analysis](/vignettes/metric_analysis.
|
|
102
|
+
Evaluation results are saved in an SQLite database. See the [Metric Analysis](https://digitalharborfoundation.github.io/FlexEval/generated/vignettes/metric_analysis.html) vignette for a sample analysis demonstrating the structure and utility of the data saved by FlexEval.
|
|
101
103
|
|
|
102
104
|
|
|
103
105
|
Read more in the [Getting Started](https://digitalharborfoundation.github.io/FlexEval/getting_started.html) guide.
|
|
@@ -115,4 +117,4 @@ Pull requests are welcome. Feel free to contribute:
|
|
|
115
117
|
- Bug fixes
|
|
116
118
|
- New features
|
|
117
119
|
|
|
118
|
-
See [DEVELOPMENT.md](DEVELOPMENT.md).
|
|
120
|
+
See [DEVELOPMENT.md](https://github.com/DigitalHarborFoundation/FlexEval/tree/main/DEVELOPMENT.md).
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
# FlexEval LLM Evals
|
|
2
2
|
|
|
3
|
+
[](https://pypi.org/project/python-flexeval/)
|
|
3
4
|
[](https://doi.org/10.5281/zenodo.12729993)
|
|
4
5
|
[](https://github.com/DigitalHarborFoundation/FlexEval/blob/main/LICENSE)
|
|
6
|
+
[](https://github.com/DigitalHarborFoundation/FlexEval/issues)
|
|
5
7
|
|
|
6
|
-

|
|
8
|
+

|
|
7
9
|
|
|
8
10
|
FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
|
|
9
11
|
|
|
@@ -33,7 +35,7 @@ flexeval.run(eval_run)
|
|
|
33
35
|
|
|
34
36
|
This example computes [Flesch reading ease](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease) for every turn in a list of conversations provided in JSONL format. The metric values are stored in an SQLite database called `eval_results.db`.
|
|
35
37
|
|
|
36
|
-
See additional usage examples in the [vignettes](/vignettes).
|
|
38
|
+
See additional usage examples in the [vignettes](https://github.com/DigitalHarborFoundation/FlexEval/tree/main/vignettes).
|
|
37
39
|
|
|
38
40
|
## Installation
|
|
39
41
|
|
|
@@ -57,7 +59,7 @@ FlexEval is designed to be "batteries included" for many basic use cases. It sup
|
|
|
57
59
|
- a set of useful rubrics
|
|
58
60
|
- a set of useful Python functions
|
|
59
61
|
|
|
60
|
-
Evaluation results are saved in an SQLite database. See the [Metric Analysis](/vignettes/metric_analysis.
|
|
62
|
+
Evaluation results are saved in an SQLite database. See the [Metric Analysis](https://digitalharborfoundation.github.io/FlexEval/generated/vignettes/metric_analysis.html) vignette for a sample analysis demonstrating the structure and utility of the data saved by FlexEval.
|
|
61
63
|
|
|
62
64
|
|
|
63
65
|
Read more in the [Getting Started](https://digitalharborfoundation.github.io/FlexEval/getting_started.html) guide.
|
|
@@ -75,4 +77,4 @@ Pull requests are welcome. Feel free to contribute:
|
|
|
75
77
|
- Bug fixes
|
|
76
78
|
- New features
|
|
77
79
|
|
|
78
|
-
See [DEVELOPMENT.md](DEVELOPMENT.md).
|
|
80
|
+
See [DEVELOPMENT.md](https://github.com/DigitalHarborFoundation/FlexEval/tree/main/DEVELOPMENT.md).
|
|
@@ -89,6 +89,10 @@ intersphinx_mapping = {
|
|
|
89
89
|
"pydantic": ("https://docs.pydantic.dev/latest", None),
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
+
docutils_conf = {
|
|
93
|
+
"line-length-limit": None, # Disable docutils line-length limit
|
|
94
|
+
}
|
|
95
|
+
|
|
92
96
|
|
|
93
97
|
def linkcode_resolve(domain, info):
|
|
94
98
|
"""
|
|
@@ -159,6 +163,10 @@ myst_enable_extensions = [
|
|
|
159
163
|
]
|
|
160
164
|
myst_url_schemes = ("http", "https", "mailto")
|
|
161
165
|
|
|
166
|
+
# myst-nb configuration
|
|
167
|
+
nb_execution_mode = "off" # Don't re-execute, use existing outputs
|
|
168
|
+
nb_merge_streams = True
|
|
169
|
+
|
|
162
170
|
autosummary_generate = True
|
|
163
171
|
autodoc_typehints = "signature"
|
|
164
172
|
autodoc_default_options = {
|
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
.. FlexEval documentation master file, created by
|
|
2
|
-
sphinx-quickstart on Thu Jul 3 12:21:33 2025.
|
|
3
|
-
You can adapt this file completely to your liking, but it should at least
|
|
4
|
-
contain the root `toctree` directive.
|
|
1
|
+
.. FlexEval documentation master file, originally created by sphinx-quickstart on 2025 July 3 12:21:33.
|
|
5
2
|
|
|
6
3
|
FlexEval documentation
|
|
7
4
|
======================
|
|
8
5
|
|
|
6
|
+
.. image:: https://img.shields.io/pypi/v/python-flexeval
|
|
7
|
+
:target: https://pypi.org/project/python-flexeval/
|
|
8
|
+
:alt: PyPI
|
|
9
|
+
|
|
9
10
|
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.12729993.svg
|
|
10
11
|
:target: https://doi.org/10.5281/zenodo.12729993
|
|
11
12
|
:alt: Zenodo DOI
|
|
@@ -14,6 +15,10 @@ FlexEval documentation
|
|
|
14
15
|
:target: https://github.com/DigitalHarborFoundation/FlexEval/blob/main/LICENSE
|
|
15
16
|
:alt: FlexEval license
|
|
16
17
|
|
|
18
|
+
.. image:: https://img.shields.io/badge/issue_tracking-github-blue.svg
|
|
19
|
+
:target: https://github.com/DigitalHarborFoundation/FlexEval/issues
|
|
20
|
+
:alt: Issue tracking on GitHub
|
|
21
|
+
|
|
17
22
|
.. raw:: html
|
|
18
23
|
|
|
19
24
|
<br>
|
|
@@ -44,7 +44,7 @@ def extract_vignette_path_strings(file_contents: str) -> list[str]:
|
|
|
44
44
|
|
|
45
45
|
def write_if_changed(path: Path, old_content: str, new_content: str):
|
|
46
46
|
if path.exists():
|
|
47
|
-
|
|
47
|
+
# check if the file contents are different
|
|
48
48
|
if old_content == new_content:
|
|
49
49
|
return # No change — don't touch file
|
|
50
50
|
path.write_text(new_content)
|
|
@@ -67,33 +67,30 @@ def generate_custom_stubs(app):
|
|
|
67
67
|
|
|
68
68
|
for vignette_file in vignettes_dir.glob("*"):
|
|
69
69
|
stem = vignette_file.stem
|
|
70
|
-
|
|
70
|
+
target_file = output_dir / f"{stem}.rst"
|
|
71
|
+
if vignette_file.suffix == ".ipynb":
|
|
72
|
+
target_file = output_dir / f"{stem}.ipynb"
|
|
71
73
|
|
|
72
74
|
current_contents = ""
|
|
73
|
-
if
|
|
74
|
-
current_contents =
|
|
75
|
+
if target_file.exists():
|
|
76
|
+
current_contents = target_file.read_text()
|
|
75
77
|
|
|
76
78
|
if vignette_file.suffix == ".py":
|
|
77
|
-
generate_python_stub(src_dir, vignette_file,
|
|
79
|
+
generate_python_stub(src_dir, vignette_file, target_file, current_contents)
|
|
78
80
|
elif vignette_file.suffix == ".ipynb":
|
|
79
|
-
generate_ipynb_stub(
|
|
81
|
+
generate_ipynb_stub(vignette_file, target_file, current_contents)
|
|
80
82
|
elif vignette_file.suffix == ".md":
|
|
81
|
-
generate_md_stub(src_dir, vignette_file,
|
|
83
|
+
generate_md_stub(src_dir, vignette_file, target_file, current_contents)
|
|
82
84
|
else:
|
|
83
85
|
logger.info(
|
|
84
86
|
f"Unsupported file type {vignette_file.suffix}; skipping while creating vignette stubs."
|
|
85
87
|
)
|
|
86
88
|
|
|
87
89
|
|
|
88
|
-
def generate_ipynb_stub(
|
|
89
|
-
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
.. include:: ../../../{ipynb_file.relative_to(src_dir.parent)}
|
|
94
|
-
:parser: myst_nb.docutils_
|
|
95
|
-
"""
|
|
96
|
-
write_if_changed(rst_file, current_contents, new_contents)
|
|
90
|
+
def generate_ipynb_stub(ipynb_file: Path, target_file: Path, current_contents: str):
|
|
91
|
+
# unlike the other file types, we just copy ipynb files
|
|
92
|
+
new_contents = ipynb_file.read_text()
|
|
93
|
+
write_if_changed(target_file, current_contents, new_contents)
|
|
97
94
|
|
|
98
95
|
|
|
99
96
|
def generate_md_stub(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "python-flexeval"
|
|
3
|
-
|
|
3
|
+
dynamic = ["version"]
|
|
4
4
|
description = "FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name="S. Thomas Christie" },
|
|
@@ -52,6 +52,7 @@ Issues = "https://github.com/DigitalHarborFoundation/FlexEval/issues"
|
|
|
52
52
|
|
|
53
53
|
[dependency-groups]
|
|
54
54
|
dev = [
|
|
55
|
+
"hatch>=1.14.1",
|
|
55
56
|
"jupyter>=1.1.1",
|
|
56
57
|
"matplotlib>=3.10.3",
|
|
57
58
|
"pre-commit>=4.2.0",
|
|
@@ -81,3 +82,6 @@ build-backend = "hatchling.build"
|
|
|
81
82
|
|
|
82
83
|
[tool.hatch.build.targets.wheel]
|
|
83
84
|
packages = ["src/flexeval"]
|
|
85
|
+
|
|
86
|
+
[tool.hatch.version]
|
|
87
|
+
path = "src/flexeval/__about__.py"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.0"
|
|
@@ -111,7 +111,7 @@ class EvalRunner:
|
|
|
111
111
|
|
|
112
112
|
def load_evaluation_settings(self):
|
|
113
113
|
"""This function parses our eval suite and puts it in the data structure we'll need
|
|
114
|
-
for easy use at run-time
|
|
114
|
+
for easy use at run-time.
|
|
115
115
|
"""
|
|
116
116
|
# if the current eval has a 'config' entry, overwrite configuration options with its entries
|
|
117
117
|
if (
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from collections import UserDict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class JsonViewDict(UserDict):
|
|
6
|
+
"""Dictionary that syncs changes back to the model field."""
|
|
7
|
+
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
model_instance,
|
|
11
|
+
text_field_attr_name,
|
|
12
|
+
json_dumps_fn=json.dumps,
|
|
13
|
+
json_loads_fn=json.loads,
|
|
14
|
+
):
|
|
15
|
+
self.model_instance = model_instance
|
|
16
|
+
self.text_field_attr_name = text_field_attr_name
|
|
17
|
+
self.json_dumps_fn = json_dumps_fn
|
|
18
|
+
self.json_loads_fn = json_loads_fn
|
|
19
|
+
|
|
20
|
+
text_value = getattr(model_instance, text_field_attr_name)
|
|
21
|
+
initial_data = self.json_loads_fn(text_value)
|
|
22
|
+
super().__init__(initial_data)
|
|
23
|
+
|
|
24
|
+
def _sync_to_model(self):
|
|
25
|
+
"""Sync the current data back to the model field."""
|
|
26
|
+
json_str = self.json_loads_fn(self.data)
|
|
27
|
+
setattr(self.model_instance, self.text_field_attr_name, json_str)
|
|
28
|
+
|
|
29
|
+
# Override mutating methods to trigger sync
|
|
30
|
+
def __setitem__(self, key, value):
|
|
31
|
+
super().__setitem__(key, value)
|
|
32
|
+
self._sync_to_model()
|
|
33
|
+
|
|
34
|
+
def __delitem__(self, key):
|
|
35
|
+
super().__delitem__(key)
|
|
36
|
+
self._sync_to_model()
|
|
37
|
+
|
|
38
|
+
def clear(self):
|
|
39
|
+
super().clear()
|
|
40
|
+
self._sync_to_model()
|
|
41
|
+
|
|
42
|
+
def pop(self, key, *args):
|
|
43
|
+
result = super().pop(key, *args)
|
|
44
|
+
self._sync_to_model()
|
|
45
|
+
return result
|
|
46
|
+
|
|
47
|
+
def popitem(self):
|
|
48
|
+
result = super().popitem()
|
|
49
|
+
self._sync_to_model()
|
|
50
|
+
return result
|
|
51
|
+
|
|
52
|
+
def setdefault(self, key, default=None):
|
|
53
|
+
result = super().setdefault(key, default)
|
|
54
|
+
self._sync_to_model()
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
def update(self, *args, **kwargs):
|
|
58
|
+
super().update(*args, **kwargs)
|
|
59
|
+
self._sync_to_model()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class JsonView:
|
|
63
|
+
"""Descriptor that provides dict-like access to a JSON text field.
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
class SomeModel(pw.Model):
|
|
67
|
+
some_field = pw.TextField(default="{}")
|
|
68
|
+
some_field_dict = JsonView(text_field_attr_name="some_field")
|
|
69
|
+
|
|
70
|
+
m = SomeModel()
|
|
71
|
+
m.some_field_dict["chosen_mistake"] = "whatever"
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(self, text_field_attr_name):
|
|
75
|
+
self.text_field_attr_name = text_field_attr_name
|
|
76
|
+
self.attr_name = None
|
|
77
|
+
|
|
78
|
+
def __set_name__(self, owner, name):
|
|
79
|
+
"""Called when the descriptor is assigned to a class attribute."""
|
|
80
|
+
self.attr_name = f"_{name}_dict"
|
|
81
|
+
|
|
82
|
+
def __get__(self, instance, owner):
|
|
83
|
+
if instance is None:
|
|
84
|
+
return self
|
|
85
|
+
|
|
86
|
+
# Check if we already have a cached JsonViewDict
|
|
87
|
+
if not hasattr(instance, self.attr_name):
|
|
88
|
+
if not hasattr(instance, self.text_field_attr_name):
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"Failed to link this JsonView to field '{self.text_field_attr_name}' because it doesn't exist on this model instance."
|
|
91
|
+
)
|
|
92
|
+
# Cache a new JsonViewDict
|
|
93
|
+
json_dict = JsonViewDict(instance, self.text_field_attr_name)
|
|
94
|
+
setattr(instance, self.attr_name, json_dict)
|
|
95
|
+
|
|
96
|
+
return getattr(instance, self.attr_name)
|
|
97
|
+
|
|
98
|
+
def __set__(self, instance, value):
|
|
99
|
+
"""Allow setting the entire dict."""
|
|
100
|
+
if isinstance(value, dict):
|
|
101
|
+
json_dict = JsonViewDict(instance, self.text_field_attr_name)
|
|
102
|
+
json_dict.update(value)
|
|
103
|
+
setattr(instance, self.attr_name, json_dict)
|
|
104
|
+
else:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"This JsonView must be a dictionary to set linked field '{self.text_field_attr_name}' correctly."
|
|
107
|
+
)
|
|
@@ -10,6 +10,7 @@ from flexeval.classes.dataset import Dataset
|
|
|
10
10
|
from flexeval.classes.eval_set_run import EvalSetRun
|
|
11
11
|
from flexeval.classes.thread import Thread
|
|
12
12
|
from flexeval.classes.turn import Turn
|
|
13
|
+
from flexeval.classes.jsonview import JsonView
|
|
13
14
|
from flexeval.configuration import completion_functions
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
@@ -34,6 +35,10 @@ class Message(BaseModel):
|
|
|
34
35
|
content = pw.TextField()
|
|
35
36
|
context = pw.TextField(null=True) # Previous messages
|
|
36
37
|
|
|
38
|
+
# metadata
|
|
39
|
+
metadata = pw.TextField(default="{}", null=False)
|
|
40
|
+
metadata_dict = JsonView("metadata")
|
|
41
|
+
|
|
37
42
|
# helpers
|
|
38
43
|
system_prompt = pw.TextField(null=True)
|
|
39
44
|
is_flexeval_completion = pw.BooleanField(null=True)
|
|
@@ -3,6 +3,7 @@ import peewee as pw
|
|
|
3
3
|
from flexeval.classes.base import BaseModel
|
|
4
4
|
from flexeval.classes.dataset import Dataset
|
|
5
5
|
from flexeval.classes.eval_set_run import EvalSetRun
|
|
6
|
+
from flexeval.classes.jsonview import JsonView
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class Thread(BaseModel):
|
|
@@ -20,6 +21,9 @@ class Thread(BaseModel):
|
|
|
20
21
|
|
|
21
22
|
system_prompt = pw.TextField(null=True)
|
|
22
23
|
|
|
24
|
+
metadata = pw.TextField(default="{}", null=False)
|
|
25
|
+
metadata_dict = JsonView("metadata")
|
|
26
|
+
|
|
23
27
|
def __init__(self, **kwargs):
|
|
24
28
|
super().__init__(**kwargs)
|
|
25
29
|
self.metrics_to_evaluate = []
|
|
@@ -42,7 +42,7 @@ class ObjectMetric:
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
class MetricGraphBuilder:
|
|
45
|
-
"""Builds :class:`networkx.DiGraph
|
|
45
|
+
"""Builds :class:`networkx.DiGraph` s of :class:`~flexeval.compute_metrics.ObjectMetric` instances that reflect any computational dependencies between them."""
|
|
46
46
|
|
|
47
47
|
def __init__(self):
|
|
48
48
|
# key: tuple(metric_level, metric_id, object_id)
|
{python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/configuration/function_metrics.py
RENAMED
|
@@ -122,8 +122,8 @@ def is_role(object: Union[Turn, Message], role: str) -> dict:
|
|
|
122
122
|
and 0 otherwise.
|
|
123
123
|
|
|
124
124
|
Args:
|
|
125
|
-
|
|
126
|
-
|
|
125
|
+
object: the Turn or Message
|
|
126
|
+
role: a string with the desired role to check against
|
|
127
127
|
"""
|
|
128
128
|
return {role: int(object.role == role)}
|
|
129
129
|
|
|
@@ -54,18 +54,13 @@ def load_jsonl(
|
|
|
54
54
|
max(1, nb_evaluations_per_thread)
|
|
55
55
|
): # duplicate stored threads for averaged evaluation results
|
|
56
56
|
if thread_id in selected_thread_ids:
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
eval_run_thread_id=str(thread_id)
|
|
62
|
-
+ "_"
|
|
63
|
-
+ str(thread_eval_run_id),
|
|
64
|
-
)
|
|
57
|
+
thread_json = json.loads(thread)
|
|
58
|
+
# extract any metadata
|
|
59
|
+
thread_metadata = thread_json.copy()
|
|
60
|
+
del thread_metadata["input"]
|
|
65
61
|
|
|
66
|
-
# Context
|
|
67
62
|
context = []
|
|
68
|
-
thread_input =
|
|
63
|
+
thread_input = thread_json["input"]
|
|
69
64
|
|
|
70
65
|
# Get system prompt used in the thread - assuming only 1
|
|
71
66
|
for message in thread_input:
|
|
@@ -78,15 +73,35 @@ def load_jsonl(
|
|
|
78
73
|
# Add the system prompt as context
|
|
79
74
|
context.append({"role": "system", "content": system_prompt})
|
|
80
75
|
|
|
76
|
+
thread_object: Thread = Thread.create(
|
|
77
|
+
evalsetrun=dataset.evalsetrun,
|
|
78
|
+
dataset=dataset,
|
|
79
|
+
jsonl_thread_id=thread_id,
|
|
80
|
+
eval_run_thread_id=str(thread_id)
|
|
81
|
+
+ "_"
|
|
82
|
+
+ str(thread_eval_run_id),
|
|
83
|
+
system_prompt=system_prompt,
|
|
84
|
+
metadata=json.dumps(thread_metadata),
|
|
85
|
+
)
|
|
86
|
+
|
|
81
87
|
# Create messages
|
|
82
88
|
index_in_thread = 0
|
|
83
89
|
for message in thread_input:
|
|
90
|
+
if not isinstance(message, dict):
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"Can't load unknown object type; expected dict. Check JSONL format: {message}"
|
|
93
|
+
)
|
|
84
94
|
role = message.get("role", None)
|
|
85
95
|
if role != "system":
|
|
86
96
|
# System message shouldn't be added as a separate message
|
|
87
97
|
system_prompt_for_this_message = ""
|
|
88
98
|
if role != "user":
|
|
89
99
|
system_prompt_for_this_message = system_prompt
|
|
100
|
+
message_metadata = message.copy()
|
|
101
|
+
if "content" in message_metadata:
|
|
102
|
+
del message_metadata["content"]
|
|
103
|
+
if "role" in message_metadata:
|
|
104
|
+
del message_metadata["role"]
|
|
90
105
|
Message.create(
|
|
91
106
|
evalsetrun=dataset.evalsetrun,
|
|
92
107
|
dataset=dataset,
|
|
@@ -95,9 +110,9 @@ def load_jsonl(
|
|
|
95
110
|
role=role,
|
|
96
111
|
content=message.get("content", None),
|
|
97
112
|
context=json.dumps(context),
|
|
98
|
-
metadata=message.get("metadata", None),
|
|
99
113
|
is_flexeval_completion=False,
|
|
100
114
|
system_prompt=system_prompt_for_this_message,
|
|
115
|
+
metadata=json.dumps(message_metadata),
|
|
101
116
|
)
|
|
102
117
|
# Update context
|
|
103
118
|
context.append(
|
|
@@ -14,6 +14,11 @@ from flexeval.classes.turn import Turn
|
|
|
14
14
|
DATABASE_TABLES = [EvalSetRun, Dataset, Thread, Turn, Message, ToolCall, Metric]
|
|
15
15
|
|
|
16
16
|
|
|
17
|
+
def ensure_database(database_path: str):
|
|
18
|
+
if not classes_base.database.is_connection_usable():
|
|
19
|
+
initialize_database(database_path)
|
|
20
|
+
|
|
21
|
+
|
|
17
22
|
def initialize_database(database_path: str, clear_tables: bool = False):
|
|
18
23
|
classes_base.database.init(database_path)
|
|
19
24
|
# classes_base.database.start()
|
|
@@ -34,5 +39,7 @@ def bind_to_database(database_path: str) -> pw.Database:
|
|
|
34
39
|
new_database = classes_base.create_sqlite_database(database_path)
|
|
35
40
|
new_database.bind(DATABASE_TABLES)
|
|
36
41
|
# Verify the binding worked by checking one of the models
|
|
37
|
-
assert classes_base.BaseModel._meta.database == new_database
|
|
42
|
+
assert classes_base.BaseModel._meta.database == new_database, (
|
|
43
|
+
f"Binding to '{database_path}' failed."
|
|
44
|
+
)
|
|
38
45
|
return new_database
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Utility functions for accessing metrics."""
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
|
|
5
|
+
from flexeval.classes import metric, message, turn, thread
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def count_dict_values(lst: list[dict]) -> dict[str, Counter]:
|
|
9
|
+
"""Convenience function for counting key values.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
lst (list[dict]): List of dictionaries.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
dict[str, Counter]: counter for each key that appears in the dicts in lst.
|
|
16
|
+
"""
|
|
17
|
+
counts = {}
|
|
18
|
+
for d in lst:
|
|
19
|
+
for k, v in d.items():
|
|
20
|
+
if k not in counts:
|
|
21
|
+
counts[k] = Counter()
|
|
22
|
+
counts[k][v] += 1
|
|
23
|
+
return counts
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_all_metrics() -> list[dict]:
|
|
27
|
+
results = []
|
|
28
|
+
for m in metric.Metric.select():
|
|
29
|
+
results.append(m.__data__.copy())
|
|
30
|
+
return results
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_first_user_message_for_threads(thread_ids: set) -> list[dict]:
|
|
34
|
+
"""Get the first user message in each thread.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
thread_ids (set): The set of thread IDs to retrieve messages for.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
list[dict]: An iterable of messages.
|
|
41
|
+
"""
|
|
42
|
+
return (
|
|
43
|
+
message.Message.select()
|
|
44
|
+
.join(thread.Thread)
|
|
45
|
+
.where(thread.Thread.id.in_(thread_ids))
|
|
46
|
+
.where(message.Message.role == "user")
|
|
47
|
+
.join(turn.Turn)
|
|
48
|
+
.where(turn.Turn.index_in_thread == 0)
|
|
49
|
+
.dicts()
|
|
50
|
+
)
|
|
@@ -16,6 +16,8 @@ MetricLevel = Literal["Message", "Turn", "Thread", "ToolCall"]
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class DependsOnItem(BaseModel):
|
|
19
|
+
"""Defines a metric dependency."""
|
|
20
|
+
|
|
19
21
|
class Config:
|
|
20
22
|
extra = "forbid"
|
|
21
23
|
|
|
@@ -56,6 +58,8 @@ class DependsOnItem(BaseModel):
|
|
|
56
58
|
|
|
57
59
|
|
|
58
60
|
class MetricItem(BaseModel):
|
|
61
|
+
"Defines a metric."
|
|
62
|
+
|
|
59
63
|
name: str = Field(
|
|
60
64
|
...,
|
|
61
65
|
description="The function to call or name of rubric to use to compute this metric.",
|
|
@@ -72,6 +76,8 @@ class MetricItem(BaseModel):
|
|
|
72
76
|
|
|
73
77
|
|
|
74
78
|
class FunctionItem(MetricItem):
|
|
79
|
+
"""Defines a metric computed from a Python function."""
|
|
80
|
+
|
|
75
81
|
kwargs: schema_utils.OptionalDict = Field(
|
|
76
82
|
default_factory=dict,
|
|
77
83
|
description="Keyword arguments for the function. Each key must correspond to an argument in the function. Extra keys will cause an error.",
|
|
@@ -80,6 +86,8 @@ class FunctionItem(MetricItem):
|
|
|
80
86
|
|
|
81
87
|
|
|
82
88
|
class RubricItem(MetricItem):
|
|
89
|
+
"""Defines a metric computed from a rubric."""
|
|
90
|
+
|
|
83
91
|
# TODO is RubricItem.kwargs actually used?
|
|
84
92
|
kwargs: Optional[Dict[str, Any]] = Field(
|
|
85
93
|
default_factory=dict,
|
|
@@ -115,6 +123,8 @@ class CompletionLlm(BaseModel):
|
|
|
115
123
|
|
|
116
124
|
|
|
117
125
|
class GraderLlm(BaseModel):
|
|
126
|
+
"""Defines the LLM used for evaluating rubrics."""
|
|
127
|
+
|
|
118
128
|
class Config:
|
|
119
129
|
extra = "forbid"
|
|
120
130
|
|
|
@@ -37,7 +37,7 @@ class FileDataSource(DataSource):
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class FunctionsCollection(BaseModel):
|
|
40
|
-
"""Collection of functions that can be used as :class:`~flexeval.schema.eval_schema.FunctionItem
|
|
40
|
+
"""Collection of functions that can be used as :class:`~flexeval.schema.eval_schema.FunctionItem` s."""
|
|
41
41
|
|
|
42
42
|
functions: list[Callable] = Field(
|
|
43
43
|
default_factory=list,
|
|
@@ -32,7 +32,7 @@ class Rubric(BaseModel):
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class RubricsCollection(BaseModel):
|
|
35
|
-
"""Collection of rubrics that can be used as :class:`~flexeval.schema.eval_schema.RubricItem
|
|
35
|
+
"""Collection of rubrics that can be used as :class:`~flexeval.schema.eval_schema.RubricItem` s."""
|
|
36
36
|
|
|
37
37
|
rubrics: dict[str, Rubric] = Field(
|
|
38
38
|
default_factory=dict,
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
{"key_1": "value_1", "key_2": {"nested_key": "nested_value"}, "input":[{ "role": "system", "content": "my system prompt" }, {"role":"user", "content": "I need help.", "index": 0}, {"role":"assistant", "content": "Help with what?", "index": 1}, {"role":"user", "content": "My homework.", "index": 2}]}
|
|
2
|
+
{"input": [{ "role": "system", "content": "my system prompt" }, {"role": "user", "content": "Hi, Nice to meet you!"}, {"role": "assistant", "content": "Nice to meet you, too! How can I help you today?"}, {"role": "user", "content": "How do I find cube roots by hand?"}]}
|