python-flexeval 0.1.5__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.gitignore +1 -0
  2. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/Makefile +11 -1
  3. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/PKG-INFO +7 -5
  4. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/README.md +6 -4
  5. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/conf.py +8 -0
  6. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/index.rst +9 -4
  7. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/vignettes.py +13 -16
  8. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/pyproject.toml +5 -1
  9. python_flexeval-0.3.0/src/flexeval/__about__.py +1 -0
  10. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/__init__.py +2 -0
  11. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/eval_runner.py +1 -1
  12. python_flexeval-0.3.0/src/flexeval/classes/jsonview.py +107 -0
  13. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/message.py +5 -0
  14. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/thread.py +4 -0
  15. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/compute_metrics.py +1 -1
  16. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/configuration/function_metrics.py +2 -2
  17. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/data_loader.py +26 -11
  18. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/db_utils.py +8 -1
  19. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/metrics/__init__.py +1 -1
  20. python_flexeval-0.3.0/src/flexeval/metrics/access.py +50 -0
  21. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/eval_schema.py +10 -0
  22. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/evalrun_schema.py +1 -1
  23. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/rubric_schema.py +1 -1
  24. python_flexeval-0.3.0/tests/data/simple_metadata.jsonl +2 -0
  25. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_data_loader.py +33 -0
  26. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/uv.lock +1835 -1245
  27. python_flexeval-0.1.5/src/flexeval/metrics/access.py +0 -28
  28. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.env-example +0 -0
  29. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.github/dependabot.yml +0 -0
  30. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.github/workflows/deploy-to-pypi.yml +0 -0
  31. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.github/workflows/github-pages.yml +0 -0
  32. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.github/workflows/validate.yaml +0 -0
  33. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.pre-commit-config.yaml +0 -0
  34. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.python-version +0 -0
  35. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/.vscode/settings.json +0 -0
  36. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/CITATION.bib +0 -0
  37. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/CITATION.cff +0 -0
  38. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/CLAUDE.md +0 -0
  39. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/DEVELOPMENT.md +0 -0
  40. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/Dockerfile +0 -0
  41. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/EDM_2024_FlexEval.pdf +0 -0
  42. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/LICENSE +0 -0
  43. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/data/metabase/.gitkeep +0 -0
  44. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docker-compose.yml +0 -0
  45. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/_static/flexeval_banner.svg +0 -0
  46. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/_static/flexeval_favicon.svg +0 -0
  47. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/_static/flexeval_logo.png +0 -0
  48. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/_static/flexeval_logo2.png +0 -0
  49. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/_templates/footer.html +0 -0
  50. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/api.rst +0 -0
  51. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/getting_started.rst +0 -0
  52. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/sphinxext/__init__.py +0 -0
  53. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/sphinxext/github.py +0 -0
  54. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/abstractions.rst +0 -0
  55. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/cli.rst +0 -0
  56. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/index.rst +0 -0
  57. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/logging.rst +0 -0
  58. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/motivation.md +0 -0
  59. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/user_guide/rubric_guide.md +0 -0
  60. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/docs/vignettes.rst +0 -0
  61. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/example_project/example_specific_rubrics.yaml +0 -0
  62. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/logs/.gitkeep +0 -0
  63. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/make.bat +0 -0
  64. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/ruff.toml +0 -0
  65. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/__main__.py +0 -0
  66. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/__init__.py +0 -0
  67. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/base.py +0 -0
  68. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/dataset.py +0 -0
  69. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/eval_set_run.py +0 -0
  70. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/metric.py +0 -0
  71. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/tool_call.py +0 -0
  72. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/classes/turn.py +0 -0
  73. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/cli.py +0 -0
  74. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/completions.py +0 -0
  75. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/config.yaml +0 -0
  76. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/configuration/__init__.py +0 -0
  77. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/configuration/completion_functions.py +0 -0
  78. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/configuration/evals.yaml +0 -0
  79. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/configuration/rubric_metrics.yaml +0 -0
  80. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/dependency_graph.py +0 -0
  81. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/eval_schema.json +0 -0
  82. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/function_types.py +0 -0
  83. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/helpers.py +0 -0
  84. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/io/__init__.py +0 -0
  85. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/io/parsers/yaml_parser.py +0 -0
  86. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/log_utils.py +0 -0
  87. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/metrics/save.py +0 -0
  88. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/rubric.py +0 -0
  89. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/run_utils.py +0 -0
  90. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/runner.py +0 -0
  91. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/__init__.py +0 -0
  92. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/config_schema.py +0 -0
  93. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/flexeval/schema/schema_utils.py +0 -0
  94. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/src/metabase/Dockerfile +0 -0
  95. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/__init__.py +0 -0
  96. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/data/multiturn.jsonl +0 -0
  97. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/data/plot-convos.jsonl +0 -0
  98. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/data/simple.jsonl +0 -0
  99. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/data/simple_nosystem.jsonl +0 -0
  100. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/__init__.py +0 -0
  101. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/config-tests.yaml +0 -0
  102. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/data/multiturn.jsonl +0 -0
  103. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/data/plot-convos.jsonl +0 -0
  104. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/data/simple.jsonl +0 -0
  105. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/evals.yaml +0 -0
  106. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/functional_tests.py +0 -0
  107. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/integration/langgraph_data.py +0 -0
  108. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/function_metric.py +0 -0
  109. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/functional_config.yaml +0 -0
  110. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/functional_evals.yaml +0 -0
  111. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/test_config.yaml +0 -0
  112. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/test_dataset.jsonl +0 -0
  113. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/test_evals.yaml +0 -0
  114. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/test_rubric_metrics.yaml +0 -0
  115. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/resources/unittest.env +0 -0
  116. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/__init__.py +0 -0
  117. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/io/test_yaml_parser.py +0 -0
  118. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/mixins.py +0 -0
  119. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_completions.py +0 -0
  120. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_compute_metrics.py +0 -0
  121. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_db_utils.py +0 -0
  122. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_dependency_graph.py +0 -0
  123. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_eval_runner.py +0 -0
  124. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_function_metrics.py +0 -0
  125. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_function_types.py +0 -0
  126. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_functional.py +0 -0
  127. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_rubric.py +0 -0
  128. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/tests/unit/test_schema.py +0 -0
  129. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/.gitignore +0 -0
  130. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/basic.py +0 -0
  131. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/basic_cli.md +0 -0
  132. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/basic_rubric.py +0 -0
  133. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/conversations.jsonl +0 -0
  134. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/custom_functions.py +0 -0
  135. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/custom_rubric.md +0 -0
  136. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/custom_rubrics.yaml +0 -0
  137. {python_flexeval-0.1.5 → python_flexeval-0.3.0}/vignettes/eval_run.yaml +0 -0
@@ -184,4 +184,5 @@ src/llm-evals/evals_sync/
184
184
 
185
185
  # Docs
186
186
  docs/generated
187
+ jupyter_execute/
187
188
 
@@ -6,7 +6,7 @@ SPHINXBUILD ?= uv run sphinx-build
6
6
  SOURCEDIR = docs
7
7
  BUILDDIR = build
8
8
 
9
- .PHONY: dochelp docautobuild docclean unittest Makefile
9
+ .PHONY: dochelp docautobuild docclean unittest get-version set-version Makefile
10
10
 
11
11
  # Put it first so that "make" without argument is like "make dochelp".
12
12
  dochelp:
@@ -23,6 +23,16 @@ docclean:
23
23
  unittest:
24
24
  @uv run python -m unittest discover -s tests.unit
25
25
 
26
+ # see: https://hatch.pypa.io/1.13/version/
27
+ get-version:
28
+ @uv run hatch version
29
+
30
+ set-version:
31
+ @if [ -z "$(VERSION)" ]; then \
32
+ echo "VERSION is not set. Usage: make set-version VERSION=x.y.z"; \
33
+ exit 1; \
34
+ fi
35
+ @uv run hatch version "$(VERSION)"
26
36
 
27
37
  # Catch-all target: route all unknown targets to Sphinx using the new
28
38
  # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-flexeval
3
- Version: 0.1.5
3
+ Version: 0.3.0
4
4
  Summary: FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
5
5
  Project-URL: Homepage, https://digitalharborfoundation.github.io/FlexEval/
6
6
  Project-URL: GitHub, https://github.com/DigitalHarborFoundation/FlexEval
@@ -40,10 +40,12 @@ Description-Content-Type: text/markdown
40
40
 
41
41
  # FlexEval LLM Evals
42
42
 
43
+ [![PyPi](https://img.shields.io/pypi/v/python-flexeval)](https://pypi.org/project/python-flexeval/)
43
44
  [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12729993.svg)](https://doi.org/10.5281/zenodo.12729993)
44
45
  [![License](https://img.shields.io/github/license/DigitalHarborFoundation/FlexEval)](https://github.com/DigitalHarborFoundation/FlexEval/blob/main/LICENSE)
46
+ [![GitHub issues](https://img.shields.io/badge/issue_tracking-github-blue.svg)](https://github.com/DigitalHarborFoundation/FlexEval/issues)
45
47
 
46
- ![FlexEval banner](/docs/_static/flexeval_banner.svg)
48
+ ![FlexEval banner](https://raw.githubusercontent.com/DigitalHarborFoundation/FlexEval/refs/heads/main/docs/_static/flexeval_banner.svg)
47
49
 
48
50
  FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
49
51
 
@@ -73,7 +75,7 @@ flexeval.run(eval_run)
73
75
 
74
76
  This example computes [Flesch reading ease](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease) for every turn in a list of conversations provided in JSONL format. The metric values are stored in an SQLite database called `eval_results.db`.
75
77
 
76
- See additional usage examples in the [vignettes](/vignettes).
78
+ See additional usage examples in the [vignettes](https://github.com/DigitalHarborFoundation/FlexEval/tree/main/vignettes).
77
79
 
78
80
  ## Installation
79
81
 
@@ -97,7 +99,7 @@ FlexEval is designed to be "batteries included" for many basic use cases. It sup
97
99
  - a set of useful rubrics
98
100
  - a set of useful Python functions
99
101
 
100
- Evaluation results are saved in an SQLite database. See the [Metric Analysis](/vignettes/metric_analysis.ipynb) vignette for a sample analysis demonstrating the structure and utility of the data saved by FlexEval.
102
+ Evaluation results are saved in an SQLite database. See the [Metric Analysis](https://digitalharborfoundation.github.io/FlexEval/generated/vignettes/metric_analysis.html) vignette for a sample analysis demonstrating the structure and utility of the data saved by FlexEval.
101
103
 
102
104
 
103
105
  Read more in the [Getting Started](https://digitalharborfoundation.github.io/FlexEval/getting_started.html) guide.
@@ -115,4 +117,4 @@ Pull requests are welcome. Feel free to contribute:
115
117
  - Bug fixes
116
118
  - New features
117
119
 
118
- See [DEVELOPMENT.md](DEVELOPMENT.md).
120
+ See [DEVELOPMENT.md](https://github.com/DigitalHarborFoundation/FlexEval/tree/main/DEVELOPMENT.md).
@@ -1,9 +1,11 @@
1
1
  # FlexEval LLM Evals
2
2
 
3
+ [![PyPi](https://img.shields.io/pypi/v/python-flexeval)](https://pypi.org/project/python-flexeval/)
3
4
  [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12729993.svg)](https://doi.org/10.5281/zenodo.12729993)
4
5
  [![License](https://img.shields.io/github/license/DigitalHarborFoundation/FlexEval)](https://github.com/DigitalHarborFoundation/FlexEval/blob/main/LICENSE)
6
+ [![GitHub issues](https://img.shields.io/badge/issue_tracking-github-blue.svg)](https://github.com/DigitalHarborFoundation/FlexEval/issues)
5
7
 
6
- ![FlexEval banner](/docs/_static/flexeval_banner.svg)
8
+ ![FlexEval banner](https://raw.githubusercontent.com/DigitalHarborFoundation/FlexEval/refs/heads/main/docs/_static/flexeval_banner.svg)
7
9
 
8
10
  FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
9
11
 
@@ -33,7 +35,7 @@ flexeval.run(eval_run)
33
35
 
34
36
  This example computes [Flesch reading ease](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease) for every turn in a list of conversations provided in JSONL format. The metric values are stored in an SQLite database called `eval_results.db`.
35
37
 
36
- See additional usage examples in the [vignettes](/vignettes).
38
+ See additional usage examples in the [vignettes](https://github.com/DigitalHarborFoundation/FlexEval/tree/main/vignettes).
37
39
 
38
40
  ## Installation
39
41
 
@@ -57,7 +59,7 @@ FlexEval is designed to be "batteries included" for many basic use cases. It sup
57
59
  - a set of useful rubrics
58
60
  - a set of useful Python functions
59
61
 
60
- Evaluation results are saved in an SQLite database. See the [Metric Analysis](/vignettes/metric_analysis.ipynb) vignette for a sample analysis demonstrating the structure and utility of the data saved by FlexEval.
62
+ Evaluation results are saved in an SQLite database. See the [Metric Analysis](https://digitalharborfoundation.github.io/FlexEval/generated/vignettes/metric_analysis.html) vignette for a sample analysis demonstrating the structure and utility of the data saved by FlexEval.
61
63
 
62
64
 
63
65
  Read more in the [Getting Started](https://digitalharborfoundation.github.io/FlexEval/getting_started.html) guide.
@@ -75,4 +77,4 @@ Pull requests are welcome. Feel free to contribute:
75
77
  - Bug fixes
76
78
  - New features
77
79
 
78
- See [DEVELOPMENT.md](DEVELOPMENT.md).
80
+ See [DEVELOPMENT.md](https://github.com/DigitalHarborFoundation/FlexEval/tree/main/DEVELOPMENT.md).
@@ -89,6 +89,10 @@ intersphinx_mapping = {
89
89
  "pydantic": ("https://docs.pydantic.dev/latest", None),
90
90
  }
91
91
 
92
+ docutils_conf = {
93
+ "line-length-limit": None, # Disable docutils line-length limit
94
+ }
95
+
92
96
 
93
97
  def linkcode_resolve(domain, info):
94
98
  """
@@ -159,6 +163,10 @@ myst_enable_extensions = [
159
163
  ]
160
164
  myst_url_schemes = ("http", "https", "mailto")
161
165
 
166
+ # myst-nb configuration
167
+ nb_execution_mode = "off" # Don't re-execute, use existing outputs
168
+ nb_merge_streams = True
169
+
162
170
  autosummary_generate = True
163
171
  autodoc_typehints = "signature"
164
172
  autodoc_default_options = {
@@ -1,11 +1,12 @@
1
- .. FlexEval documentation master file, created by
2
- sphinx-quickstart on Thu Jul 3 12:21:33 2025.
3
- You can adapt this file completely to your liking, but it should at least
4
- contain the root `toctree` directive.
1
+ .. FlexEval documentation master file, originally created by sphinx-quickstart on 2025 July 3 12:21:33.
5
2
 
6
3
  FlexEval documentation
7
4
  ======================
8
5
 
6
+ .. image:: https://img.shields.io/pypi/v/python-flexeval
7
+ :target: https://pypi.org/project/python-flexeval/
8
+ :alt: PyPI
9
+
9
10
  .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.12729993.svg
10
11
  :target: https://doi.org/10.5281/zenodo.12729993
11
12
  :alt: Zenodo DOI
@@ -14,6 +15,10 @@ FlexEval documentation
14
15
  :target: https://github.com/DigitalHarborFoundation/FlexEval/blob/main/LICENSE
15
16
  :alt: FlexEval license
16
17
 
18
+ .. image:: https://img.shields.io/badge/issue_tracking-github-blue.svg
19
+ :target: https://github.com/DigitalHarborFoundation/FlexEval/issues
20
+ :alt: Issue tracking on GitHub
21
+
17
22
  .. raw:: html
18
23
 
19
24
  <br>
@@ -44,7 +44,7 @@ def extract_vignette_path_strings(file_contents: str) -> list[str]:
44
44
 
45
45
  def write_if_changed(path: Path, old_content: str, new_content: str):
46
46
  if path.exists():
47
- old_content = path.read_text()
47
+ # check if the file contents are different
48
48
  if old_content == new_content:
49
49
  return # No change — don't touch file
50
50
  path.write_text(new_content)
@@ -67,33 +67,30 @@ def generate_custom_stubs(app):
67
67
 
68
68
  for vignette_file in vignettes_dir.glob("*"):
69
69
  stem = vignette_file.stem
70
- rst_file = output_dir / f"{stem}.rst"
70
+ target_file = output_dir / f"{stem}.rst"
71
+ if vignette_file.suffix == ".ipynb":
72
+ target_file = output_dir / f"{stem}.ipynb"
71
73
 
72
74
  current_contents = ""
73
- if rst_file.exists():
74
- current_contents = rst_file.read_text()
75
+ if target_file.exists():
76
+ current_contents = target_file.read_text()
75
77
 
76
78
  if vignette_file.suffix == ".py":
77
- generate_python_stub(src_dir, vignette_file, rst_file, current_contents)
79
+ generate_python_stub(src_dir, vignette_file, target_file, current_contents)
78
80
  elif vignette_file.suffix == ".ipynb":
79
- generate_ipynb_stub(src_dir, vignette_file, rst_file, current_contents)
81
+ generate_ipynb_stub(vignette_file, target_file, current_contents)
80
82
  elif vignette_file.suffix == ".md":
81
- generate_md_stub(src_dir, vignette_file, rst_file, current_contents)
83
+ generate_md_stub(src_dir, vignette_file, target_file, current_contents)
82
84
  else:
83
85
  logger.info(
84
86
  f"Unsupported file type {vignette_file.suffix}; skipping while creating vignette stubs."
85
87
  )
86
88
 
87
89
 
88
- def generate_ipynb_stub(
89
- src_dir: Path, ipynb_file: Path, rst_file: Path, current_contents: str
90
- ):
91
- new_contents = f""".. _{ipynb_file.stem}:
92
-
93
- .. include:: ../../../{ipynb_file.relative_to(src_dir.parent)}
94
- :parser: myst_nb.docutils_
95
- """
96
- write_if_changed(rst_file, current_contents, new_contents)
90
+ def generate_ipynb_stub(ipynb_file: Path, target_file: Path, current_contents: str):
91
+ # unlike the other file types, we just copy ipynb files
92
+ new_contents = ipynb_file.read_text()
93
+ write_if_changed(target_file, current_contents, new_contents)
97
94
 
98
95
 
99
96
  def generate_md_stub(
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "python-flexeval"
3
- version = "0.1.5"
3
+ dynamic = ["version"]
4
4
  description = "FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems."
5
5
  authors = [
6
6
  { name="S. Thomas Christie" },
@@ -52,6 +52,7 @@ Issues = "https://github.com/DigitalHarborFoundation/FlexEval/issues"
52
52
 
53
53
  [dependency-groups]
54
54
  dev = [
55
+ "hatch>=1.14.1",
55
56
  "jupyter>=1.1.1",
56
57
  "matplotlib>=3.10.3",
57
58
  "pre-commit>=4.2.0",
@@ -81,3 +82,6 @@ build-backend = "hatchling.build"
81
82
 
82
83
  [tool.hatch.build.targets.wheel]
83
84
  packages = ["src/flexeval"]
85
+
86
+ [tool.hatch.version]
87
+ path = "src/flexeval/__about__.py"
@@ -0,0 +1 @@
1
+ __version__ = "0.3.0"
@@ -4,8 +4,10 @@ This top-level import exposes the :func:`~flexeval.runner.run` method."""
4
4
 
5
5
  from flexeval import metrics
6
6
  from flexeval.runner import run
7
+ from .__about__ import __version__
7
8
 
8
9
  __all__ = [
9
10
  "metrics",
10
11
  "run",
12
+ "__version__",
11
13
  ]
@@ -111,7 +111,7 @@ class EvalRunner:
111
111
 
112
112
  def load_evaluation_settings(self):
113
113
  """This function parses our eval suite and puts it in the data structure we'll need
114
- for easy use at run-time
114
+ for easy use at run-time.
115
115
  """
116
116
  # if the current eval has a 'config' entry, overwrite configuration options with its entries
117
117
  if (
@@ -0,0 +1,107 @@
1
+ import json
2
+ from collections import UserDict
3
+
4
+
5
+ class JsonViewDict(UserDict):
6
+ """Dictionary that syncs changes back to the model field."""
7
+
8
+ def __init__(
9
+ self,
10
+ model_instance,
11
+ text_field_attr_name,
12
+ json_dumps_fn=json.dumps,
13
+ json_loads_fn=json.loads,
14
+ ):
15
+ self.model_instance = model_instance
16
+ self.text_field_attr_name = text_field_attr_name
17
+ self.json_dumps_fn = json_dumps_fn
18
+ self.json_loads_fn = json_loads_fn
19
+
20
+ text_value = getattr(model_instance, text_field_attr_name)
21
+ initial_data = self.json_loads_fn(text_value)
22
+ super().__init__(initial_data)
23
+
24
+ def _sync_to_model(self):
25
+ """Sync the current data back to the model field."""
26
+ json_str = self.json_loads_fn(self.data)
27
+ setattr(self.model_instance, self.text_field_attr_name, json_str)
28
+
29
+ # Override mutating methods to trigger sync
30
+ def __setitem__(self, key, value):
31
+ super().__setitem__(key, value)
32
+ self._sync_to_model()
33
+
34
+ def __delitem__(self, key):
35
+ super().__delitem__(key)
36
+ self._sync_to_model()
37
+
38
+ def clear(self):
39
+ super().clear()
40
+ self._sync_to_model()
41
+
42
+ def pop(self, key, *args):
43
+ result = super().pop(key, *args)
44
+ self._sync_to_model()
45
+ return result
46
+
47
+ def popitem(self):
48
+ result = super().popitem()
49
+ self._sync_to_model()
50
+ return result
51
+
52
+ def setdefault(self, key, default=None):
53
+ result = super().setdefault(key, default)
54
+ self._sync_to_model()
55
+ return result
56
+
57
+ def update(self, *args, **kwargs):
58
+ super().update(*args, **kwargs)
59
+ self._sync_to_model()
60
+
61
+
62
+ class JsonView:
63
+ """Descriptor that provides dict-like access to a JSON text field.
64
+
65
+ Example:
66
+ class SomeModel(pw.Model):
67
+ some_field = pw.TextField(default="{}")
68
+ some_field_dict = JsonView(text_field_attr_name="some_field")
69
+
70
+ m = SomeModel()
71
+ m.some_field_dict["chosen_mistake"] = "whatever"
72
+ """
73
+
74
+ def __init__(self, text_field_attr_name):
75
+ self.text_field_attr_name = text_field_attr_name
76
+ self.attr_name = None
77
+
78
+ def __set_name__(self, owner, name):
79
+ """Called when the descriptor is assigned to a class attribute."""
80
+ self.attr_name = f"_{name}_dict"
81
+
82
+ def __get__(self, instance, owner):
83
+ if instance is None:
84
+ return self
85
+
86
+ # Check if we already have a cached JsonViewDict
87
+ if not hasattr(instance, self.attr_name):
88
+ if not hasattr(instance, self.text_field_attr_name):
89
+ raise ValueError(
90
+ f"Failed to link this JsonView to field '{self.text_field_attr_name}' because it doesn't exist on this model instance."
91
+ )
92
+ # Cache a new JsonViewDict
93
+ json_dict = JsonViewDict(instance, self.text_field_attr_name)
94
+ setattr(instance, self.attr_name, json_dict)
95
+
96
+ return getattr(instance, self.attr_name)
97
+
98
+ def __set__(self, instance, value):
99
+ """Allow setting the entire dict."""
100
+ if isinstance(value, dict):
101
+ json_dict = JsonViewDict(instance, self.text_field_attr_name)
102
+ json_dict.update(value)
103
+ setattr(instance, self.attr_name, json_dict)
104
+ else:
105
+ raise ValueError(
106
+ f"This JsonView must be a dictionary to set linked field '{self.text_field_attr_name}' correctly."
107
+ )
@@ -10,6 +10,7 @@ from flexeval.classes.dataset import Dataset
10
10
  from flexeval.classes.eval_set_run import EvalSetRun
11
11
  from flexeval.classes.thread import Thread
12
12
  from flexeval.classes.turn import Turn
13
+ from flexeval.classes.jsonview import JsonView
13
14
  from flexeval.configuration import completion_functions
14
15
 
15
16
  logger = logging.getLogger(__name__)
@@ -34,6 +35,10 @@ class Message(BaseModel):
34
35
  content = pw.TextField()
35
36
  context = pw.TextField(null=True) # Previous messages
36
37
 
38
+ # metadata
39
+ metadata = pw.TextField(default="{}", null=False)
40
+ metadata_dict = JsonView("metadata")
41
+
37
42
  # helpers
38
43
  system_prompt = pw.TextField(null=True)
39
44
  is_flexeval_completion = pw.BooleanField(null=True)
@@ -3,6 +3,7 @@ import peewee as pw
3
3
  from flexeval.classes.base import BaseModel
4
4
  from flexeval.classes.dataset import Dataset
5
5
  from flexeval.classes.eval_set_run import EvalSetRun
6
+ from flexeval.classes.jsonview import JsonView
6
7
 
7
8
 
8
9
  class Thread(BaseModel):
@@ -20,6 +21,9 @@ class Thread(BaseModel):
20
21
 
21
22
  system_prompt = pw.TextField(null=True)
22
23
 
24
+ metadata = pw.TextField(default="{}", null=False)
25
+ metadata_dict = JsonView("metadata")
26
+
23
27
  def __init__(self, **kwargs):
24
28
  super().__init__(**kwargs)
25
29
  self.metrics_to_evaluate = []
@@ -42,7 +42,7 @@ class ObjectMetric:
42
42
 
43
43
 
44
44
  class MetricGraphBuilder:
45
- """Builds :class:`networkx.DiGraph`\s of :class:`~flexeval.compute_metrics.ObjectMetric` instances that reflect any computational dependencies between them."""
45
+ """Builds :class:`networkx.DiGraph` s of :class:`~flexeval.compute_metrics.ObjectMetric` instances that reflect any computational dependencies between them."""
46
46
 
47
47
  def __init__(self):
48
48
  # key: tuple(metric_level, metric_id, object_id)
@@ -122,8 +122,8 @@ def is_role(object: Union[Turn, Message], role: str) -> dict:
122
122
  and 0 otherwise.
123
123
 
124
124
  Args:
125
- object: the Turn or Message
126
- role: a string with the desired role to check against
125
+ object: the Turn or Message
126
+ role: a string with the desired role to check against
127
127
  """
128
128
  return {role: int(object.role == role)}
129
129
 
@@ -54,18 +54,13 @@ def load_jsonl(
54
54
  max(1, nb_evaluations_per_thread)
55
55
  ): # duplicate stored threads for averaged evaluation results
56
56
  if thread_id in selected_thread_ids:
57
- thread_object = Thread.create(
58
- evalsetrun=dataset.evalsetrun,
59
- dataset=dataset,
60
- jsonl_thread_id=thread_id,
61
- eval_run_thread_id=str(thread_id)
62
- + "_"
63
- + str(thread_eval_run_id),
64
- )
57
+ thread_json = json.loads(thread)
58
+ # extract any metadata
59
+ thread_metadata = thread_json.copy()
60
+ del thread_metadata["input"]
65
61
 
66
- # Context
67
62
  context = []
68
- thread_input = json.loads(thread)["input"]
63
+ thread_input = thread_json["input"]
69
64
 
70
65
  # Get system prompt used in the thread - assuming only 1
71
66
  for message in thread_input:
@@ -78,15 +73,35 @@ def load_jsonl(
78
73
  # Add the system prompt as context
79
74
  context.append({"role": "system", "content": system_prompt})
80
75
 
76
+ thread_object: Thread = Thread.create(
77
+ evalsetrun=dataset.evalsetrun,
78
+ dataset=dataset,
79
+ jsonl_thread_id=thread_id,
80
+ eval_run_thread_id=str(thread_id)
81
+ + "_"
82
+ + str(thread_eval_run_id),
83
+ system_prompt=system_prompt,
84
+ metadata=json.dumps(thread_metadata),
85
+ )
86
+
81
87
  # Create messages
82
88
  index_in_thread = 0
83
89
  for message in thread_input:
90
+ if not isinstance(message, dict):
91
+ raise ValueError(
92
+ f"Can't load unknown object type; expected dict. Check JSONL format: {message}"
93
+ )
84
94
  role = message.get("role", None)
85
95
  if role != "system":
86
96
  # System message shouldn't be added as a separate message
87
97
  system_prompt_for_this_message = ""
88
98
  if role != "user":
89
99
  system_prompt_for_this_message = system_prompt
100
+ message_metadata = message.copy()
101
+ if "content" in message_metadata:
102
+ del message_metadata["content"]
103
+ if "role" in message_metadata:
104
+ del message_metadata["role"]
90
105
  Message.create(
91
106
  evalsetrun=dataset.evalsetrun,
92
107
  dataset=dataset,
@@ -95,9 +110,9 @@ def load_jsonl(
95
110
  role=role,
96
111
  content=message.get("content", None),
97
112
  context=json.dumps(context),
98
- metadata=message.get("metadata", None),
99
113
  is_flexeval_completion=False,
100
114
  system_prompt=system_prompt_for_this_message,
115
+ metadata=json.dumps(message_metadata),
101
116
  )
102
117
  # Update context
103
118
  context.append(
@@ -14,6 +14,11 @@ from flexeval.classes.turn import Turn
14
14
  DATABASE_TABLES = [EvalSetRun, Dataset, Thread, Turn, Message, ToolCall, Metric]
15
15
 
16
16
 
17
+ def ensure_database(database_path: str):
18
+ if not classes_base.database.is_connection_usable():
19
+ initialize_database(database_path)
20
+
21
+
17
22
  def initialize_database(database_path: str, clear_tables: bool = False):
18
23
  classes_base.database.init(database_path)
19
24
  # classes_base.database.start()
@@ -34,5 +39,7 @@ def bind_to_database(database_path: str) -> pw.Database:
34
39
  new_database = classes_base.create_sqlite_database(database_path)
35
40
  new_database.bind(DATABASE_TABLES)
36
41
  # Verify the binding worked by checking one of the models
37
- assert classes_base.BaseModel._meta.database == new_database
42
+ assert classes_base.BaseModel._meta.database == new_database, (
43
+ f"Binding to '{database_path}' failed."
44
+ )
38
45
  return new_database
@@ -1,4 +1,4 @@
1
- """Utility functions for accessing metrics."""
1
+ """Utility functions for working with metrics."""
2
2
 
3
3
  from flexeval.metrics import access, save
4
4
 
@@ -0,0 +1,50 @@
1
+ """Utility functions for accessing metrics."""
2
+
3
+ from collections import Counter
4
+
5
+ from flexeval.classes import metric, message, turn, thread
6
+
7
+
8
+ def count_dict_values(lst: list[dict]) -> dict[str, Counter]:
9
+ """Convenience function for counting key values.
10
+
11
+ Args:
12
+ lst (list[dict]): List of dictionaries.
13
+
14
+ Returns:
15
+ dict[str, Counter]: counter for each key that appears in the dicts in lst.
16
+ """
17
+ counts = {}
18
+ for d in lst:
19
+ for k, v in d.items():
20
+ if k not in counts:
21
+ counts[k] = Counter()
22
+ counts[k][v] += 1
23
+ return counts
24
+
25
+
26
+ def get_all_metrics() -> list[dict]:
27
+ results = []
28
+ for m in metric.Metric.select():
29
+ results.append(m.__data__.copy())
30
+ return results
31
+
32
+
33
+ def get_first_user_message_for_threads(thread_ids: set) -> list[dict]:
34
+ """Get the first user message in each thread.
35
+
36
+ Args:
37
+ thread_ids (set): The set of thread IDs to retrieve messages for.
38
+
39
+ Returns:
40
+ list[dict]: An iterable of messages.
41
+ """
42
+ return (
43
+ message.Message.select()
44
+ .join(thread.Thread)
45
+ .where(thread.Thread.id.in_(thread_ids))
46
+ .where(message.Message.role == "user")
47
+ .join(turn.Turn)
48
+ .where(turn.Turn.index_in_thread == 0)
49
+ .dicts()
50
+ )
@@ -16,6 +16,8 @@ MetricLevel = Literal["Message", "Turn", "Thread", "ToolCall"]
16
16
 
17
17
 
18
18
  class DependsOnItem(BaseModel):
19
+ """Defines a metric dependency."""
20
+
19
21
  class Config:
20
22
  extra = "forbid"
21
23
 
@@ -56,6 +58,8 @@ class DependsOnItem(BaseModel):
56
58
 
57
59
 
58
60
  class MetricItem(BaseModel):
61
+ "Defines a metric."
62
+
59
63
  name: str = Field(
60
64
  ...,
61
65
  description="The function to call or name of rubric to use to compute this metric.",
@@ -72,6 +76,8 @@ class MetricItem(BaseModel):
72
76
 
73
77
 
74
78
  class FunctionItem(MetricItem):
79
+ """Defines a metric computed from a Python function."""
80
+
75
81
  kwargs: schema_utils.OptionalDict = Field(
76
82
  default_factory=dict,
77
83
  description="Keyword arguments for the function. Each key must correspond to an argument in the function. Extra keys will cause an error.",
@@ -80,6 +86,8 @@ class FunctionItem(MetricItem):
80
86
 
81
87
 
82
88
  class RubricItem(MetricItem):
89
+ """Defines a metric computed from a rubric."""
90
+
83
91
  # TODO is RubricItem.kwargs actually used?
84
92
  kwargs: Optional[Dict[str, Any]] = Field(
85
93
  default_factory=dict,
@@ -115,6 +123,8 @@ class CompletionLlm(BaseModel):
115
123
 
116
124
 
117
125
  class GraderLlm(BaseModel):
126
+ """Defines the LLM used for evaluating rubrics."""
127
+
118
128
  class Config:
119
129
  extra = "forbid"
120
130
 
@@ -37,7 +37,7 @@ class FileDataSource(DataSource):
37
37
 
38
38
 
39
39
  class FunctionsCollection(BaseModel):
40
- """Collection of functions that can be used as :class:`~flexeval.schema.eval_schema.FunctionItem`\s."""
40
+ """Collection of functions that can be used as :class:`~flexeval.schema.eval_schema.FunctionItem` s."""
41
41
 
42
42
  functions: list[Callable] = Field(
43
43
  default_factory=list,
@@ -32,7 +32,7 @@ class Rubric(BaseModel):
32
32
 
33
33
 
34
34
  class RubricsCollection(BaseModel):
35
- """Collection of rubrics that can be used as :class:`~flexeval.schema.eval_schema.RubricItem`\s."""
35
+ """Collection of rubrics that can be used as :class:`~flexeval.schema.eval_schema.RubricItem` s."""
36
36
 
37
37
  rubrics: dict[str, Rubric] = Field(
38
38
  default_factory=dict,
@@ -0,0 +1,2 @@
1
+ {"key_1": "value_1", "key_2": {"nested_key": "nested_value"}, "input":[{ "role": "system", "content": "my system prompt" }, {"role":"user", "content": "I need help.", "index": 0}, {"role":"assistant", "content": "Help with what?", "index": 1}, {"role":"user", "content": "My homework.", "index": 2}]}
2
+ {"input": [{ "role": "system", "content": "my system prompt" }, {"role": "user", "content": "Hi, Nice to meet you!"}, {"role": "assistant", "content": "Nice to meet you, too! How can I help you today?"}, {"role": "user", "content": "How do I find cube roots by hand?"}]}