python-flexeval 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/.github/dependabot.yml +1 -1
  2. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/CLAUDE.md +7 -4
  3. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/DEVELOPMENT.md +32 -2
  4. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/PKG-INFO +3 -3
  5. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/user_guide/abstractions.rst +22 -2
  6. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/vignettes.rst +1 -0
  7. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/pyproject.toml +2 -2
  8. python_flexeval-0.4.0/src/flexeval/__about__.py +1 -0
  9. python_flexeval-0.4.0/src/flexeval/classes/dataset.py +22 -0
  10. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/classes/eval_set_run.py +18 -7
  11. python_flexeval-0.4.0/src/flexeval/classes/jsonview.py +112 -0
  12. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/classes/message.py +16 -5
  13. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/classes/metric.py +0 -8
  14. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/classes/thread.py +4 -2
  15. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/classes/tool_call.py +0 -2
  16. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/classes/turn.py +7 -5
  17. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/completions.py +8 -5
  18. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/compute_metrics.py +45 -32
  19. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/configuration/evals.yaml +2 -25
  20. python_flexeval-0.4.0/src/flexeval/data_loader.py +430 -0
  21. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/db_utils.py +11 -2
  22. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/dependency_graph.py +3 -3
  23. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/eval_schema.json +0 -18
  24. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/function_types.py +2 -13
  25. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/metrics/save.py +12 -8
  26. python_flexeval-0.4.0/src/flexeval/run_utils.py +211 -0
  27. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/runner.py +6 -14
  28. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/schema/config_schema.py +12 -0
  29. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/schema/eval_schema.py +3 -0
  30. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/schema/evalrun_schema.py +41 -10
  31. python_flexeval-0.4.0/tests/data/simple_metadata.jsonl +2 -0
  32. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/integration/evals.yaml +3 -34
  33. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/integration/functional_tests.py +153 -241
  34. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/integration/langgraph_data.py +1 -1
  35. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/resources/functional_evals.yaml +2 -30
  36. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/mixins.py +4 -4
  37. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/test_completions.py +8 -9
  38. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/test_compute_metrics.py +30 -24
  39. python_flexeval-0.4.0/tests/unit/test_data_loader.py +652 -0
  40. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/test_function_types.py +11 -19
  41. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/test_functional.py +1 -4
  42. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/uv.lock +414 -247
  43. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/vignettes/eval_run.yaml +2 -1
  44. python_flexeval-0.4.0/vignettes/multiple_configs.py +87 -0
  45. python_flexeval-0.2.0/src/flexeval/__about__.py +0 -1
  46. python_flexeval-0.2.0/src/flexeval/classes/dataset.py +0 -82
  47. python_flexeval-0.2.0/src/flexeval/data_loader.py +0 -513
  48. python_flexeval-0.2.0/src/flexeval/run_utils.py +0 -65
  49. python_flexeval-0.2.0/tests/unit/test_data_loader.py +0 -100
  50. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/.env-example +0 -0
  51. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/.github/workflows/deploy-to-pypi.yml +0 -0
  52. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/.github/workflows/github-pages.yml +0 -0
  53. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/.github/workflows/validate.yaml +0 -0
  54. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/.gitignore +0 -0
  55. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/.pre-commit-config.yaml +0 -0
  56. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/.python-version +0 -0
  57. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/.vscode/settings.json +0 -0
  58. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/CITATION.bib +0 -0
  59. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/CITATION.cff +0 -0
  60. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/Dockerfile +0 -0
  61. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/EDM_2024_FlexEval.pdf +0 -0
  62. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/LICENSE +0 -0
  63. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/Makefile +0 -0
  64. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/README.md +0 -0
  65. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/data/metabase/.gitkeep +0 -0
  66. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docker-compose.yml +0 -0
  67. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/_static/flexeval_banner.svg +0 -0
  68. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/_static/flexeval_favicon.svg +0 -0
  69. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/_static/flexeval_logo.png +0 -0
  70. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/_static/flexeval_logo2.png +0 -0
  71. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/_templates/footer.html +0 -0
  72. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/api.rst +0 -0
  73. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/conf.py +0 -0
  74. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/getting_started.rst +0 -0
  75. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/index.rst +0 -0
  76. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/sphinxext/__init__.py +0 -0
  77. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/sphinxext/github.py +0 -0
  78. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/user_guide/cli.rst +0 -0
  79. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/user_guide/index.rst +0 -0
  80. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/user_guide/logging.rst +0 -0
  81. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/user_guide/motivation.md +0 -0
  82. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/user_guide/rubric_guide.md +0 -0
  83. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/docs/vignettes.py +0 -0
  84. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/example_project/example_specific_rubrics.yaml +0 -0
  85. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/logs/.gitkeep +0 -0
  86. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/make.bat +0 -0
  87. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/ruff.toml +0 -0
  88. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/__init__.py +0 -0
  89. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/__main__.py +0 -0
  90. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/classes/__init__.py +0 -0
  91. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/classes/base.py +0 -0
  92. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/classes/eval_runner.py +0 -0
  93. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/cli.py +0 -0
  94. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/config.yaml +0 -0
  95. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/configuration/__init__.py +0 -0
  96. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/configuration/completion_functions.py +0 -0
  97. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/configuration/function_metrics.py +0 -0
  98. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/configuration/rubric_metrics.yaml +0 -0
  99. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/helpers.py +0 -0
  100. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/io/__init__.py +0 -0
  101. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/io/parsers/yaml_parser.py +0 -0
  102. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/log_utils.py +0 -0
  103. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/metrics/__init__.py +0 -0
  104. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/metrics/access.py +0 -0
  105. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/rubric.py +0 -0
  106. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/schema/__init__.py +0 -0
  107. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/schema/rubric_schema.py +0 -0
  108. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/flexeval/schema/schema_utils.py +0 -0
  109. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/src/metabase/Dockerfile +0 -0
  110. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/__init__.py +0 -0
  111. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/data/multiturn.jsonl +0 -0
  112. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/data/plot-convos.jsonl +0 -0
  113. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/data/simple.jsonl +0 -0
  114. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/data/simple_nosystem.jsonl +0 -0
  115. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/integration/__init__.py +0 -0
  116. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/integration/config-tests.yaml +0 -0
  117. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/integration/data/multiturn.jsonl +0 -0
  118. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/integration/data/plot-convos.jsonl +0 -0
  119. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/integration/data/simple.jsonl +0 -0
  120. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/resources/function_metric.py +0 -0
  121. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/resources/functional_config.yaml +0 -0
  122. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/resources/test_config.yaml +0 -0
  123. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/resources/test_dataset.jsonl +0 -0
  124. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/resources/test_evals.yaml +0 -0
  125. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/resources/test_rubric_metrics.yaml +0 -0
  126. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/resources/unittest.env +0 -0
  127. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/__init__.py +0 -0
  128. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/io/test_yaml_parser.py +0 -0
  129. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/test_db_utils.py +0 -0
  130. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/test_dependency_graph.py +0 -0
  131. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/test_eval_runner.py +0 -0
  132. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/test_function_metrics.py +0 -0
  133. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/test_rubric.py +0 -0
  134. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/tests/unit/test_schema.py +0 -0
  135. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/vignettes/.gitignore +0 -0
  136. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/vignettes/basic.py +0 -0
  137. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/vignettes/basic_cli.md +0 -0
  138. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/vignettes/basic_rubric.py +0 -0
  139. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/vignettes/conversations.jsonl +0 -0
  140. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/vignettes/custom_functions.py +0 -0
  141. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/vignettes/custom_rubric.md +0 -0
  142. {python_flexeval-0.2.0 → python_flexeval-0.4.0}/vignettes/custom_rubrics.yaml +0 -0
@@ -5,4 +5,4 @@ updates:
5
5
  directory: "/"
6
6
  schedule:
7
7
  interval: "weekly"
8
- target-branch: "dev"
8
+ target-branch: "main"
@@ -59,7 +59,7 @@ FlexEval is a tool for evaluating LLM-powered systems using custom metrics, comp
59
59
  ### Core Abstractions
60
60
 
61
61
  **EvalRun** (`src/flexeval/schema/evalrun_schema.py`): The top-level execution unit that combines:
62
- - Data sources (conversations in JSONL format as inputs, an SQLite filepath as output)
62
+ - Data sources (polymorphic via `type` discriminator: `FileDataSource`, `NamedDataSource`, `IterableDataSource`)
63
63
  - An Eval specification (metrics to compute)
64
64
  - Configuration (workers, database path, etc.)
65
65
  - Rubric and function sources
@@ -71,15 +71,18 @@ FlexEval is a tool for evaluating LLM-powered systems using custom metrics, comp
71
71
  - Grader LLM (for rubric evaluation)
72
72
  - Dependencies between metrics
73
73
 
74
- **Config** (`src/flexeval/schema/config_schema.py`): Defines how to evaluate (e.g. single- vs multi-process, etc.)
74
+ **Config** (`src/flexeval/schema/config_schema.py`): Defines how to evaluate (e.g. single- vs multi-process, dataset reuse/naming constraints, etc.)
75
75
 
76
76
  ### Data Hierarchy
77
77
  The evaluation operates at multiple levels of granularity:
78
+ - **Dataset** (`src/flexeval/classes/dataset.py`): Container for loaded data, linked to EvalSetRuns via many-to-many join table (`EvalSetRunDatasets`). Datasets can be reused across multiple eval runs.
78
79
  - **Thread**: Full conversation
79
- - **Turn**: User-assistant exchange pair
80
+ - **Turn**: User-assistant exchange pair
80
81
  - **Message**: Individual message from user or assistant
81
82
  - **ToolCall**: Function/tool invocation within a message
82
83
 
84
+ Thread, Turn, Message, and ToolCall belong to a Dataset. Metrics belong to both an EvalSetRun and a Dataset.
85
+
83
86
  ### Key Components
84
87
 
85
88
  **Configuration System**:
@@ -89,7 +92,7 @@ The evaluation operates at multiple levels of granularity:
89
92
 
90
93
  **Execution Pipeline** (`src/flexeval/runner.py`):
91
94
  1. Load configuration and eval specification
92
- 2. Create Dataset from data sources
95
+ 2. Create Datasets from data sources and link to EvalSetRun via `EvalSetRunDatasets`
93
96
  3. Run EvalRunner to compute metrics
94
97
  4. Store results in SQLite database
95
98
 
@@ -30,7 +30,9 @@ uv sync --upgrade --all-groups
30
30
  uv build
31
31
  ```
32
32
 
33
- ### Running tests
33
+ ### Unit tests
34
+
35
+ Unit tests live in `tests/unit/` and are run in CI.
34
36
 
35
37
  Run the unit tests:
36
38
 
@@ -46,7 +48,35 @@ To run a specific file's tests:
46
48
  uv run python -m unittest tests.unit.{module_name}
47
49
  ```
48
50
 
49
- There are integration tests in tests/integration that can be executed.
51
+ ### Integration tests
52
+
53
+ Integration tests live in `tests/integration/` and are **not** run in CI.
54
+
55
+ Run the integration tests:
56
+
57
+ ```bash
58
+ uv run python -m unittest tests.integration.functional_tests
59
+ ```
60
+
61
+ **Prerequisites:**
62
+ - An `.env` file at the repo root with `OPENAI_API_KEY` set
63
+ - Suites with rubric metrics (`TestSuite04`) make **real API calls** to OpenAI (gpt-5.4-nano)
64
+ - Function-only suites (`TestSuite01`, `TestSuite02`, `TestSuite03`) do not require API keys
65
+ - LangGraph-based test suites use pre-generated test data from `tests/resources/langgraph-test-data.db`
66
+
67
+ To run only the function-metric suites (no API key required):
68
+
69
+ ```bash
70
+ uv run python -m unittest tests.integration.functional_tests.TestSuite01 tests.integration.functional_tests.TestSuite02 tests.integration.functional_tests.TestSuite03
71
+ ```
72
+
73
+ **Regenerating LangGraph test data:**
74
+
75
+ The file `tests/resources/langgraph-test-data.db` is pre-generated. To regenerate it (requires `OPENAI_API_KEY`):
76
+
77
+ ```bash
78
+ uv run python tests/integration/langgraph_data.py
79
+ ```
50
80
 
51
81
  ### Adding or updating dependencies
52
82
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-flexeval
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
5
5
  Project-URL: Homepage, https://digitalharborfoundation.github.io/FlexEval/
6
6
  Project-URL: GitHub, https://github.com/DigitalHarborFoundation/FlexEval
@@ -21,8 +21,8 @@ Requires-Dist: flatten-json>=0.1.14
21
21
  Requires-Dist: jsonschema>=4.23.0
22
22
  Requires-Dist: langchain-openai>=0.3.8
23
23
  Requires-Dist: langchain>=0.3.20
24
- Requires-Dist: langgraph-checkpoint-sqlite>=2.0.6
25
- Requires-Dist: langgraph>=0.3.6
24
+ Requires-Dist: langgraph-checkpoint-sqlite>=3.0.0
25
+ Requires-Dist: langgraph>=1.0.0
26
26
  Requires-Dist: litellm>=1.74.3
27
27
  Requires-Dist: msgpack>=1.1.0
28
28
  Requires-Dist: networkx>=3.4.2
@@ -16,7 +16,7 @@ An evaluation is represented by :class:`flexeval.schema.eval_schema.Eval`, and c
16
16
  - **Functions**: :class:`~flexeval.schema.eval_schema.FunctionItem`\s apply a Python function to the test data, returning a numeric value.
17
17
  - **Rubrics**: :class:`~flexeval.schema.eval_schema.RubricItem`\s use a configured :class:`~flexeval.schema.eval_schema.GraderLlm` function and the provided rubric template to generate a numeric score from an LLM's output.
18
18
 
19
- You execute an :class:`~flexeval.schema.eval_schema.Eval` by creating an :class:`flexeval.schema.evalrun_schema.EvalRun`.
19
+ You execute an :class:`~flexeval.schema.eval_schema.Eval` by creating an :class:`flexeval.schema.evalrun_schema.EvalRun`.
20
20
  EvalRun contains:
21
21
 
22
22
  - Data sources (conversations as inputs, an SQLite filepath as output)
@@ -26,11 +26,31 @@ EvalRun contains:
26
26
 
27
27
  The :class:`~flexeval.schema.config_schema.Config` includes details about multi-threaded metric computation, about logging, etc.
28
28
 
29
+ Data Sources
30
+ ------------
31
+
32
+ Data sources can be any of these types:
33
+
34
+ - :class:`~flexeval.schema.evalrun_schema.FileDataSource` (``type: file``): Load from a JSONL or LangGraph SQLite file. This is the most common data source.
35
+ - :class:`~flexeval.schema.evalrun_schema.NamedDataSource` (``type: named``): Reference a previously loaded dataset by name, enabling dataset reuse across eval runs.
36
+ - :class:`~flexeval.schema.evalrun_schema.IterableDataSource` (``type: iterable``): Load from an in-memory Python iterable (programmatic use only).
37
+
38
+ In YAML configurations, specify the ``type`` field::
39
+
40
+ data_sources:
41
+ - type: file
42
+ path: conversations.jsonl
43
+
44
+ In Python, the type is set automatically when you construct the appropriate class::
45
+
46
+ data_sources = [FileDataSource(path="conversations.jsonl")]
47
+
29
48
  Data Hierarchy
30
49
  --------------
31
50
 
32
- Metrics can operate at any of four levels of granularity:
51
+ Data is organized at several levels of granularity:
33
52
 
53
+ - :class:`~flexeval.classes.dataset.Dataset`: A loaded collection of conversations. Datasets can be shared across multiple eval runs.
34
54
  - :class:`~flexeval.classes.thread.Thread`: Full conversation
35
55
  - :class:`~flexeval.classes.turn.Turn`: Adjacent set of messages from the same user or assistant
36
56
  - :class:`~flexeval.classes.message.Message`: Individual message from user or assistant
@@ -13,6 +13,7 @@ These vignettes demonstrate how to use FlexEval.
13
13
  generated/vignettes/basic_rubric
14
14
  generated/vignettes/custom_rubric
15
15
  generated/vignettes/basic_cli
16
+ generated/vignettes/multiple_configs
16
17
  generated/vignettes/metric_analysis
17
18
 
18
19
 
@@ -28,8 +28,8 @@ dependencies = [
28
28
  "jsonschema>=4.23.0",
29
29
  "langchain>=0.3.20",
30
30
  "langchain-openai>=0.3.8",
31
- "langgraph>=0.3.6",
32
- "langgraph-checkpoint-sqlite>=2.0.6",
31
+ "langgraph>=1.0.0",
32
+ "langgraph-checkpoint-sqlite>=3.0.0",
33
33
  "litellm>=1.74.3",
34
34
  "msgpack>=1.1.0",
35
35
  "networkx>=3.4.2",
@@ -0,0 +1 @@
1
+ __version__ = "0.4.0"
@@ -0,0 +1,22 @@
1
+ import logging
2
+ from datetime import datetime
3
+
4
+ import peewee as pw
5
+
6
+ from flexeval.classes.base import BaseModel
7
+ from flexeval.classes.jsonview import JsonView
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class Dataset(BaseModel):
13
+ """Holds a dataset, e.g. a jsonl file"""
14
+
15
+ id = pw.IntegerField(primary_key=True)
16
+ timestamp = pw.DateTimeField(default=datetime.now)
17
+ datasource_type = pw.TextField(null=False)
18
+ name = pw.TextField(default=None, null=True)
19
+ notes = pw.TextField(default=None, null=True)
20
+ is_loaded = pw.BooleanField(default=False)
21
+ metadata = pw.TextField(default="{}", null=False)
22
+ metadata_dict = JsonView("metadata")
@@ -1,9 +1,9 @@
1
- import json
2
1
  from datetime import datetime
3
2
 
4
3
  import peewee as pw
5
4
 
6
5
  from flexeval.classes.base import BaseModel
6
+ from flexeval.classes.dataset import Dataset
7
7
 
8
8
 
9
9
  class EvalSetRun(BaseModel):
@@ -12,7 +12,6 @@ class EvalSetRun(BaseModel):
12
12
  id = pw.IntegerField(primary_key=True)
13
13
  name = pw.CharField(null=True)
14
14
  notes = pw.TextField(null=True)
15
- dataset_files = pw.TextField() # JSON string
16
15
  metrics = pw.TextField()
17
16
  metrics_graph_ordered_list = pw.TextField()
18
17
  do_completion = pw.BooleanField()
@@ -25,8 +24,20 @@ class EvalSetRun(BaseModel):
25
24
  default=datetime.now
26
25
  ) # Automatically set to current date and time
27
26
 
28
- def get_datasets(self) -> list[str]:
29
- # TODO Turn these into DataSource instances instead, returning list[DataSource]
30
- temp = json.loads(self.dataset_files)
31
- assert isinstance(temp, list), "The `data` entry in evals.yaml must be a list."
32
- return temp
27
+ @property
28
+ def dataset_list(self) -> list[Dataset]:
29
+ """Returns the actual Dataset objects linked to this EvalSetRun via the join table."""
30
+ return list(
31
+ Dataset.select()
32
+ .join(EvalSetRunDatasets)
33
+ .where(EvalSetRunDatasets.evalsetrun == self)
34
+ )
35
+
36
+
37
+ class EvalSetRunDatasets(BaseModel):
38
+ """Datasets used by an EvalSetRun."""
39
+
40
+ id = pw.IntegerField(primary_key=True)
41
+ timestamp = pw.DateTimeField(default=datetime.now)
42
+ evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="dataset_links")
43
+ dataset = pw.ForeignKeyField(Dataset, backref="evalsetrun_links")
@@ -0,0 +1,112 @@
1
+ import json
2
+ from collections import UserDict
3
+
4
+
5
+ class JsonViewDict(UserDict):
6
+ """Dictionary that syncs changes back to the model field."""
7
+
8
+ def __init__(
9
+ self,
10
+ model_instance,
11
+ text_field_attr_name,
12
+ json_dumps_fn=json.dumps,
13
+ json_loads_fn=json.loads,
14
+ ):
15
+ self.model_instance = model_instance
16
+ self.text_field_attr_name = text_field_attr_name
17
+ self.json_dumps_fn = json_dumps_fn
18
+ self.json_loads_fn = json_loads_fn
19
+
20
+ text_value = getattr(model_instance, text_field_attr_name)
21
+ initial_data = self.json_loads_fn(text_value)
22
+ super().__init__(initial_data)
23
+
24
+ def _sync_to_model(self):
25
+ """Sync the current data back to the model field."""
26
+ json_str = self.json_dumps_fn(self.data)
27
+ setattr(self.model_instance, self.text_field_attr_name, json_str)
28
+
29
+ # Override mutating methods to trigger sync
30
+ def __setitem__(self, key, value):
31
+ super().__setitem__(key, value)
32
+ self._sync_to_model()
33
+
34
+ def __delitem__(self, key):
35
+ super().__delitem__(key)
36
+ self._sync_to_model()
37
+
38
+ def clear(self):
39
+ super().clear()
40
+ self._sync_to_model()
41
+
42
+ def pop(self, key, *args):
43
+ result = super().pop(key, *args)
44
+ self._sync_to_model()
45
+ return result
46
+
47
+ def popitem(self):
48
+ result = super().popitem()
49
+ self._sync_to_model()
50
+ return result
51
+
52
+ def setdefault(self, key, default=None):
53
+ result = super().setdefault(key, default)
54
+ self._sync_to_model()
55
+ return result
56
+
57
+ def update(self, *args, **kwargs):
58
+ super().update(*args, **kwargs)
59
+ self._sync_to_model()
60
+
61
+ def refresh_from_model(self):
62
+ """If the text attribute has been mutated in the model, this method brings the view back in sync.
63
+
64
+ If you're going to use the JsonView, avoid mutating the text attribute directly.
65
+ """
66
+ text_value = getattr(self.model_instance, self.text_field_attr_name)
67
+ self.update(self.json_loads_fn(text_value))
68
+
69
+
70
+ class JsonView:
71
+ """Descriptor that provides dict-like access to a JSON text field.
72
+
73
+ Example:
74
+ class SomeModel(pw.Model):
75
+ some_field = pw.TextField(default="{}")
76
+ some_field_dict = JsonView(text_field_attr_name="some_field")
77
+ """
78
+
79
+ def __init__(self, text_field_attr_name):
80
+ self.text_field_attr_name = text_field_attr_name
81
+ self.attr_name = None
82
+
83
+ def __set_name__(self, owner, name):
84
+ """Called when the descriptor is assigned to a class attribute."""
85
+ self.attr_name = f"_{name}_dict"
86
+
87
+ def __get__(self, instance, owner) -> JsonViewDict:
88
+ if instance is None:
89
+ return self
90
+
91
+ # Check if we already have a cached JsonViewDict
92
+ if not hasattr(instance, self.attr_name):
93
+ if not hasattr(instance, self.text_field_attr_name):
94
+ raise ValueError(
95
+ f"Failed to link this JsonView to field '{self.text_field_attr_name}' because it doesn't exist on this model instance."
96
+ )
97
+ # Cache a new JsonViewDict
98
+ json_dict = JsonViewDict(instance, self.text_field_attr_name)
99
+ setattr(instance, self.attr_name, json_dict)
100
+
101
+ return getattr(instance, self.attr_name)
102
+
103
+ def __set__(self, instance, value):
104
+ """Allow setting the entire dict."""
105
+ if isinstance(value, dict):
106
+ json_dict = JsonViewDict(instance, self.text_field_attr_name)
107
+ json_dict.update(value)
108
+ setattr(instance, self.attr_name, json_dict)
109
+ else:
110
+ raise ValueError(
111
+ f"This JsonView must be a dictionary to set linked field '{self.text_field_attr_name}' correctly."
112
+ )
@@ -7,9 +7,9 @@ from playhouse.shortcuts import model_to_dict
7
7
 
8
8
  from flexeval.classes.base import BaseModel
9
9
  from flexeval.classes.dataset import Dataset
10
- from flexeval.classes.eval_set_run import EvalSetRun
11
10
  from flexeval.classes.thread import Thread
12
11
  from flexeval.classes.turn import Turn
12
+ from flexeval.classes.jsonview import JsonView
13
13
  from flexeval.configuration import completion_functions
14
14
 
15
15
  logger = logging.getLogger(__name__)
@@ -23,7 +23,6 @@ class Message(BaseModel):
23
23
 
24
24
  id = pw.IntegerField(primary_key=True)
25
25
 
26
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="messages")
27
26
  dataset = pw.ForeignKeyField(Dataset, backref="messages")
28
27
  thread = pw.ForeignKeyField(Thread, backref="messages")
29
28
  index_in_thread = pw.IntegerField()
@@ -34,6 +33,10 @@ class Message(BaseModel):
34
33
  content = pw.TextField()
35
34
  context = pw.TextField(null=True) # Previous messages
36
35
 
36
+ # metadata
37
+ metadata = pw.TextField(default="{}", null=False)
38
+ metadata_dict = JsonView("metadata")
39
+
37
40
  # helpers
38
41
  system_prompt = pw.TextField(null=True)
39
42
  is_flexeval_completion = pw.BooleanField(null=True)
@@ -66,10 +69,18 @@ class Message(BaseModel):
66
69
  super().__init__(**kwargs)
67
70
  self.metrics_to_evaluate = []
68
71
 
69
- def get_completion(self, include_system_prompt=False):
72
+ def get_completion(
73
+ self,
74
+ include_system_prompt=False,
75
+ completion_config: dict | None = None,
76
+ evalsetrun=None,
77
+ ):
70
78
  # only get a completion if this is the final turn - we probably don't want to branch from mid-conversation
71
79
  if self.is_final_turn_in_input:
72
- completion_config = json.loads(self.evalsetrun.completion_llm)
80
+ if completion_config is None:
81
+ raise ValueError(
82
+ "completion_config must be provided to get_completion()"
83
+ )
73
84
  completion_fn_name = completion_config.get("function_name", None)
74
85
  completion_function_kwargs = completion_config.get("kwargs", None)
75
86
 
@@ -99,7 +110,7 @@ class Message(BaseModel):
99
110
  # which generally means it'll have a structure like this
100
111
  # {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}
101
112
  result = model_to_dict(self, exclude=[self.id])
102
- result["evalsetrun"] = self.evalsetrun
113
+ result["evalsetrun"] = evalsetrun
103
114
  result["dataset"] = self.dataset
104
115
  result["datasetrow"] = self.datasetrow
105
116
  result["turn_number"] = self.turn_number + 1
@@ -37,14 +37,6 @@ class Metric(BaseModel):
37
37
  null=True
38
38
  ) # necessary if rubric result is INVALID or e.g. latency doesn't apply to the very first message
39
39
  kwargs = pw.TextField()
40
- # context_only allows us to create another kind of dependency
41
- # where we can quantify something about the previous conversation
42
- # and then use that quantity in a downstream analysis
43
- # e.g. 'would a plot be pedagogically appropriate here' is really a question about the PAST of the conversation
44
- # NOTE: but we have gotten rid of context_only for rubrics, where only {context} is used so technically here 'context_only' is False
45
- # or 'was the conversation ever flagged by the moderation api' would be a question about the previous turns that might
46
- # allow to have better context for the properties of this turn
47
- # context_only = pw.BooleanField(default=False)
48
40
  source = pw.TextField() # TODO - make another table for this? But maybe not, because this also contains filled-in rubrics
49
41
  depends_on = pw.TextField()
50
42
  rubric_prompt = pw.TextField(null=True)
@@ -2,7 +2,7 @@ import peewee as pw
2
2
 
3
3
  from flexeval.classes.base import BaseModel
4
4
  from flexeval.classes.dataset import Dataset
5
- from flexeval.classes.eval_set_run import EvalSetRun
5
+ from flexeval.classes.jsonview import JsonView
6
6
 
7
7
 
8
8
  class Thread(BaseModel):
@@ -12,7 +12,6 @@ class Thread(BaseModel):
12
12
 
13
13
  id = pw.IntegerField(primary_key=True)
14
14
  dataset = pw.ForeignKeyField(Dataset, backref="threads")
15
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="threads")
16
15
 
17
16
  langgraph_thread_id = pw.TextField(null=True)
18
17
  eval_run_thread_id = pw.TextField(null=True)
@@ -20,6 +19,9 @@ class Thread(BaseModel):
20
19
 
21
20
  system_prompt = pw.TextField(null=True)
22
21
 
22
+ metadata = pw.TextField(default="{}", null=False)
23
+ metadata_dict = JsonView("metadata")
24
+
23
25
  def __init__(self, **kwargs):
24
26
  super().__init__(**kwargs)
25
27
  self.metrics_to_evaluate = []
@@ -2,7 +2,6 @@ import peewee as pw
2
2
 
3
3
  from flexeval.classes.base import BaseModel
4
4
  from flexeval.classes.dataset import Dataset
5
- from flexeval.classes.eval_set_run import EvalSetRun
6
5
  from flexeval.classes.message import Message
7
6
  from flexeval.classes.thread import Thread
8
7
  from flexeval.classes.turn import Turn
@@ -16,7 +15,6 @@ class ToolCall(BaseModel):
16
15
 
17
16
  id = pw.IntegerField(primary_key=True)
18
17
 
19
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="toolcalls")
20
18
  dataset = pw.ForeignKeyField(Dataset, backref="toolcalls")
21
19
  thread = pw.ForeignKeyField(Thread, backref="toolcalls")
22
20
  message = pw.ForeignKeyField(Message, backref="toolcalls")
@@ -7,7 +7,6 @@ from playhouse.shortcuts import model_to_dict
7
7
 
8
8
  from flexeval.classes.base import BaseModel
9
9
  from flexeval.classes.dataset import Dataset
10
- from flexeval.classes.eval_set_run import EvalSetRun
11
10
  from flexeval.classes.thread import Thread
12
11
  from flexeval.configuration import completion_functions
13
12
 
@@ -22,7 +21,6 @@ class Turn(BaseModel):
22
21
 
23
22
  id = pw.IntegerField(primary_key=True)
24
23
 
25
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="turns")
26
24
  dataset = pw.ForeignKeyField(Dataset, backref="turns")
27
25
  thread = pw.ForeignKeyField(Thread, backref="turns")
28
26
  index_in_thread = pw.IntegerField()
@@ -32,10 +30,13 @@ class Turn(BaseModel):
32
30
  super().__init__(**kwargs)
33
31
  self.metrics_to_evaluate = []
34
32
 
35
- def get_completion(self):
33
+ def get_completion(self, completion_config: dict | None = None, evalsetrun=None):
36
34
  # only get a completion if this is the final turn - we probably don't want to branch from mid-conversation
37
35
  if self.is_final_turn_in_input:
38
- completion_config = json.loads(self.evalsetrun.completion_llm)
36
+ if completion_config is None:
37
+ raise ValueError(
38
+ "completion_config must be provided to get_completion()"
39
+ )
39
40
  completion_fn_name = completion_config.get("function_name", None)
40
41
  completion_function_kwargs = completion_config.get("kwargs", None)
41
42
 
@@ -69,7 +70,7 @@ class Turn(BaseModel):
69
70
  # - make the completion function just return content?
70
71
  # {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}
71
72
  result = model_to_dict(self, exclude=[self.id])
72
- result["evalsetrun"] = self.evalsetrun
73
+ result["evalsetrun"] = evalsetrun
73
74
  result["dataset"] = self.dataset
74
75
  result["datasetrow"] = self.datasetrow
75
76
  result["turn_number"] = self.turn_number + 1
@@ -108,6 +109,7 @@ class Turn(BaseModel):
108
109
  """
109
110
  context = ""
110
111
  for message in self.messages:
112
+ # TODO why not just use message.get_context(include_system_prompt=include_system_prompt) here?
111
113
  context = message.context
112
114
  break
113
115
  context = json.loads(context)
@@ -55,10 +55,15 @@ def get_completion(turn: classes.turn.Turn, completion_llm: CompletionLlm):
55
55
  return completion
56
56
 
57
57
 
58
- def get_completions(eval_run: EvalRun, evalsetrun: classes.eval_set_run.EvalSetRun):
58
+ def get_completions(
59
+ eval_run: EvalRun,
60
+ evalsetrun: classes.eval_set_run.EvalSetRun,
61
+ datasets: list[classes.dataset.Dataset],
62
+ ):
59
63
  n_workers = eval_run.config.max_workers
64
+ threads = [thread for dataset in datasets for thread in dataset.threads]
60
65
  if n_workers == 1:
61
- for thread in evalsetrun.threads:
66
+ for thread in threads:
62
67
  # select last turn in thread
63
68
  if len(thread.turns) == 0:
64
69
  continue
@@ -75,7 +80,7 @@ def get_completions(eval_run: EvalRun, evalsetrun: classes.eval_set_run.EvalSetR
75
80
  else:
76
81
  with ThreadPoolExecutor(max_workers=n_workers) as executor:
77
82
  futures: dict[Future, classes.turn.Turn] = {}
78
- for thread in evalsetrun.threads:
83
+ for thread in threads:
79
84
  if len(thread.turns) == 0:
80
85
  continue
81
86
  turn = (
@@ -113,7 +118,6 @@ def save_completion(
113
118
  new_turn = turn
114
119
  else:
115
120
  new_turn = classes.turn.Turn.create(
116
- evalsetrun=evalsetrun,
117
121
  dataset=turn.dataset,
118
122
  thread=turn.thread,
119
123
  index_in_thread=turn.index_in_thread + 1,
@@ -129,7 +133,6 @@ def save_completion(
129
133
  {"role": prev_message.role, "content": prev_message.content}
130
134
  )
131
135
  classes.message.Message.create(
132
- evalsetrun=evalsetrun,
133
136
  dataset=turn.dataset,
134
137
  thread=turn.thread,
135
138
  turn=new_turn,