python-flexeval 0.3.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/.github/dependabot.yml +1 -1
  2. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/CLAUDE.md +7 -4
  3. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/DEVELOPMENT.md +32 -2
  4. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/PKG-INFO +3 -3
  5. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/user_guide/abstractions.rst +22 -2
  6. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/vignettes.rst +1 -0
  7. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/pyproject.toml +2 -2
  8. python_flexeval-0.4.1/src/flexeval/__about__.py +1 -0
  9. python_flexeval-0.4.1/src/flexeval/classes/dataset.py +22 -0
  10. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/classes/eval_set_run.py +18 -7
  11. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/classes/jsonview.py +10 -5
  12. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/classes/message.py +11 -5
  13. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/classes/metric.py +0 -8
  14. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/classes/thread.py +0 -2
  15. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/classes/tool_call.py +0 -2
  16. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/classes/turn.py +7 -5
  17. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/completions.py +8 -5
  18. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/compute_metrics.py +45 -32
  19. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/configuration/evals.yaml +2 -25
  20. python_flexeval-0.4.1/src/flexeval/data_loader.py +430 -0
  21. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/db_utils.py +11 -2
  22. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/dependency_graph.py +3 -3
  23. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/eval_schema.json +0 -18
  24. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/function_types.py +2 -13
  25. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/metrics/save.py +12 -8
  26. python_flexeval-0.4.1/src/flexeval/run_utils.py +211 -0
  27. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/runner.py +6 -14
  28. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/schema/config_schema.py +12 -0
  29. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/schema/eval_schema.py +3 -0
  30. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/schema/evalrun_schema.py +41 -10
  31. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/integration/evals.yaml +3 -34
  32. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/integration/functional_tests.py +153 -241
  33. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/integration/langgraph_data.py +1 -1
  34. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/resources/functional_evals.yaml +2 -30
  35. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/mixins.py +4 -4
  36. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/test_completions.py +8 -9
  37. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/test_compute_metrics.py +30 -24
  38. python_flexeval-0.4.1/tests/unit/test_data_loader.py +652 -0
  39. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/test_function_types.py +11 -19
  40. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/test_functional.py +1 -4
  41. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/uv.lock +534 -354
  42. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/vignettes/eval_run.yaml +2 -1
  43. python_flexeval-0.4.1/vignettes/multiple_configs.py +87 -0
  44. python_flexeval-0.3.0/src/flexeval/__about__.py +0 -1
  45. python_flexeval-0.3.0/src/flexeval/classes/dataset.py +0 -82
  46. python_flexeval-0.3.0/src/flexeval/data_loader.py +0 -528
  47. python_flexeval-0.3.0/src/flexeval/run_utils.py +0 -65
  48. python_flexeval-0.3.0/tests/unit/test_data_loader.py +0 -133
  49. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/.env-example +0 -0
  50. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/.github/workflows/deploy-to-pypi.yml +0 -0
  51. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/.github/workflows/github-pages.yml +0 -0
  52. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/.github/workflows/validate.yaml +0 -0
  53. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/.gitignore +0 -0
  54. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/.pre-commit-config.yaml +0 -0
  55. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/.python-version +0 -0
  56. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/.vscode/settings.json +0 -0
  57. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/CITATION.bib +0 -0
  58. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/CITATION.cff +0 -0
  59. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/Dockerfile +0 -0
  60. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/EDM_2024_FlexEval.pdf +0 -0
  61. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/LICENSE +0 -0
  62. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/Makefile +0 -0
  63. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/README.md +0 -0
  64. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/data/metabase/.gitkeep +0 -0
  65. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docker-compose.yml +0 -0
  66. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/_static/flexeval_banner.svg +0 -0
  67. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/_static/flexeval_favicon.svg +0 -0
  68. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/_static/flexeval_logo.png +0 -0
  69. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/_static/flexeval_logo2.png +0 -0
  70. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/_templates/footer.html +0 -0
  71. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/api.rst +0 -0
  72. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/conf.py +0 -0
  73. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/getting_started.rst +0 -0
  74. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/index.rst +0 -0
  75. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/sphinxext/__init__.py +0 -0
  76. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/sphinxext/github.py +0 -0
  77. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/user_guide/cli.rst +0 -0
  78. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/user_guide/index.rst +0 -0
  79. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/user_guide/logging.rst +0 -0
  80. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/user_guide/motivation.md +0 -0
  81. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/user_guide/rubric_guide.md +0 -0
  82. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/docs/vignettes.py +0 -0
  83. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/example_project/example_specific_rubrics.yaml +0 -0
  84. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/logs/.gitkeep +0 -0
  85. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/make.bat +0 -0
  86. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/ruff.toml +0 -0
  87. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/__init__.py +0 -0
  88. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/__main__.py +0 -0
  89. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/classes/__init__.py +0 -0
  90. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/classes/base.py +0 -0
  91. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/classes/eval_runner.py +0 -0
  92. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/cli.py +0 -0
  93. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/config.yaml +0 -0
  94. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/configuration/__init__.py +0 -0
  95. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/configuration/completion_functions.py +0 -0
  96. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/configuration/function_metrics.py +0 -0
  97. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/configuration/rubric_metrics.yaml +0 -0
  98. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/helpers.py +0 -0
  99. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/io/__init__.py +0 -0
  100. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/io/parsers/yaml_parser.py +0 -0
  101. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/log_utils.py +0 -0
  102. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/metrics/__init__.py +0 -0
  103. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/metrics/access.py +0 -0
  104. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/rubric.py +0 -0
  105. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/schema/__init__.py +0 -0
  106. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/schema/rubric_schema.py +0 -0
  107. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/flexeval/schema/schema_utils.py +0 -0
  108. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/src/metabase/Dockerfile +0 -0
  109. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/__init__.py +0 -0
  110. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/data/multiturn.jsonl +0 -0
  111. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/data/plot-convos.jsonl +0 -0
  112. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/data/simple.jsonl +0 -0
  113. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/data/simple_metadata.jsonl +0 -0
  114. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/data/simple_nosystem.jsonl +0 -0
  115. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/integration/__init__.py +0 -0
  116. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/integration/config-tests.yaml +0 -0
  117. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/integration/data/multiturn.jsonl +0 -0
  118. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/integration/data/plot-convos.jsonl +0 -0
  119. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/integration/data/simple.jsonl +0 -0
  120. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/resources/function_metric.py +0 -0
  121. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/resources/functional_config.yaml +0 -0
  122. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/resources/test_config.yaml +0 -0
  123. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/resources/test_dataset.jsonl +0 -0
  124. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/resources/test_evals.yaml +0 -0
  125. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/resources/test_rubric_metrics.yaml +0 -0
  126. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/resources/unittest.env +0 -0
  127. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/__init__.py +0 -0
  128. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/io/test_yaml_parser.py +0 -0
  129. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/test_db_utils.py +0 -0
  130. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/test_dependency_graph.py +0 -0
  131. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/test_eval_runner.py +0 -0
  132. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/test_function_metrics.py +0 -0
  133. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/test_rubric.py +0 -0
  134. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/tests/unit/test_schema.py +0 -0
  135. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/vignettes/.gitignore +0 -0
  136. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/vignettes/basic.py +0 -0
  137. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/vignettes/basic_cli.md +0 -0
  138. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/vignettes/basic_rubric.py +0 -0
  139. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/vignettes/conversations.jsonl +0 -0
  140. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/vignettes/custom_functions.py +0 -0
  141. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/vignettes/custom_rubric.md +0 -0
  142. {python_flexeval-0.3.0 → python_flexeval-0.4.1}/vignettes/custom_rubrics.yaml +0 -0
@@ -5,4 +5,4 @@ updates:
5
5
  directory: "/"
6
6
  schedule:
7
7
  interval: "weekly"
8
- target-branch: "dev"
8
+ target-branch: "main"
@@ -59,7 +59,7 @@ FlexEval is a tool for evaluating LLM-powered systems using custom metrics, comp
59
59
  ### Core Abstractions
60
60
 
61
61
  **EvalRun** (`src/flexeval/schema/evalrun_schema.py`): The top-level execution unit that combines:
62
- - Data sources (conversations in JSONL format as inputs, an SQLite filepath as output)
62
+ - Data sources (polymorphic via `type` discriminator: `FileDataSource`, `NamedDataSource`, `IterableDataSource`)
63
63
  - An Eval specification (metrics to compute)
64
64
  - Configuration (workers, database path, etc.)
65
65
  - Rubric and function sources
@@ -71,15 +71,18 @@ FlexEval is a tool for evaluating LLM-powered systems using custom metrics, comp
71
71
  - Grader LLM (for rubric evaluation)
72
72
  - Dependencies between metrics
73
73
 
74
- **Config** (`src/flexeval/schema/config_schema.py`): Defines how to evaluate (e.g. single- vs multi-process, etc.)
74
+ **Config** (`src/flexeval/schema/config_schema.py`): Defines how to evaluate (e.g. single- vs multi-process, dataset reuse/naming constraints, etc.)
75
75
 
76
76
  ### Data Hierarchy
77
77
  The evaluation operates at multiple levels of granularity:
78
+ - **Dataset** (`src/flexeval/classes/dataset.py`): Container for loaded data, linked to EvalSetRuns via many-to-many join table (`EvalSetRunDatasets`). Datasets can be reused across multiple eval runs.
78
79
  - **Thread**: Full conversation
79
- - **Turn**: User-assistant exchange pair
80
+ - **Turn**: User-assistant exchange pair
80
81
  - **Message**: Individual message from user or assistant
81
82
  - **ToolCall**: Function/tool invocation within a message
82
83
 
84
+ Thread, Turn, Message, and ToolCall belong to a Dataset. Metrics belong to both an EvalSetRun and a Dataset.
85
+
83
86
  ### Key Components
84
87
 
85
88
  **Configuration System**:
@@ -89,7 +92,7 @@ The evaluation operates at multiple levels of granularity:
89
92
 
90
93
  **Execution Pipeline** (`src/flexeval/runner.py`):
91
94
  1. Load configuration and eval specification
92
- 2. Create Dataset from data sources
95
+ 2. Create Datasets from data sources and link to EvalSetRun via `EvalSetRunDatasets`
93
96
  3. Run EvalRunner to compute metrics
94
97
  4. Store results in SQLite database
95
98
 
@@ -30,7 +30,9 @@ uv sync --upgrade --all-groups
30
30
  uv build
31
31
  ```
32
32
 
33
- ### Running tests
33
+ ### Unit tests
34
+
35
+ Unit tests live in `tests/unit/` and are run in CI.
34
36
 
35
37
  Run the unit tests:
36
38
 
@@ -46,7 +48,35 @@ To run a specific file's tests:
46
48
  uv run python -m unittest tests.unit.{module_name}
47
49
  ```
48
50
 
49
- There are integration tests in tests/integration that can be executed.
51
+ ### Integration tests
52
+
53
+ Integration tests live in `tests/integration/` and are **not** run in CI.
54
+
55
+ Run the integration tests:
56
+
57
+ ```bash
58
+ uv run python -m unittest tests.integration.functional_tests
59
+ ```
60
+
61
+ **Prerequisites:**
62
+ - An `.env` file at the repo root with `OPENAI_API_KEY` set
63
+ - Suites with rubric metrics (`TestSuite04`) make **real API calls** to OpenAI (gpt-5.4-nano)
64
+ - Function-only suites (`TestSuite01`, `TestSuite02`, `TestSuite03`) do not require API keys
65
+ - LangGraph-based test suites use pre-generated test data from `tests/resources/langgraph-test-data.db`
66
+
67
+ To run only the function-metric suites (no API key required):
68
+
69
+ ```bash
70
+ uv run python -m unittest tests.integration.functional_tests.TestSuite01 tests.integration.functional_tests.TestSuite02 tests.integration.functional_tests.TestSuite03
71
+ ```
72
+
73
+ **Regenerating LangGraph test data:**
74
+
75
+ The file `tests/resources/langgraph-test-data.db` is pre-generated. To regenerate it (requires `OPENAI_API_KEY`):
76
+
77
+ ```bash
78
+ uv run python tests/integration/langgraph_data.py
79
+ ```
50
80
 
51
81
  ### Adding or updating dependencies
52
82
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-flexeval
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
5
5
  Project-URL: Homepage, https://digitalharborfoundation.github.io/FlexEval/
6
6
  Project-URL: GitHub, https://github.com/DigitalHarborFoundation/FlexEval
@@ -21,8 +21,8 @@ Requires-Dist: flatten-json>=0.1.14
21
21
  Requires-Dist: jsonschema>=4.23.0
22
22
  Requires-Dist: langchain-openai>=0.3.8
23
23
  Requires-Dist: langchain>=0.3.20
24
- Requires-Dist: langgraph-checkpoint-sqlite>=2.0.6
25
- Requires-Dist: langgraph>=0.3.6
24
+ Requires-Dist: langgraph-checkpoint-sqlite>=3.0.0
25
+ Requires-Dist: langgraph>=1.0.0
26
26
  Requires-Dist: litellm>=1.74.3
27
27
  Requires-Dist: msgpack>=1.1.0
28
28
  Requires-Dist: networkx>=3.4.2
@@ -16,7 +16,7 @@ An evaluation is represented by :class:`flexeval.schema.eval_schema.Eval`, and c
16
16
  - **Functions**: :class:`~flexeval.schema.eval_schema.FunctionItem`\s apply a Python function to the test data, returning a numeric value.
17
17
  - **Rubrics**: :class:`~flexeval.schema.eval_schema.RubricItem`\s use a configured :class:`~flexeval.schema.eval_schema.GraderLlm` function and the provided rubric template to generate a numeric score from an LLM's output.
18
18
 
19
- You execute an :class:`~flexeval.schema.eval_schema.Eval` by creating an :class:`flexeval.schema.evalrun_schema.EvalRun`.
19
+ You execute an :class:`~flexeval.schema.eval_schema.Eval` by creating an :class:`flexeval.schema.evalrun_schema.EvalRun`.
20
20
  EvalRun contains:
21
21
 
22
22
  - Data sources (conversations as inputs, an SQLite filepath as output)
@@ -26,11 +26,31 @@ EvalRun contains:
26
26
 
27
27
  The :class:`~flexeval.schema.config_schema.Config` includes details about multi-threaded metric computation, about logging, etc.
28
28
 
29
+ Data Sources
30
+ ------------
31
+
32
+ Data sources can be any of these types:
33
+
34
+ - :class:`~flexeval.schema.evalrun_schema.FileDataSource` (``type: file``): Load from a JSONL or LangGraph SQLite file. This is the most common data source.
35
+ - :class:`~flexeval.schema.evalrun_schema.NamedDataSource` (``type: named``): Reference a previously loaded dataset by name, enabling dataset reuse across eval runs.
36
+ - :class:`~flexeval.schema.evalrun_schema.IterableDataSource` (``type: iterable``): Load from an in-memory Python iterable (programmatic use only).
37
+
38
+ In YAML configurations, specify the ``type`` field::
39
+
40
+ data_sources:
41
+ - type: file
42
+ path: conversations.jsonl
43
+
44
+ In Python, the type is set automatically when you construct the appropriate class::
45
+
46
+ data_sources = [FileDataSource(path="conversations.jsonl")]
47
+
29
48
  Data Hierarchy
30
49
  --------------
31
50
 
32
- Metrics can operate at any of four levels of granularity:
51
+ Data is organized at several levels of granularity:
33
52
 
53
+ - :class:`~flexeval.classes.dataset.Dataset`: A loaded collection of conversations. Datasets can be shared across multiple eval runs.
34
54
  - :class:`~flexeval.classes.thread.Thread`: Full conversation
35
55
  - :class:`~flexeval.classes.turn.Turn`: Adjacent set of messages from the same user or assistant
36
56
  - :class:`~flexeval.classes.message.Message`: Individual message from user or assistant
@@ -13,6 +13,7 @@ These vignettes demonstrate how to use FlexEval.
13
13
  generated/vignettes/basic_rubric
14
14
  generated/vignettes/custom_rubric
15
15
  generated/vignettes/basic_cli
16
+ generated/vignettes/multiple_configs
16
17
  generated/vignettes/metric_analysis
17
18
 
18
19
 
@@ -28,8 +28,8 @@ dependencies = [
28
28
  "jsonschema>=4.23.0",
29
29
  "langchain>=0.3.20",
30
30
  "langchain-openai>=0.3.8",
31
- "langgraph>=0.3.6",
32
- "langgraph-checkpoint-sqlite>=2.0.6",
31
+ "langgraph>=1.0.0",
32
+ "langgraph-checkpoint-sqlite>=3.0.0",
33
33
  "litellm>=1.74.3",
34
34
  "msgpack>=1.1.0",
35
35
  "networkx>=3.4.2",
@@ -0,0 +1 @@
1
+ __version__ = "0.4.1"
@@ -0,0 +1,22 @@
1
+ import logging
2
+ from datetime import datetime
3
+
4
+ import peewee as pw
5
+
6
+ from flexeval.classes.base import BaseModel
7
+ from flexeval.classes.jsonview import JsonView
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class Dataset(BaseModel):
13
+ """Holds a dataset, e.g. a jsonl file"""
14
+
15
+ id = pw.IntegerField(primary_key=True)
16
+ timestamp = pw.DateTimeField(default=datetime.now)
17
+ datasource_type = pw.TextField(null=False)
18
+ name = pw.TextField(default=None, null=True)
19
+ notes = pw.TextField(default=None, null=True)
20
+ is_loaded = pw.BooleanField(default=False)
21
+ metadata = pw.TextField(default="{}", null=False)
22
+ metadata_dict = JsonView("metadata")
@@ -1,9 +1,9 @@
1
- import json
2
1
  from datetime import datetime
3
2
 
4
3
  import peewee as pw
5
4
 
6
5
  from flexeval.classes.base import BaseModel
6
+ from flexeval.classes.dataset import Dataset
7
7
 
8
8
 
9
9
  class EvalSetRun(BaseModel):
@@ -12,7 +12,6 @@ class EvalSetRun(BaseModel):
12
12
  id = pw.IntegerField(primary_key=True)
13
13
  name = pw.CharField(null=True)
14
14
  notes = pw.TextField(null=True)
15
- dataset_files = pw.TextField() # JSON string
16
15
  metrics = pw.TextField()
17
16
  metrics_graph_ordered_list = pw.TextField()
18
17
  do_completion = pw.BooleanField()
@@ -25,8 +24,20 @@ class EvalSetRun(BaseModel):
25
24
  default=datetime.now
26
25
  ) # Automatically set to current date and time
27
26
 
28
- def get_datasets(self) -> list[str]:
29
- # TODO Turn these into DataSource instances instead, returning list[DataSource]
30
- temp = json.loads(self.dataset_files)
31
- assert isinstance(temp, list), "The `data` entry in evals.yaml must be a list."
32
- return temp
27
+ @property
28
+ def dataset_list(self) -> list[Dataset]:
29
+ """Returns the actual Dataset objects linked to this EvalSetRun via the join table."""
30
+ return list(
31
+ Dataset.select()
32
+ .join(EvalSetRunDatasets)
33
+ .where(EvalSetRunDatasets.evalsetrun == self)
34
+ )
35
+
36
+
37
+ class EvalSetRunDatasets(BaseModel):
38
+ """Datasets used by an EvalSetRun."""
39
+
40
+ id = pw.IntegerField(primary_key=True)
41
+ timestamp = pw.DateTimeField(default=datetime.now)
42
+ evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="dataset_links")
43
+ dataset = pw.ForeignKeyField(Dataset, backref="evalsetrun_links")
@@ -23,7 +23,7 @@ class JsonViewDict(UserDict):
23
23
 
24
24
  def _sync_to_model(self):
25
25
  """Sync the current data back to the model field."""
26
- json_str = self.json_loads_fn(self.data)
26
+ json_str = self.json_dumps_fn(self.data)
27
27
  setattr(self.model_instance, self.text_field_attr_name, json_str)
28
28
 
29
29
  # Override mutating methods to trigger sync
@@ -58,6 +58,14 @@ class JsonViewDict(UserDict):
58
58
  super().update(*args, **kwargs)
59
59
  self._sync_to_model()
60
60
 
61
+ def refresh_from_model(self):
62
+ """If the text attribute has been mutated in the model, this method brings the view back in sync.
63
+
64
+ If you're going to use the JsonView, avoid mutating the text attribute directly.
65
+ """
66
+ text_value = getattr(self.model_instance, self.text_field_attr_name)
67
+ self.update(self.json_loads_fn(text_value))
68
+
61
69
 
62
70
  class JsonView:
63
71
  """Descriptor that provides dict-like access to a JSON text field.
@@ -66,9 +74,6 @@ class JsonView:
66
74
  class SomeModel(pw.Model):
67
75
  some_field = pw.TextField(default="{}")
68
76
  some_field_dict = JsonView(text_field_attr_name="some_field")
69
-
70
- m = SomeModel()
71
- m.some_field_dict["chosen_mistake"] = "whatever"
72
77
  """
73
78
 
74
79
  def __init__(self, text_field_attr_name):
@@ -79,7 +84,7 @@ class JsonView:
79
84
  """Called when the descriptor is assigned to a class attribute."""
80
85
  self.attr_name = f"_{name}_dict"
81
86
 
82
- def __get__(self, instance, owner):
87
+ def __get__(self, instance, owner) -> JsonViewDict:
83
88
  if instance is None:
84
89
  return self
85
90
 
@@ -7,7 +7,6 @@ from playhouse.shortcuts import model_to_dict
7
7
 
8
8
  from flexeval.classes.base import BaseModel
9
9
  from flexeval.classes.dataset import Dataset
10
- from flexeval.classes.eval_set_run import EvalSetRun
11
10
  from flexeval.classes.thread import Thread
12
11
  from flexeval.classes.turn import Turn
13
12
  from flexeval.classes.jsonview import JsonView
@@ -24,7 +23,6 @@ class Message(BaseModel):
24
23
 
25
24
  id = pw.IntegerField(primary_key=True)
26
25
 
27
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="messages")
28
26
  dataset = pw.ForeignKeyField(Dataset, backref="messages")
29
27
  thread = pw.ForeignKeyField(Thread, backref="messages")
30
28
  index_in_thread = pw.IntegerField()
@@ -71,10 +69,18 @@ class Message(BaseModel):
71
69
  super().__init__(**kwargs)
72
70
  self.metrics_to_evaluate = []
73
71
 
74
- def get_completion(self, include_system_prompt=False):
72
+ def get_completion(
73
+ self,
74
+ include_system_prompt=False,
75
+ completion_config: dict | None = None,
76
+ evalsetrun=None,
77
+ ):
75
78
  # only get a completion if this is the final turn - we probably don't want to branch from mid-conversation
76
79
  if self.is_final_turn_in_input:
77
- completion_config = json.loads(self.evalsetrun.completion_llm)
80
+ if completion_config is None:
81
+ raise ValueError(
82
+ "completion_config must be provided to get_completion()"
83
+ )
78
84
  completion_fn_name = completion_config.get("function_name", None)
79
85
  completion_function_kwargs = completion_config.get("kwargs", None)
80
86
 
@@ -104,7 +110,7 @@ class Message(BaseModel):
104
110
  # which generally means it'll have a structure like this
105
111
  # {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}
106
112
  result = model_to_dict(self, exclude=[self.id])
107
- result["evalsetrun"] = self.evalsetrun
113
+ result["evalsetrun"] = evalsetrun
108
114
  result["dataset"] = self.dataset
109
115
  result["datasetrow"] = self.datasetrow
110
116
  result["turn_number"] = self.turn_number + 1
@@ -37,14 +37,6 @@ class Metric(BaseModel):
37
37
  null=True
38
38
  ) # necessary if rubric result is INVALID or e.g. latency doesn't apply to the very first message
39
39
  kwargs = pw.TextField()
40
- # context_only allows us to create another kind of dependency
41
- # where we can quantify something about the previous conversation
42
- # and then use that quantity in a downstream analysis
43
- # e.g. 'would a plot be pedagogically appropriate here' is really a question about the PAST of the conversation
44
- # NOTE: but we have gotten rid of context_only for rubrics, where only {context} is used so technically here 'context_only' is False
45
- # or 'was the conversation ever flagged by the moderation api' would be a question about the previous turns that might
46
- # allow to have better context for the properties of this turn
47
- # context_only = pw.BooleanField(default=False)
48
40
  source = pw.TextField() # TODO - make another table for this? But maybe not, because this also contains filled-in rubrics
49
41
  depends_on = pw.TextField()
50
42
  rubric_prompt = pw.TextField(null=True)
@@ -2,7 +2,6 @@ import peewee as pw
2
2
 
3
3
  from flexeval.classes.base import BaseModel
4
4
  from flexeval.classes.dataset import Dataset
5
- from flexeval.classes.eval_set_run import EvalSetRun
6
5
  from flexeval.classes.jsonview import JsonView
7
6
 
8
7
 
@@ -13,7 +12,6 @@ class Thread(BaseModel):
13
12
 
14
13
  id = pw.IntegerField(primary_key=True)
15
14
  dataset = pw.ForeignKeyField(Dataset, backref="threads")
16
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="threads")
17
15
 
18
16
  langgraph_thread_id = pw.TextField(null=True)
19
17
  eval_run_thread_id = pw.TextField(null=True)
@@ -2,7 +2,6 @@ import peewee as pw
2
2
 
3
3
  from flexeval.classes.base import BaseModel
4
4
  from flexeval.classes.dataset import Dataset
5
- from flexeval.classes.eval_set_run import EvalSetRun
6
5
  from flexeval.classes.message import Message
7
6
  from flexeval.classes.thread import Thread
8
7
  from flexeval.classes.turn import Turn
@@ -16,7 +15,6 @@ class ToolCall(BaseModel):
16
15
 
17
16
  id = pw.IntegerField(primary_key=True)
18
17
 
19
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="toolcalls")
20
18
  dataset = pw.ForeignKeyField(Dataset, backref="toolcalls")
21
19
  thread = pw.ForeignKeyField(Thread, backref="toolcalls")
22
20
  message = pw.ForeignKeyField(Message, backref="toolcalls")
@@ -7,7 +7,6 @@ from playhouse.shortcuts import model_to_dict
7
7
 
8
8
  from flexeval.classes.base import BaseModel
9
9
  from flexeval.classes.dataset import Dataset
10
- from flexeval.classes.eval_set_run import EvalSetRun
11
10
  from flexeval.classes.thread import Thread
12
11
  from flexeval.configuration import completion_functions
13
12
 
@@ -22,7 +21,6 @@ class Turn(BaseModel):
22
21
 
23
22
  id = pw.IntegerField(primary_key=True)
24
23
 
25
- evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="turns")
26
24
  dataset = pw.ForeignKeyField(Dataset, backref="turns")
27
25
  thread = pw.ForeignKeyField(Thread, backref="turns")
28
26
  index_in_thread = pw.IntegerField()
@@ -32,10 +30,13 @@ class Turn(BaseModel):
32
30
  super().__init__(**kwargs)
33
31
  self.metrics_to_evaluate = []
34
32
 
35
- def get_completion(self):
33
+ def get_completion(self, completion_config: dict | None = None, evalsetrun=None):
36
34
  # only get a completion if this is the final turn - we probably don't want to branch from mid-conversation
37
35
  if self.is_final_turn_in_input:
38
- completion_config = json.loads(self.evalsetrun.completion_llm)
36
+ if completion_config is None:
37
+ raise ValueError(
38
+ "completion_config must be provided to get_completion()"
39
+ )
39
40
  completion_fn_name = completion_config.get("function_name", None)
40
41
  completion_function_kwargs = completion_config.get("kwargs", None)
41
42
 
@@ -69,7 +70,7 @@ class Turn(BaseModel):
69
70
  # - make the completion function just return content?
70
71
  # {"choices": [{"message": {"content": "hi", "role": "assistant"}}]}
71
72
  result = model_to_dict(self, exclude=[self.id])
72
- result["evalsetrun"] = self.evalsetrun
73
+ result["evalsetrun"] = evalsetrun
73
74
  result["dataset"] = self.dataset
74
75
  result["datasetrow"] = self.datasetrow
75
76
  result["turn_number"] = self.turn_number + 1
@@ -108,6 +109,7 @@ class Turn(BaseModel):
108
109
  """
109
110
  context = ""
110
111
  for message in self.messages:
112
+ # TODO why not just use message.get_context(include_system_prompt=include_system_prompt) here?
111
113
  context = message.context
112
114
  break
113
115
  context = json.loads(context)
@@ -55,10 +55,15 @@ def get_completion(turn: classes.turn.Turn, completion_llm: CompletionLlm):
55
55
  return completion
56
56
 
57
57
 
58
- def get_completions(eval_run: EvalRun, evalsetrun: classes.eval_set_run.EvalSetRun):
58
+ def get_completions(
59
+ eval_run: EvalRun,
60
+ evalsetrun: classes.eval_set_run.EvalSetRun,
61
+ datasets: list[classes.dataset.Dataset],
62
+ ):
59
63
  n_workers = eval_run.config.max_workers
64
+ threads = [thread for dataset in datasets for thread in dataset.threads]
60
65
  if n_workers == 1:
61
- for thread in evalsetrun.threads:
66
+ for thread in threads:
62
67
  # select last turn in thread
63
68
  if len(thread.turns) == 0:
64
69
  continue
@@ -75,7 +80,7 @@ def get_completions(eval_run: EvalRun, evalsetrun: classes.eval_set_run.EvalSetR
75
80
  else:
76
81
  with ThreadPoolExecutor(max_workers=n_workers) as executor:
77
82
  futures: dict[Future, classes.turn.Turn] = {}
78
- for thread in evalsetrun.threads:
83
+ for thread in threads:
79
84
  if len(thread.turns) == 0:
80
85
  continue
81
86
  turn = (
@@ -113,7 +118,6 @@ def save_completion(
113
118
  new_turn = turn
114
119
  else:
115
120
  new_turn = classes.turn.Turn.create(
116
- evalsetrun=evalsetrun,
117
121
  dataset=turn.dataset,
118
122
  thread=turn.thread,
119
123
  index_in_thread=turn.index_in_thread + 1,
@@ -129,7 +133,6 @@ def save_completion(
129
133
  {"role": prev_message.role, "content": prev_message.content}
130
134
  )
131
135
  classes.message.Message.create(
132
- evalsetrun=evalsetrun,
133
136
  dataset=turn.dataset,
134
137
  thread=turn.thread,
135
138
  turn=new_turn,
@@ -14,6 +14,7 @@ from typing import Iterable, Union
14
14
  import networkx as nx
15
15
 
16
16
  from flexeval import function_types
17
+ from flexeval.classes.dataset import Dataset
17
18
  from flexeval.classes.eval_set_run import EvalSetRun
18
19
  from flexeval.classes.message import Message
19
20
  from flexeval.classes.thread import Thread
@@ -159,8 +160,8 @@ class MetricGraphBuilder:
159
160
  metric = self.metric_id_map[metric_id]
160
161
  return self.get_or_create_object_metric(dependency_metric_level, object, metric)
161
162
 
162
- def build_thread_task_graphs(self, evalsetrun: EvalSetRun) -> Iterable[nx.DiGraph]:
163
- threads = evalsetrun.threads
163
+ def build_thread_task_graphs(self, dataset: Dataset) -> Iterable[nx.DiGraph]:
164
+ threads = dataset.threads
164
165
  for thread in threads:
165
166
  yield self.build_thread_task_graph(thread)
166
167
 
@@ -208,28 +209,35 @@ class MetricGraphBuilder:
208
209
  return g
209
210
 
210
211
 
211
- def compute_metrics(evalrun: EvalRun, evalsetrun: EvalSetRun) -> list[dict]:
212
+ def compute_metrics(
213
+ evalrun: EvalRun, evalsetrun: EvalSetRun, datasets: list[Dataset]
214
+ ) -> list[dict]:
212
215
  n_workers = evalrun.config.max_workers
213
216
  raise_on_error = evalrun.config.raise_on_metric_error
214
217
  mgb = MetricGraphBuilder()
215
218
  mgb.build_metric_structures(evalsetrun)
216
- graphs = mgb.build_thread_task_graphs(evalsetrun)
217
219
  mc = MetricComputer.from_evalrun(evalrun, evalsetrun)
218
220
  metrics = []
219
- if n_workers == 1:
220
- for graph in graphs:
221
- graph_metrics = mc.process_thread_dependency_graph(graph, raise_on_error)
222
- metrics.extend(graph_metrics)
223
- else:
224
- with ThreadPoolExecutor(max_workers=n_workers) as executor:
225
- futures = []
221
+ for dataset in datasets:
222
+ graphs = mgb.build_thread_task_graphs(dataset)
223
+ if n_workers == 1:
226
224
  for graph in graphs:
227
- future = executor.submit(mc.process_thread_dependency_graph, graph)
228
- futures.append(future)
229
- for i, future in enumerate(futures):
230
- metrics.extend(future.result())
231
- if i % 100 == 0:
232
- logger.info(f"Metrics futures resulted: {i + 1} / {len(futures)}")
225
+ graph_metrics = mc.process_thread_dependency_graph(
226
+ graph, raise_on_error
227
+ )
228
+ metrics.extend(graph_metrics)
229
+ else:
230
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
231
+ futures = []
232
+ for graph in graphs:
233
+ future = executor.submit(mc.process_thread_dependency_graph, graph)
234
+ futures.append(future)
235
+ for i, future in enumerate(futures):
236
+ metrics.extend(future.result())
237
+ if i % 100 == 0:
238
+ logger.info(
239
+ f"Metrics futures resulted: {i + 1} / {len(futures)}"
240
+ )
233
241
  return metrics
234
242
 
235
243
 
@@ -296,10 +304,18 @@ class MetricComputer:
296
304
  self.rubrics: dict | None = (
297
305
  self.load_rubrics(evalsetrun) if evalsetrun is not None else None
298
306
  )
307
+ self.do_completion: bool = (
308
+ evalsetrun.do_completion if evalsetrun is not None else False
309
+ )
310
+ self.grader_llm: str | None = (
311
+ evalsetrun.grader_llm if evalsetrun is not None else None
312
+ )
299
313
 
300
- def load_rubrics(self, evalsetrun: EvalSetRun):
301
- """Set the rubrics to be used by this MetricComputer from the given EvalSetRun."""
302
- self.rubrics = json.loads(evalsetrun.rubrics)
314
+ def load_rubrics(self, evalsetrun: EvalSetRun) -> dict:
315
+ """Load and return rubrics from the given EvalSetRun."""
316
+ rubrics = json.loads(evalsetrun.rubrics)
317
+ self.rubrics = rubrics
318
+ return rubrics
303
319
 
304
320
  def process_thread_dependency_graphs(
305
321
  self, graph_list: Iterable[nx.DiGraph]
@@ -467,7 +483,6 @@ class MetricComputer:
467
483
  evaluation_type: str,
468
484
  metric_level: str,
469
485
  kwargs: dict,
470
- context_only: bool = None,
471
486
  depends_on: list = None,
472
487
  id: int = None,
473
488
  notes: str = None, # just a placeholder
@@ -477,7 +492,6 @@ class MetricComputer:
477
492
  function_name=evaluation_name,
478
493
  metric_kwargs=kwargs,
479
494
  metric_level=metric_level,
480
- context_only=context_only,
481
495
  input_object=object,
482
496
  depends_on=depends_on,
483
497
  id=id,
@@ -515,10 +529,9 @@ class MetricComputer:
515
529
  metric_level: eval_schema.MetricLevel,
516
530
  input_object: function_types.AnyFunctionObjectInput,
517
531
  metric_kwargs: dict,
518
- context_only: bool,
519
532
  ):
520
533
  function_input = function_types.get_function_input(
521
- metric_function, metric_level, input_object, context_only
534
+ metric_function, metric_level, input_object
522
535
  )
523
536
  metrics_result = metric_function(function_input, **metric_kwargs)
524
537
  return metrics_result
@@ -541,7 +554,6 @@ class MetricComputer:
541
554
  metric_kwargs: dict,
542
555
  input_object: Union[Thread, Turn, Message, ToolCall],
543
556
  metric_level: eval_schema.MetricLevel,
544
- context_only: bool,
545
557
  depends_on: list,
546
558
  id: int,
547
559
  ):
@@ -552,7 +564,7 @@ class MetricComputer:
552
564
  # Check if the function exists in any of the function namespaces
553
565
  metric_function, metric_source = self.find_function(function_name)
554
566
  metrics_result = self.invoke_function(
555
- metric_function, metric_level, input_object, metric_kwargs, context_only
567
+ metric_function, metric_level, input_object, metric_kwargs
556
568
  )
557
569
 
558
570
  base_result = {
@@ -562,7 +574,6 @@ class MetricComputer:
562
574
  "metric_level": metric_level,
563
575
  "kwargs": metric_kwargs,
564
576
  "source": metric_source, # TODO - put this back?
565
- "context_only": context_only,
566
577
  "depends_on": depends_on,
567
578
  "id": id,
568
579
  }
@@ -611,7 +622,9 @@ class MetricComputer:
611
622
  if self.rubrics is not None:
612
623
  rubrics = self.rubrics
613
624
  else:
614
- rubrics = json.loads(object.evalsetrun.rubrics)
625
+ raise ValueError(
626
+ "No rubrics loaded. Rubrics must be loaded via MetricComputer.from_evalrun() before computing rubric metrics."
627
+ )
615
628
  if rubric_name not in rubrics:
616
629
  raise ValueError(
617
630
  f"You requested a rubric called '{rubric_name}', but only these were found: {rubrics.keys()}."
@@ -643,7 +656,7 @@ class MetricComputer:
643
656
  "Your rubric should not have both {content} and {completion}. Please check the README file for more information about how to write FlexEval rubrics."
644
657
  )
645
658
 
646
- if "{completion}" in prompt and not object.evalsetrun.do_completion:
659
+ if "{completion}" in prompt and not self.do_completion:
647
660
  raise Exception(
648
661
  "Your rubric has {completion}, but in your test specification for this rubric evaluation, do_completion is not True. Please check the README file for more information about how to write FlexEval rubrics."
649
662
  )
@@ -656,7 +669,7 @@ class MetricComputer:
656
669
  )
657
670
 
658
671
  # with do_completion == True, only the completion is evaluated with or without the context.
659
- if object.evalsetrun.do_completion and "{completion}" in prompt:
672
+ if self.do_completion and "{completion}" in prompt:
660
673
  # TODO revisit this logic
661
674
  # also included object.is_completion, which only works for Message rubrics
662
675
  # but we can in principle check for a message in either a turn or a thread with is_flexeval_completion true
@@ -665,11 +678,11 @@ class MetricComputer:
665
678
  choice_scores = rubrics.get(rubric_name).get("choice_scores")
666
679
 
667
680
  # get rubric grader
668
- if object.evalsetrun.grader_llm is None or object.evalsetrun.grader_llm == "":
681
+ if self.grader_llm is None or self.grader_llm == "":
669
682
  raise ValueError(
670
683
  "Attempting to evaluate a rubric metric, but no grader LLM defined."
671
684
  )
672
- grader_completion_function = json.loads(object.evalsetrun.grader_llm)
685
+ grader_completion_function = json.loads(self.grader_llm)
673
686
  if grader_completion_function is None or len(grader_completion_function) == 0:
674
687
  raise ValueError(
675
688
  "Attempting to evaluate a rubric metric, but no grader LLM defined."